From 1077ec2d886b9201bca504782a25636bcf904036 Mon Sep 17 00:00:00 2001 From: Pavel Kochetov Date: Tue, 16 Dec 2025 17:03:49 +0000 Subject: [PATCH 01/33] Deduplicate merger WIP --- smartfeed/examples/example_client.py | 13 +- smartfeed/manager.py | 5 +- smartfeed/schemas.py | 297 +++++++++++++++++++++-- tests/fixtures/configs.py | 30 +++ tests/fixtures/redis.py | 28 ++- tests/test_merger_append.py | 5 +- tests/test_merger_append_distribute.py | 5 +- tests/test_merger_deduplication.py | 271 +++++++++++++++++++++ tests/test_merger_percentage.py | 3 +- tests/test_merger_percentage_gradient.py | 5 +- tests/test_merger_positional.py | 7 +- tests/test_merger_view_session.py | 11 +- tests/test_parsing_config.py | 14 +- tests/test_redis_live.py | 5 +- tests/test_sub_feed.py | 8 +- tests/utils.py | 17 ++ 16 files changed, 670 insertions(+), 54 deletions(-) create mode 100644 tests/test_merger_deduplication.py create mode 100644 tests/utils.py diff --git a/smartfeed/examples/example_client.py b/smartfeed/examples/example_client.py index 9a421ff..8b9d6a7 100644 --- a/smartfeed/examples/example_client.py +++ b/smartfeed/examples/example_client.py @@ -2,7 +2,7 @@ import json from typing import Optional, Union -from pydantic import BaseModel, Field, validator +from pydantic import BaseModel, ConfigDict, Field, field_validator from smartfeed.schemas import FeedResultClient, FeedResultNextPage, FeedResultNextPageInside @@ -18,13 +18,16 @@ class TestClientRequest(BaseModel): base64.urlsafe_b64encode(json.dumps({"data": {}}).encode()).decode() ) - class Config: - validate_all = True + model_config = ConfigDict(validate_default=True) - @validator("next_page") + @field_validator("next_page") + @classmethod def validate_next_page(cls, value: Union[str, FeedResultNextPage]) -> Union[str, FeedResultNextPage]: if isinstance(value, str): - return FeedResultNextPage.parse_obj(json.loads(base64.urlsafe_b64decode(value))) + payload = json.loads(base64.urlsafe_b64decode(value)) + if hasattr(FeedResultNextPage, "model_validate"): + return FeedResultNextPage.model_validate(payload) # type: ignore[attr-defined] + return FeedResultNextPage.parse_obj(payload) return value diff --git a/smartfeed/manager.py b/smartfeed/manager.py index e91bbe9..7ac06f9 100644 --- a/smartfeed/manager.py +++ b/smartfeed/manager.py @@ -20,7 +20,10 @@ def __init__(self, config: Dict, methods_dict: Dict, redis_client: Optional[Unio :param redis_client: объект клиента Redis (для конфигурации с view_session = True). """ - self.feed_config = FeedConfig.parse_obj(config) + if hasattr(FeedConfig, "model_validate"): + self.feed_config = FeedConfig.model_validate(config) # type: ignore[attr-defined] + else: + self.feed_config = FeedConfig.parse_obj(config) self.methods_dict = methods_dict self.redis_client = redis_client diff --git a/smartfeed/schemas.py b/smartfeed/schemas.py index 45df221..a9ef784 100644 --- a/smartfeed/schemas.py +++ b/smartfeed/schemas.py @@ -1,18 +1,21 @@ +import base64 import inspect import json import logging +import zlib from abc import ABC, abstractmethod from collections import defaultdict, deque from random import shuffle from typing import Annotated, Any, Callable, Dict, List, Literal, Optional, Union, no_type_check import redis -from pydantic import BaseModel, Field, root_validator +from pydantic import BaseModel, Field, model_validator from redis.asyncio import Redis as AsyncRedis from redis.asyncio import RedisCluster as AsyncRedisCluster FeedTypes = Annotated[ Union[ + "DeduplicationMerger", "MergerAppend", "MergerAppendDistribute", "MergerPositional", @@ -501,17 +504,17 @@ class MergerPositional(BaseFeedConfigModel): positional: FeedTypes default: FeedTypes - @root_validator(skip_on_failure=True) - def validate_merger_positional(cls, values: Dict[str, Any]) -> Dict[str, Any]: - if not values["positions"] and not all((values["start"], values["end"], values["step"])): + @model_validator(mode="after") + def validate_merger_positional(self) -> "MergerPositional": + if not self.positions and not all((self.start, self.end, self.step)): raise ValueError('Either "positions" or "start", "end", and "step" must be provided') - if values["start"] and values["positions"]: - if isinstance(values["start"], int) and values["start"] <= max(values["positions"]): + if self.start and self.positions: + if isinstance(self.start, int) and self.start <= max(self.positions): raise ValueError('"start" must be bigger than maximum value of "positions"') - if isinstance(values["start"], int) and isinstance(values["end"], int): - if values["end"] <= values["start"]: + if isinstance(self.start, int) and isinstance(self.end, int): + if self.end <= self.start: raise ValueError('"end" must be bigger than "start"') - return values + return self async def get_data( self, @@ -757,13 +760,13 @@ class MergerPercentageGradient(BaseFeedConfigModel): size_to_step: int shuffle: bool = False - @root_validator(skip_on_failure=True) - def validate_merger_percentage_gradient(cls, values: Dict[str, Any]) -> Dict[str, Any]: - if values["step"] < 1 or values["step"] > 100: + @model_validator(mode="after") + def validate_merger_percentage_gradient(self) -> "MergerPercentageGradient": + if self.step < 1 or self.step > 100: raise ValueError('"step" must be in range from 1 to 100') - if values["size_to_step"] < 1: + if self.size_to_step < 1: raise ValueError('"size_to_step" must be bigger than 1') - return values + return self async def _calculate_limits_and_percents(self, page: int, limit: int) -> Dict: """ @@ -1018,6 +1021,247 @@ async def get_data( return result +class DeduplicationMergerItem(BaseModel): + """Configuration item for DeduplicationMerger.""" + + priority: int = 0 + data: FeedTypes + + +class DeduplicationMerger(BaseFeedConfigModel): + """Merger that deduplicates items and refills to the requested limit. + + Key properties: + - Always tries to return exactly `limit` unique items if they exist upstream. + - Supports cross-page deduplication using either cursor state or Redis. + - Supports explicit per-source priority; higher priority wins on same dedup key. + """ + + merger_id: str + type: Literal["merger_deduplication"] + items: List[DeduplicationMergerItem] + + dedup_key: Optional[str] = None + missing_key_policy: Literal["error", "keep", "drop"] = "error" + + state_backend: Literal["cursor", "redis"] = "cursor" + state_ttl_seconds: int = 3600 + cursor_compress: bool = True + cursor_max_keys: Optional[int] = None + + overfetch_factor: int = 2 + max_refill_loops: int = 20 + + def _normalize_key(self, value: Any) -> str: + if isinstance(value, (str, int)): + return str(value) + if isinstance(value, (dict, list)): + return json.dumps(value, sort_keys=True, default=str) + return str(value) + + def _extract_dedup_value(self, item: Any) -> Any: + if not self.dedup_key: + return item + + try: + value = item.get(self.dedup_key) + except AttributeError: + value = getattr(item, self.dedup_key, None) + + if value is None and self.missing_key_policy == "error": + raise AssertionError( + f"Deduplication failed: entity {item} has no key or attr {self.dedup_key}" + ) + return value + + def _decode_seen_from_cursor(self, next_page: FeedResultNextPage) -> List[str]: + entry = next_page.data.get(self.merger_id) + if not entry or entry.after is None: + return [] + + after = entry.after + if isinstance(after, dict) and "z" in after: + payload = base64.urlsafe_b64decode(after["z"].encode()) + raw = zlib.decompress(payload).decode() + return list(json.loads(raw)) + if isinstance(after, dict) and "seen" in after: + return list(after["seen"]) + if isinstance(after, list): + return list(after) + return [] + + def _encode_seen_for_cursor(self, seen_keys_in_order: List[str]) -> Any: + if self.cursor_max_keys is not None: + seen_keys_in_order = seen_keys_in_order[-self.cursor_max_keys :] + + if not self.cursor_compress: + return {"v": 1, "seen": seen_keys_in_order} + + raw = json.dumps(seen_keys_in_order).encode() + compressed = zlib.compress(raw) + return { + "v": 1, + "c": "zlib+base64", + "n": len(seen_keys_in_order), + "z": base64.urlsafe_b64encode(compressed).decode(), + } + + async def _redis_sismember(self, redis_client: Union[redis.Redis, AsyncRedis], key: str, member: str) -> bool: + res = redis_client.sismember(key, member) + if inspect.iscoroutine(res): + res = await res + return bool(res) + + async def _redis_sadd_and_expire( + self, + redis_client: Union[redis.Redis, AsyncRedis], + key: str, + members: List[str], + ) -> None: + if not members: + return + res = redis_client.sadd(key, *members) + if inspect.iscoroutine(res): + await res + await redis_client.expire(key, self.state_ttl_seconds) + else: + redis_client.expire(key, self.state_ttl_seconds) + + def _build_redis_state_key(self, user_id: Any, params: Dict[str, Any]) -> str: + suffix = params.get("custom_deduplication_key") or params.get("custom_view_session_key") + if suffix: + return f"dedup:{self.merger_id}:{user_id}:{suffix}" + return f"dedup:{self.merger_id}:{user_id}" + + async def get_data( + self, + methods_dict: Dict[str, Callable], + user_id: Any, + limit: int, + next_page: FeedResultNextPage, + redis_client: Optional[Union[redis.Redis, AsyncRedis]] = None, + **params: Any, + ) -> FeedResult: + if limit <= 0: + return FeedResult(data=[], next_page=next_page, has_next_page=False) + + # Treat an explicit "page 0" (or missing cursor for this merger) as a fresh session. + # This allows clients to restart the feed (e.g., full reload) without carrying over seen state. + requested_page = next_page.data.get(self.merger_id).page if self.merger_id in next_page.data else None + is_fresh_session = requested_page is None or (isinstance(requested_page, int) and requested_page <= 0) + + if self.state_backend == "redis" and not redis_client: + raise ValueError("Redis client must be provided if using DeduplicationMerger with state_backend=redis") + + if hasattr(next_page, "model_copy"): + working_next_page = next_page.model_copy(deep=True) # type: ignore[attr-defined] + else: + working_next_page = next_page.copy(deep=True) + sorted_items = sorted(self.items, key=lambda x: x.priority, reverse=True) + + seen_keys_in_order: List[str] = [] + seen_cursor_set: set[str] = set() + if self.state_backend == "cursor" and not is_fresh_session: + seen_keys_in_order = self._decode_seen_from_cursor(next_page) + seen_cursor_set = set(seen_keys_in_order) + + redis_state_key = "" + if self.state_backend == "redis" and redis_client: + redis_state_key = self._build_redis_state_key(user_id=user_id, params=params) + if is_fresh_session: + # Drop state for a full restart. + deleted = redis_client.delete(redis_state_key) + if inspect.iscoroutine(deleted): + await deleted + + result_items: List[Any] = [] + accepted: Dict[str, Dict[str, Any]] = {} + redis_new_members: List[str] = [] + any_has_next_page = False + + loops = 0 + while len(result_items) < limit and loops < self.max_refill_loops: + loops += 1 + before_len = len(result_items) + + for item in sorted_items: + remaining = limit - len(result_items) + if remaining <= 0: + break + + request_limit = max(1, remaining * max(1, self.overfetch_factor)) + item_result = await item.data.get_data( + methods_dict=methods_dict, + user_id=user_id, + limit=request_limit, + next_page=working_next_page, + redis_client=redis_client, + **params, + ) + + any_has_next_page = any_has_next_page or item_result.has_next_page + working_next_page.data.update(item_result.next_page.data) + + for entity in item_result.data: + raw_value = self._extract_dedup_value(entity) + if raw_value is None: + if self.missing_key_policy == "drop": + continue + if self.missing_key_policy == "keep": + # Make a unique key per object instance representation. + raw_value = ("__missing__", id(entity)) + + key = self._normalize_key(raw_value) + + if key in accepted: + if item.priority > accepted[key]["priority"]: + result_items[accepted[key]["index"]] = entity + accepted[key]["priority"] = item.priority + continue + + if self.state_backend == "cursor": + if key in seen_cursor_set: + continue + else: + assert redis_client is not None + if await self._redis_sismember(redis_client, redis_state_key, key): + continue + + accepted[key] = {"priority": item.priority, "index": len(result_items)} + result_items.append(entity) + + if self.state_backend == "cursor": + seen_cursor_set.add(key) + seen_keys_in_order.append(key) + else: + redis_new_members.append(key) + + if len(result_items) >= limit: + break + + if len(result_items) >= limit: + break + + if len(result_items) == before_len: + break + + if self.state_backend == "redis" and redis_client: + await self._redis_sadd_and_expire(redis_client, redis_state_key, redis_new_members) + + page = next_page.data[self.merger_id].page if self.merger_id in next_page.data else 1 + merger_after: Any = None + if self.state_backend == "cursor": + merger_after = self._encode_seen_for_cursor(seen_keys_in_order) + + if hasattr(working_next_page, "model_copy"): + result_next_page = working_next_page.model_copy(deep=True) # type: ignore[attr-defined] + else: + result_next_page = working_next_page.copy(deep=True) + result_next_page.data[self.merger_id] = FeedResultNextPageInside(page=page + 1, after=merger_after) + + return FeedResult(data=result_items, next_page=result_next_page, has_next_page=any_has_next_page) + + class SubFeed(BaseFeedConfigModel): """ Модель субфида. @@ -1122,11 +1366,20 @@ class FeedConfig(BaseModel): # Update Forward Refs -MergerPositional.update_forward_refs() -MergerPercentage.update_forward_refs() -SubFeed.update_forward_refs() -MergerPercentageItem.update_forward_refs() -MergerAppend.update_forward_refs() -MergerAppendDistribute.update_forward_refs() -MergerPercentageGradient.update_forward_refs() -MergerViewSession.update_forward_refs() +def _rebuild_model(model: Any) -> None: + if hasattr(model, "model_rebuild"): + model.model_rebuild() # type: ignore[attr-defined] + else: + model.update_forward_refs() # type: ignore[attr-defined] + + +_rebuild_model(MergerPositional) +_rebuild_model(MergerPercentage) +_rebuild_model(SubFeed) +_rebuild_model(MergerPercentageItem) +_rebuild_model(MergerAppend) +_rebuild_model(MergerAppendDistribute) +_rebuild_model(MergerPercentageGradient) +_rebuild_model(MergerViewSession) +_rebuild_model(DeduplicationMergerItem) +_rebuild_model(DeduplicationMerger) diff --git a/tests/fixtures/configs.py b/tests/fixtures/configs.py index 8c96e4e..a982e3d 100644 --- a/tests/fixtures/configs.py +++ b/tests/fixtures/configs.py @@ -86,3 +86,33 @@ }, }, } + + +PARSING_DEDUP_CONFIG_FIXTURE = { + "version": "1", + "feed": { + "merger_id": "merger_deduplication_parsing_example", + "type": "merger_deduplication", + "dedup_key": "id", + "state_backend": "cursor", + "cursor_compress": True, + "items": [ + { + "priority": 100, + "data": { + "subfeed_id": "subfeed_dedup_priority_high", + "type": "subfeed", + "method_name": "posted", + }, + }, + { + "priority": 0, + "data": { + "subfeed_id": "subfeed_dedup_priority_low", + "type": "subfeed", + "method_name": "posted", + }, + }, + ], + }, +} diff --git a/tests/fixtures/redis.py b/tests/fixtures/redis.py index b98695e..2c07678 100644 --- a/tests/fixtures/redis.py +++ b/tests/fixtures/redis.py @@ -1,10 +1,30 @@ import pytest +import pytest_asyncio import redis from redis.asyncio import Redis as AsyncRedis -@pytest.fixture(scope="function") -def redis_client(request): +@pytest_asyncio.fixture(scope="function") +async def redis_client(request): + """Provide a Redis client for tests. + + If Redis is not available on localhost:6379, skip tests that depend on it. + """ + if request.param == "async": - return AsyncRedis(host="localhost", port=6379) - return redis.Redis(host="localhost", port=6379, db=0) + client = AsyncRedis(host="localhost", port=6379) + try: + await client.ping() + except Exception: # pragma: no cover + pytest.skip("Redis is not available on localhost:6379") + yield client + await client.aclose() + return + + client = redis.Redis(host="localhost", port=6379, db=0) + try: + client.ping() + except Exception: # pragma: no cover + pytest.skip("Redis is not available on localhost:6379") + yield client + client.close() diff --git a/tests/test_merger_append.py b/tests/test_merger_append.py index e9db5c7..309ea82 100644 --- a/tests/test_merger_append.py +++ b/tests/test_merger_append.py @@ -3,6 +3,7 @@ from smartfeed.schemas import FeedResultNextPage, FeedResultNextPageInside, MergerAppend from tests.fixtures.configs import METHODS_DICT from tests.fixtures.mergers import MERGER_APPEND_CONFIG +from tests.utils import parse_model @pytest.mark.asyncio @@ -11,7 +12,7 @@ async def test_merger_append() -> None: Тест для проверки получения данных из append мерджера. """ - merger_append = MergerAppend.parse_obj(MERGER_APPEND_CONFIG) + merger_append = parse_model(MergerAppend, MERGER_APPEND_CONFIG) merger_append_res = await merger_append.get_data( methods_dict=METHODS_DICT, limit=11, @@ -28,7 +29,7 @@ async def test_merger_append_with_item_1_page_2() -> None: Тест для проверки получения данных из append мерджера с курсором пагинации первого субфида. """ - merger_append = MergerAppend.parse_obj(MERGER_APPEND_CONFIG) + merger_append = parse_model(MergerAppend, MERGER_APPEND_CONFIG) merger_append_res = await merger_append.get_data( methods_dict=METHODS_DICT, limit=11, diff --git a/tests/test_merger_append_distribute.py b/tests/test_merger_append_distribute.py index 6bb1782..bc4878b 100644 --- a/tests/test_merger_append_distribute.py +++ b/tests/test_merger_append_distribute.py @@ -3,6 +3,7 @@ from smartfeed.schemas import FeedResultNextPage, FeedResultNextPageInside, MergerAppendDistribute from tests.fixtures.configs import METHODS_DICT from tests.fixtures.mergers import MERGER_APPEND_DISTRIBUTE_CONFIG +from tests.utils import parse_model @pytest.mark.asyncio @@ -11,7 +12,7 @@ async def test_merger_disturbed_append() -> None: Тест для проверки получения данных из append мерджера. """ - merger_distributed = MergerAppendDistribute.parse_obj(MERGER_APPEND_DISTRIBUTE_CONFIG) + merger_distributed = parse_model(MergerAppendDistribute, MERGER_APPEND_DISTRIBUTE_CONFIG) merger_distributed_res = await merger_distributed.get_data( methods_dict=METHODS_DICT, limit=20, @@ -31,7 +32,7 @@ async def test_merger_append_with_item_1_page_2() -> None: """ Тест для проверки получения данных из append мерджера с курсором пагинации первого субфида. """ - merger_distributed = MergerAppendDistribute.parse_obj(MERGER_APPEND_DISTRIBUTE_CONFIG) + merger_distributed = parse_model(MergerAppendDistribute, MERGER_APPEND_DISTRIBUTE_CONFIG) merger_distributed_res = await merger_distributed.get_data( methods_dict=METHODS_DICT, limit=11, diff --git a/tests/test_merger_deduplication.py b/tests/test_merger_deduplication.py new file mode 100644 index 0000000..5e16c45 --- /dev/null +++ b/tests/test_merger_deduplication.py @@ -0,0 +1,271 @@ +import inspect + +import pytest + +from smartfeed.schemas import ( + DeduplicationMerger, + FeedResultClient, + FeedResultNextPage, + FeedResultNextPageInside, +) + +from tests.fixtures.redis import redis_client # noqa: F401 +from tests.utils import parse_model + + +def make_offset_paged_method(items): + async def _method(user_id, limit, next_page): # pylint: disable=unused-argument + offset = int(next_page.after or 0) + result_data = items[offset : offset + limit] + next_page.after = offset + len(result_data) + next_page.page += 1 + has_next_page = (offset + len(result_data)) < len(items) + return FeedResultClient(data=result_data, next_page=next_page, has_next_page=has_next_page) + + return _method + + +@pytest.mark.asyncio +async def test_deduplication_merger_cursor_priority_and_cross_page() -> None: + low_items = [ + {"id": 1, "src": "low"}, + {"id": 2, "src": "low"}, + {"id": 3, "src": "low"}, + {"id": 4, "src": "low"}, + {"id": 5, "src": "low"}, + # repeats later (cross-page duplicates) + {"id": 3, "src": "low"}, + {"id": 4, "src": "low"}, + {"id": 6, "src": "low"}, + {"id": 7, "src": "low"}, + {"id": 8, "src": "low"}, + {"id": 9, "src": "low"}, + {"id": 10, "src": "low"}, + ] + high_items = [ + {"id": 3, "src": "high"}, + {"id": 4, "src": "high"}, + ] + + methods_dict = { + "low": make_offset_paged_method(low_items), + "high": make_offset_paged_method(high_items), + } + + config = { + "merger_id": "dedup_example", + "type": "merger_deduplication", + "dedup_key": "id", + "state_backend": "cursor", + "cursor_compress": True, + "overfetch_factor": 3, + "items": [ + { + "priority": 100, + "data": {"subfeed_id": "sf_high", "type": "subfeed", "method_name": "high"}, + }, + { + "priority": 0, + "data": {"subfeed_id": "sf_low", "type": "subfeed", "method_name": "low"}, + }, + ], + } + + merger = parse_model(DeduplicationMerger, config) + + res_1 = await merger.get_data( + methods_dict=methods_dict, + user_id="u", + limit=5, + next_page=FeedResultNextPage(data={}), + ) + + assert len(res_1.data) == 5 + ids_1 = [x["id"] for x in res_1.data] + assert len(ids_1) == len(set(ids_1)) + # Priority: id 3 and 4 must come from high + for x in res_1.data: + if x["id"] in {3, 4}: + assert x["src"] == "high" + + # Next page should not repeat 3/4 even though low repeats them later. + res_2 = await merger.get_data( + methods_dict=methods_dict, + user_id="u", + limit=5, + next_page=res_1.next_page, + ) + + ids_2 = [x["id"] for x in res_2.data] + assert not (set(ids_1) & set(ids_2)) + + # Ensure merger stores cursor state (compressed) in its own after. + assert "dedup_example" in res_2.next_page.data + assert isinstance(res_2.next_page.data["dedup_example"].after, dict) + assert "z" in res_2.next_page.data["dedup_example"].after + + +@pytest.mark.asyncio +async def test_deduplication_merger_refill_to_limit() -> None: + dup_items = [ + {"id": 1}, + {"id": 1}, + {"id": 1}, + {"id": 1}, + {"id": 1}, + {"id": 2}, + {"id": 3}, + {"id": 4}, + {"id": 5}, + {"id": 6}, + ] + + methods_dict = { + "dups": make_offset_paged_method(dup_items), + } + + config = { + "merger_id": "dedup_refill", + "type": "merger_deduplication", + "dedup_key": "id", + "state_backend": "cursor", + "overfetch_factor": 4, + "max_refill_loops": 10, + "items": [ + { + "priority": 0, + "data": {"subfeed_id": "sf_dups", "type": "subfeed", "method_name": "dups"}, + } + ], + } + + merger = parse_model(DeduplicationMerger, config) + + res = await merger.get_data( + methods_dict=methods_dict, + user_id="u", + limit=5, + next_page=FeedResultNextPage(data={}), + ) + + assert [x["id"] for x in res.data] == [1, 2, 3, 4, 5] + + +@pytest.mark.asyncio +async def test_deduplication_merger_page_zero_resets_cursor_state() -> None: + items = [{"id": i} for i in range(1, 50)] + methods_dict = {"stream": make_offset_paged_method(items)} + + config = { + "merger_id": "dedup_reset", + "type": "merger_deduplication", + "dedup_key": "id", + "state_backend": "cursor", + "cursor_compress": True, + "overfetch_factor": 2, + "items": [ + { + "priority": 0, + "data": {"subfeed_id": "sf_stream", "type": "subfeed", "method_name": "stream"}, + } + ], + } + + merger = parse_model(DeduplicationMerger, config) + + res_1 = await merger.get_data( + methods_dict=methods_dict, + user_id="u", + limit=5, + next_page=FeedResultNextPage(data={}), + ) + assert [x["id"] for x in res_1.data] == [1, 2, 3, 4, 5] + + # Simulate a full reload: page 0 requested again. Even if the client mistakenly + # keeps the previous "after" payload, we start a new session. + res_2 = await merger.get_data( + methods_dict=methods_dict, + user_id="u", + limit=5, + next_page=FeedResultNextPage( + data={ + "dedup_reset": FeedResultNextPageInside(page=0, after=res_1.next_page.data["dedup_reset"].after) + } + ), + ) + + assert [x["id"] for x in res_2.data] == [1, 2, 3, 4, 5] + + +@pytest.mark.parametrize("redis_client", ["sync", "async"], indirect=True) +@pytest.mark.asyncio +async def test_deduplication_merger_redis_backend(redis_client) -> None: + # This dataset repeats ids across pages (sliding window style) + items = [ + {"id": 1}, + {"id": 2}, + {"id": 3}, + {"id": 2}, + {"id": 3}, + {"id": 4}, + {"id": 5}, + {"id": 6}, + {"id": 4}, + {"id": 7}, + {"id": 8}, + ] + + methods_dict = {"stream": make_offset_paged_method(items)} + + config = { + "merger_id": "dedup_redis", + "type": "merger_deduplication", + "dedup_key": "id", + "state_backend": "redis", + "state_ttl_seconds": 60, + "overfetch_factor": 4, + "items": [ + { + "priority": 0, + "data": {"subfeed_id": "sf_stream", "type": "subfeed", "method_name": "stream"}, + } + ], + } + + merger = parse_model(DeduplicationMerger, config) + + res_1 = await merger.get_data( + methods_dict=methods_dict, + user_id="u", + limit=4, + next_page=FeedResultNextPage(data={}), + redis_client=redis_client, + custom_deduplication_key="t1", + ) + + res_2 = await merger.get_data( + methods_dict=methods_dict, + user_id="u", + limit=4, + next_page=res_1.next_page, + redis_client=redis_client, + custom_deduplication_key="t1", + ) + + ids_1 = [x["id"] for x in res_1.data] + ids_2 = [x["id"] for x in res_2.data] + + assert len(ids_1) == len(set(ids_1)) + assert len(ids_2) == len(set(ids_2)) + assert not (set(ids_1) & set(ids_2)) + + # Redis backend should not store seen ids in cursor after. + assert "dedup_redis" in res_2.next_page.data + assert res_2.next_page.data["dedup_redis"].after is None + + # Ensure fixture works for both sync/async redis. + key = "dedup:dedup_redis:u:t1" + members = redis_client.smembers(key) + if inspect.iscoroutine(members): + members = await members + assert len(members) >= len(set(ids_1 + ids_2)) diff --git a/tests/test_merger_percentage.py b/tests/test_merger_percentage.py index e5ab76e..89e225e 100644 --- a/tests/test_merger_percentage.py +++ b/tests/test_merger_percentage.py @@ -3,6 +3,7 @@ from smartfeed.schemas import FeedResultNextPage, FeedResultNextPageInside, MergerPercentage from tests.fixtures.configs import METHODS_DICT from tests.fixtures.mergers import MERGER_PERCENTAGE_CONFIG +from tests.utils import parse_model @pytest.mark.asyncio @@ -11,7 +12,7 @@ async def test_merger_percentage() -> None: Тест для проверки получения данных из процентного мерджера. """ - merger_percentage = MergerPercentage.parse_obj(MERGER_PERCENTAGE_CONFIG) + merger_percentage = parse_model(MergerPercentage, MERGER_PERCENTAGE_CONFIG) merger_percentage_res = await merger_percentage.get_data( methods_dict=METHODS_DICT, limit=10, diff --git a/tests/test_merger_percentage_gradient.py b/tests/test_merger_percentage_gradient.py index eaaba9c..73bc769 100644 --- a/tests/test_merger_percentage_gradient.py +++ b/tests/test_merger_percentage_gradient.py @@ -3,6 +3,7 @@ from smartfeed.schemas import FeedResultNextPage, FeedResultNextPageInside, MergerPercentageGradient from tests.fixtures.configs import METHODS_DICT from tests.fixtures.mergers import MERGER_PERCENTAGE_GRADIENT_CONFIG +from tests.utils import parse_model @pytest.mark.asyncio @@ -11,7 +12,7 @@ async def test_merger_percentage_gradient() -> None: Тест для проверки получения данных из процентного мерджера с градиентом. """ - merger_percentage_gradient = MergerPercentageGradient.parse_obj(MERGER_PERCENTAGE_GRADIENT_CONFIG) + merger_percentage_gradient = parse_model(MergerPercentageGradient, MERGER_PERCENTAGE_GRADIENT_CONFIG) merger_percentage_gradient_res = await merger_percentage_gradient.get_data( methods_dict=METHODS_DICT, limit=10, @@ -44,7 +45,7 @@ async def test_merger_percentage_gradient_next_page() -> None: Тест для проверки получения данных из процентного мерджера с градиентом после изменения процента на другой странице. """ - merger_percentage_gradient = MergerPercentageGradient.parse_obj(MERGER_PERCENTAGE_GRADIENT_CONFIG) + merger_percentage_gradient = parse_model(MergerPercentageGradient, MERGER_PERCENTAGE_GRADIENT_CONFIG) merger_percentage_gradient_res = await merger_percentage_gradient.get_data( methods_dict=METHODS_DICT, limit=10, diff --git a/tests/test_merger_positional.py b/tests/test_merger_positional.py index c0f3815..370f770 100644 --- a/tests/test_merger_positional.py +++ b/tests/test_merger_positional.py @@ -3,6 +3,7 @@ from smartfeed.schemas import FeedResultNextPage, FeedResultNextPageInside, MergerPositional from tests.fixtures.configs import METHODS_DICT from tests.fixtures.mergers import MERGER_POSITIONAL_CONFIG +from tests.utils import parse_model @pytest.mark.asyncio @@ -11,7 +12,7 @@ async def test_merger_positional_with_positions() -> None: Тест для проверки получения данных из позиционного мерджера на основе позиций в конфигурации. """ - merger_positional = MergerPositional.parse_obj(MERGER_POSITIONAL_CONFIG) + merger_positional = parse_model(MergerPositional, MERGER_POSITIONAL_CONFIG) merger_positional_res = await merger_positional.get_data( methods_dict=METHODS_DICT, limit=9, @@ -33,7 +34,7 @@ async def test_merger_positional_with_step() -> None: Тест для проверки получения данных из позиционного мерджера на основе шагов в конфигурации. """ - merger_positional = MergerPositional.parse_obj(MERGER_POSITIONAL_CONFIG) + merger_positional = parse_model(MergerPositional, MERGER_POSITIONAL_CONFIG) merger_positional_res = await merger_positional.get_data( methods_dict=METHODS_DICT, limit=10, @@ -56,7 +57,7 @@ async def test_merger_positional_with_empty_default() -> None: Тест для проверки получения данных из позиционного мерджера на основе шагов в конфигурации. """ - merger_positional = MergerPositional.parse_obj(MERGER_POSITIONAL_CONFIG) + merger_positional = parse_model(MergerPositional, MERGER_POSITIONAL_CONFIG) merger_positional.default.method_name = "empty" merger_positional_res = await merger_positional.get_data( methods_dict=METHODS_DICT, diff --git a/tests/test_merger_view_session.py b/tests/test_merger_view_session.py index 78a0566..bd02bfe 100644 --- a/tests/test_merger_view_session.py +++ b/tests/test_merger_view_session.py @@ -7,6 +7,7 @@ from tests.fixtures.configs import METHODS_DICT from tests.fixtures.mergers import MERGER_VIEW_SESSION_CONFIG, MERGER_VIEW_SESSION_DUPS_CONFIG from tests.fixtures.redis import redis_client +from tests.utils import parse_model @pytest.mark.asyncio @@ -15,7 +16,7 @@ async def test_merger_view_session_no_redis() -> None: Тест для проверки получения данных из мерджера с кэшированием без клиента Redis. """ - merger_vs = MergerViewSession.parse_obj(MERGER_VIEW_SESSION_CONFIG) + merger_vs = parse_model(MergerViewSession, MERGER_VIEW_SESSION_CONFIG) with pytest.raises(ValueError): await merger_vs.get_data( methods_dict=METHODS_DICT, @@ -32,7 +33,7 @@ async def test_merger_view_session(redis_client) -> None: Тест для проверки получения данных из мерджера с кэшированием. """ - merger_vs = MergerViewSession.parse_obj(MERGER_VIEW_SESSION_CONFIG) + merger_vs = parse_model(MergerViewSession, MERGER_VIEW_SESSION_CONFIG) merger_vs_res = await merger_vs.get_data( methods_dict=METHODS_DICT, limit=10, @@ -59,7 +60,7 @@ async def test_merger_view_session_custom_key(redis_client) -> None: Тест для проверки получения данных из мерджера с кэшированием по ключу с кастомным постфиксом. """ - merger_vs = MergerViewSession.parse_obj(MERGER_VIEW_SESSION_CONFIG) + merger_vs = parse_model(MergerViewSession, MERGER_VIEW_SESSION_CONFIG) # Даем дополнительный параметр, который мерджер добавит в ключ кэша. merger_vs_res = await merger_vs.get_data( methods_dict=METHODS_DICT, @@ -88,7 +89,7 @@ async def test_merger_view_session_next_page(redis_client) -> None: Тест для проверки получения данных следующей страницы из мерджера с кэшированием. """ - merger_vs = MergerViewSession.parse_obj(MERGER_VIEW_SESSION_CONFIG) + merger_vs = parse_model(MergerViewSession, MERGER_VIEW_SESSION_CONFIG) merger_vs_res = await merger_vs.get_data( methods_dict=METHODS_DICT, limit=10, @@ -113,7 +114,7 @@ async def test_merger_view_session_next_page(redis_client) -> None: @pytest.mark.parametrize("redis_client", ["sync", "async"], indirect=True) @pytest.mark.asyncio async def test_merger_view_session_deduplication(redis_client) -> None: - merger_vs = MergerViewSession.parse_obj(MERGER_VIEW_SESSION_DUPS_CONFIG) + merger_vs = parse_model(MergerViewSession, MERGER_VIEW_SESSION_DUPS_CONFIG) merger_vs_res = await merger_vs.get_data( methods_dict=METHODS_DICT, limit=10, diff --git a/tests/test_parsing_config.py b/tests/test_parsing_config.py index 4d789cb..10cc8f1 100644 --- a/tests/test_parsing_config.py +++ b/tests/test_parsing_config.py @@ -2,6 +2,7 @@ from smartfeed.manager import FeedManager from smartfeed.schemas import ( + DeduplicationMerger, FeedConfig, MergerAppend, MergerPercentage, @@ -11,7 +12,7 @@ MergerViewSession, SubFeed, ) -from tests.fixtures.configs import METHODS_DICT, PARSING_CONFIG_FIXTURE +from tests.fixtures.configs import METHODS_DICT, PARSING_CONFIG_FIXTURE, PARSING_DEDUP_CONFIG_FIXTURE @pytest.mark.asyncio @@ -45,3 +46,14 @@ async def test_parsing_config() -> None: # SubFeed with Raise Exception False. assert isinstance(feed_manager.feed_config.feed.default.items[0].data, SubFeed) assert feed_manager.feed_config.feed.default.items[0].data.raise_error is False + + +@pytest.mark.asyncio +async def test_parsing_config_deduplication_merger() -> None: + feed_manager = FeedManager(config=PARSING_DEDUP_CONFIG_FIXTURE, methods_dict=METHODS_DICT) + + assert isinstance(feed_manager.feed_config, FeedConfig) + assert isinstance(feed_manager.feed_config.feed, DeduplicationMerger) + assert len(feed_manager.feed_config.feed.items) == 2 + assert feed_manager.feed_config.feed.items[0].priority == 100 + assert isinstance(feed_manager.feed_config.feed.items[0].data, SubFeed) diff --git a/tests/test_redis_live.py b/tests/test_redis_live.py index 1a23839..85effa2 100644 --- a/tests/test_redis_live.py +++ b/tests/test_redis_live.py @@ -8,6 +8,7 @@ from smartfeed.schemas import FeedResultNextPage, MergerViewSession from tests.fixtures.configs import METHODS_DICT from tests.fixtures.mergers import MERGER_VIEW_SESSION_CONFIG +from tests.utils import parse_model class RedisReplicationSimulator: @@ -63,7 +64,7 @@ async def test_redis_replication_delay_problem(): # Используем симулятор задержки репликации redis_client = RedisReplicationSimulator(real_client) - merger_vs = MergerViewSession.parse_obj(MERGER_VIEW_SESSION_CONFIG) + merger_vs = parse_model(MergerViewSession, MERGER_VIEW_SESSION_CONFIG) print("\n=== Демонстрация проблемы с задержкой репликации ===") @@ -117,7 +118,7 @@ async def test_redis_multiple_requests(): real_client.delete(test_key) redis_client = RedisReplicationSimulator(real_client) - merger_vs = MergerViewSession.parse_obj(MERGER_VIEW_SESSION_CONFIG) + merger_vs = parse_model(MergerViewSession, MERGER_VIEW_SESSION_CONFIG) print("\n=== Тест множественных запросов ===") diff --git a/tests/test_sub_feed.py b/tests/test_sub_feed.py index 7da1924..11645d4 100644 --- a/tests/test_sub_feed.py +++ b/tests/test_sub_feed.py @@ -16,7 +16,7 @@ async def test_sub_feed() -> None: Тест для проверки получения данных из субфида (без параметров). """ - sub_feed = SubFeed.parse_obj(SUBFEED_CONFIG) + sub_feed = SubFeed.model_validate(SUBFEED_CONFIG) sub_feed_data = await sub_feed.get_data( methods_dict=METHODS_DICT, limit=15, @@ -33,7 +33,7 @@ async def test_sub_feed_with_params() -> None: Тест для проверки получения данных из субфида (с параметрами). """ - sub_feed = SubFeed.parse_obj(SUBFEED_WITH_PARAMS_CONFIG) + sub_feed = SubFeed.model_validate(SUBFEED_WITH_PARAMS_CONFIG) sub_feed_data = await sub_feed.get_data( methods_dict=METHODS_DICT, limit=15, @@ -50,7 +50,7 @@ async def test_sub_feed_raise_error() -> None: Тест для проверки получения данных из субфида (без параметров). """ - sub_feed = SubFeed.parse_obj(SUBFEED_CONFIG_RAISE_ERROR) + sub_feed = SubFeed.model_validate(SUBFEED_CONFIG_RAISE_ERROR) with pytest.raises(Exception): await sub_feed.get_data( @@ -67,7 +67,7 @@ async def test_sub_feed_no_raise_error() -> None: Тест для проверки получения данных из субфида (без параметров). """ - sub_feed = SubFeed.parse_obj(SUBFEED_CONFIG_NO_RAISE_ERROR) + sub_feed = SubFeed.model_validate(SUBFEED_CONFIG_NO_RAISE_ERROR) sub_feed_data = await sub_feed.get_data( methods_dict=METHODS_DICT, limit=15, diff --git a/tests/utils.py b/tests/utils.py new file mode 100644 index 0000000..f38689f --- /dev/null +++ b/tests/utils.py @@ -0,0 +1,17 @@ +from __future__ import annotations + +from typing import Any, Dict, Type, TypeVar + + +T = TypeVar("T") + + +def parse_model(model_cls: Type[T], obj: Dict[str, Any]) -> T: + """Parse a dict into a Pydantic model. + + Uses Pydantic v2 `model_validate` when available, otherwise falls back to v1 `parse_obj`. + """ + + if hasattr(model_cls, "model_validate"): + return model_cls.model_validate(obj) # type: ignore[attr-defined] + return model_cls.parse_obj(obj) # type: ignore[attr-defined] From 1967175f8dd9bafe6682b89444ac56d3aba840e6 Mon Sep 17 00:00:00 2001 From: Pavel Kochetov Date: Tue, 16 Dec 2025 17:07:25 +0000 Subject: [PATCH 02/33] Name fix. --- smartfeed/schemas.py | 16 ++++++++-------- tests/test_merger_deduplication.py | 10 +++++----- tests/test_parsing_config.py | 4 ++-- 3 files changed, 15 insertions(+), 15 deletions(-) diff --git a/smartfeed/schemas.py b/smartfeed/schemas.py index a9ef784..5a86a3a 100644 --- a/smartfeed/schemas.py +++ b/smartfeed/schemas.py @@ -15,7 +15,7 @@ FeedTypes = Annotated[ Union[ - "DeduplicationMerger", + "MergerDeduplication", "MergerAppend", "MergerAppendDistribute", "MergerPositional", @@ -1021,14 +1021,14 @@ async def get_data( return result -class DeduplicationMergerItem(BaseModel): - """Configuration item for DeduplicationMerger.""" +class MergerDeduplicationItem(BaseModel): + """Configuration item for MergerDeduplication.""" priority: int = 0 data: FeedTypes -class DeduplicationMerger(BaseFeedConfigModel): +class MergerDeduplication(BaseFeedConfigModel): """Merger that deduplicates items and refills to the requested limit. Key properties: @@ -1039,7 +1039,7 @@ class DeduplicationMerger(BaseFeedConfigModel): merger_id: str type: Literal["merger_deduplication"] - items: List[DeduplicationMergerItem] + items: List[MergerDeduplicationItem] dedup_key: Optional[str] = None missing_key_policy: Literal["error", "keep", "drop"] = "error" @@ -1151,7 +1151,7 @@ async def get_data( is_fresh_session = requested_page is None or (isinstance(requested_page, int) and requested_page <= 0) if self.state_backend == "redis" and not redis_client: - raise ValueError("Redis client must be provided if using DeduplicationMerger with state_backend=redis") + raise ValueError("Redis client must be provided if using MergerDeduplication with state_backend=redis") if hasattr(next_page, "model_copy"): working_next_page = next_page.model_copy(deep=True) # type: ignore[attr-defined] @@ -1381,5 +1381,5 @@ def _rebuild_model(model: Any) -> None: _rebuild_model(MergerAppendDistribute) _rebuild_model(MergerPercentageGradient) _rebuild_model(MergerViewSession) -_rebuild_model(DeduplicationMergerItem) -_rebuild_model(DeduplicationMerger) +_rebuild_model(MergerDeduplicationItem) +_rebuild_model(MergerDeduplication) \ No newline at end of file diff --git a/tests/test_merger_deduplication.py b/tests/test_merger_deduplication.py index 5e16c45..26b4c78 100644 --- a/tests/test_merger_deduplication.py +++ b/tests/test_merger_deduplication.py @@ -3,7 +3,7 @@ import pytest from smartfeed.schemas import ( - DeduplicationMerger, + MergerDeduplication, FeedResultClient, FeedResultNextPage, FeedResultNextPageInside, @@ -71,7 +71,7 @@ async def test_deduplication_merger_cursor_priority_and_cross_page() -> None: ], } - merger = parse_model(DeduplicationMerger, config) + merger = parse_model(MergerDeduplication, config) res_1 = await merger.get_data( methods_dict=methods_dict, @@ -139,7 +139,7 @@ async def test_deduplication_merger_refill_to_limit() -> None: ], } - merger = parse_model(DeduplicationMerger, config) + merger = parse_model(MergerDeduplication, config) res = await merger.get_data( methods_dict=methods_dict, @@ -171,7 +171,7 @@ async def test_deduplication_merger_page_zero_resets_cursor_state() -> None: ], } - merger = parse_model(DeduplicationMerger, config) + merger = parse_model(MergerDeduplication, config) res_1 = await merger.get_data( methods_dict=methods_dict, @@ -232,7 +232,7 @@ async def test_deduplication_merger_redis_backend(redis_client) -> None: ], } - merger = parse_model(DeduplicationMerger, config) + merger = parse_model(MergerDeduplication, config) res_1 = await merger.get_data( methods_dict=methods_dict, diff --git a/tests/test_parsing_config.py b/tests/test_parsing_config.py index 10cc8f1..291971b 100644 --- a/tests/test_parsing_config.py +++ b/tests/test_parsing_config.py @@ -2,7 +2,7 @@ from smartfeed.manager import FeedManager from smartfeed.schemas import ( - DeduplicationMerger, + MergerDeduplication, FeedConfig, MergerAppend, MergerPercentage, @@ -53,7 +53,7 @@ async def test_parsing_config_deduplication_merger() -> None: feed_manager = FeedManager(config=PARSING_DEDUP_CONFIG_FIXTURE, methods_dict=METHODS_DICT) assert isinstance(feed_manager.feed_config, FeedConfig) - assert isinstance(feed_manager.feed_config.feed, DeduplicationMerger) + assert isinstance(feed_manager.feed_config.feed, MergerDeduplication) assert len(feed_manager.feed_config.feed.items) == 2 assert feed_manager.feed_config.feed.items[0].priority == 100 assert isinstance(feed_manager.feed_config.feed.items[0].data, SubFeed) From 60f1a77f963c4a3b08aacca9b6987bedf5d78ca9 Mon Sep 17 00:00:00 2001 From: Pavel Kochetov Date: Tue, 16 Dec 2025 17:47:46 +0000 Subject: [PATCH 03/33] More tests. --- smartfeed/schemas.py | 45 ++ tests/test_merger_deduplication.py | 754 ++++++++++++++++++++++++++++- 2 files changed, 795 insertions(+), 4 deletions(-) diff --git a/smartfeed/schemas.py b/smartfeed/schemas.py index 5a86a3a..c5e58bb 100644 --- a/smartfeed/schemas.py +++ b/smartfeed/schemas.py @@ -1052,6 +1052,43 @@ class MergerDeduplication(BaseFeedConfigModel): overfetch_factor: int = 2 max_refill_loops: int = 20 + def _collect_descendant_cursor_keys(self, feed: BaseFeedConfigModel) -> set[str]: + keys: set[str] = set() + + subfeed_id = getattr(feed, "subfeed_id", None) + if isinstance(subfeed_id, str) and subfeed_id: + keys.add(subfeed_id) + + merger_id = getattr(feed, "merger_id", None) + if isinstance(merger_id, str) and merger_id: + keys.add(merger_id) + + # Recurse into known child containers across existing feed types. + child: Any + for attr_name in ("data", "positional", "default"): + child = getattr(feed, attr_name, None) + if isinstance(child, BaseFeedConfigModel): + keys.update(self._collect_descendant_cursor_keys(child)) + + for attr_name in ("item_from", "item_to"): + child = getattr(feed, attr_name, None) + inner = getattr(child, "data", None) + if isinstance(inner, BaseFeedConfigModel): + keys.update(self._collect_descendant_cursor_keys(inner)) + + items = getattr(feed, "items", None) + if isinstance(items, list): + for item in items: + if isinstance(item, BaseFeedConfigModel): + keys.update(self._collect_descendant_cursor_keys(item)) + continue + + inner = getattr(item, "data", None) + if isinstance(inner, BaseFeedConfigModel): + keys.update(self._collect_descendant_cursor_keys(inner)) + + return keys + def _normalize_key(self, value: Any) -> str: if isinstance(value, (str, int)): return str(value) @@ -1157,6 +1194,14 @@ async def get_data( working_next_page = next_page.model_copy(deep=True) # type: ignore[attr-defined] else: working_next_page = next_page.copy(deep=True) + + if is_fresh_session: + # Reset cursors for all descendants under this merger so upstream nodes also restart. + descendant_keys: set[str] = set() + for item in self.items: + descendant_keys.update(self._collect_descendant_cursor_keys(item.data)) + for key in descendant_keys: + working_next_page.data.pop(key, None) sorted_items = sorted(self.items, key=lambda x: x.priority, reverse=True) seen_keys_in_order: List[str] = [] diff --git a/tests/test_merger_deduplication.py b/tests/test_merger_deduplication.py index 26b4c78..cede8f3 100644 --- a/tests/test_merger_deduplication.py +++ b/tests/test_merger_deduplication.py @@ -13,10 +13,13 @@ from tests.utils import parse_model -def make_offset_paged_method(items): +def make_offset_paged_method(items, *, max_per_call=None): async def _method(user_id, limit, next_page): # pylint: disable=unused-argument offset = int(next_page.after or 0) - result_data = items[offset : offset + limit] + effective_limit = limit + if isinstance(max_per_call, int) and max_per_call > 0: + effective_limit = min(effective_limit, max_per_call) + result_data = items[offset : offset + effective_limit] next_page.after = offset + len(result_data) next_page.page += 1 has_next_page = (offset + len(result_data)) < len(items) @@ -25,6 +28,90 @@ async def _method(user_id, limit, next_page): # pylint: disable=unused-argument return _method +async def _run_two_pages( + *, + config, + methods_dict, + user_id, + limit, + redis_client_instance=None, + **params, +): + merger = parse_model(MergerDeduplication, config) + res_1 = await merger.get_data( + methods_dict=methods_dict, + user_id=user_id, + limit=limit, + next_page=FeedResultNextPage(data={}), + redis_client=redis_client_instance, + **params, + ) + res_2 = await merger.get_data( + methods_dict=methods_dict, + user_id=user_id, + limit=limit, + next_page=res_1.next_page, + redis_client=redis_client_instance, + **params, + ) + return res_1, res_2 + + +def _assert_dedup_backend_state(*, res, merger_id: str, state_backend: str) -> None: + assert merger_id in res.next_page.data + if state_backend == "cursor": + assert isinstance(res.next_page.data[merger_id].after, dict) + else: + assert res.next_page.data[merger_id].after is None + + +def _ids(data): + return [x["id"] for x in data] + + +def _assert_two_pages_no_overlap(res_1, res_2): + ids_1 = set(_ids(res_1.data)) + ids_2 = set(_ids(res_2.data)) + assert len(ids_1) == len(res_1.data) + assert len(ids_2) == len(res_2.data) + assert not (ids_1 & ids_2) + + +def _assert_cursor_monotonic_if_present(res_1, res_2, keys): + """Assert that cursor values monotonically advance for keys that are present. + + MergerDeduplication may stop early once it has enough unique items, so a + descendant might not be called on a given page. This helper only asserts + monotonicity when the cursor key exists in `res_1`. + """ + + for key in keys: + if key not in res_1.next_page.data: + continue + + assert key in res_2.next_page.data + + after_1 = res_1.next_page.data[key].after + after_2 = res_2.next_page.data[key].after + + if after_1 is None or after_2 is None: + continue + + if isinstance(after_1, int) and isinstance(after_2, int): + assert after_2 >= after_1 + continue + + # Merger cursors can be structured (dict), just require presence. + if isinstance(after_1, dict) and isinstance(after_2, dict): + continue + + # If values are comparable, enforce monotonicity; otherwise don't fail. + try: + assert after_2 >= after_1 + except TypeError: + pass + + @pytest.mark.asyncio async def test_deduplication_merger_cursor_priority_and_cross_page() -> None: low_items = [ @@ -182,14 +269,15 @@ async def test_deduplication_merger_page_zero_resets_cursor_state() -> None: assert [x["id"] for x in res_1.data] == [1, 2, 3, 4, 5] # Simulate a full reload: page 0 requested again. Even if the client mistakenly - # keeps the previous "after" payload, we start a new session. + # keeps the previous cursor payloads (including subfeed cursors), we start a new session. res_2 = await merger.get_data( methods_dict=methods_dict, user_id="u", limit=5, next_page=FeedResultNextPage( data={ - "dedup_reset": FeedResultNextPageInside(page=0, after=res_1.next_page.data["dedup_reset"].after) + "dedup_reset": FeedResultNextPageInside(page=0, after=res_1.next_page.data["dedup_reset"].after), + "sf_stream": res_1.next_page.data["sf_stream"], } ), ) @@ -269,3 +357,661 @@ async def test_deduplication_merger_redis_backend(redis_client) -> None: if inspect.iscoroutine(members): members = await members assert len(members) >= len(set(ids_1 + ids_2)) + + +@pytest.mark.asyncio +async def test_deduplication_merger_priority_replacement_across_loops_cursor_backend() -> None: + # This test forces the higher-priority source to surface a duplicate only on a later call, + # so we exercise the in-page replacement logic. + # Important: dedup calls sources in descending priority. To ensure we exercise + # replacement, we need a lower-priority source to introduce id=5 *before* + # the high-priority source sees id=5 on a later refill loop. + low_items = [ + {"id": 5, "src": "low"}, + {"id": 6, "src": "low"}, + {"id": 7, "src": "low"}, + {"id": 99, "src": "low"}, + ] + mid_items = [ + {"id": 5, "src": "mid"}, + {"id": 98, "src": "mid"}, + {"id": 8, "src": "mid"}, + {"id": 9, "src": "mid"}, + ] + high_items = [ + {"id": 1, "src": "high"}, + {"id": 5, "src": "high"}, + {"id": 2, "src": "high"}, + {"id": 3, "src": "high"}, + ] + + methods_dict = { + "low": make_offset_paged_method(low_items, max_per_call=1), + "mid": make_offset_paged_method(mid_items, max_per_call=1), + "high": make_offset_paged_method(high_items, max_per_call=1), + } + + config = { + "merger_id": "dedup_priority_cursor", + "type": "merger_deduplication", + "dedup_key": "id", + "state_backend": "cursor", + "cursor_compress": True, + "overfetch_factor": 1, + "max_refill_loops": 10, + "items": [ + {"priority": 100, "data": {"subfeed_id": "sf_high_p", "type": "subfeed", "method_name": "high"}}, + {"priority": 50, "data": {"subfeed_id": "sf_mid_p", "type": "subfeed", "method_name": "mid"}}, + {"priority": 0, "data": {"subfeed_id": "sf_low_p", "type": "subfeed", "method_name": "low"}}, + ], + } + + res_1 = await parse_model(MergerDeduplication, config).get_data( + methods_dict=methods_dict, + user_id="u", + limit=4, + next_page=FeedResultNextPage(data={}), + ) + + # Ensure id=5 is present and comes from highest priority, even though low/mid can surface it earlier. + winners = {x["id"]: x["src"] for x in res_1.data} + assert winners[5] == "high" + _assert_dedup_backend_state(res=res_1, merger_id="dedup_priority_cursor", state_backend="cursor") + + +@pytest.mark.parametrize("redis_client", ["sync", "async"], indirect=True) +@pytest.mark.asyncio +async def test_deduplication_merger_priority_replacement_across_loops_redis_backend(redis_client) -> None: + low_items = [ + {"id": 5, "src": "low"}, + {"id": 6, "src": "low"}, + {"id": 7, "src": "low"}, + {"id": 99, "src": "low"}, + ] + mid_items = [ + {"id": 5, "src": "mid"}, + {"id": 98, "src": "mid"}, + {"id": 8, "src": "mid"}, + {"id": 9, "src": "mid"}, + ] + high_items = [ + {"id": 1, "src": "high"}, + {"id": 5, "src": "high"}, + {"id": 2, "src": "high"}, + {"id": 3, "src": "high"}, + ] + + methods_dict = { + "low": make_offset_paged_method(low_items, max_per_call=1), + "mid": make_offset_paged_method(mid_items, max_per_call=1), + "high": make_offset_paged_method(high_items, max_per_call=1), + } + + config = { + "merger_id": "dedup_priority_redis", + "type": "merger_deduplication", + "dedup_key": "id", + "state_backend": "redis", + "state_ttl_seconds": 60, + "overfetch_factor": 1, + "max_refill_loops": 10, + "items": [ + {"priority": 100, "data": {"subfeed_id": "sf_high_pr", "type": "subfeed", "method_name": "high"}}, + {"priority": 50, "data": {"subfeed_id": "sf_mid_pr", "type": "subfeed", "method_name": "mid"}}, + {"priority": 0, "data": {"subfeed_id": "sf_low_pr", "type": "subfeed", "method_name": "low"}}, + ], + } + + res_1 = await parse_model(MergerDeduplication, config).get_data( + methods_dict=methods_dict, + user_id="u", + limit=4, + next_page=FeedResultNextPage(data={}), + redis_client=redis_client, + custom_deduplication_key="priority", + ) + + winners = {x["id"]: x["src"] for x in res_1.data} + assert winners[5] == "high" + _assert_dedup_backend_state(res=res_1, merger_id="dedup_priority_redis", state_backend="redis") + + +@pytest.mark.asyncio +async def test_deduplication_merger_with_append_and_three_sources_cursor_backend() -> None: + # Inner MergerAppend (two subfeeds) + two extra subfeeds as separate dedup items. + a_items = [{"id": i, "src": "a"} for i in range(1, 30)] + b_items = [{"id": i, "src": "b"} for i in range(10, 40)] + c_items = [{"id": i, "src": "c"} for i in range(20, 60)] + d_items = [{"id": i, "src": "d"} for i in range(25, 70)] + + # Cap each subfeed to 1 item per call so dedup must invoke all children + # (and therefore exercise nested cursor propagation). + methods_dict = { + "a": make_offset_paged_method(a_items, max_per_call=1), + "b": make_offset_paged_method(b_items, max_per_call=1), + "c": make_offset_paged_method(c_items, max_per_call=1), + "d": make_offset_paged_method(d_items, max_per_call=1), + } + + append_config = { + "merger_id": "inner_append_unused", + "type": "merger_append", + "items": [ + {"subfeed_id": "sf_a_append", "type": "subfeed", "method_name": "a"}, + {"subfeed_id": "sf_b_append", "type": "subfeed", "method_name": "b"}, + ], + } + + config = { + "merger_id": "dedup_with_append_cursor", + "type": "merger_deduplication", + "dedup_key": "id", + "state_backend": "cursor", + "cursor_compress": True, + "overfetch_factor": 2, + "items": [ + {"priority": 10, "data": append_config}, + {"priority": 5, "data": {"subfeed_id": "sf_c", "type": "subfeed", "method_name": "c"}}, + {"priority": 0, "data": {"subfeed_id": "sf_d", "type": "subfeed", "method_name": "d"}}, + ], + } + + res_1, res_2 = await _run_two_pages(config=config, methods_dict=methods_dict, user_id="u", limit=15) + _assert_two_pages_no_overlap(res_1, res_2) + _assert_dedup_backend_state(res=res_2, merger_id="dedup_with_append_cursor", state_backend="cursor") + + # Cursor correctness: descendant subfeed cursors exist and advance. + _assert_cursor_monotonic_if_present(res_1, res_2, ["sf_a_append", "sf_b_append", "sf_c", "sf_d"]) + + +@pytest.mark.parametrize("redis_client", ["sync", "async"], indirect=True) +@pytest.mark.asyncio +async def test_deduplication_merger_with_append_and_three_sources_redis_backend(redis_client) -> None: + a_items = [{"id": i, "src": "a"} for i in range(1, 30)] + b_items = [{"id": i, "src": "b"} for i in range(10, 40)] + c_items = [{"id": i, "src": "c"} for i in range(20, 60)] + d_items = [{"id": i, "src": "d"} for i in range(25, 70)] + + methods_dict = { + "a": make_offset_paged_method(a_items, max_per_call=1), + "b": make_offset_paged_method(b_items, max_per_call=1), + "c": make_offset_paged_method(c_items, max_per_call=1), + "d": make_offset_paged_method(d_items, max_per_call=1), + } + + append_config = { + "merger_id": "inner_append_unused_r", + "type": "merger_append", + "items": [ + {"subfeed_id": "sf_a_append_r", "type": "subfeed", "method_name": "a"}, + {"subfeed_id": "sf_b_append_r", "type": "subfeed", "method_name": "b"}, + ], + } + + config = { + "merger_id": "dedup_with_append_redis", + "type": "merger_deduplication", + "dedup_key": "id", + "state_backend": "redis", + "state_ttl_seconds": 60, + "overfetch_factor": 2, + "items": [ + {"priority": 10, "data": append_config}, + {"priority": 5, "data": {"subfeed_id": "sf_c_r", "type": "subfeed", "method_name": "c"}}, + {"priority": 0, "data": {"subfeed_id": "sf_d_r", "type": "subfeed", "method_name": "d"}}, + ], + } + + res_1, res_2 = await _run_two_pages( + config=config, + methods_dict=methods_dict, + user_id="u", + limit=15, + redis_client_instance=redis_client, + custom_deduplication_key="append", + ) + _assert_two_pages_no_overlap(res_1, res_2) + _assert_dedup_backend_state(res=res_2, merger_id="dedup_with_append_redis", state_backend="redis") + + _assert_cursor_monotonic_if_present(res_1, res_2, ["sf_a_append_r", "sf_b_append_r", "sf_c_r", "sf_d_r"]) + + +@pytest.mark.asyncio +async def test_deduplication_merger_with_percentage_cursor_backend() -> None: + a_items = [{"id": i, "src": "pa"} for i in range(1, 60)] + b_items = [{"id": i, "src": "pb"} for i in range(30, 90)] + c_items = [{"id": i, "src": "pc"} for i in range(40, 120)] + + methods_dict = { + "pa": make_offset_paged_method(a_items, max_per_call=1), + "pb": make_offset_paged_method(b_items, max_per_call=1), + "pc": make_offset_paged_method(c_items, max_per_call=1), + } + + percentage_config = { + "merger_id": "inner_percentage_unused", + "type": "merger_percentage", + "items": [ + {"percentage": 50, "data": {"subfeed_id": "sf_pa", "type": "subfeed", "method_name": "pa"}}, + {"percentage": 50, "data": {"subfeed_id": "sf_pb", "type": "subfeed", "method_name": "pb"}}, + ], + } + + config = { + "merger_id": "dedup_percentage_cursor", + "type": "merger_deduplication", + "dedup_key": "id", + "state_backend": "cursor", + "overfetch_factor": 2, + "items": [ + {"priority": 0, "data": percentage_config}, + {"priority": 10, "data": {"subfeed_id": "sf_pc", "type": "subfeed", "method_name": "pc"}}, + {"priority": 5, "data": {"subfeed_id": "sf_pd", "type": "subfeed", "method_name": "pa"}}, + ], + } + + res_1, res_2 = await _run_two_pages(config=config, methods_dict=methods_dict, user_id="u", limit=20) + _assert_two_pages_no_overlap(res_1, res_2) + _assert_dedup_backend_state(res=res_2, merger_id="dedup_percentage_cursor", state_backend="cursor") + + for key in ("sf_pa", "sf_pb", "sf_pc"): + assert key in res_1.next_page.data + assert isinstance(res_1.next_page.data[key].after, int) + + _assert_cursor_monotonic_if_present(res_1, res_2, ["sf_pa", "sf_pb", "sf_pc", "sf_pd"]) + + +@pytest.mark.parametrize("redis_client", ["sync", "async"], indirect=True) +@pytest.mark.asyncio +async def test_deduplication_merger_with_percentage_redis_backend(redis_client) -> None: + a_items = [{"id": i, "src": "pa"} for i in range(1, 60)] + b_items = [{"id": i, "src": "pb"} for i in range(30, 90)] + c_items = [{"id": i, "src": "pc"} for i in range(40, 120)] + + methods_dict = { + "pa": make_offset_paged_method(a_items, max_per_call=1), + "pb": make_offset_paged_method(b_items, max_per_call=1), + "pc": make_offset_paged_method(c_items, max_per_call=1), + } + + percentage_config = { + "merger_id": "inner_percentage_unused_r", + "type": "merger_percentage", + "items": [ + {"percentage": 50, "data": {"subfeed_id": "sf_pa_r", "type": "subfeed", "method_name": "pa"}}, + {"percentage": 50, "data": {"subfeed_id": "sf_pb_r", "type": "subfeed", "method_name": "pb"}}, + ], + } + + config = { + "merger_id": "dedup_percentage_redis", + "type": "merger_deduplication", + "dedup_key": "id", + "state_backend": "redis", + "state_ttl_seconds": 60, + "overfetch_factor": 2, + "items": [ + {"priority": 0, "data": percentage_config}, + {"priority": 10, "data": {"subfeed_id": "sf_pc_r", "type": "subfeed", "method_name": "pc"}}, + {"priority": 5, "data": {"subfeed_id": "sf_pd_r", "type": "subfeed", "method_name": "pa"}}, + ], + } + + res_1, res_2 = await _run_two_pages( + config=config, + methods_dict=methods_dict, + user_id="u", + limit=20, + redis_client_instance=redis_client, + custom_deduplication_key="percentage", + ) + _assert_two_pages_no_overlap(res_1, res_2) + _assert_dedup_backend_state(res=res_2, merger_id="dedup_percentage_redis", state_backend="redis") + + _assert_cursor_monotonic_if_present(res_1, res_2, ["sf_pa_r", "sf_pb_r", "sf_pc_r", "sf_pd_r"]) + + +@pytest.mark.asyncio +async def test_deduplication_merger_with_positional_cursor_backend() -> None: + # MergerPositional carries its own merger cursor; verify it survives nesting in dedup. + pos_items = [{"id": i, "src": "pos"} for i in range(1, 100)] + def_items = [{"id": i, "src": "def"} for i in range(50, 140)] + extra_items = [{"id": i, "src": "extra"} for i in range(80, 180)] + + methods_dict = { + "pos": make_offset_paged_method(pos_items, max_per_call=1), + "def": make_offset_paged_method(def_items, max_per_call=1), + "extra": make_offset_paged_method(extra_items, max_per_call=1), + } + + positional_config = { + "merger_id": "inner_positional", + "type": "merger_positional", + "positions": [0, 2, 4, 6, 8], + "positional": {"subfeed_id": "sf_positional", "type": "subfeed", "method_name": "pos"}, + "default": {"subfeed_id": "sf_default", "type": "subfeed", "method_name": "def"}, + } + + config = { + "merger_id": "dedup_positional_cursor", + "type": "merger_deduplication", + "dedup_key": "id", + "state_backend": "cursor", + "overfetch_factor": 2, + "items": [ + # Positional must run; it owns its own merger cursor entry. + {"priority": 10, "data": positional_config}, + {"priority": 5, "data": {"subfeed_id": "sf_extra", "type": "subfeed", "method_name": "extra"}}, + {"priority": 0, "data": {"subfeed_id": "sf_extra2", "type": "subfeed", "method_name": "extra"}}, + ], + } + + res_1, res_2 = await _run_two_pages(config=config, methods_dict=methods_dict, user_id="u", limit=20) + _assert_two_pages_no_overlap(res_1, res_2) + _assert_dedup_backend_state(res=res_2, merger_id="dedup_positional_cursor", state_backend="cursor") + assert "inner_positional" in res_1.next_page.data + assert "inner_positional" in res_2.next_page.data + + _assert_cursor_monotonic_if_present(res_1, res_2, ["sf_positional", "sf_default", "sf_extra", "sf_extra2"]) + + +@pytest.mark.parametrize("redis_client", ["sync", "async"], indirect=True) +@pytest.mark.asyncio +async def test_deduplication_merger_with_positional_redis_backend(redis_client) -> None: + pos_items = [{"id": i, "src": "pos"} for i in range(1, 100)] + def_items = [{"id": i, "src": "def"} for i in range(50, 140)] + extra_items = [{"id": i, "src": "extra"} for i in range(80, 180)] + + methods_dict = { + "pos": make_offset_paged_method(pos_items, max_per_call=1), + "def": make_offset_paged_method(def_items, max_per_call=1), + "extra": make_offset_paged_method(extra_items, max_per_call=1), + } + + positional_config = { + "merger_id": "inner_positional_r", + "type": "merger_positional", + "positions": [0, 2, 4, 6, 8], + "positional": {"subfeed_id": "sf_positional_r", "type": "subfeed", "method_name": "pos"}, + "default": {"subfeed_id": "sf_default_r", "type": "subfeed", "method_name": "def"}, + } + + config = { + "merger_id": "dedup_positional_redis", + "type": "merger_deduplication", + "dedup_key": "id", + "state_backend": "redis", + "state_ttl_seconds": 60, + "overfetch_factor": 2, + "items": [ + {"priority": 10, "data": positional_config}, + {"priority": 5, "data": {"subfeed_id": "sf_extra_r", "type": "subfeed", "method_name": "extra"}}, + {"priority": 0, "data": {"subfeed_id": "sf_extra2_r", "type": "subfeed", "method_name": "extra"}}, + ], + } + + res_1, res_2 = await _run_two_pages( + config=config, + methods_dict=methods_dict, + user_id="u", + limit=20, + redis_client_instance=redis_client, + custom_deduplication_key="positional", + ) + _assert_two_pages_no_overlap(res_1, res_2) + _assert_dedup_backend_state(res=res_2, merger_id="dedup_positional_redis", state_backend="redis") + assert "inner_positional_r" in res_1.next_page.data + + _assert_cursor_monotonic_if_present( + res_1, + res_2, + ["sf_positional_r", "sf_default_r", "sf_extra_r", "sf_extra2_r"], + ) + + +@pytest.mark.asyncio +async def test_deduplication_merger_with_percentage_gradient_cursor_backend() -> None: + from_items = [{"id": i, "src": "from"} for i in range(1, 140)] + to_items = [{"id": i, "src": "to"} for i in range(60, 200)] + extra_items = [{"id": i, "src": "extra"} for i in range(120, 300)] + + methods_dict = { + "from": make_offset_paged_method(from_items, max_per_call=1), + "to": make_offset_paged_method(to_items, max_per_call=1), + "extra": make_offset_paged_method(extra_items, max_per_call=1), + } + + gradient_config = { + "merger_id": "inner_gradient", + "type": "merger_percentage_gradient", + "item_from": {"percentage": 80, "data": {"subfeed_id": "sf_from", "type": "subfeed", "method_name": "from"}}, + "item_to": {"percentage": 20, "data": {"subfeed_id": "sf_to", "type": "subfeed", "method_name": "to"}}, + "step": 10, + "size_to_step": 10, + } + + config = { + "merger_id": "dedup_gradient_cursor", + "type": "merger_deduplication", + "dedup_key": "id", + "state_backend": "cursor", + "overfetch_factor": 2, + "items": [ + {"priority": 10, "data": gradient_config}, + {"priority": 5, "data": {"subfeed_id": "sf_extra_g", "type": "subfeed", "method_name": "extra"}}, + {"priority": 0, "data": {"subfeed_id": "sf_extra_g2", "type": "subfeed", "method_name": "extra"}}, + ], + } + + res_1, res_2 = await _run_two_pages(config=config, methods_dict=methods_dict, user_id="u", limit=25) + _assert_two_pages_no_overlap(res_1, res_2) + assert "inner_gradient" in res_1.next_page.data + assert "inner_gradient" in res_2.next_page.data + _assert_dedup_backend_state(res=res_2, merger_id="dedup_gradient_cursor", state_backend="cursor") + + _assert_cursor_monotonic_if_present(res_1, res_2, ["sf_from", "sf_to", "sf_extra_g", "sf_extra_g2"]) + + +@pytest.mark.parametrize("redis_client", ["sync", "async"], indirect=True) +@pytest.mark.asyncio +async def test_deduplication_merger_with_percentage_gradient_redis_backend(redis_client) -> None: + from_items = [{"id": i, "src": "from"} for i in range(1, 140)] + to_items = [{"id": i, "src": "to"} for i in range(60, 200)] + extra_items = [{"id": i, "src": "extra"} for i in range(120, 300)] + + methods_dict = { + "from": make_offset_paged_method(from_items, max_per_call=1), + "to": make_offset_paged_method(to_items, max_per_call=1), + "extra": make_offset_paged_method(extra_items, max_per_call=1), + } + + gradient_config = { + "merger_id": "inner_gradient_r", + "type": "merger_percentage_gradient", + "item_from": {"percentage": 80, "data": {"subfeed_id": "sf_from_r", "type": "subfeed", "method_name": "from"}}, + "item_to": {"percentage": 20, "data": {"subfeed_id": "sf_to_r", "type": "subfeed", "method_name": "to"}}, + "step": 10, + "size_to_step": 10, + } + + config = { + "merger_id": "dedup_gradient_redis", + "type": "merger_deduplication", + "dedup_key": "id", + "state_backend": "redis", + "state_ttl_seconds": 60, + "overfetch_factor": 2, + "items": [ + {"priority": 10, "data": gradient_config}, + {"priority": 5, "data": {"subfeed_id": "sf_extra_gr", "type": "subfeed", "method_name": "extra"}}, + {"priority": 0, "data": {"subfeed_id": "sf_extra_gr2", "type": "subfeed", "method_name": "extra"}}, + ], + } + + res_1, res_2 = await _run_two_pages( + config=config, + methods_dict=methods_dict, + user_id="u", + limit=25, + redis_client_instance=redis_client, + custom_deduplication_key="gradient", + ) + _assert_two_pages_no_overlap(res_1, res_2) + assert "inner_gradient_r" in res_1.next_page.data + _assert_dedup_backend_state(res=res_2, merger_id="dedup_gradient_redis", state_backend="redis") + + _assert_cursor_monotonic_if_present(res_1, res_2, ["sf_from_r", "sf_to_r", "sf_extra_gr", "sf_extra_gr2"]) + + +@pytest.mark.parametrize("state_backend", ["cursor", "redis"]) +@pytest.mark.parametrize("redis_client", ["sync", "async"], indirect=True) +@pytest.mark.asyncio +async def test_deduplication_merger_with_view_session_child(state_backend, redis_client) -> None: + # MergerViewSession always requires Redis, so this test always uses redis_client. + # We still validate both dedup state backends. + base_items = [{"id": i, "src": "vs"} for i in range(1, 200)] + extra_items = [{"id": i, "src": "extra"} for i in range(50, 260)] + + methods_dict = { + "vs": make_offset_paged_method(base_items), + "extra": make_offset_paged_method(extra_items), + } + + view_session_config = { + "merger_id": "inner_view_session", + "type": "merger_view_session", + "session_size": 60, + "session_live_time": 60, + "data": {"subfeed_id": "sf_vs", "type": "subfeed", "method_name": "vs"}, + "deduplicate": True, + "dedup_key": "id", + } + + config = { + "merger_id": f"dedup_vs_{state_backend}", + "type": "merger_deduplication", + "dedup_key": "id", + "state_backend": state_backend, + "state_ttl_seconds": 60, + "overfetch_factor": 2, + "items": [ + {"priority": 10, "data": view_session_config}, + {"priority": 5, "data": {"subfeed_id": "sf_extra_vs", "type": "subfeed", "method_name": "extra"}}, + {"priority": 0, "data": {"subfeed_id": "sf_extra_vs2", "type": "subfeed", "method_name": "extra"}}, + ], + } + + res_1, res_2 = await _run_two_pages( + config=config, + methods_dict=methods_dict, + user_id="u", + limit=20, + redis_client_instance=redis_client, + custom_deduplication_key=f"vs_{state_backend}", + custom_view_session_key=f"vs_{state_backend}", + ) + _assert_two_pages_no_overlap(res_1, res_2) + _assert_dedup_backend_state(res=res_2, merger_id=f"dedup_vs_{state_backend}", state_backend=state_backend) + assert "inner_view_session" in res_1.next_page.data + + _assert_cursor_monotonic_if_present(res_1, res_2, ["sf_vs", "sf_extra_vs", "sf_extra_vs2"]) + + +@pytest.mark.asyncio +async def test_deduplication_merger_with_append_distribute_cursor_backend() -> None: + # MergerAppendDistribute (type merger_distribute) + two extra subfeeds. + s1 = [{"id": i, "src": "s1", "group": "g1" if i % 2 == 0 else "g2"} for i in range(1, 120)] + s2 = [{"id": i, "src": "s2", "group": "g2" if i % 3 == 0 else "g3"} for i in range(60, 200)] + extra = [{"id": i, "src": "extra", "group": "g9"} for i in range(100, 240)] + + methods_dict = { + "s1": make_offset_paged_method(s1, max_per_call=1), + "s2": make_offset_paged_method(s2, max_per_call=1), + "extra": make_offset_paged_method(extra, max_per_call=1), + } + + distribute_config = { + "merger_id": "inner_distribute_unused", + "type": "merger_distribute", + "distribution_key": "group", + "items": [ + {"subfeed_id": "sf_s1", "type": "subfeed", "method_name": "s1"}, + {"subfeed_id": "sf_s2", "type": "subfeed", "method_name": "s2"}, + ], + } + + config = { + "merger_id": "dedup_distribute_cursor", + "type": "merger_deduplication", + "dedup_key": "id", + "state_backend": "cursor", + "overfetch_factor": 2, + "items": [ + {"priority": 10, "data": distribute_config}, + {"priority": 5, "data": {"subfeed_id": "sf_extra_dist", "type": "subfeed", "method_name": "extra"}}, + {"priority": 0, "data": {"subfeed_id": "sf_extra_dist2", "type": "subfeed", "method_name": "extra"}}, + ], + } + + res_1, res_2 = await _run_two_pages(config=config, methods_dict=methods_dict, user_id="u", limit=25) + _assert_two_pages_no_overlap(res_1, res_2) + _assert_dedup_backend_state(res=res_2, merger_id="dedup_distribute_cursor", state_backend="cursor") + for key in ("sf_s1", "sf_s2"): + assert key in res_1.next_page.data + + _assert_cursor_monotonic_if_present(res_1, res_2, ["sf_s1", "sf_s2", "sf_extra_dist", "sf_extra_dist2"]) + + +@pytest.mark.parametrize("redis_client", ["sync", "async"], indirect=True) +@pytest.mark.asyncio +async def test_deduplication_merger_with_append_distribute_redis_backend(redis_client) -> None: + s1 = [{"id": i, "src": "s1", "group": "g1" if i % 2 == 0 else "g2"} for i in range(1, 120)] + s2 = [{"id": i, "src": "s2", "group": "g2" if i % 3 == 0 else "g3"} for i in range(60, 200)] + extra = [{"id": i, "src": "extra", "group": "g9"} for i in range(100, 240)] + + methods_dict = { + "s1": make_offset_paged_method(s1, max_per_call=1), + "s2": make_offset_paged_method(s2, max_per_call=1), + "extra": make_offset_paged_method(extra, max_per_call=1), + } + + distribute_config = { + "merger_id": "inner_distribute_unused_r", + "type": "merger_distribute", + "distribution_key": "group", + "items": [ + {"subfeed_id": "sf_s1_r", "type": "subfeed", "method_name": "s1"}, + {"subfeed_id": "sf_s2_r", "type": "subfeed", "method_name": "s2"}, + ], + } + + config = { + "merger_id": "dedup_distribute_redis", + "type": "merger_deduplication", + "dedup_key": "id", + "state_backend": "redis", + "state_ttl_seconds": 60, + "overfetch_factor": 2, + "items": [ + {"priority": 10, "data": distribute_config}, + {"priority": 5, "data": {"subfeed_id": "sf_extra_dist_r", "type": "subfeed", "method_name": "extra"}}, + {"priority": 0, "data": {"subfeed_id": "sf_extra_dist2_r", "type": "subfeed", "method_name": "extra"}}, + ], + } + + res_1, res_2 = await _run_two_pages( + config=config, + methods_dict=methods_dict, + user_id="u", + limit=25, + redis_client_instance=redis_client, + custom_deduplication_key="distribute", + ) + _assert_two_pages_no_overlap(res_1, res_2) + _assert_dedup_backend_state(res=res_2, merger_id="dedup_distribute_redis", state_backend="redis") + + _assert_cursor_monotonic_if_present( + res_1, + res_2, + ["sf_s1_r", "sf_s2_r", "sf_extra_dist_r", "sf_extra_dist2_r"], + ) From 29bd63607de450b3073c594b5e6a51ed91ad88e7 Mon Sep 17 00:00:00 2001 From: Pavel Kochetov Date: Tue, 16 Dec 2025 21:12:06 +0000 Subject: [PATCH 04/33] dedup WIP. --- smartfeed/schemas.py | 635 ++++++++++----- tests/fixtures/configs.py | 37 +- tests/test_merger_deduplication.py | 1172 ++++++++++------------------ tests/test_parsing_config.py | 5 +- 4 files changed, 867 insertions(+), 982 deletions(-) diff --git a/smartfeed/schemas.py b/smartfeed/schemas.py index c5e58bb..3320fb7 100644 --- a/smartfeed/schemas.py +++ b/smartfeed/schemas.py @@ -87,6 +87,10 @@ class BaseFeedConfigModel(ABC, BaseModel): Абстрактный класс для мерджера / субфида конфигурации. """ + # Higher value means the item should "win" deduplication when duplicates exist. + # This is primarily used by MergerDeduplication and by mergers when a dedup wrapper is active. + dedup_priority: int = 0 + @abstractmethod async def get_data( self, @@ -441,37 +445,60 @@ async def get_data( :return: список данных методом append. """ - # Формируем результат append мерджера. + # When a MergerDeduplication wrapper is active, we may need to respect dedup_priority + # across children without changing the append output order. In that mode we fetch in + # priority order, then concatenate in the configured order and trim to `limit`. + dedup_active = bool(params.pop("_sf_dedup_active", False)) + result = FeedResult(data=[], next_page=FeedResultNextPage(data={}), has_next_page=False) - result_limit = limit - for item in self.items: - # Получаем данные из позиции мерджера. - item_result = await item.get_data( - methods_dict=methods_dict, - user_id=user_id, - limit=result_limit, - next_page=next_page, - redis_client=redis_client, - **params, - ) + if dedup_active: + indexed_items = list(enumerate(self.items)) + fetch_order = sorted(indexed_items, key=lambda p: (getattr(p[1], "dedup_priority", 0), -p[0]), reverse=True) + fetched: Dict[int, FeedResult] = {} - # Добавляем данные позиции к общему результату процентного мерджера. - result.data.extend(item_result.data) + for idx, item in fetch_order: + fetched[idx] = await item.get_data( + methods_dict=methods_dict, + user_id=user_id, + limit=limit, + next_page=next_page, + redis_client=redis_client, + _sf_dedup_active=True, + **params, + ) - # Обновляем result_limit - result_limit -= len(item_result.data) + for idx, _item in indexed_items: + item_result = fetched[idx] + result.data.extend(item_result.data) + result.next_page.data.update(item_result.next_page.data) + if item_result.has_next_page: + result.has_next_page = True - # Если has_next_page = False, то проверяем has_next_page у позиции и, если необходимо, обновляем. - if not result.has_next_page and item_result.has_next_page: - result.has_next_page = True + if len(result.data) > limit: + result.data = result.data[:limit] + else: + result_limit = limit + for item in self.items: + item_result = await item.get_data( + methods_dict=methods_dict, + user_id=user_id, + limit=result_limit, + next_page=next_page, + redis_client=redis_client, + **params, + ) - # Обновляем next_page. - result.next_page.data.update(item_result.next_page.data) + result.data.extend(item_result.data) + result_limit -= len(item_result.data) + + if not result.has_next_page and item_result.has_next_page: + result.has_next_page = True - # Если полученных данных хватает, то прерываем итерацию и возвращаем результат. - if result_limit <= 0: - break + result.next_page.data.update(item_result.next_page.data) + + if result_limit <= 0: + break # Если в конфигурации указано "смешать" данные. if self.shuffle: @@ -537,37 +564,14 @@ async def get_data( :return: список данных в процентном соотношении. """ - # Получаем данные "default". - default_res = await self.default.get_data( - methods_dict=methods_dict, - user_id=user_id, - limit=limit, - next_page=next_page, - redis_client=redis_client, - **params, - ) + dedup_active = bool(params.pop("_sf_dedup_active", False)) - # Формируем результат позиционного мерджера. - result = FeedResult( - data=default_res.data, - next_page=FeedResultNextPage( - data={ - self.merger_id: FeedResultNextPageInside( - page=next_page.data[self.merger_id].page if self.merger_id in next_page.data else 1, - after=next_page.data[self.merger_id].after if self.merger_id in next_page.data else None, - ) - }, - ), - has_next_page=default_res.has_next_page, - ) + # Determine the merger page first (independent of children). + page = next_page.data[self.merger_id].page if self.merger_id in next_page.data else 1 - # Получаем список позиций с учетом текущей страницы. positional_has_next_page = True - page_positions = [] - available_positions = range( - (result.next_page.data[self.merger_id].page - 1) * limit, - (result.next_page.data[self.merger_id].page * limit) + 1, - ) + page_positions: List[int] = [] + available_positions = range((page - 1) * limit, (page * limit) + 1) for position in self.positions: if position in available_positions: page_positions.append(available_positions.index(position)) @@ -584,14 +588,59 @@ async def get_data( if position in available_positions: page_positions.append(available_positions.index(position)) - # Получаем данные "positional". - pos_res = await self.positional.get_data( - methods_dict=methods_dict, - user_id=user_id, - limit=len(page_positions), - next_page=next_page, - redis_client=redis_client, - **params, + default_res: FeedResult + pos_res: FeedResult + + if dedup_active and getattr(self.positional, "dedup_priority", 0) > getattr(self.default, "dedup_priority", 0): + pos_res = await self.positional.get_data( + methods_dict=methods_dict, + user_id=user_id, + limit=len(page_positions), + next_page=next_page, + redis_client=redis_client, + _sf_dedup_active=True, + **params, + ) + default_res = await self.default.get_data( + methods_dict=methods_dict, + user_id=user_id, + limit=limit, + next_page=next_page, + redis_client=redis_client, + _sf_dedup_active=True, + **params, + ) + else: + default_res = await self.default.get_data( + methods_dict=methods_dict, + user_id=user_id, + limit=limit, + next_page=next_page, + redis_client=redis_client, + _sf_dedup_active=dedup_active, + **params, + ) + pos_res = await self.positional.get_data( + methods_dict=methods_dict, + user_id=user_id, + limit=len(page_positions), + next_page=next_page, + redis_client=redis_client, + _sf_dedup_active=dedup_active, + **params, + ) + + result = FeedResult( + data=default_res.data, + next_page=FeedResultNextPage( + data={ + self.merger_id: FeedResultNextPageInside( + page=page, + after=next_page.data[self.merger_id].after if self.merger_id in next_page.data else None, + ) + }, + ), + has_next_page=default_res.has_next_page, ) # Если has_next_page = False, то проверяем has_next_page у позиции и, если необходимо, обновляем. @@ -706,26 +755,39 @@ async def get_data( # Формируем результат процентного мерджера. result = FeedResult(data=[], next_page=FeedResultNextPage(data={}), has_next_page=False) - items_data: List = [] - for item in self.items: - # Получаем данные из позиций процентного мерджера. + dedup_active = bool(params.pop("_sf_dedup_active", False)) + + items_data: List = [None] * len(self.items) + results: List[Optional[FeedResult]] = [None] * len(self.items) + + indexed_items = list(enumerate(self.items)) + fetch_order = indexed_items + if dedup_active: + fetch_order = sorted( + indexed_items, + key=lambda p: (getattr(p[1].data, "dedup_priority", 0), -p[0]), + reverse=True, + ) + + for idx, item in fetch_order: item_result = await item.data.get_data( methods_dict=methods_dict, user_id=user_id, limit=limit * item.percentage // 100, next_page=next_page, redis_client=redis_client, + _sf_dedup_active=dedup_active, **params, ) - # Добавляем данные позиции в список данных позиций. - items_data.append(item_result.data) + results[idx] = item_result + + for idx, item_result in enumerate(results): + assert item_result is not None + items_data[idx] = item_result.data - # Если has_next_page = False, то проверяем has_next_page у позиции и, если необходимо, обновляем. if not result.has_next_page and item_result.has_next_page: result.has_next_page = True - - # Обновляем next_page. result.next_page.data.update(item_result.next_page.data) # Добавляем данные позиции к общему результату процентного мерджера. @@ -866,23 +928,49 @@ async def get_data( limit=limit, ) - # Получаем данные из позиций в процентном соотношений. - item_from = await self.item_from.data.get_data( - methods_dict=methods_dict, - user_id=user_id, - limit=limits_and_percents["limit_from"], - next_page=next_page, - redis_client=redis_client, - **params, - ) - item_to = await self.item_to.data.get_data( - methods_dict=methods_dict, - user_id=user_id, - limit=limits_and_percents["limit_to"], - next_page=next_page, - redis_client=redis_client, - **params, - ) + dedup_active = bool(params.pop("_sf_dedup_active", False)) + + from_priority = getattr(self.item_from.data, "dedup_priority", 0) + to_priority = getattr(self.item_to.data, "dedup_priority", 0) + + if dedup_active and to_priority > from_priority: + item_to = await self.item_to.data.get_data( + methods_dict=methods_dict, + user_id=user_id, + limit=limits_and_percents["limit_to"], + next_page=next_page, + redis_client=redis_client, + _sf_dedup_active=True, + **params, + ) + item_from = await self.item_from.data.get_data( + methods_dict=methods_dict, + user_id=user_id, + limit=limits_and_percents["limit_from"], + next_page=next_page, + redis_client=redis_client, + _sf_dedup_active=True, + **params, + ) + else: + item_from = await self.item_from.data.get_data( + methods_dict=methods_dict, + user_id=user_id, + limit=limits_and_percents["limit_from"], + next_page=next_page, + redis_client=redis_client, + _sf_dedup_active=dedup_active, + **params, + ) + item_to = await self.item_to.data.get_data( + methods_dict=methods_dict, + user_id=user_id, + limit=limits_and_percents["limit_to"], + next_page=next_page, + redis_client=redis_client, + _sf_dedup_active=dedup_active, + **params, + ) from_start_index = 0 to_start_index = 0 @@ -984,62 +1072,75 @@ async def get_data( :return: список данных методом append. """ - # Формируем результат append мерджера. + dedup_active = bool(params.pop("_sf_dedup_active", False)) + result = FeedResult(data=[], next_page=FeedResultNextPage(data={}), has_next_page=False) - result_limit = limit - for item in self.items: - # Получаем данные из позиции мерджера. - item_result = await item.get_data( - methods_dict=methods_dict, - user_id=user_id, - limit=result_limit, - next_page=next_page, - redis_client=redis_client, - **params, - ) + if dedup_active: + indexed_items = list(enumerate(self.items)) + fetch_order = sorted(indexed_items, key=lambda p: (getattr(p[1], "dedup_priority", 0), -p[0]), reverse=True) + fetched: Dict[int, FeedResult] = {} - # Добавляем данные позиции к общему результату процентного мерджера. - result.data.extend(item_result.data) + for idx, item in fetch_order: + fetched[idx] = await item.get_data( + methods_dict=methods_dict, + user_id=user_id, + limit=limit, + next_page=next_page, + redis_client=redis_client, + _sf_dedup_active=True, + **params, + ) - # Обновляем result_limit - result_limit -= len(item_result.data) + for idx, _item in indexed_items: + item_result = fetched[idx] + result.data.extend(item_result.data) + result.next_page.data.update(item_result.next_page.data) + if item_result.has_next_page: + result.has_next_page = True - # Если has_next_page = False, то проверяем has_next_page у позиции и, если необходимо, обновляем. - if not result.has_next_page and item_result.has_next_page: - result.has_next_page = True + if len(result.data) > limit: + result.data = result.data[:limit] + else: + result_limit = limit + for item in self.items: + item_result = await item.get_data( + methods_dict=methods_dict, + user_id=user_id, + limit=result_limit, + next_page=next_page, + redis_client=redis_client, + **params, + ) - # Обновляем next_page. - result.next_page.data.update(item_result.next_page.data) + result.data.extend(item_result.data) + result_limit -= len(item_result.data) + + if not result.has_next_page and item_result.has_next_page: + result.has_next_page = True + + result.next_page.data.update(item_result.next_page.data) - # Если полученных данных хватает, то прерываем итерацию и возвращаем результат. - if result_limit <= 0: - break + if result_limit <= 0: + break # Распределяем данные равномерно по ключу. result.data = await self._uniform_distribute(result.data) return result -class MergerDeduplicationItem(BaseModel): - """Configuration item for MergerDeduplication.""" - - priority: int = 0 - data: FeedTypes - - class MergerDeduplication(BaseFeedConfigModel): - """Merger that deduplicates items and refills to the requested limit. + """Merger that deduplicates while preserving child mixing/position semantics. - Key properties: - - Always tries to return exactly `limit` unique items if they exist upstream. - - Supports cross-page deduplication using either cursor state or Redis. - - Supports explicit per-source priority; higher priority wins on same dedup key. + This merger acts as a wrapper around exactly one child feed node. + Deduplication is applied at the leaf SubFeed method level with a shared seen-set. + This lets nested mergers (positional/percentage/gradient/etc.) keep their slot rules: + duplicates are skipped by fetching additional items from the *same* leaf source. """ merger_id: str type: Literal["merger_deduplication"] - items: List[MergerDeduplicationItem] + data: FeedTypes dedup_key: Optional[str] = None missing_key_policy: Literal["error", "keep", "drop"] = "error" @@ -1049,9 +1150,18 @@ class MergerDeduplication(BaseFeedConfigModel): cursor_compress: bool = True cursor_max_keys: Optional[int] = None - overfetch_factor: int = 2 + overfetch_factor: int = 1 + max_refill_loops: int = 20 + @model_validator(mode="after") + def validate_merger_deduplication(self) -> "MergerDeduplication": + if self.overfetch_factor < 1: + raise ValueError('"overfetch_factor" must be >= 1') + if self.max_refill_loops < 1: + raise ValueError('"max_refill_loops" must be >= 1') + return self + def _collect_descendant_cursor_keys(self, feed: BaseFeedConfigModel) -> set[str]: keys: set[str] = set() @@ -1111,53 +1221,68 @@ def _extract_dedup_value(self, item: Any) -> Any: ) return value - def _decode_seen_from_cursor(self, next_page: FeedResultNextPage) -> List[str]: + def _decode_seen_from_cursor(self, next_page: FeedResultNextPage) -> Dict[str, int]: entry = next_page.data.get(self.merger_id) if not entry or entry.after is None: - return [] + return {} after = entry.after if isinstance(after, dict) and "z" in after: payload = base64.urlsafe_b64decode(after["z"].encode()) raw = zlib.decompress(payload).decode() - return list(json.loads(raw)) + decoded = json.loads(raw) + if isinstance(decoded, dict): + return {str(k): int(v) for k, v in decoded.items()} + if isinstance(decoded, list): + # v2: list of [key, priority] entries + seen_map: Dict[str, int] = {} + for entry_item in decoded: + if isinstance(entry_item, (list, tuple)) and len(entry_item) == 2: + seen_map[str(entry_item[0])] = int(entry_item[1]) + else: + seen_map[str(entry_item)] = 0 + return seen_map + return {} if isinstance(after, dict) and "seen" in after: - return list(after["seen"]) + return {str(k): 0 for k in list(after["seen"])} if isinstance(after, list): - return list(after) - return [] + return {str(k): 0 for k in list(after)} + if isinstance(after, dict): + # v2 uncompressed map + return {str(k): int(v) for k, v in after.items() if k not in {"v", "c", "n"}} + return {} - def _encode_seen_for_cursor(self, seen_keys_in_order: List[str]) -> Any: + def _encode_seen_for_cursor(self, seen_updates_in_order: List[tuple[str, int]]) -> Any: if self.cursor_max_keys is not None: - seen_keys_in_order = seen_keys_in_order[-self.cursor_max_keys :] + seen_updates_in_order = seen_updates_in_order[-self.cursor_max_keys :] if not self.cursor_compress: - return {"v": 1, "seen": seen_keys_in_order} + return {"v": 2, "seen": [[k, p] for k, p in seen_updates_in_order]} - raw = json.dumps(seen_keys_in_order).encode() + raw = json.dumps([[k, p] for k, p in seen_updates_in_order]).encode() compressed = zlib.compress(raw) return { - "v": 1, + "v": 2, "c": "zlib+base64", - "n": len(seen_keys_in_order), + "n": len(seen_updates_in_order), "z": base64.urlsafe_b64encode(compressed).decode(), } - async def _redis_sismember(self, redis_client: Union[redis.Redis, AsyncRedis], key: str, member: str) -> bool: - res = redis_client.sismember(key, member) + async def _redis_zscore(self, redis_client: Union[redis.Redis, AsyncRedis], key: str, member: str) -> Optional[float]: + res = redis_client.zscore(key, member) if inspect.iscoroutine(res): res = await res - return bool(res) + return None if res is None else float(res) - async def _redis_sadd_and_expire( + async def _redis_zadd_and_expire( self, redis_client: Union[redis.Redis, AsyncRedis], key: str, - members: List[str], + member_scores: Dict[str, int], ) -> None: - if not members: + if not member_scores: return - res = redis_client.sadd(key, *members) + res = redis_client.zadd(key, mapping={m: float(s) for m, s in member_scores.items()}) if inspect.iscoroutine(res): await res await redis_client.expire(key, self.state_ttl_seconds) @@ -1197,18 +1322,18 @@ async def get_data( if is_fresh_session: # Reset cursors for all descendants under this merger so upstream nodes also restart. - descendant_keys: set[str] = set() - for item in self.items: - descendant_keys.update(self._collect_descendant_cursor_keys(item.data)) + descendant_keys = self._collect_descendant_cursor_keys(self.data) for key in descendant_keys: working_next_page.data.pop(key, None) - sorted_items = sorted(self.items, key=lambda x: x.priority, reverse=True) - seen_keys_in_order: List[str] = [] - seen_cursor_set: set[str] = set() + # Shared dedup state (cross-page) + seen_priority_map: Dict[str, int] = {} + seen_updates_in_order: List[tuple[str, int]] = [] if self.state_backend == "cursor" and not is_fresh_session: - seen_keys_in_order = self._decode_seen_from_cursor(next_page) - seen_cursor_set = set(seen_keys_in_order) + seen_priority_map = self._decode_seen_from_cursor(next_page) + + # Always maintain a per-request seen set to prevent duplicates within a single get_data() call. + seen_request_set: set[str] = set(seen_priority_map.keys()) redis_state_key = "" if self.state_backend == "redis" and redis_client: @@ -1219,92 +1344,175 @@ async def get_data( if inspect.iscoroutine(deleted): await deleted - result_items: List[Any] = [] - accepted: Dict[str, Dict[str, Any]] = {} - redis_new_members: List[str] = [] - any_has_next_page = False + redis_new_scores: Dict[str, int] = {} - loops = 0 - while len(result_items) < limit and loops < self.max_refill_loops: - loops += 1 - before_len = len(result_items) + # Preserve inner merger ordering/mixing semantics by deduplicating at the leaf method level + # with a shared seen-set. + original_methods_dict = methods_dict - for item in sorted_items: - remaining = limit - len(result_items) - if remaining <= 0: - break + # Create a deep copy of the child tree and rewrite each SubFeed to call a unique wrapper + # so we can associate a dedup_priority with each leaf. + child = self.data + if hasattr(child, "model_copy"): + child = child.model_copy(deep=True) # type: ignore[attr-defined] + else: + child = child.copy(deep=True) - request_limit = max(1, remaining * max(1, self.overfetch_factor)) - item_result = await item.data.get_data( - methods_dict=methods_dict, - user_id=user_id, - limit=request_limit, - next_page=working_next_page, - redis_client=redis_client, - **params, - ) + def iter_subfeeds(feed: BaseFeedConfigModel) -> List["SubFeed"]: + found: List[SubFeed] = [] - any_has_next_page = any_has_next_page or item_result.has_next_page - working_next_page.data.update(item_result.next_page.data) + if isinstance(feed, SubFeed): + found.append(feed) + return found - for entity in item_result.data: - raw_value = self._extract_dedup_value(entity) - if raw_value is None: - if self.missing_key_policy == "drop": - continue - if self.missing_key_policy == "keep": - # Make a unique key per object instance representation. - raw_value = ("__missing__", id(entity)) + for attr_name in ("data", "positional", "default"): + inner = getattr(feed, attr_name, None) + if isinstance(inner, BaseFeedConfigModel): + found.extend(iter_subfeeds(inner)) - key = self._normalize_key(raw_value) + for attr_name in ("item_from", "item_to"): + wrapper = getattr(feed, attr_name, None) + inner = getattr(wrapper, "data", None) + if isinstance(inner, BaseFeedConfigModel): + found.extend(iter_subfeeds(inner)) - if key in accepted: - if item.priority > accepted[key]["priority"]: - result_items[accepted[key]["index"]] = entity - accepted[key]["priority"] = item.priority + items = getattr(feed, "items", None) + if isinstance(items, list): + for item in items: + if isinstance(item, BaseFeedConfigModel): + found.extend(iter_subfeeds(item)) continue - - if self.state_backend == "cursor": - if key in seen_cursor_set: + inner = getattr(item, "data", None) + if isinstance(inner, BaseFeedConfigModel): + found.extend(iter_subfeeds(inner)) + + return found + + rewritten_methods_dict = dict(original_methods_dict) + + def wrap_leaf_method(*, subfeed: "SubFeed") -> None: + original_name = subfeed.method_name + original_method = original_methods_dict[original_name] + unique_name = f"__dedup__{self.merger_id}__{subfeed.subfeed_id}" + # Idempotency: if the same subfeed id appears multiple times, don't re-wrap. + if unique_name in rewritten_methods_dict: + subfeed.method_name = unique_name + return + subfeed.method_name = unique_name + leaf_priority = int(getattr(subfeed, "dedup_priority", 0)) + + async def _wrapped_method(user_id: Any, limit: int, next_page: FeedResultNextPageInside, **kw: Any): + collected: List[Any] = [] + local_seen: set[str] = set() + any_has_next_page = False + + loops = 0 + while len(collected) < limit and loops < self.max_refill_loops: + loops += 1 + before_len = len(collected) + + remaining = limit - len(collected) + # Safe oversampling: only when we can rewind integer-offset cursors. + can_overfetch = isinstance(next_page.after, (int, type(None))) + request_limit = max(1, remaining) + if can_overfetch and self.overfetch_factor > 1: + request_limit = max(1, remaining * self.overfetch_factor) + + start_after = 0 if next_page.after is None else int(next_page.after) + + method_result = await original_method(user_id=user_id, limit=request_limit, next_page=next_page, **kw) + if not isinstance(method_result, FeedResultClient): + raise TypeError('SubFeed function must return "FeedResultClient" instance.') + + any_has_next_page = any_has_next_page or method_result.has_next_page + + consumed_in_batch = 0 + + for entity in method_result.data: + consumed_in_batch += 1 + raw_value = self._extract_dedup_value(entity) + if raw_value is None: + if self.missing_key_policy == "drop": + continue + if self.missing_key_policy == "keep": + raw_value = ("__missing__", id(entity)) + + key = self._normalize_key(raw_value) + if key in local_seen: continue - else: - assert redis_client is not None - if await self._redis_sismember(redis_client, redis_state_key, key): + + if key in seen_request_set: continue - accepted[key] = {"priority": item.priority, "index": len(result_items)} - result_items.append(entity) + if self.state_backend == "cursor": + existing_priority = seen_priority_map.get(key) + if existing_priority is not None and leaf_priority <= existing_priority: + continue + else: + assert redis_client is not None + existing_score = await self._redis_zscore(redis_client, redis_state_key, key) + if existing_score is not None and leaf_priority <= int(existing_score): + continue - if self.state_backend == "cursor": - seen_cursor_set.add(key) - seen_keys_in_order.append(key) - else: - redis_new_members.append(key) + local_seen.add(key) + collected.append(entity) - if len(result_items) >= limit: - break + seen_request_set.add(key) - if len(result_items) >= limit: - break + if self.state_backend == "cursor": + seen_priority_map[key] = leaf_priority + seen_updates_in_order.append((key, leaf_priority)) + else: + redis_new_scores[key] = max(redis_new_scores.get(key, 0), leaf_priority) + + if len(collected) >= limit: + break + + if len(collected) == before_len: + # No progress this loop. Stop if upstream is exhausted. + if not method_result.has_next_page: + break + + # If we oversampled with a simple integer cursor, rewind to the point we actually consumed. + # This prevents skipping un-inspected items that were fetched but not needed. + if can_overfetch and request_limit > remaining: + end_after = next_page.after + if isinstance(end_after, int) and end_after == start_after + len(method_result.data): + next_page.after = start_after + consumed_in_batch + + return FeedResultClient(data=collected, next_page=next_page, has_next_page=any_has_next_page) + + setattr(_wrapped_method, "_smartfeed_original", original_method) + rewritten_methods_dict[unique_name] = _wrapped_method - if len(result_items) == before_len: - break + for sf in iter_subfeeds(child): + wrap_leaf_method(subfeed=sf) + + child_result = await child.get_data( + methods_dict=rewritten_methods_dict, + user_id=user_id, + limit=limit, + next_page=working_next_page, + redis_client=redis_client, + _sf_dedup_active=True, + **params, + ) if self.state_backend == "redis" and redis_client: - await self._redis_sadd_and_expire(redis_client, redis_state_key, redis_new_members) + await self._redis_zadd_and_expire(redis_client, redis_state_key, redis_new_scores) page = next_page.data[self.merger_id].page if self.merger_id in next_page.data else 1 merger_after: Any = None if self.state_backend == "cursor": - merger_after = self._encode_seen_for_cursor(seen_keys_in_order) + merger_after = self._encode_seen_for_cursor(seen_updates_in_order) - if hasattr(working_next_page, "model_copy"): - result_next_page = working_next_page.model_copy(deep=True) # type: ignore[attr-defined] + if hasattr(child_result.next_page, "model_copy"): + result_next_page = child_result.next_page.model_copy(deep=True) # type: ignore[attr-defined] else: - result_next_page = working_next_page.copy(deep=True) + result_next_page = child_result.next_page.copy(deep=True) result_next_page.data[self.merger_id] = FeedResultNextPageInside(page=page + 1, after=merger_after) - return FeedResult(data=result_items, next_page=result_next_page, has_next_page=any_has_next_page) + return FeedResult(data=child_result.data, next_page=result_next_page, has_next_page=child_result.has_next_page) class SubFeed(BaseFeedConfigModel): @@ -1354,7 +1562,9 @@ async def get_data( ) # Формируем params для функции субфида. - method_args = inspect.getfullargspec(methods_dict[self.method_name]).args + method = methods_dict[self.method_name] + method_spec = getattr(method, "_smartfeed_original", method) + method_args = inspect.getfullargspec(method_spec).args method_params: Dict[str, Any] = {} for arg in method_args: if arg in params: @@ -1426,5 +1636,4 @@ def _rebuild_model(model: Any) -> None: _rebuild_model(MergerAppendDistribute) _rebuild_model(MergerPercentageGradient) _rebuild_model(MergerViewSession) -_rebuild_model(MergerDeduplicationItem) _rebuild_model(MergerDeduplication) \ No newline at end of file diff --git a/tests/fixtures/configs.py b/tests/fixtures/configs.py index a982e3d..6aff5cb 100644 --- a/tests/fixtures/configs.py +++ b/tests/fixtures/configs.py @@ -96,23 +96,28 @@ "dedup_key": "id", "state_backend": "cursor", "cursor_compress": True, - "items": [ - { - "priority": 100, - "data": { - "subfeed_id": "subfeed_dedup_priority_high", - "type": "subfeed", - "method_name": "posted", + "data": { + "merger_id": "merger_percentage_inside_dedup_parsing_example", + "type": "merger_percentage", + "shuffle": False, + "items": [ + { + "percentage": 50, + "data": { + "subfeed_id": "subfeed_dedup_a", + "type": "subfeed", + "method_name": "posted", + }, }, - }, - { - "priority": 0, - "data": { - "subfeed_id": "subfeed_dedup_priority_low", - "type": "subfeed", - "method_name": "posted", + { + "percentage": 50, + "data": { + "subfeed_id": "subfeed_dedup_b", + "type": "subfeed", + "method_name": "posted", + }, }, - }, - ], + ], + }, }, } diff --git a/tests/test_merger_deduplication.py b/tests/test_merger_deduplication.py index cede8f3..5b046a8 100644 --- a/tests/test_merger_deduplication.py +++ b/tests/test_merger_deduplication.py @@ -3,10 +3,10 @@ import pytest from smartfeed.schemas import ( - MergerDeduplication, FeedResultClient, FeedResultNextPage, FeedResultNextPageInside, + MergerDeduplication, ) from tests.fixtures.redis import redis_client # noqa: F401 @@ -14,7 +14,7 @@ def make_offset_paged_method(items, *, max_per_call=None): - async def _method(user_id, limit, next_page): # pylint: disable=unused-argument + async def _method(user_id, limit, next_page, **kwargs): # pylint: disable=unused-argument offset = int(next_page.after or 0) effective_limit = limit if isinstance(max_per_call, int) and max_per_call > 0: @@ -28,63 +28,7 @@ async def _method(user_id, limit, next_page): # pylint: disable=unused-argument return _method -async def _run_two_pages( - *, - config, - methods_dict, - user_id, - limit, - redis_client_instance=None, - **params, -): - merger = parse_model(MergerDeduplication, config) - res_1 = await merger.get_data( - methods_dict=methods_dict, - user_id=user_id, - limit=limit, - next_page=FeedResultNextPage(data={}), - redis_client=redis_client_instance, - **params, - ) - res_2 = await merger.get_data( - methods_dict=methods_dict, - user_id=user_id, - limit=limit, - next_page=res_1.next_page, - redis_client=redis_client_instance, - **params, - ) - return res_1, res_2 - - -def _assert_dedup_backend_state(*, res, merger_id: str, state_backend: str) -> None: - assert merger_id in res.next_page.data - if state_backend == "cursor": - assert isinstance(res.next_page.data[merger_id].after, dict) - else: - assert res.next_page.data[merger_id].after is None - - -def _ids(data): - return [x["id"] for x in data] - - -def _assert_two_pages_no_overlap(res_1, res_2): - ids_1 = set(_ids(res_1.data)) - ids_2 = set(_ids(res_2.data)) - assert len(ids_1) == len(res_1.data) - assert len(ids_2) == len(res_2.data) - assert not (ids_1 & ids_2) - - def _assert_cursor_monotonic_if_present(res_1, res_2, keys): - """Assert that cursor values monotonically advance for keys that are present. - - MergerDeduplication may stop early once it has enough unique items, so a - descendant might not be called on a given page. This helper only asserts - monotonicity when the cursor key exists in `res_1`. - """ - for key in keys: if key not in res_1.next_page.data: continue @@ -101,61 +45,67 @@ def _assert_cursor_monotonic_if_present(res_1, res_2, keys): assert after_2 >= after_1 continue - # Merger cursors can be structured (dict), just require presence. if isinstance(after_1, dict) and isinstance(after_2, dict): continue - # If values are comparable, enforce monotonicity; otherwise don't fail. try: assert after_2 >= after_1 except TypeError: pass +def _sources(data): + return [x.get("src") for x in data] + + +def _ids(data): + return [x.get("id") for x in data] + + +def _assert_no_dupes_in_page(data): + ids = _ids(data) + assert len(ids) == len(set(ids)) + + +def _assert_pages_no_overlap(res_1, res_2): + assert not (set(_ids(res_1.data)) & set(_ids(res_2.data))) + + @pytest.mark.asyncio -async def test_deduplication_merger_cursor_priority_and_cross_page() -> None: - low_items = [ - {"id": 1, "src": "low"}, - {"id": 2, "src": "low"}, - {"id": 3, "src": "low"}, - {"id": 4, "src": "low"}, - {"id": 5, "src": "low"}, - # repeats later (cross-page duplicates) - {"id": 3, "src": "low"}, - {"id": 4, "src": "low"}, - {"id": 6, "src": "low"}, - {"id": 7, "src": "low"}, - {"id": 8, "src": "low"}, - {"id": 9, "src": "low"}, - {"id": 10, "src": "low"}, - ] - high_items = [ - {"id": 3, "src": "high"}, - {"id": 4, "src": "high"}, - ] +async def test_dedup_positional_slot_ownership_cursor_backend() -> None: + """Positional slots must remain owned by the positional branch. + + Deduplication must not drop items *after* the positional merge (which would shift indices). + Instead, duplicates must be skipped inside the leaf source that owns the slot. + """ + + # Default branch has early ids 1..3, which will be seen first. + default_items = [{"id": i, "src": "default"} for i in range(1, 300)] + + # Positional branch starts with duplicates 1..3; it must skip them and fetch 4.. instead. + positional_items = [{"id": i, "src": "pos"} for i in range(1, 300)] methods_dict = { - "low": make_offset_paged_method(low_items), - "high": make_offset_paged_method(high_items), + "default": make_offset_paged_method(default_items), + "pos": make_offset_paged_method(positional_items), } config = { - "merger_id": "dedup_example", + "merger_id": "dedup_wrapper", "type": "merger_deduplication", "dedup_key": "id", "state_backend": "cursor", "cursor_compress": True, - "overfetch_factor": 3, - "items": [ - { - "priority": 100, - "data": {"subfeed_id": "sf_high", "type": "subfeed", "method_name": "high"}, - }, - { - "priority": 0, - "data": {"subfeed_id": "sf_low", "type": "subfeed", "method_name": "low"}, - }, - ], + "max_refill_loops": 20, + "data": { + "merger_id": "positional_mix", + "type": "merger_positional", + # Ensure positional inserts exist on both pages for limit=6: + # page1 uses (1,3,5), page2 uses (7,9,11) which map to the same in-page slots. + "positions": [1, 3, 5, 7, 9, 11], + "positional": {"subfeed_id": "sf_pos", "type": "subfeed", "method_name": "pos"}, + "default": {"subfeed_id": "sf_default", "type": "subfeed", "method_name": "default"}, + }, } merger = parse_model(MergerDeduplication, config) @@ -163,99 +113,68 @@ async def test_deduplication_merger_cursor_priority_and_cross_page() -> None: res_1 = await merger.get_data( methods_dict=methods_dict, user_id="u", - limit=5, + limit=6, next_page=FeedResultNextPage(data={}), ) - assert len(res_1.data) == 5 - ids_1 = [x["id"] for x in res_1.data] - assert len(ids_1) == len(set(ids_1)) - # Priority: id 3 and 4 must come from high - for x in res_1.data: - if x["id"] in {3, 4}: - assert x["src"] == "high" + assert len(res_1.data) == 6 + _assert_no_dupes_in_page(res_1.data) - # Next page should not repeat 3/4 even though low repeats them later. + # Slot ownership: configured positions [1,3,5] are the positional branch. + assert _sources(res_1.data)[0] == "pos" + assert _sources(res_1.data)[2] == "pos" + assert _sources(res_1.data)[4] == "pos" + + # Next page: still no overlap across pages, and positional slots remain owned. res_2 = await merger.get_data( methods_dict=methods_dict, user_id="u", - limit=5, + limit=6, next_page=res_1.next_page, ) - ids_2 = [x["id"] for x in res_2.data] - assert not (set(ids_1) & set(ids_2)) - - # Ensure merger stores cursor state (compressed) in its own after. - assert "dedup_example" in res_2.next_page.data - assert isinstance(res_2.next_page.data["dedup_example"].after, dict) - assert "z" in res_2.next_page.data["dedup_example"].after - - -@pytest.mark.asyncio -async def test_deduplication_merger_refill_to_limit() -> None: - dup_items = [ - {"id": 1}, - {"id": 1}, - {"id": 1}, - {"id": 1}, - {"id": 1}, - {"id": 2}, - {"id": 3}, - {"id": 4}, - {"id": 5}, - {"id": 6}, - ] + assert len(res_2.data) == 6 + _assert_no_dupes_in_page(res_2.data) + _assert_pages_no_overlap(res_1, res_2) - methods_dict = { - "dups": make_offset_paged_method(dup_items), - } + assert _sources(res_2.data)[0] == "pos" + assert _sources(res_2.data)[2] == "pos" + assert _sources(res_2.data)[4] == "pos" - config = { - "merger_id": "dedup_refill", - "type": "merger_deduplication", - "dedup_key": "id", - "state_backend": "cursor", - "overfetch_factor": 4, - "max_refill_loops": 10, - "items": [ - { - "priority": 0, - "data": {"subfeed_id": "sf_dups", "type": "subfeed", "method_name": "dups"}, - } - ], - } + _assert_cursor_monotonic_if_present(res_1, res_2, keys=["sf_pos", "sf_default", "positional_mix", "dedup_wrapper"]) - merger = parse_model(MergerDeduplication, config) - res = await merger.get_data( - methods_dict=methods_dict, - user_id="u", - limit=5, - next_page=FeedResultNextPage(data={}), - ) +@pytest.mark.asyncio +async def test_dedup_percentage_slot_ownership_cursor_backend() -> None: + """Percentage mixing order must be preserved even with duplicates across sources.""" - assert [x["id"] for x in res.data] == [1, 2, 3, 4, 5] + # A is called first by the percentage merger; its ids will be seen before B. + a_items = [{"id": i, "src": "A"} for i in range(1, 300)] + # B starts with duplicates 1..3; it must skip them and fetch unique tail items. + # Same IDs as A to force cross-source duplicates. + b_items = [{"id": i, "src": "B"} for i in range(1, 300)] -@pytest.mark.asyncio -async def test_deduplication_merger_page_zero_resets_cursor_state() -> None: - items = [{"id": i} for i in range(1, 50)] - methods_dict = {"stream": make_offset_paged_method(items)} + methods_dict = { + "a": make_offset_paged_method(a_items), + "b": make_offset_paged_method(b_items), + } config = { - "merger_id": "dedup_reset", + "merger_id": "dedup_wrapper_pct", "type": "merger_deduplication", "dedup_key": "id", "state_backend": "cursor", "cursor_compress": True, - "overfetch_factor": 2, - "items": [ - { - "priority": 0, - "data": {"subfeed_id": "sf_stream", "type": "subfeed", "method_name": "stream"}, - } - ], + "data": { + "merger_id": "pct_mix", + "type": "merger_percentage", + "shuffle": False, + "items": [ + {"percentage": 50, "data": {"subfeed_id": "sf_a", "type": "subfeed", "method_name": "a"}}, + {"percentage": 50, "data": {"subfeed_id": "sf_b", "type": "subfeed", "method_name": "b"}}, + ], + }, } merger = parse_model(MergerDeduplication, config) @@ -263,61 +182,76 @@ async def test_deduplication_merger_page_zero_resets_cursor_state() -> None: res_1 = await merger.get_data( methods_dict=methods_dict, user_id="u", - limit=5, + limit=10, next_page=FeedResultNextPage(data={}), ) - assert [x["id"] for x in res_1.data] == [1, 2, 3, 4, 5] - # Simulate a full reload: page 0 requested again. Even if the client mistakenly - # keeps the previous cursor payloads (including subfeed cursors), we start a new session. + assert len(res_1.data) == 10 + _assert_no_dupes_in_page(res_1.data) + + # Slot ownership: percentage merge alternates when list sizes are equal. + sources_1 = _sources(res_1.data) + assert sources_1[0] == "A" + assert sources_1[1] == "B" + assert sources_1[2] == "A" + assert sources_1[3] == "B" + res_2 = await merger.get_data( methods_dict=methods_dict, user_id="u", - limit=5, - next_page=FeedResultNextPage( - data={ - "dedup_reset": FeedResultNextPageInside(page=0, after=res_1.next_page.data["dedup_reset"].after), - "sf_stream": res_1.next_page.data["sf_stream"], - } - ), + limit=10, + next_page=res_1.next_page, ) - assert [x["id"] for x in res_2.data] == [1, 2, 3, 4, 5] + assert len(res_2.data) == 10 + _assert_no_dupes_in_page(res_2.data) + _assert_pages_no_overlap(res_1, res_2) + + sources_2 = _sources(res_2.data) + assert sources_2[0] == "A" + assert sources_2[1] == "B" + + _assert_cursor_monotonic_if_present(res_1, res_2, keys=["sf_a", "sf_b", "pct_mix", "dedup_wrapper_pct"]) -@pytest.mark.parametrize("redis_client", ["sync", "async"], indirect=True) @pytest.mark.asyncio -async def test_deduplication_merger_redis_backend(redis_client) -> None: - # This dataset repeats ids across pages (sliding window style) - items = [ - {"id": 1}, - {"id": 2}, - {"id": 3}, - {"id": 2}, - {"id": 3}, - {"id": 4}, - {"id": 5}, - {"id": 6}, - {"id": 4}, - {"id": 7}, - {"id": 8}, - ] +async def test_dedup_deep_tree_cursor_backend() -> None: + """Dedup must work through deep merger trees (wrapping leaf methods).""" - methods_dict = {"stream": make_offset_paged_method(items)} + # Leaf sources: intentionally overlapping ids across different leaves. + p_items = [{"id": i, "src": "P"} for i in range(1, 30)] + d1_items = [{"id": i, "src": "D1"} for i in range(1, 30)] # overlaps P + d2_items = [{"id": 100 + i, "src": "D2"} for i in range(1, 30)] + methods_dict = { + "p": make_offset_paged_method(p_items), + "d1": make_offset_paged_method(d1_items), + "d2": make_offset_paged_method(d2_items), + } + + # Deep tree: Dedup -> Positional(default=Percentage(D1,D2), positional=SubFeed(P)) config = { - "merger_id": "dedup_redis", + "merger_id": "dedup_deep", "type": "merger_deduplication", "dedup_key": "id", - "state_backend": "redis", - "state_ttl_seconds": 60, - "overfetch_factor": 4, - "items": [ - { - "priority": 0, - "data": {"subfeed_id": "sf_stream", "type": "subfeed", "method_name": "stream"}, - } - ], + "state_backend": "cursor", + "cursor_compress": True, + "data": { + "merger_id": "pos_deep", + "type": "merger_positional", + # Ensure positional positions exist on both page 1 (1,4) and page 2 (9,12) for limit=8. + "positions": [1, 4, 9, 12], + "positional": {"subfeed_id": "sf_p", "type": "subfeed", "method_name": "p"}, + "default": { + "merger_id": "pct_deep", + "type": "merger_percentage", + "shuffle": False, + "items": [ + {"percentage": 50, "data": {"subfeed_id": "sf_d1", "type": "subfeed", "method_name": "d1"}}, + {"percentage": 50, "data": {"subfeed_id": "sf_d2", "type": "subfeed", "method_name": "d2"}}, + ], + }, + }, } merger = parse_model(MergerDeduplication, config) @@ -325,693 +259,431 @@ async def test_deduplication_merger_redis_backend(redis_client) -> None: res_1 = await merger.get_data( methods_dict=methods_dict, user_id="u", - limit=4, + limit=8, next_page=FeedResultNextPage(data={}), - redis_client=redis_client, - custom_deduplication_key="t1", ) + assert len(res_1.data) == 8 + _assert_no_dupes_in_page(res_1.data) + + # Positional ownership must hold even with deep defaults. + assert _sources(res_1.data)[0] == "P" # position 1 + assert _sources(res_1.data)[3] == "P" # position 4 + res_2 = await merger.get_data( methods_dict=methods_dict, user_id="u", - limit=4, + limit=8, next_page=res_1.next_page, - redis_client=redis_client, - custom_deduplication_key="t1", ) - ids_1 = [x["id"] for x in res_1.data] - ids_2 = [x["id"] for x in res_2.data] + assert len(res_2.data) == 8 + _assert_no_dupes_in_page(res_2.data) + _assert_pages_no_overlap(res_1, res_2) - assert len(ids_1) == len(set(ids_1)) - assert len(ids_2) == len(set(ids_2)) - assert not (set(ids_1) & set(ids_2)) + assert _sources(res_2.data)[0] == "P" + assert _sources(res_2.data)[3] == "P" - # Redis backend should not store seen ids in cursor after. - assert "dedup_redis" in res_2.next_page.data - assert res_2.next_page.data["dedup_redis"].after is None - # Ensure fixture works for both sync/async redis. - key = "dedup:dedup_redis:u:t1" - members = redis_client.smembers(key) - if inspect.iscoroutine(members): - members = await members - assert len(members) >= len(set(ids_1 + ids_2)) +@pytest.mark.asyncio +async def test_dedup_overfetch_factor_does_not_skip_unseen_items_in_deep_tree_cursors() -> None: + """When overfetch_factor>1, leaf cursors must be rewound to inspected count. + This is a regression test for the "safe overfetch" logic: we may request more + than we need from a leaf source, but we must not advance that leaf cursor past + un-inspected items. In a deep tree, this must hold for all descendant SubFeeds. + """ -@pytest.mark.asyncio -async def test_deduplication_merger_priority_replacement_across_loops_cursor_backend() -> None: - # This test forces the higher-priority source to surface a duplicate only on a later call, - # so we exercise the in-page replacement logic. - # Important: dedup calls sources in descending priority. To ensure we exercise - # replacement, we need a lower-priority source to introduce id=5 *before* - # the high-priority source sees id=5 on a later refill loop. - low_items = [ - {"id": 5, "src": "low"}, - {"id": 6, "src": "low"}, - {"id": 7, "src": "low"}, - {"id": 99, "src": "low"}, - ] - mid_items = [ - {"id": 5, "src": "mid"}, - {"id": 98, "src": "mid"}, - {"id": 8, "src": "mid"}, - {"id": 9, "src": "mid"}, - ] - high_items = [ - {"id": 1, "src": "high"}, - {"id": 5, "src": "high"}, - {"id": 2, "src": "high"}, - {"id": 3, "src": "high"}, - ] + p_items = [{"id": 1000 + i, "src": "P"} for i in range(1, 200)] + d1_items = [{"id": i, "src": "D1"} for i in range(1, 200)] + d2_items = [{"id": 500 + i, "src": "D2"} for i in range(1, 200)] methods_dict = { - "low": make_offset_paged_method(low_items, max_per_call=1), - "mid": make_offset_paged_method(mid_items, max_per_call=1), - "high": make_offset_paged_method(high_items, max_per_call=1), + "p": make_offset_paged_method(p_items), + "d1": make_offset_paged_method(d1_items), + "d2": make_offset_paged_method(d2_items), } config = { - "merger_id": "dedup_priority_cursor", + "merger_id": "dedup_overfetch", "type": "merger_deduplication", "dedup_key": "id", "state_backend": "cursor", "cursor_compress": True, - "overfetch_factor": 1, - "max_refill_loops": 10, - "items": [ - {"priority": 100, "data": {"subfeed_id": "sf_high_p", "type": "subfeed", "method_name": "high"}}, - {"priority": 50, "data": {"subfeed_id": "sf_mid_p", "type": "subfeed", "method_name": "mid"}}, - {"priority": 0, "data": {"subfeed_id": "sf_low_p", "type": "subfeed", "method_name": "low"}}, - ], + "overfetch_factor": 3, + "data": { + "merger_id": "pos_overfetch", + "type": "merger_positional", + "positions": [1, 4, 9, 12], + "positional": {"subfeed_id": "sf_p", "type": "subfeed", "method_name": "p"}, + "default": { + "merger_id": "pct_overfetch", + "type": "merger_percentage", + "shuffle": False, + "items": [ + {"percentage": 50, "data": {"subfeed_id": "sf_d1", "type": "subfeed", "method_name": "d1"}}, + {"percentage": 50, "data": {"subfeed_id": "sf_d2", "type": "subfeed", "method_name": "d2"}}, + ], + }, + }, } - res_1 = await parse_model(MergerDeduplication, config).get_data( + merger = parse_model(MergerDeduplication, config) + + # Page 1 + res_1 = await merger.get_data( methods_dict=methods_dict, user_id="u", - limit=4, + limit=8, next_page=FeedResultNextPage(data={}), ) - # Ensure id=5 is present and comes from highest priority, even though low/mid can surface it earlier. - winners = {x["id"]: x["src"] for x in res_1.data} - assert winners[5] == "high" - _assert_dedup_backend_state(res=res_1, merger_id="dedup_priority_cursor", state_backend="cursor") + assert len(res_1.data) == 8 + _assert_no_dupes_in_page(res_1.data) + # Deep descendant cursors: positional leaf requests 2 items; percentage leaves request 4 each. + # With overfetch_factor=3, internal calls may request 6/12, but cursor must not advance that far. + assert res_1.next_page.data["sf_p"].after == 2 + assert res_1.next_page.data["sf_d1"].after == 4 + assert res_1.next_page.data["sf_d2"].after == 4 -@pytest.mark.parametrize("redis_client", ["sync", "async"], indirect=True) -@pytest.mark.asyncio -async def test_deduplication_merger_priority_replacement_across_loops_redis_backend(redis_client) -> None: - low_items = [ - {"id": 5, "src": "low"}, - {"id": 6, "src": "low"}, - {"id": 7, "src": "low"}, - {"id": 99, "src": "low"}, - ] - mid_items = [ - {"id": 5, "src": "mid"}, - {"id": 98, "src": "mid"}, - {"id": 8, "src": "mid"}, - {"id": 9, "src": "mid"}, - ] - high_items = [ - {"id": 1, "src": "high"}, - {"id": 5, "src": "high"}, - {"id": 2, "src": "high"}, - {"id": 3, "src": "high"}, - ] - - methods_dict = { - "low": make_offset_paged_method(low_items, max_per_call=1), - "mid": make_offset_paged_method(mid_items, max_per_call=1), - "high": make_offset_paged_method(high_items, max_per_call=1), - } - - config = { - "merger_id": "dedup_priority_redis", - "type": "merger_deduplication", - "dedup_key": "id", - "state_backend": "redis", - "state_ttl_seconds": 60, - "overfetch_factor": 1, - "max_refill_loops": 10, - "items": [ - {"priority": 100, "data": {"subfeed_id": "sf_high_pr", "type": "subfeed", "method_name": "high"}}, - {"priority": 50, "data": {"subfeed_id": "sf_mid_pr", "type": "subfeed", "method_name": "mid"}}, - {"priority": 0, "data": {"subfeed_id": "sf_low_pr", "type": "subfeed", "method_name": "low"}}, - ], - } - - res_1 = await parse_model(MergerDeduplication, config).get_data( + # Page 2 (monotonic advancement, still no over-advancement) + res_2 = await merger.get_data( methods_dict=methods_dict, user_id="u", - limit=4, - next_page=FeedResultNextPage(data={}), - redis_client=redis_client, - custom_deduplication_key="priority", + limit=8, + next_page=res_1.next_page, ) - winners = {x["id"]: x["src"] for x in res_1.data} - assert winners[5] == "high" - _assert_dedup_backend_state(res=res_1, merger_id="dedup_priority_redis", state_backend="redis") + assert len(res_2.data) == 8 + _assert_no_dupes_in_page(res_2.data) + _assert_pages_no_overlap(res_1, res_2) + assert res_2.next_page.data["sf_p"].after == 4 + assert res_2.next_page.data["sf_d1"].after == 8 + assert res_2.next_page.data["sf_d2"].after == 8 -@pytest.mark.asyncio -async def test_deduplication_merger_with_append_and_three_sources_cursor_backend() -> None: - # Inner MergerAppend (two subfeeds) + two extra subfeeds as separate dedup items. - a_items = [{"id": i, "src": "a"} for i in range(1, 30)] - b_items = [{"id": i, "src": "b"} for i in range(10, 40)] - c_items = [{"id": i, "src": "c"} for i in range(20, 60)] - d_items = [{"id": i, "src": "d"} for i in range(25, 70)] - - # Cap each subfeed to 1 item per call so dedup must invoke all children - # (and therefore exercise nested cursor propagation). - methods_dict = { - "a": make_offset_paged_method(a_items, max_per_call=1), - "b": make_offset_paged_method(b_items, max_per_call=1), - "c": make_offset_paged_method(c_items, max_per_call=1), - "d": make_offset_paged_method(d_items, max_per_call=1), - } + _assert_cursor_monotonic_if_present( + res_1, + res_2, + keys=["sf_p", "sf_d1", "sf_d2", "pos_overfetch", "dedup_overfetch"], + ) - append_config = { - "merger_id": "inner_append_unused", - "type": "merger_append", - "items": [ - {"subfeed_id": "sf_a_append", "type": "subfeed", "method_name": "a"}, - {"subfeed_id": "sf_b_append", "type": "subfeed", "method_name": "b"}, - ], - } + +@pytest.mark.asyncio +async def test_dedup_page_zero_resets_seen_and_descendant_cursors() -> None: + items = [{"id": i, "src": "S"} for i in range(1, 50)] + methods_dict = {"s": make_offset_paged_method(items)} config = { - "merger_id": "dedup_with_append_cursor", + "merger_id": "dedup_reset", "type": "merger_deduplication", "dedup_key": "id", "state_backend": "cursor", "cursor_compress": True, - "overfetch_factor": 2, - "items": [ - {"priority": 10, "data": append_config}, - {"priority": 5, "data": {"subfeed_id": "sf_c", "type": "subfeed", "method_name": "c"}}, - {"priority": 0, "data": {"subfeed_id": "sf_d", "type": "subfeed", "method_name": "d"}}, - ], - } - - res_1, res_2 = await _run_two_pages(config=config, methods_dict=methods_dict, user_id="u", limit=15) - _assert_two_pages_no_overlap(res_1, res_2) - _assert_dedup_backend_state(res=res_2, merger_id="dedup_with_append_cursor", state_backend="cursor") - - # Cursor correctness: descendant subfeed cursors exist and advance. - _assert_cursor_monotonic_if_present(res_1, res_2, ["sf_a_append", "sf_b_append", "sf_c", "sf_d"]) - - -@pytest.mark.parametrize("redis_client", ["sync", "async"], indirect=True) -@pytest.mark.asyncio -async def test_deduplication_merger_with_append_and_three_sources_redis_backend(redis_client) -> None: - a_items = [{"id": i, "src": "a"} for i in range(1, 30)] - b_items = [{"id": i, "src": "b"} for i in range(10, 40)] - c_items = [{"id": i, "src": "c"} for i in range(20, 60)] - d_items = [{"id": i, "src": "d"} for i in range(25, 70)] - - methods_dict = { - "a": make_offset_paged_method(a_items, max_per_call=1), - "b": make_offset_paged_method(b_items, max_per_call=1), - "c": make_offset_paged_method(c_items, max_per_call=1), - "d": make_offset_paged_method(d_items, max_per_call=1), - } - - append_config = { - "merger_id": "inner_append_unused_r", - "type": "merger_append", - "items": [ - {"subfeed_id": "sf_a_append_r", "type": "subfeed", "method_name": "a"}, - {"subfeed_id": "sf_b_append_r", "type": "subfeed", "method_name": "b"}, - ], + "data": {"subfeed_id": "sf_stream", "type": "subfeed", "method_name": "s"}, } - config = { - "merger_id": "dedup_with_append_redis", - "type": "merger_deduplication", - "dedup_key": "id", - "state_backend": "redis", - "state_ttl_seconds": 60, - "overfetch_factor": 2, - "items": [ - {"priority": 10, "data": append_config}, - {"priority": 5, "data": {"subfeed_id": "sf_c_r", "type": "subfeed", "method_name": "c"}}, - {"priority": 0, "data": {"subfeed_id": "sf_d_r", "type": "subfeed", "method_name": "d"}}, - ], - } + merger = parse_model(MergerDeduplication, config) - res_1, res_2 = await _run_two_pages( - config=config, + res_1 = await merger.get_data( methods_dict=methods_dict, user_id="u", - limit=15, - redis_client_instance=redis_client, - custom_deduplication_key="append", + limit=5, + next_page=FeedResultNextPage(data={}), ) - _assert_two_pages_no_overlap(res_1, res_2) - _assert_dedup_backend_state(res=res_2, merger_id="dedup_with_append_redis", state_backend="redis") - - _assert_cursor_monotonic_if_present(res_1, res_2, ["sf_a_append_r", "sf_b_append_r", "sf_c_r", "sf_d_r"]) + assert _ids(res_1.data) == [1, 2, 3, 4, 5] + # Simulate client "full reload": page=0 for the dedup merger. + # Also include the stale descendant cursor; dedup should clear it. + res_2 = await merger.get_data( + methods_dict=methods_dict, + user_id="u", + limit=5, + next_page=FeedResultNextPage( + data={ + "dedup_reset": FeedResultNextPageInside(page=0, after=res_1.next_page.data["dedup_reset"].after), + "sf_stream": res_1.next_page.data["sf_stream"], + } + ), + ) -@pytest.mark.asyncio -async def test_deduplication_merger_with_percentage_cursor_backend() -> None: - a_items = [{"id": i, "src": "pa"} for i in range(1, 60)] - b_items = [{"id": i, "src": "pb"} for i in range(30, 90)] - c_items = [{"id": i, "src": "pc"} for i in range(40, 120)] - - methods_dict = { - "pa": make_offset_paged_method(a_items, max_per_call=1), - "pb": make_offset_paged_method(b_items, max_per_call=1), - "pc": make_offset_paged_method(c_items, max_per_call=1), - } - - percentage_config = { - "merger_id": "inner_percentage_unused", - "type": "merger_percentage", - "items": [ - {"percentage": 50, "data": {"subfeed_id": "sf_pa", "type": "subfeed", "method_name": "pa"}}, - {"percentage": 50, "data": {"subfeed_id": "sf_pb", "type": "subfeed", "method_name": "pb"}}, - ], - } - - config = { - "merger_id": "dedup_percentage_cursor", - "type": "merger_deduplication", - "dedup_key": "id", - "state_backend": "cursor", - "overfetch_factor": 2, - "items": [ - {"priority": 0, "data": percentage_config}, - {"priority": 10, "data": {"subfeed_id": "sf_pc", "type": "subfeed", "method_name": "pc"}}, - {"priority": 5, "data": {"subfeed_id": "sf_pd", "type": "subfeed", "method_name": "pa"}}, - ], - } - - res_1, res_2 = await _run_two_pages(config=config, methods_dict=methods_dict, user_id="u", limit=20) - _assert_two_pages_no_overlap(res_1, res_2) - _assert_dedup_backend_state(res=res_2, merger_id="dedup_percentage_cursor", state_backend="cursor") - - for key in ("sf_pa", "sf_pb", "sf_pc"): - assert key in res_1.next_page.data - assert isinstance(res_1.next_page.data[key].after, int) - - _assert_cursor_monotonic_if_present(res_1, res_2, ["sf_pa", "sf_pb", "sf_pc", "sf_pd"]) + # Must restart from the beginning. + assert _ids(res_2.data) == [1, 2, 3, 4, 5] @pytest.mark.parametrize("redis_client", ["sync", "async"], indirect=True) @pytest.mark.asyncio -async def test_deduplication_merger_with_percentage_redis_backend(redis_client) -> None: - a_items = [{"id": i, "src": "pa"} for i in range(1, 60)] - b_items = [{"id": i, "src": "pb"} for i in range(30, 90)] - c_items = [{"id": i, "src": "pc"} for i in range(40, 120)] +async def test_dedup_redis_backend_cross_page(redis_client) -> None: + items_a = [{"id": i, "src": "A"} for i in range(1, 300)] + # Same IDs as A to force cross-source duplicates. + items_b = [{"id": i, "src": "B"} for i in range(1, 300)] methods_dict = { - "pa": make_offset_paged_method(a_items, max_per_call=1), - "pb": make_offset_paged_method(b_items, max_per_call=1), - "pc": make_offset_paged_method(c_items, max_per_call=1), - } - - percentage_config = { - "merger_id": "inner_percentage_unused_r", - "type": "merger_percentage", - "items": [ - {"percentage": 50, "data": {"subfeed_id": "sf_pa_r", "type": "subfeed", "method_name": "pa"}}, - {"percentage": 50, "data": {"subfeed_id": "sf_pb_r", "type": "subfeed", "method_name": "pb"}}, - ], + "a": make_offset_paged_method(items_a), + "b": make_offset_paged_method(items_b), } config = { - "merger_id": "dedup_percentage_redis", + "merger_id": "dedup_redis", "type": "merger_deduplication", "dedup_key": "id", "state_backend": "redis", "state_ttl_seconds": 60, - "overfetch_factor": 2, - "items": [ - {"priority": 0, "data": percentage_config}, - {"priority": 10, "data": {"subfeed_id": "sf_pc_r", "type": "subfeed", "method_name": "pc"}}, - {"priority": 5, "data": {"subfeed_id": "sf_pd_r", "type": "subfeed", "method_name": "pa"}}, - ], + "data": { + "merger_id": "pct_mix", + "type": "merger_percentage", + "shuffle": False, + "items": [ + {"percentage": 50, "data": {"subfeed_id": "sf_a", "type": "subfeed", "method_name": "a"}}, + {"percentage": 50, "data": {"subfeed_id": "sf_b", "type": "subfeed", "method_name": "b"}}, + ], + }, } - res_1, res_2 = await _run_two_pages( - config=config, + merger = parse_model(MergerDeduplication, config) + + res_1 = await merger.get_data( methods_dict=methods_dict, user_id="u", - limit=20, - redis_client_instance=redis_client, - custom_deduplication_key="percentage", + limit=10, + next_page=FeedResultNextPage(data={}), + redis_client=redis_client, + custom_deduplication_key="t1", ) - _assert_two_pages_no_overlap(res_1, res_2) - _assert_dedup_backend_state(res=res_2, merger_id="dedup_percentage_redis", state_backend="redis") - - _assert_cursor_monotonic_if_present(res_1, res_2, ["sf_pa_r", "sf_pb_r", "sf_pc_r", "sf_pd_r"]) - - -@pytest.mark.asyncio -async def test_deduplication_merger_with_positional_cursor_backend() -> None: - # MergerPositional carries its own merger cursor; verify it survives nesting in dedup. - pos_items = [{"id": i, "src": "pos"} for i in range(1, 100)] - def_items = [{"id": i, "src": "def"} for i in range(50, 140)] - extra_items = [{"id": i, "src": "extra"} for i in range(80, 180)] - - methods_dict = { - "pos": make_offset_paged_method(pos_items, max_per_call=1), - "def": make_offset_paged_method(def_items, max_per_call=1), - "extra": make_offset_paged_method(extra_items, max_per_call=1), - } - positional_config = { - "merger_id": "inner_positional", - "type": "merger_positional", - "positions": [0, 2, 4, 6, 8], - "positional": {"subfeed_id": "sf_positional", "type": "subfeed", "method_name": "pos"}, - "default": {"subfeed_id": "sf_default", "type": "subfeed", "method_name": "def"}, - } + res_2 = await merger.get_data( + methods_dict=methods_dict, + user_id="u", + limit=10, + next_page=res_1.next_page, + redis_client=redis_client, + custom_deduplication_key="t1", + ) - config = { - "merger_id": "dedup_positional_cursor", - "type": "merger_deduplication", - "dedup_key": "id", - "state_backend": "cursor", - "overfetch_factor": 2, - "items": [ - # Positional must run; it owns its own merger cursor entry. - {"priority": 10, "data": positional_config}, - {"priority": 5, "data": {"subfeed_id": "sf_extra", "type": "subfeed", "method_name": "extra"}}, - {"priority": 0, "data": {"subfeed_id": "sf_extra2", "type": "subfeed", "method_name": "extra"}}, - ], - } + _assert_no_dupes_in_page(res_1.data) + _assert_no_dupes_in_page(res_2.data) + _assert_pages_no_overlap(res_1, res_2) - res_1, res_2 = await _run_two_pages(config=config, methods_dict=methods_dict, user_id="u", limit=20) - _assert_two_pages_no_overlap(res_1, res_2) - _assert_dedup_backend_state(res=res_2, merger_id="dedup_positional_cursor", state_backend="cursor") - assert "inner_positional" in res_1.next_page.data - assert "inner_positional" in res_2.next_page.data + # Redis backend should not store seen ids in cursor after. + assert "dedup_redis" in res_2.next_page.data + assert res_2.next_page.data["dedup_redis"].after is None - _assert_cursor_monotonic_if_present(res_1, res_2, ["sf_positional", "sf_default", "sf_extra", "sf_extra2"]) + # Ensure state is persisted in Redis. + key = "dedup:dedup_redis:u:t1" + members = redis_client.zrange(key, 0, -1) + if inspect.iscoroutine(members): + members = await members + assert len(members) >= len(set(_ids(res_1.data) + _ids(res_2.data))) -@pytest.mark.parametrize("redis_client", ["sync", "async"], indirect=True) @pytest.mark.asyncio -async def test_deduplication_merger_with_positional_redis_backend(redis_client) -> None: - pos_items = [{"id": i, "src": "pos"} for i in range(1, 100)] - def_items = [{"id": i, "src": "def"} for i in range(50, 140)] - extra_items = [{"id": i, "src": "extra"} for i in range(80, 180)] +async def test_dedup_append_distribute_cursor_backend_no_dupes() -> None: + items_a = [{"id": i, "user_id": f"u{i%3}", "src": "A"} for i in range(1, 200)] + items_b = [{"id": i, "user_id": f"u{i%3}", "src": "B"} for i in range(1, 200)] methods_dict = { - "pos": make_offset_paged_method(pos_items, max_per_call=1), - "def": make_offset_paged_method(def_items, max_per_call=1), - "extra": make_offset_paged_method(extra_items, max_per_call=1), - } - - positional_config = { - "merger_id": "inner_positional_r", - "type": "merger_positional", - "positions": [0, 2, 4, 6, 8], - "positional": {"subfeed_id": "sf_positional_r", "type": "subfeed", "method_name": "pos"}, - "default": {"subfeed_id": "sf_default_r", "type": "subfeed", "method_name": "def"}, + "a": make_offset_paged_method(items_a), + "b": make_offset_paged_method(items_b), } config = { - "merger_id": "dedup_positional_redis", + "merger_id": "dedup_dist", "type": "merger_deduplication", "dedup_key": "id", - "state_backend": "redis", - "state_ttl_seconds": 60, - "overfetch_factor": 2, - "items": [ - {"priority": 10, "data": positional_config}, - {"priority": 5, "data": {"subfeed_id": "sf_extra_r", "type": "subfeed", "method_name": "extra"}}, - {"priority": 0, "data": {"subfeed_id": "sf_extra2_r", "type": "subfeed", "method_name": "extra"}}, - ], + "state_backend": "cursor", + "cursor_compress": True, + "data": { + "merger_id": "dist", + "type": "merger_distribute", + "distribution_key": "user_id", + "items": [ + {"subfeed_id": "sf_a", "type": "subfeed", "method_name": "a"}, + {"subfeed_id": "sf_b", "type": "subfeed", "method_name": "b"}, + ], + }, } - res_1, res_2 = await _run_two_pages( - config=config, + merger = parse_model(MergerDeduplication, config) + res = await merger.get_data( methods_dict=methods_dict, user_id="u", - limit=20, - redis_client_instance=redis_client, - custom_deduplication_key="positional", + limit=30, + next_page=FeedResultNextPage(data={}), ) - _assert_two_pages_no_overlap(res_1, res_2) - _assert_dedup_backend_state(res=res_2, merger_id="dedup_positional_redis", state_backend="redis") - assert "inner_positional_r" in res_1.next_page.data - _assert_cursor_monotonic_if_present( - res_1, - res_2, - ["sf_positional_r", "sf_default_r", "sf_extra_r", "sf_extra2_r"], - ) + assert len(res.data) == 30 + _assert_no_dupes_in_page(res.data) @pytest.mark.asyncio -async def test_deduplication_merger_with_percentage_gradient_cursor_backend() -> None: - from_items = [{"id": i, "src": "from"} for i in range(1, 140)] - to_items = [{"id": i, "src": "to"} for i in range(60, 200)] - extra_items = [{"id": i, "src": "extra"} for i in range(120, 300)] - - methods_dict = { - "from": make_offset_paged_method(from_items, max_per_call=1), - "to": make_offset_paged_method(to_items, max_per_call=1), - "extra": make_offset_paged_method(extra_items, max_per_call=1), - } - - gradient_config = { - "merger_id": "inner_gradient", - "type": "merger_percentage_gradient", - "item_from": {"percentage": 80, "data": {"subfeed_id": "sf_from", "type": "subfeed", "method_name": "from"}}, - "item_to": {"percentage": 20, "data": {"subfeed_id": "sf_to", "type": "subfeed", "method_name": "to"}}, - "step": 10, - "size_to_step": 10, - } - - config = { - "merger_id": "dedup_gradient_cursor", - "type": "merger_deduplication", - "dedup_key": "id", - "state_backend": "cursor", - "overfetch_factor": 2, - "items": [ - {"priority": 10, "data": gradient_config}, - {"priority": 5, "data": {"subfeed_id": "sf_extra_g", "type": "subfeed", "method_name": "extra"}}, - {"priority": 0, "data": {"subfeed_id": "sf_extra_g2", "type": "subfeed", "method_name": "extra"}}, - ], - } - - res_1, res_2 = await _run_two_pages(config=config, methods_dict=methods_dict, user_id="u", limit=25) - _assert_two_pages_no_overlap(res_1, res_2) - assert "inner_gradient" in res_1.next_page.data - assert "inner_gradient" in res_2.next_page.data - _assert_dedup_backend_state(res=res_2, merger_id="dedup_gradient_cursor", state_backend="cursor") - - _assert_cursor_monotonic_if_present(res_1, res_2, ["sf_from", "sf_to", "sf_extra_g", "sf_extra_g2"]) +async def test_dedup_in_page_deletion_priority_keeps_high_priority_even_if_config_order_is_low_first() -> None: + """High dedup_priority source must not be deleted even if called later in config order. + We use a percentage merger where both branches have overlapping ids. + The "high" branch is second in config, but has higher dedup_priority. + """ -@pytest.mark.parametrize("redis_client", ["sync", "async"], indirect=True) -@pytest.mark.asyncio -async def test_deduplication_merger_with_percentage_gradient_redis_backend(redis_client) -> None: - from_items = [{"id": i, "src": "from"} for i in range(1, 140)] - to_items = [{"id": i, "src": "to"} for i in range(60, 200)] - extra_items = [{"id": i, "src": "extra"} for i in range(120, 300)] + low_items = [{"id": i, "src": "low"} for i in range(1, 200)] + high_items = [{"id": i, "src": "high"} for i in range(1, 200)] methods_dict = { - "from": make_offset_paged_method(from_items, max_per_call=1), - "to": make_offset_paged_method(to_items, max_per_call=1), - "extra": make_offset_paged_method(extra_items, max_per_call=1), - } - - gradient_config = { - "merger_id": "inner_gradient_r", - "type": "merger_percentage_gradient", - "item_from": {"percentage": 80, "data": {"subfeed_id": "sf_from_r", "type": "subfeed", "method_name": "from"}}, - "item_to": {"percentage": 20, "data": {"subfeed_id": "sf_to_r", "type": "subfeed", "method_name": "to"}}, - "step": 10, - "size_to_step": 10, + "low": make_offset_paged_method(low_items), + "high": make_offset_paged_method(high_items), } config = { - "merger_id": "dedup_gradient_redis", + "merger_id": "dedup_priority", "type": "merger_deduplication", "dedup_key": "id", - "state_backend": "redis", - "state_ttl_seconds": 60, - "overfetch_factor": 2, - "items": [ - {"priority": 10, "data": gradient_config}, - {"priority": 5, "data": {"subfeed_id": "sf_extra_gr", "type": "subfeed", "method_name": "extra"}}, - {"priority": 0, "data": {"subfeed_id": "sf_extra_gr2", "type": "subfeed", "method_name": "extra"}}, - ], + "state_backend": "cursor", + "cursor_compress": True, + "data": { + "merger_id": "pct", + "type": "merger_percentage", + "shuffle": False, + "items": [ + { + "percentage": 50, + "data": {"subfeed_id": "sf_low", "type": "subfeed", "method_name": "low", "dedup_priority": 0}, + }, + { + "percentage": 50, + "data": {"subfeed_id": "sf_high", "type": "subfeed", "method_name": "high", "dedup_priority": 100}, + }, + ], + }, } - res_1, res_2 = await _run_two_pages( - config=config, + merger = parse_model(MergerDeduplication, config) + res = await merger.get_data( methods_dict=methods_dict, user_id="u", - limit=25, - redis_client_instance=redis_client, - custom_deduplication_key="gradient", + limit=10, + next_page=FeedResultNextPage(data={}), ) - _assert_two_pages_no_overlap(res_1, res_2) - assert "inner_gradient_r" in res_1.next_page.data - _assert_dedup_backend_state(res=res_2, merger_id="dedup_gradient_redis", state_backend="redis") - _assert_cursor_monotonic_if_present(res_1, res_2, ["sf_from_r", "sf_to_r", "sf_extra_gr", "sf_extra_gr2"]) + _assert_no_dupes_in_page(res.data) + # Priority is about which source "wins" for a given dedup_key, not about output order. + # With 50/50 limits, the high-priority branch should supply ids 1..5, while the low-priority + # branch will be advanced to avoid duplicates. + winning = {item["id"]: item["src"] for item in res.data} + assert all(winning[i] == "high" for i in range(1, 6)) -@pytest.mark.parametrize("state_backend", ["cursor", "redis"]) -@pytest.mark.parametrize("redis_client", ["sync", "async"], indirect=True) @pytest.mark.asyncio -async def test_deduplication_merger_with_view_session_child(state_backend, redis_client) -> None: - # MergerViewSession always requires Redis, so this test always uses redis_client. - # We still validate both dedup state backends. - base_items = [{"id": i, "src": "vs"} for i in range(1, 200)] - extra_items = [{"id": i, "src": "extra"} for i in range(50, 260)] +async def test_dedup_percentage_gradient_slot_ownership_cursor_backend() -> None: + """Dedup must preserve gradient chunking semantics. - methods_dict = { - "vs": make_offset_paged_method(base_items), - "extra": make_offset_paged_method(extra_items), - } + For limit=10, size_to_step=5, from/to percentages should yield chunks: + - first 5: 3 from A, 2 from B + - next 5: 2 from A, 3 from B + Dedup must refill within each leaf so these chunk sizes remain true. + """ - view_session_config = { - "merger_id": "inner_view_session", - "type": "merger_view_session", - "session_size": 60, - "session_live_time": 60, - "data": {"subfeed_id": "sf_vs", "type": "subfeed", "method_name": "vs"}, - "deduplicate": True, - "dedup_key": "id", + a_items = [{"id": i, "src": "A"} for i in range(1, 300)] + # Start with duplicates, then provide unique tail. + b_items = ([{"id": i, "src": "B"} for i in range(1, 30)] + [{"id": 1000 + i, "src": "B"} for i in range(1, 300)]) + + methods_dict = { + "a": make_offset_paged_method(a_items), + "b": make_offset_paged_method(b_items), } config = { - "merger_id": f"dedup_vs_{state_backend}", + "merger_id": "dedup_gradient", "type": "merger_deduplication", "dedup_key": "id", - "state_backend": state_backend, - "state_ttl_seconds": 60, - "overfetch_factor": 2, - "items": [ - {"priority": 10, "data": view_session_config}, - {"priority": 5, "data": {"subfeed_id": "sf_extra_vs", "type": "subfeed", "method_name": "extra"}}, - {"priority": 0, "data": {"subfeed_id": "sf_extra_vs2", "type": "subfeed", "method_name": "extra"}}, - ], + "state_backend": "cursor", + "cursor_compress": True, + "max_refill_loops": 50, + "data": { + "merger_id": "grad_mix", + "type": "merger_percentage_gradient", + "item_from": { + "percentage": 60, + "data": {"subfeed_id": "sf_a", "type": "subfeed", "method_name": "a"}, + }, + "item_to": { + "percentage": 40, + "data": {"subfeed_id": "sf_b", "type": "subfeed", "method_name": "b"}, + }, + "step": 20, + "size_to_step": 5, + "shuffle": False, + }, } - res_1, res_2 = await _run_two_pages( - config=config, + merger = parse_model(MergerDeduplication, config) + res = await merger.get_data( methods_dict=methods_dict, user_id="u", - limit=20, - redis_client_instance=redis_client, - custom_deduplication_key=f"vs_{state_backend}", - custom_view_session_key=f"vs_{state_backend}", + limit=10, + next_page=FeedResultNextPage(data={}), ) - _assert_two_pages_no_overlap(res_1, res_2) - _assert_dedup_backend_state(res=res_2, merger_id=f"dedup_vs_{state_backend}", state_backend=state_backend) - assert "inner_view_session" in res_1.next_page.data - - _assert_cursor_monotonic_if_present(res_1, res_2, ["sf_vs", "sf_extra_vs", "sf_extra_vs2"]) - - -@pytest.mark.asyncio -async def test_deduplication_merger_with_append_distribute_cursor_backend() -> None: - # MergerAppendDistribute (type merger_distribute) + two extra subfeeds. - s1 = [{"id": i, "src": "s1", "group": "g1" if i % 2 == 0 else "g2"} for i in range(1, 120)] - s2 = [{"id": i, "src": "s2", "group": "g2" if i % 3 == 0 else "g3"} for i in range(60, 200)] - extra = [{"id": i, "src": "extra", "group": "g9"} for i in range(100, 240)] - methods_dict = { - "s1": make_offset_paged_method(s1, max_per_call=1), - "s2": make_offset_paged_method(s2, max_per_call=1), - "extra": make_offset_paged_method(extra, max_per_call=1), - } - - distribute_config = { - "merger_id": "inner_distribute_unused", - "type": "merger_distribute", - "distribution_key": "group", - "items": [ - {"subfeed_id": "sf_s1", "type": "subfeed", "method_name": "s1"}, - {"subfeed_id": "sf_s2", "type": "subfeed", "method_name": "s2"}, - ], - } + assert len(res.data) == 10 + _assert_no_dupes_in_page(res.data) - config = { - "merger_id": "dedup_distribute_cursor", - "type": "merger_deduplication", - "dedup_key": "id", - "state_backend": "cursor", - "overfetch_factor": 2, - "items": [ - {"priority": 10, "data": distribute_config}, - {"priority": 5, "data": {"subfeed_id": "sf_extra_dist", "type": "subfeed", "method_name": "extra"}}, - {"priority": 0, "data": {"subfeed_id": "sf_extra_dist2", "type": "subfeed", "method_name": "extra"}}, - ], - } + sources = _sources(res.data) + assert sources[:3] == ["A", "A", "A"] + assert sources[3:5] == ["B", "B"] + assert sources[5:7] == ["A", "A"] + assert sources[7:10] == ["B", "B", "B"] - res_1, res_2 = await _run_two_pages(config=config, methods_dict=methods_dict, user_id="u", limit=25) - _assert_two_pages_no_overlap(res_1, res_2) - _assert_dedup_backend_state(res=res_2, merger_id="dedup_distribute_cursor", state_backend="cursor") - for key in ("sf_s1", "sf_s2"): - assert key in res_1.next_page.data - _assert_cursor_monotonic_if_present(res_1, res_2, ["sf_s1", "sf_s2", "sf_extra_dist", "sf_extra_dist2"]) +@pytest.mark.asyncio +async def test_dedup_preserves_append_priority_and_advances_cursors_cursor_backend() -> None: + """Append order is the priority signal; dedup must not let later sources win duplicates. + Also asserts that a leaf cursor advances even when items are skipped as duplicates. + """ -@pytest.mark.parametrize("redis_client", ["sync", "async"], indirect=True) -@pytest.mark.asyncio -async def test_deduplication_merger_with_append_distribute_redis_backend(redis_client) -> None: - s1 = [{"id": i, "src": "s1", "group": "g1" if i % 2 == 0 else "g2"} for i in range(1, 120)] - s2 = [{"id": i, "src": "s2", "group": "g2" if i % 3 == 0 else "g3"} for i in range(60, 200)] - extra = [{"id": i, "src": "extra", "group": "g9"} for i in range(100, 240)] + a_items = [ + {"id": 1, "src": "A"}, + {"id": 2, "src": "A"}, + ] + # B repeats A's ids first, then continues with unique ids. + b_items = [{"id": i, "src": "B"} for i in range(1, 50)] methods_dict = { - "s1": make_offset_paged_method(s1, max_per_call=1), - "s2": make_offset_paged_method(s2, max_per_call=1), - "extra": make_offset_paged_method(extra, max_per_call=1), - } - - distribute_config = { - "merger_id": "inner_distribute_unused_r", - "type": "merger_distribute", - "distribution_key": "group", - "items": [ - {"subfeed_id": "sf_s1_r", "type": "subfeed", "method_name": "s1"}, - {"subfeed_id": "sf_s2_r", "type": "subfeed", "method_name": "s2"}, - ], + "a": make_offset_paged_method(a_items), + "b": make_offset_paged_method(b_items), } config = { - "merger_id": "dedup_distribute_redis", + "merger_id": "dedup_append", "type": "merger_deduplication", "dedup_key": "id", - "state_backend": "redis", - "state_ttl_seconds": 60, - "overfetch_factor": 2, - "items": [ - {"priority": 10, "data": distribute_config}, - {"priority": 5, "data": {"subfeed_id": "sf_extra_dist_r", "type": "subfeed", "method_name": "extra"}}, - {"priority": 0, "data": {"subfeed_id": "sf_extra_dist2_r", "type": "subfeed", "method_name": "extra"}}, - ], + "state_backend": "cursor", + "cursor_compress": True, + "max_refill_loops": 20, + "data": { + "merger_id": "append_mix", + "type": "merger_append", + "shuffle": False, + "items": [ + {"subfeed_id": "sf_a", "type": "subfeed", "method_name": "a"}, + {"subfeed_id": "sf_b", "type": "subfeed", "method_name": "b"}, + ], + }, } - res_1, res_2 = await _run_two_pages( - config=config, + merger = parse_model(MergerDeduplication, config) + res = await merger.get_data( methods_dict=methods_dict, user_id="u", - limit=25, - redis_client_instance=redis_client, - custom_deduplication_key="distribute", + limit=5, + next_page=FeedResultNextPage(data={}), ) - _assert_two_pages_no_overlap(res_1, res_2) - _assert_dedup_backend_state(res=res_2, merger_id="dedup_distribute_redis", state_backend="redis") - _assert_cursor_monotonic_if_present( - res_1, - res_2, - ["sf_s1_r", "sf_s2_r", "sf_extra_dist_r", "sf_extra_dist2_r"], - ) + assert _ids(res.data) == [1, 2, 3, 4, 5] + assert _sources(res.data)[:2] == ["A", "A"] + assert _sources(res.data)[2:] == ["B", "B", "B"] + + # B had to scan past duplicated ids 1 and 2, so its cursor should advance + # farther than the number of items it contributed to the final page. + assert "sf_b" in res.next_page.data + assert isinstance(res.next_page.data["sf_b"].after, int) + b_contributed = sum(1 for x in res.data if x.get("src") == "B") + assert res.next_page.data["sf_b"].after > b_contributed diff --git a/tests/test_parsing_config.py b/tests/test_parsing_config.py index 291971b..66c2387 100644 --- a/tests/test_parsing_config.py +++ b/tests/test_parsing_config.py @@ -54,6 +54,5 @@ async def test_parsing_config_deduplication_merger() -> None: assert isinstance(feed_manager.feed_config, FeedConfig) assert isinstance(feed_manager.feed_config.feed, MergerDeduplication) - assert len(feed_manager.feed_config.feed.items) == 2 - assert feed_manager.feed_config.feed.items[0].priority == 100 - assert isinstance(feed_manager.feed_config.feed.items[0].data, SubFeed) + # Deduplication merger is a wrapper around a single child feed. + assert isinstance(feed_manager.feed_config.feed.data, (MergerPercentage, SubFeed)) From 3ed4a094b70fb659243ece10b4bbc3af3841654e Mon Sep 17 00:00:00 2001 From: Pavel Kochetov Date: Tue, 16 Dec 2025 22:36:17 +0000 Subject: [PATCH 05/33] More tests. --- tests/test_merger_deduplication.py | 449 +++++++++++++++++++++++++++++ 1 file changed, 449 insertions(+) diff --git a/tests/test_merger_deduplication.py b/tests/test_merger_deduplication.py index 5b046a8..8347e3d 100644 --- a/tests/test_merger_deduplication.py +++ b/tests/test_merger_deduplication.py @@ -71,6 +71,100 @@ def _assert_pages_no_overlap(res_1, res_2): assert not (set(_ids(res_1.data)) & set(_ids(res_2.data))) +def _inner_append_config(*, merger_id: str, subfeed_id: str, method_name: str, dedup_priority: int): + return { + "merger_id": merger_id, + "type": "merger_append", + # Important: dedup deletion priority must be visible at this node so parent mergers + # can fetch higher-priority subtrees first when a dedup wrapper is active. + "dedup_priority": dedup_priority, + "shuffle": False, + "items": [ + { + "subfeed_id": subfeed_id, + "type": "subfeed", + "method_name": method_name, + "dedup_priority": dedup_priority, + } + ], + } + + +def _build_deep_priority_tree_for_merger_type(*, merger_type: str): + """Return a deep tree config where low/high leaves overlap by id. + + The inner leaves are wrapped into an append merger to ensure a "deep" tree even + when the outer merger is flat. + """ + + low = _inner_append_config(merger_id="inner_low", subfeed_id="sf_low", method_name="low", dedup_priority=0) + high = _inner_append_config(merger_id="inner_high", subfeed_id="sf_high", method_name="high", dedup_priority=100) + + if merger_type == "merger_append": + return { + "merger_id": "outer_append", + "type": "merger_append", + "shuffle": False, + # Put low first intentionally; priority must still make high win for overlapping ids. + "items": [low, high], + } + + if merger_type == "merger_distribute": + return { + "merger_id": "outer_dist", + "type": "merger_distribute", + "distribution_key": "user_id", + # Put low first intentionally. + "items": [low, high], + } + + if merger_type == "merger_percentage": + return { + "merger_id": "outer_pct", + "type": "merger_percentage", + "shuffle": False, + "items": [ + {"percentage": 50, "data": low}, + {"percentage": 50, "data": high}, + ], + } + + if merger_type == "merger_percentage_gradient": + return { + "merger_id": "outer_grad", + "type": "merger_percentage_gradient", + "item_from": {"percentage": 60, "data": low}, + "item_to": {"percentage": 40, "data": high}, + "step": 20, + "size_to_step": 5, + "shuffle": False, + } + + if merger_type == "merger_positional": + # High priority on positional branch so it must win duplicates. + high_pos = _inner_append_config( + merger_id="inner_pos_high", + subfeed_id="sf_high", + method_name="high", + dedup_priority=100, + ) + low_def = _inner_append_config( + merger_id="inner_def_low", + subfeed_id="sf_low", + method_name="low", + dedup_priority=0, + ) + return { + "merger_id": "outer_pos", + "type": "merger_positional", + "positions": [1, 3, 5, 7, 9, 11], + "positional": high_pos, + "default": low_def, + } + + raise AssertionError(f"Unknown merger_type: {merger_type}") + + @pytest.mark.asyncio async def test_dedup_positional_slot_ownership_cursor_backend() -> None: """Positional slots must remain owned by the positional branch. @@ -285,6 +379,79 @@ async def test_dedup_deep_tree_cursor_backend() -> None: assert _sources(res_2.data)[3] == "P" +@pytest.mark.parametrize( + "merger_type", + [ + "merger_append", + "merger_distribute", + "merger_positional", + "merger_percentage", + "merger_percentage_gradient", + ], +) +@pytest.mark.asyncio +async def test_dedup_deletion_priority_works_for_deep_trees_all_merger_types(merger_type: str) -> None: + """Deletion priority must work even in deep trees, across merger types. + + For overlapping ids, higher dedup_priority leaf must supply the winning entity. + """ + + # For mixing mergers (percentage/gradient/positional), identical id ranges are enough: the + # high-priority leaf should claim the first chunk of ids and the other leaf must skip them. + # + # For append/distribute, we must ensure BOTH branches contribute to the output (otherwise + # "priority" is unobservable because earlier branches can fill the page). We do that by + # making the low branch short and duplicate-heavy. + if merger_type in {"merger_append", "merger_distribute"}: + low_items = [ + {"id": 1, "user_id": "u0", "src": "low"}, + {"id": 2, "user_id": "u1", "src": "low"}, + {"id": 3, "user_id": "u2", "src": "low"}, + {"id": 1000, "user_id": "u0", "src": "low"}, + {"id": 1001, "user_id": "u1", "src": "low"}, + ] + high_items = [{"id": i, "user_id": f"u{i%3}", "src": "high"} for i in range(1, 200)] + else: + low_items = [{"id": i, "user_id": f"u{i%3}", "src": "low"} for i in range(1, 200)] + high_items = [{"id": i, "user_id": f"u{i%3}", "src": "high"} for i in range(1, 200)] + + methods_dict = { + "low": make_offset_paged_method(low_items), + "high": make_offset_paged_method(high_items), + } + + deep_tree = _build_deep_priority_tree_for_merger_type(merger_type=merger_type) + config = { + "merger_id": f"dedup_priority_deep_{merger_type}", + "type": "merger_deduplication", + "dedup_key": "id", + "state_backend": "cursor", + "cursor_compress": True, + "data": deep_tree, + } + + merger = parse_model(MergerDeduplication, config) + res = await merger.get_data( + methods_dict=methods_dict, + user_id="u", + limit=10, + next_page=FeedResultNextPage(data={}), + ) + + _assert_no_dupes_in_page(res.data) + + # Priority is about which source wins overlapping ids (not about output order). + winning = {item["id"]: item["src"] for item in res.data} + assert all(winning[i] == "high" for i in range(1, 6) if i in winning) + + # Placement invariant for positional: positional slots must still be owned by positional branch. + if merger_type == "merger_positional": + sources = _sources(res.data) + assert sources[0] == "high" + assert sources[2] == "high" + assert sources[4] == "high" + + @pytest.mark.asyncio async def test_dedup_overfetch_factor_does_not_skip_unseen_items_in_deep_tree_cursors() -> None: """When overfetch_factor>1, leaf cursors must be rewound to inspected count. @@ -341,6 +508,15 @@ async def test_dedup_overfetch_factor_does_not_skip_unseen_items_in_deep_tree_cu assert len(res_1.data) == 8 _assert_no_dupes_in_page(res_1.data) + # Dedup merger cursor must exist and advance page. + assert "dedup_overfetch" in res_1.next_page.data + assert res_1.next_page.data["dedup_overfetch"].page == 2 + assert res_1.next_page.data["dedup_overfetch"].after is not None + + # Positional merger cursor must exist and advance page. + assert "pos_overfetch" in res_1.next_page.data + assert res_1.next_page.data["pos_overfetch"].page == 2 + # Deep descendant cursors: positional leaf requests 2 items; percentage leaves request 4 each. # With overfetch_factor=3, internal calls may request 6/12, but cursor must not advance that far. assert res_1.next_page.data["sf_p"].after == 2 @@ -359,6 +535,9 @@ async def test_dedup_overfetch_factor_does_not_skip_unseen_items_in_deep_tree_cu _assert_no_dupes_in_page(res_2.data) _assert_pages_no_overlap(res_1, res_2) + assert res_2.next_page.data["dedup_overfetch"].page == 3 + assert res_2.next_page.data["pos_overfetch"].page == 3 + assert res_2.next_page.data["sf_p"].after == 4 assert res_2.next_page.data["sf_d1"].after == 8 assert res_2.next_page.data["sf_d2"].after == 8 @@ -477,6 +656,276 @@ async def test_dedup_redis_backend_cross_page(redis_client) -> None: assert len(members) >= len(set(_ids(res_1.data) + _ids(res_2.data))) +@pytest.mark.asyncio +async def test_dedup_append_cursor_backend_across_pages_and_refill_advances_leaf_cursor_exactly() -> None: + """Append: across pages there is no overlap; refill advances cursors correctly. + + This uses a max_per_call=1 method for the duplicate-heavy leaf so the wrapper + must do multiple client calls (refill loops). + """ + + a_items = [{"id": 1, "src": "A"}, {"id": 2, "src": "A"}] + b_items = [{"id": i, "src": "B"} for i in range(1, 50)] + + methods_dict = { + "a": make_offset_paged_method(a_items), + # Force multiple internal calls. + "b": make_offset_paged_method(b_items, max_per_call=1), + } + + config = { + "merger_id": "dedup_append_pages", + "type": "merger_deduplication", + "dedup_key": "id", + "state_backend": "cursor", + "cursor_compress": True, + "max_refill_loops": 50, + "data": { + "merger_id": "append_mix", + "type": "merger_append", + "shuffle": False, + "items": [ + {"subfeed_id": "sf_a", "type": "subfeed", "method_name": "a"}, + {"subfeed_id": "sf_b", "type": "subfeed", "method_name": "b"}, + ], + }, + } + + merger = parse_model(MergerDeduplication, config) + + res_1 = await merger.get_data( + methods_dict=methods_dict, + user_id="u", + limit=5, + next_page=FeedResultNextPage(data={}), + ) + + assert _ids(res_1.data) == [1, 2, 3, 4, 5] + assert res_1.next_page.data["dedup_append_pages"].page == 2 + + # In dedup-active append mode, each child is requested with the full page limit (5). + # B must therefore collect 5 unique items while skipping 2 duplicates -> scan ids 1..7. + assert res_1.next_page.data["sf_b"].after == 7 + + res_2 = await merger.get_data( + methods_dict=methods_dict, + user_id="u", + limit=5, + next_page=res_1.next_page, + ) + + assert len(res_2.data) == 5 + _assert_no_dupes_in_page(res_2.data) + _assert_pages_no_overlap(res_1, res_2) + + +@pytest.mark.asyncio +async def test_dedup_distribute_cursor_backend_across_pages_preserves_source_refill() -> None: + """Distribute: duplicates skipped per-leaf and page slices don't overlap.""" + + # A is short so B must contribute. + items_a = [{"id": i, "user_id": f"u{i%2}", "src": "A"} for i in range(1, 4)] + # B overlaps A by id and continues. + items_b = [{"id": i, "user_id": f"u{i%2}", "src": "B"} for i in range(1, 200)] + + methods_dict = { + "a": make_offset_paged_method(items_a), + "b": make_offset_paged_method(items_b), + } + + config = { + "merger_id": "dedup_dist_pages", + "type": "merger_deduplication", + "dedup_key": "id", + "state_backend": "cursor", + "cursor_compress": True, + "data": { + "merger_id": "dist", + "type": "merger_distribute", + "distribution_key": "user_id", + "items": [ + {"subfeed_id": "sf_a", "type": "subfeed", "method_name": "a"}, + {"subfeed_id": "sf_b", "type": "subfeed", "method_name": "b"}, + ], + }, + } + + merger = parse_model(MergerDeduplication, config) + res_1 = await merger.get_data(methods_dict=methods_dict, user_id="u", limit=10, next_page=FeedResultNextPage(data={})) + res_2 = await merger.get_data(methods_dict=methods_dict, user_id="u", limit=10, next_page=res_1.next_page) + + assert len(res_1.data) == 10 + assert len(res_2.data) == 10 + _assert_no_dupes_in_page(res_1.data) + _assert_no_dupes_in_page(res_2.data) + _assert_pages_no_overlap(res_1, res_2) + + # Placement/refill: B must skip duplicate ids 1..3 and still fill the page. + b_ids_1 = [x["id"] for x in res_1.data if x.get("src") == "B"] + assert b_ids_1 and min(b_ids_1) >= 4 + + +@pytest.mark.asyncio +async def test_dedup_percentage_gradient_cursor_backend_across_pages() -> None: + a_items = [{"id": i, "src": "A"} for i in range(1, 300)] + b_items = ([{"id": i, "src": "B"} for i in range(1, 30)] + [{"id": 1000 + i, "src": "B"} for i in range(1, 300)]) + + methods_dict = { + "a": make_offset_paged_method(a_items), + "b": make_offset_paged_method(b_items), + } + + config = { + "merger_id": "dedup_grad_pages", + "type": "merger_deduplication", + "dedup_key": "id", + "state_backend": "cursor", + "cursor_compress": True, + "max_refill_loops": 50, + "data": { + "merger_id": "grad_mix", + "type": "merger_percentage_gradient", + "item_from": {"percentage": 60, "data": {"subfeed_id": "sf_a", "type": "subfeed", "method_name": "a"}}, + "item_to": {"percentage": 40, "data": {"subfeed_id": "sf_b", "type": "subfeed", "method_name": "b"}}, + "step": 20, + "size_to_step": 5, + "shuffle": False, + }, + } + + merger = parse_model(MergerDeduplication, config) + res_1 = await merger.get_data(methods_dict=methods_dict, user_id="u", limit=10, next_page=FeedResultNextPage(data={})) + res_2 = await merger.get_data(methods_dict=methods_dict, user_id="u", limit=10, next_page=res_1.next_page) + + _assert_no_dupes_in_page(res_1.data) + _assert_no_dupes_in_page(res_2.data) + _assert_pages_no_overlap(res_1, res_2) + + # Gradient merger cursor should exist and advance. + assert res_1.next_page.data["grad_mix"].page == 2 + assert res_2.next_page.data["grad_mix"].page == 3 + + +@pytest.mark.parametrize("redis_client", ["sync", "async"], indirect=True) +@pytest.mark.asyncio +async def test_dedup_redis_backend_cross_page_append(redis_client) -> None: + items_a = [{"id": i, "src": "A"} for i in range(1, 20)] + items_b = [{"id": i, "src": "B"} for i in range(1, 300)] + + methods_dict = { + "a": make_offset_paged_method(items_a), + "b": make_offset_paged_method(items_b), + } + + config = { + "merger_id": "dedup_redis_append", + "type": "merger_deduplication", + "dedup_key": "id", + "state_backend": "redis", + "state_ttl_seconds": 60, + "data": { + "merger_id": "append_mix", + "type": "merger_append", + "shuffle": False, + "items": [ + {"subfeed_id": "sf_a", "type": "subfeed", "method_name": "a"}, + {"subfeed_id": "sf_b", "type": "subfeed", "method_name": "b"}, + ], + }, + } + + merger = parse_model(MergerDeduplication, config) + res_1 = await merger.get_data( + methods_dict=methods_dict, + user_id="u", + limit=10, + next_page=FeedResultNextPage(data={}), + redis_client=redis_client, + custom_deduplication_key="t2", + ) + res_2 = await merger.get_data( + methods_dict=methods_dict, + user_id="u", + limit=10, + next_page=res_1.next_page, + redis_client=redis_client, + custom_deduplication_key="t2", + ) + + _assert_no_dupes_in_page(res_1.data) + _assert_no_dupes_in_page(res_2.data) + _assert_pages_no_overlap(res_1, res_2) + assert res_2.next_page.data["dedup_redis_append"].after is None + + +@pytest.mark.parametrize("redis_client", ["sync", "async"], indirect=True) +@pytest.mark.asyncio +async def test_dedup_wrapper_with_view_session_merger(redis_client) -> None: + """Dedup wrapper must work when the child is a view_session merger.""" + + # Two leaves with overlapping ids; view_session computes a session once. + items_low = [{"id": i, "src": "low"} for i in range(1, 100)] + items_high = [{"id": i, "src": "high"} for i in range(1, 100)] + + methods_dict = { + "low": make_offset_paged_method(items_low), + "high": make_offset_paged_method(items_high), + } + + config = { + "merger_id": "dedup_vs", + "type": "merger_deduplication", + "dedup_key": "id", + "state_backend": "cursor", + "cursor_compress": True, + "data": { + "merger_id": "vs", + "type": "merger_view_session", + "session_size": 30, + "session_live_time": 60, + "deduplicate": False, + "shuffle": False, + "data": { + "merger_id": "pct", + "type": "merger_percentage", + "shuffle": False, + "items": [ + {"percentage": 50, "data": {"subfeed_id": "sf_low", "type": "subfeed", "method_name": "low", "dedup_priority": 0}}, + {"percentage": 50, "data": {"subfeed_id": "sf_high", "type": "subfeed", "method_name": "high", "dedup_priority": 100}}, + ], + }, + }, + } + + merger = parse_model(MergerDeduplication, config) + + res_1 = await merger.get_data( + methods_dict=methods_dict, + user_id="u", + limit=10, + next_page=FeedResultNextPage(data={}), + redis_client=redis_client, + custom_view_session_key="vs1", + ) + + res_2 = await merger.get_data( + methods_dict=methods_dict, + user_id="u", + limit=10, + next_page=res_1.next_page, + redis_client=redis_client, + custom_view_session_key="vs1", + ) + + _assert_no_dupes_in_page(res_1.data) + _assert_no_dupes_in_page(res_2.data) + _assert_pages_no_overlap(res_1, res_2) + + # Deletion priority: for the overlapping early ids, the winning entity must be from high. + winning = {item["id"]: item["src"] for item in (res_1.data + res_2.data)} + assert all(winning[i] == "high" for i in range(1, 11) if i in winning) + + @pytest.mark.asyncio async def test_dedup_append_distribute_cursor_backend_no_dupes() -> None: items_a = [{"id": i, "user_id": f"u{i%3}", "src": "A"} for i in range(1, 200)] From aa2b608c60c848d9cd309eea3e4d426dd1be20fb Mon Sep 17 00:00:00 2001 From: Pavel Kochetov Date: Tue, 16 Dec 2025 22:59:18 +0000 Subject: [PATCH 06/33] More tests and minor after fixes. --- smartfeed/schemas.py | 8 +- tests/test_merger_deduplication.py | 258 +++++++++++++++++++++++++++++ 2 files changed, 263 insertions(+), 3 deletions(-) diff --git a/smartfeed/schemas.py b/smartfeed/schemas.py index 3320fb7..b1e8900 100644 --- a/smartfeed/schemas.py +++ b/smartfeed/schemas.py @@ -1413,12 +1413,14 @@ async def _wrapped_method(user_id: Any, limit: int, next_page: FeedResultNextPag remaining = limit - len(collected) # Safe oversampling: only when we can rewind integer-offset cursors. - can_overfetch = isinstance(next_page.after, (int, type(None))) + # IMPORTANT: `after` can be many shapes (str/dict/etc) and may start as None. + # We only enable overfetch when `after` is already an int offset. + can_overfetch = isinstance(next_page.after, int) request_limit = max(1, remaining) if can_overfetch and self.overfetch_factor > 1: request_limit = max(1, remaining * self.overfetch_factor) - start_after = 0 if next_page.after is None else int(next_page.after) + start_after: Optional[int] = int(next_page.after) if can_overfetch else None method_result = await original_method(user_id=user_id, limit=request_limit, next_page=next_page, **kw) if not isinstance(method_result, FeedResultClient): @@ -1475,7 +1477,7 @@ async def _wrapped_method(user_id: Any, limit: int, next_page: FeedResultNextPag # If we oversampled with a simple integer cursor, rewind to the point we actually consumed. # This prevents skipping un-inspected items that were fetched but not needed. - if can_overfetch and request_limit > remaining: + if can_overfetch and request_limit > remaining and start_after is not None: end_after = next_page.after if isinstance(end_after, int) and end_after == start_after + len(method_result.data): next_page.after = start_after + consumed_in_batch diff --git a/tests/test_merger_deduplication.py b/tests/test_merger_deduplication.py index 8347e3d..94d122a 100644 --- a/tests/test_merger_deduplication.py +++ b/tests/test_merger_deduplication.py @@ -28,6 +28,100 @@ async def _method(user_id, limit, next_page, **kwargs): # pylint: disable=unuse return _method +def make_string_after_paged_method(items, *, max_per_call=None, after_field="created_at"): + """A subfeed method whose cursor is a string (e.g. timestamp). + + Cursor semantics: `after` is the last returned `created_at` value (monotonic). + """ + + async def _method(user_id, limit, next_page, **kwargs): # pylint: disable=unused-argument + effective_limit = limit + if isinstance(max_per_call, int) and max_per_call > 0: + effective_limit = min(effective_limit, max_per_call) + + after = next_page.after + start_idx = 0 + if isinstance(after, str) and after: + # Find first item with created_at > after + for i, item in enumerate(items): + if str(item[after_field]) > after: + start_idx = i + break + else: + start_idx = len(items) + + result_data = items[start_idx : start_idx + effective_limit] + has_next_page = (start_idx + len(result_data)) < len(items) + + if result_data: + next_page.after = str(result_data[-1][after_field]) + next_page.page += 1 + return FeedResultClient(data=result_data, next_page=next_page, has_next_page=has_next_page) + + return _method + + +def make_profile_dict_after_method( + profiles_to_items, + *, + max_per_call=None, + after_key="after", +): + """A subfeed method whose cursor is a dict of per-profile offsets. + + Example shape: after = {"p1": 0, "p2": 0} + Cursor semantics: each profile offset increments as items are *read*. + """ + + profile_ids = list(profiles_to_items.keys()) + + async def _method(user_id, limit, next_page, **kwargs): # pylint: disable=unused-argument + effective_limit = limit + if isinstance(max_per_call, int) and max_per_call > 0: + effective_limit = min(effective_limit, max_per_call) + + after = next_page.after + if not isinstance(after, dict): + after = {pid: 0 for pid in profile_ids} + else: + after = dict(after) + for pid in profile_ids: + after.setdefault(pid, 0) + + result = [] + has_next_page = False + + # Build a cyclic iteration over profiles. + active_profiles = [pid for pid in profile_ids] + + i = 0 + while active_profiles and len(result) < effective_limit: + pid = active_profiles[i % len(active_profiles)] + idx = after.get(pid, 0) + items = profiles_to_items.get(pid, []) + + if idx >= len(items): + # This profile is exhausted. + active_profiles.remove(pid) + continue + + result.append(items[idx]) + after[pid] = idx + 1 + i += 1 + + # Determine if any profile still has unread items. + for pid in profile_ids: + if after.get(pid, 0) < len(profiles_to_items.get(pid, [])): + has_next_page = True + break + + next_page.after = after + next_page.page += 1 + return FeedResultClient(data=result, next_page=next_page, has_next_page=has_next_page) + + return _method + + def _assert_cursor_monotonic_if_present(res_1, res_2, keys): for key in keys: if key not in res_1.next_page.data: @@ -719,6 +813,170 @@ async def test_dedup_append_cursor_backend_across_pages_and_refill_advances_leaf _assert_pages_no_overlap(res_1, res_2) +@pytest.mark.asyncio +async def test_dedup_refill_loops_advance_dict_after_cursor_not_just_page() -> None: + """Dedup refill loops must correctly advance dict-shaped `after` cursors.""" + + # A produces ids 1,2. + a_items = [{"id": 1, "src": "A"}, {"id": 2, "src": "A"}] + + # B produces ids 1.. in round-robin across profiles; cursor is per-profile offsets. + b_profiles = { + "p0": [{"id": 1, "src": "B"}, {"id": 3, "src": "B"}, {"id": 5, "src": "B"}, {"id": 7, "src": "B"}], + "p1": [{"id": 2, "src": "B"}, {"id": 4, "src": "B"}, {"id": 6, "src": "B"}, {"id": 8, "src": "B"}], + } + + methods_dict = { + "a": make_offset_paged_method(a_items), + "b": make_profile_dict_after_method(b_profiles), + } + + # Use a percentage merger so B is asked for a small limit (2 items for limit=4). + # This forces refill loops when B's first batch is all duplicates. + config = { + "merger_id": "dedup_dict_after", + "type": "merger_deduplication", + "dedup_key": "id", + "state_backend": "cursor", + "cursor_compress": True, + "max_refill_loops": 50, + "data": { + "merger_id": "pct_mix", + "type": "merger_percentage", + "shuffle": False, + "items": [ + { + "percentage": 50, + "data": {"subfeed_id": "sf_a", "type": "subfeed", "method_name": "a", "dedup_priority": 100}, + }, + { + "percentage": 50, + "data": {"subfeed_id": "sf_b", "type": "subfeed", "method_name": "b", "dedup_priority": 0}, + }, + ], + }, + } + + merger = parse_model(MergerDeduplication, config) + res = await merger.get_data( + methods_dict=methods_dict, + user_id="u", + limit=4, + next_page=FeedResultNextPage(data={}), + ) + + assert len(res.data) == 4 + _assert_no_dupes_in_page(res.data) + assert set(_ids(res.data)) == {1, 2, 3, 4} + assert "sf_b" in res.next_page.data + assert isinstance(res.next_page.data["sf_b"].after, dict) + + # B contributed 2 items (3,4) but must have *read* 4 items (1..4) to skip duplicates. + b_after = res.next_page.data["sf_b"].after + read_count = sum(int(v) for v in b_after.values()) + assert read_count == 4 + + +@pytest.mark.asyncio +async def test_dedup_overfetch_does_not_overadvance_non_int_after_cursor() -> None: + """overfetch_factor must not cause over-advancement for non-rewindable cursors.""" + + # Single subfeed with dict after cursor; no dedup skips should happen. + profiles = { + "p0": [{"id": 1, "src": "B"}, {"id": 3, "src": "B"}, {"id": 5, "src": "B"}, {"id": 7, "src": "B"}], + "p1": [{"id": 2, "src": "B"}, {"id": 4, "src": "B"}, {"id": 6, "src": "B"}, {"id": 8, "src": "B"}], + } + + methods_dict = { + "b": make_profile_dict_after_method(profiles), + } + + config = { + "merger_id": "dedup_nonint_overfetch", + "type": "merger_deduplication", + "dedup_key": "id", + "state_backend": "cursor", + "cursor_compress": True, + "overfetch_factor": 5, + "data": {"subfeed_id": "sf_b", "type": "subfeed", "method_name": "b"}, + } + + merger = parse_model(MergerDeduplication, config) + res = await merger.get_data(methods_dict=methods_dict, user_id="u", limit=4, next_page=FeedResultNextPage(data={})) + + assert len(res.data) == 4 + after = res.next_page.data["sf_b"].after + assert isinstance(after, dict) + # If overfetch were incorrectly applied, we'd see more than 4 reads. + assert sum(int(v) for v in after.values()) == 4 + + +@pytest.mark.asyncio +async def test_dedup_overfetch_rewinds_offset_cursor_when_first_batch_all_duplicates() -> None: + """Overfetch should be safe: when we oversample, we must rewind offset cursors. + + Scenario: + - A (high priority) returns ids 1..5 + - B (low priority) initially returns only duplicates (1..5) + - On the next refill loop, B overfetches but must rewind `after` to inspected count + so it doesn't skip items. + """ + + items_a = [{"id": i, "src": "A"} for i in range(1, 300)] + items_b = [{"id": i, "src": "B"} for i in range(1, 300)] + + methods_dict = { + "a": make_offset_paged_method(items_a), + "b": make_offset_paged_method(items_b), + } + + config = { + "merger_id": "dedup_overfetch_rewind", + "type": "merger_deduplication", + "dedup_key": "id", + "state_backend": "cursor", + "cursor_compress": True, + "overfetch_factor": 3, + "max_refill_loops": 20, + "data": { + "merger_id": "pct_mix", + "type": "merger_percentage", + "shuffle": False, + "items": [ + { + "percentage": 50, + "data": {"subfeed_id": "sf_a", "type": "subfeed", "method_name": "a", "dedup_priority": 100}, + }, + { + "percentage": 50, + "data": {"subfeed_id": "sf_b", "type": "subfeed", "method_name": "b", "dedup_priority": 0}, + }, + ], + }, + } + + merger = parse_model(MergerDeduplication, config) + res = await merger.get_data( + methods_dict=methods_dict, + user_id="u", + limit=10, + next_page=FeedResultNextPage(data={}), + ) + + assert len(res.data) == 10 + _assert_no_dupes_in_page(res.data) + + # A provides 1..5, B must provide 6..10. + winning = {item["id"]: item["src"] for item in res.data} + assert all(winning[i] == "A" for i in range(1, 6)) + assert all(winning[i] == "B" for i in range(6, 11)) + + # Cursor rewind check: + # - First loop for B reads 5 duplicates -> after becomes 5 + # - Second loop overfetches, but must rewind to inspected 5 more -> after should end at 10 + assert res.next_page.data["sf_b"].after == 10 + + @pytest.mark.asyncio async def test_dedup_distribute_cursor_backend_across_pages_preserves_source_refill() -> None: """Distribute: duplicates skipped per-leaf and page slices don't overlap.""" From c6184dbc3e74ba3dfa98fa79ea8763ecbaf7a33c Mon Sep 17 00:00:00 2001 From: Pavel Kochetov Date: Tue, 16 Dec 2025 23:46:58 +0000 Subject: [PATCH 07/33] Minor refactor. --- smartfeed/schemas.py | 465 +++++++++++++++++++++++++++++-------------- 1 file changed, 316 insertions(+), 149 deletions(-) diff --git a/smartfeed/schemas.py b/smartfeed/schemas.py index b1e8900..16fe0cc 100644 --- a/smartfeed/schemas.py +++ b/smartfeed/schemas.py @@ -1,4 +1,5 @@ import base64 +from dataclasses import dataclass import inspect import json import logging @@ -9,10 +10,99 @@ from typing import Annotated, Any, Callable, Dict, List, Literal, Optional, Union, no_type_check import redis -from pydantic import BaseModel, Field, model_validator +from pydantic import BaseModel, Field, PrivateAttr, model_validator from redis.asyncio import Redis as AsyncRedis from redis.asyncio import RedisCluster as AsyncRedisCluster + +def _pydantic_deep_copy(model: Any) -> Any: + """Deep copy helper compatible with Pydantic v1 and v2.""" + + if hasattr(model, "model_copy"): + return model.model_copy(deep=True) # type: ignore[attr-defined] + return model.copy(deep=True) + + +class _DedupState(ABC): + @abstractmethod + def should_accept(self, key: str, priority: int) -> bool: + raise NotImplementedError + + @abstractmethod + def record(self, key: str, priority: int) -> None: + raise NotImplementedError + + async def prefetch(self, keys: List[str]) -> None: + return + + +@dataclass +class _CursorDedupState(_DedupState): + seen_priority_map: Dict[str, int] + seen_updates_in_order: List[tuple[str, int]] + seen_request_set: set[str] + + def should_accept(self, key: str, priority: int) -> bool: + if key in self.seen_request_set: + return False + existing_priority = self.seen_priority_map.get(key) + if existing_priority is not None and priority <= existing_priority: + return False + return True + + def record(self, key: str, priority: int) -> None: + self.seen_priority_map[key] = priority + self.seen_updates_in_order.append((key, priority)) + self.seen_request_set.add(key) + + +@dataclass +class _RedisDedupState(_DedupState): + redis_client: Union[redis.Redis, AsyncRedis] + redis_state_key: str + redis_seen_cache: Dict[str, Optional[int]] + redis_new_scores: Dict[str, int] + seen_request_set: set[str] + zmscore: Callable[[Union[redis.Redis, AsyncRedis], str, List[str]], Any] + + async def prefetch(self, keys: List[str]) -> None: + if not keys: + return + unique: List[str] = [] + seen: set[str] = set() + for k in keys: + if k in self.seen_request_set: + continue + if k in self.redis_seen_cache: + continue + if k in seen: + continue + seen.add(k) + unique.append(k) + + if not unique: + return + + scores = self.zmscore(self.redis_client, self.redis_state_key, unique) + if inspect.iscoroutine(scores): + scores = await scores + + for k, s in zip(unique, scores): + self.redis_seen_cache[k] = None if s is None else int(s) + + def should_accept(self, key: str, priority: int) -> bool: + if key in self.seen_request_set: + return False + existing_priority = self.redis_seen_cache.get(key) + if existing_priority is not None and priority <= existing_priority: + return False + return True + + def record(self, key: str, priority: int) -> None: + self.seen_request_set.add(key) + self.redis_seen_cache[key] = priority + self.redis_new_scores[key] = max(self.redis_new_scores.get(key, 0), priority) + FeedTypes = Annotated[ Union[ "MergerDeduplication", @@ -1154,6 +1244,8 @@ class MergerDeduplication(BaseFeedConfigModel): max_refill_loops: int = 20 + _descendant_cursor_keys_cache: Optional[set[str]] = PrivateAttr(default=None) + @model_validator(mode="after") def validate_merger_deduplication(self) -> "MergerDeduplication": if self.overfetch_factor < 1: @@ -1199,6 +1291,18 @@ def _collect_descendant_cursor_keys(self, feed: BaseFeedConfigModel) -> set[str] return keys + def _get_descendant_cursor_keys_cached(self) -> set[str]: + cached = self._descendant_cursor_keys_cache + if cached is None: + cached = self._collect_descendant_cursor_keys(self.data) + self._descendant_cursor_keys_cache = cached + return cached + + def _reset_descendant_cursors(self, next_page: FeedResultNextPage) -> None: + descendant_keys = self._get_descendant_cursor_keys_cached() + for key in descendant_keys: + next_page.data.pop(key, None) + def _normalize_key(self, value: Any) -> str: if isinstance(value, (str, int)): return str(value) @@ -1221,6 +1325,161 @@ def _extract_dedup_value(self, item: Any) -> Any: ) return value + def _get_entity_key(self, entity: Any) -> Optional[str]: + """Return normalized dedup key for entity, or None if entity should be skipped.""" + + raw_value = self._extract_dedup_value(entity) + if raw_value is None: + if self.missing_key_policy == "drop": + return None + if self.missing_key_policy == "keep": + raw_value = ("__missing__", id(entity)) + return self._normalize_key(raw_value) + + def _compute_overfetch_params(self, *, remaining: int, next_after: Any) -> tuple[bool, int, Optional[int]]: + """Compute safe overfetch params. + + Overfetch is only safe when `after` is an integer offset (so we can rewind). + + Returns: (can_overfetch, request_limit, start_after) + """ + + can_overfetch = isinstance(next_after, int) + request_limit = max(1, remaining) + if can_overfetch and self.overfetch_factor > 1: + request_limit = max(1, remaining * self.overfetch_factor) + start_after: Optional[int] = int(next_after) if can_overfetch else None + return can_overfetch, request_limit, start_after + + def _iter_subfeeds(self, feed: BaseFeedConfigModel): + if isinstance(feed, SubFeed): + yield feed + return + + for attr_name in ("data", "positional", "default"): + inner = getattr(feed, attr_name, None) + if isinstance(inner, BaseFeedConfigModel): + yield from self._iter_subfeeds(inner) + + for attr_name in ("item_from", "item_to"): + wrapper = getattr(feed, attr_name, None) + inner = getattr(wrapper, "data", None) + if isinstance(inner, BaseFeedConfigModel): + yield from self._iter_subfeeds(inner) + + items = getattr(feed, "items", None) + if isinstance(items, list): + for item in items: + if isinstance(item, BaseFeedConfigModel): + yield from self._iter_subfeeds(item) + continue + inner = getattr(item, "data", None) + if isinstance(inner, BaseFeedConfigModel): + yield from self._iter_subfeeds(inner) + + def _register_wrapped_subfeed_method( + self, + *, + subfeed: "SubFeed", + original_methods_dict: Dict[str, Callable], + rewritten_methods_dict: Dict[str, Callable], + dedup_state: "_DedupState", + ) -> None: + original_name = subfeed.method_name + original_method = original_methods_dict[original_name] + unique_name = f"__dedup__{self.merger_id}__{subfeed.subfeed_id}" + + # Idempotency: if the same subfeed id appears multiple times, don't re-wrap. + if unique_name in rewritten_methods_dict: + subfeed.method_name = unique_name + return + + subfeed.method_name = unique_name + leaf_priority = int(getattr(subfeed, "dedup_priority", 0)) + + wrapped = self._make_wrapped_leaf_method( + original_method=original_method, + dedup_state=dedup_state, + leaf_priority=leaf_priority, + ) + setattr(wrapped, "_smartfeed_original", original_method) + rewritten_methods_dict[unique_name] = wrapped + + def _make_wrapped_leaf_method( + self, + *, + original_method: Callable, + dedup_state: "_DedupState", + leaf_priority: int, + ) -> Callable: + async def _wrapped_method(user_id: Any, limit: int, next_page: FeedResultNextPageInside, **kw: Any): + collected: List[Any] = [] + upstream_has_next_page = False + + loops = 0 + while len(collected) < limit and loops < self.max_refill_loops: + loops += 1 + before_len = len(collected) + + remaining = limit - len(collected) + can_overfetch, request_limit, start_after = self._compute_overfetch_params( + remaining=remaining, + next_after=next_page.after, + ) + + method_result = await original_method(user_id=user_id, limit=request_limit, next_page=next_page, **kw) + if not isinstance(method_result, FeedResultClient): + raise TypeError('SubFeed function must return "FeedResultClient" instance.') + + upstream_has_next_page = upstream_has_next_page or method_result.has_next_page + + inspected_count = 0 + + # Backend-specific optimization: Redis batches zmscore. + # For cursor backend, prefetch is a no-op and we avoid the extra pass entirely. + keys_by_index: Optional[List[Optional[str]]] = None + if isinstance(dedup_state, _RedisDedupState): + keys_by_index = [] + batch_keys: List[str] = [] + for entity in method_result.data: + key = self._get_entity_key(entity) + keys_by_index.append(key) + if key is not None: + batch_keys.append(key) + await dedup_state.prefetch(batch_keys) + + for idx, entity in enumerate(method_result.data, start=1): + inspected_count = idx + + key = keys_by_index[idx - 1] if keys_by_index is not None else self._get_entity_key(entity) + if key is None: + continue + + if not dedup_state.should_accept(key, leaf_priority): + continue + + collected.append(entity) + dedup_state.record(key, leaf_priority) + + if len(collected) >= limit: + break + + if len(collected) == before_len: + # No progress this loop. Stop if upstream is exhausted. + if not method_result.has_next_page: + break + + # If we oversampled with a simple integer cursor, rewind to the point we actually consumed. + # This prevents skipping un-inspected items that were fetched but not needed. + if can_overfetch and request_limit > remaining and start_after is not None: + end_after = next_page.after + if isinstance(end_after, int) and end_after == start_after + len(method_result.data): + next_page.after = start_after + inspected_count + + return FeedResultClient(data=collected, next_page=next_page, has_next_page=upstream_has_next_page) + + return _wrapped_method + def _decode_seen_from_cursor(self, next_page: FeedResultNextPage) -> Dict[str, int]: entry = next_page.data.get(self.merger_id) if not entry or entry.after is None: @@ -1268,11 +1527,34 @@ def _encode_seen_for_cursor(self, seen_updates_in_order: List[tuple[str, int]]) "z": base64.urlsafe_b64encode(compressed).decode(), } - async def _redis_zscore(self, redis_client: Union[redis.Redis, AsyncRedis], key: str, member: str) -> Optional[float]: - res = redis_client.zscore(key, member) + async def _redis_zmscore( + self, + redis_client: Union[redis.Redis, AsyncRedis], + key: str, + members: List[str], + ) -> List[Optional[float]]: + """Batch zscore for multiple members. + + Falls back to pipelined zscore when zmscore isn't available. + """ + + if not members: + return [] + + if hasattr(redis_client, "zmscore"): + res = redis_client.zmscore(key, members) # type: ignore[attr-defined] + if inspect.iscoroutine(res): + res = await res + # redis-py returns list[Optional[float]] + return [None if v is None else float(v) for v in list(res)] + + pipe = redis_client.pipeline() + for m in members: + pipe.zscore(key, m) + res = pipe.execute() if inspect.iscoroutine(res): res = await res - return None if res is None else float(res) + return [None if v is None else float(v) for v in list(res)] async def _redis_zadd_and_expire( self, @@ -1315,16 +1597,11 @@ async def get_data( if self.state_backend == "redis" and not redis_client: raise ValueError("Redis client must be provided if using MergerDeduplication with state_backend=redis") - if hasattr(next_page, "model_copy"): - working_next_page = next_page.model_copy(deep=True) # type: ignore[attr-defined] - else: - working_next_page = next_page.copy(deep=True) + working_next_page = _pydantic_deep_copy(next_page) if is_fresh_session: # Reset cursors for all descendants under this merger so upstream nodes also restart. - descendant_keys = self._collect_descendant_cursor_keys(self.data) - for key in descendant_keys: - working_next_page.data.pop(key, None) + self._reset_descendant_cursors(working_next_page) # Shared dedup state (cross-page) seen_priority_map: Dict[str, int] = {} @@ -1336,6 +1613,8 @@ async def get_data( seen_request_set: set[str] = set(seen_priority_map.keys()) redis_state_key = "" + redis_new_scores: Dict[str, int] = {} + redis_seen_cache: Dict[str, Optional[int]] = {} if self.state_backend == "redis" and redis_client: redis_state_key = self._build_redis_state_key(user_id=user_id, params=params) if is_fresh_session: @@ -1344,7 +1623,23 @@ async def get_data( if inspect.iscoroutine(deleted): await deleted - redis_new_scores: Dict[str, int] = {} + # Create a single state helper shared across all leaf wrappers. + if self.state_backend == "cursor": + dedup_state: _DedupState = _CursorDedupState( + seen_priority_map=seen_priority_map, + seen_updates_in_order=seen_updates_in_order, + seen_request_set=seen_request_set, + ) + else: + assert redis_client is not None + dedup_state = _RedisDedupState( + redis_client=redis_client, + redis_state_key=redis_state_key, + redis_seen_cache=redis_seen_cache, + redis_new_scores=redis_new_scores, + seen_request_set=seen_request_set, + zmscore=self._redis_zmscore, + ) # Preserve inner merger ordering/mixing semantics by deduplicating at the leaf method level # with a shared seen-set. @@ -1353,142 +1648,17 @@ async def get_data( # Create a deep copy of the child tree and rewrite each SubFeed to call a unique wrapper # so we can associate a dedup_priority with each leaf. child = self.data - if hasattr(child, "model_copy"): - child = child.model_copy(deep=True) # type: ignore[attr-defined] - else: - child = child.copy(deep=True) - - def iter_subfeeds(feed: BaseFeedConfigModel) -> List["SubFeed"]: - found: List[SubFeed] = [] - - if isinstance(feed, SubFeed): - found.append(feed) - return found - - for attr_name in ("data", "positional", "default"): - inner = getattr(feed, attr_name, None) - if isinstance(inner, BaseFeedConfigModel): - found.extend(iter_subfeeds(inner)) - - for attr_name in ("item_from", "item_to"): - wrapper = getattr(feed, attr_name, None) - inner = getattr(wrapper, "data", None) - if isinstance(inner, BaseFeedConfigModel): - found.extend(iter_subfeeds(inner)) - - items = getattr(feed, "items", None) - if isinstance(items, list): - for item in items: - if isinstance(item, BaseFeedConfigModel): - found.extend(iter_subfeeds(item)) - continue - inner = getattr(item, "data", None) - if isinstance(inner, BaseFeedConfigModel): - found.extend(iter_subfeeds(inner)) - - return found + child = _pydantic_deep_copy(child) rewritten_methods_dict = dict(original_methods_dict) - def wrap_leaf_method(*, subfeed: "SubFeed") -> None: - original_name = subfeed.method_name - original_method = original_methods_dict[original_name] - unique_name = f"__dedup__{self.merger_id}__{subfeed.subfeed_id}" - # Idempotency: if the same subfeed id appears multiple times, don't re-wrap. - if unique_name in rewritten_methods_dict: - subfeed.method_name = unique_name - return - subfeed.method_name = unique_name - leaf_priority = int(getattr(subfeed, "dedup_priority", 0)) - - async def _wrapped_method(user_id: Any, limit: int, next_page: FeedResultNextPageInside, **kw: Any): - collected: List[Any] = [] - local_seen: set[str] = set() - any_has_next_page = False - - loops = 0 - while len(collected) < limit and loops < self.max_refill_loops: - loops += 1 - before_len = len(collected) - - remaining = limit - len(collected) - # Safe oversampling: only when we can rewind integer-offset cursors. - # IMPORTANT: `after` can be many shapes (str/dict/etc) and may start as None. - # We only enable overfetch when `after` is already an int offset. - can_overfetch = isinstance(next_page.after, int) - request_limit = max(1, remaining) - if can_overfetch and self.overfetch_factor > 1: - request_limit = max(1, remaining * self.overfetch_factor) - - start_after: Optional[int] = int(next_page.after) if can_overfetch else None - - method_result = await original_method(user_id=user_id, limit=request_limit, next_page=next_page, **kw) - if not isinstance(method_result, FeedResultClient): - raise TypeError('SubFeed function must return "FeedResultClient" instance.') - - any_has_next_page = any_has_next_page or method_result.has_next_page - - consumed_in_batch = 0 - - for entity in method_result.data: - consumed_in_batch += 1 - raw_value = self._extract_dedup_value(entity) - if raw_value is None: - if self.missing_key_policy == "drop": - continue - if self.missing_key_policy == "keep": - raw_value = ("__missing__", id(entity)) - - key = self._normalize_key(raw_value) - if key in local_seen: - continue - - if key in seen_request_set: - continue - - if self.state_backend == "cursor": - existing_priority = seen_priority_map.get(key) - if existing_priority is not None and leaf_priority <= existing_priority: - continue - else: - assert redis_client is not None - existing_score = await self._redis_zscore(redis_client, redis_state_key, key) - if existing_score is not None and leaf_priority <= int(existing_score): - continue - - local_seen.add(key) - collected.append(entity) - - seen_request_set.add(key) - - if self.state_backend == "cursor": - seen_priority_map[key] = leaf_priority - seen_updates_in_order.append((key, leaf_priority)) - else: - redis_new_scores[key] = max(redis_new_scores.get(key, 0), leaf_priority) - - if len(collected) >= limit: - break - - if len(collected) == before_len: - # No progress this loop. Stop if upstream is exhausted. - if not method_result.has_next_page: - break - - # If we oversampled with a simple integer cursor, rewind to the point we actually consumed. - # This prevents skipping un-inspected items that were fetched but not needed. - if can_overfetch and request_limit > remaining and start_after is not None: - end_after = next_page.after - if isinstance(end_after, int) and end_after == start_after + len(method_result.data): - next_page.after = start_after + consumed_in_batch - - return FeedResultClient(data=collected, next_page=next_page, has_next_page=any_has_next_page) - - setattr(_wrapped_method, "_smartfeed_original", original_method) - rewritten_methods_dict[unique_name] = _wrapped_method - - for sf in iter_subfeeds(child): - wrap_leaf_method(subfeed=sf) + for sf in self._iter_subfeeds(child): + self._register_wrapped_subfeed_method( + subfeed=sf, + original_methods_dict=original_methods_dict, + rewritten_methods_dict=rewritten_methods_dict, + dedup_state=dedup_state, + ) child_result = await child.get_data( methods_dict=rewritten_methods_dict, @@ -1508,10 +1678,7 @@ async def _wrapped_method(user_id: Any, limit: int, next_page: FeedResultNextPag if self.state_backend == "cursor": merger_after = self._encode_seen_for_cursor(seen_updates_in_order) - if hasattr(child_result.next_page, "model_copy"): - result_next_page = child_result.next_page.model_copy(deep=True) # type: ignore[attr-defined] - else: - result_next_page = child_result.next_page.copy(deep=True) + result_next_page = _pydantic_deep_copy(child_result.next_page) result_next_page.data[self.merger_id] = FeedResultNextPageInside(page=page + 1, after=merger_after) return FeedResult(data=child_result.data, next_page=result_next_page, has_next_page=child_result.has_next_page) From b0fa3ea4d198ce807fd4df98a4838cf340df1c18 Mon Sep 17 00:00:00 2001 From: Pavel Kochetov Date: Tue, 16 Dec 2025 23:54:27 +0000 Subject: [PATCH 08/33] Readme updated. --- README.md | 112 ++++++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 112 insertions(+) diff --git a/README.md b/README.md index fe96da4..a86e8a4 100644 --- a/README.md +++ b/README.md @@ -7,6 +7,9 @@ Python-package для формирования ленты (Feed) из клиен - [Использование](#использование) - [Установка](#установка) - [Формирование конфигурации](#формирование-конфигурации) + - [MergerDeduplication (дедупликация)](#mergerdeduplication-дедупликация) + - [Параметры MergerDeduplication](#параметры-mergerdeduplication) + - [Важные нюансы (сброс, cursor/redis, overfetch)](#важные-нюансы-сброс-cursorredis-overfetch) - [Требования к клиентскому методу](#требования-к-клиентскому-методу) - [Запуск](#запуск) @@ -68,6 +71,115 @@ poetry add git+ssh://git@github.com:epoch8/looky-timeline.git }, ``` +### MergerDeduplication (дедупликация) + +MergerDeduplication — обёртка над одним дочерним узлом (merger или subfeed), которая удаляет дубли по ключу. + +Ключевые свойства реализации: + +- Дедупликация выполняется на уровне листьев (SubFeed), а не пост-обработкой результата мерджера. + Это важно: вложенные мерджеры (positional/percentage/gradient/append/distribute) сохраняют свои правила смешивания. + Если элемент удалён как дубль, MergerDeduplication «дозапросит» следующий элемент из того же источника. +- Состояние «уже видели» может храниться: + - в курсоре (state_backend="cursor") — удобно без Redis, но курсор может расти; + - в Redis (state_backend="redis") — удобно для большого состояния. + +Пример: обернуть существующую конфигурацию фида дедупликацией: + +```json +{ + "version": "1", + "feed": { + "merger_id": "dedup_main", + "type": "merger_deduplication", + "dedup_key": "id", + "missing_key_policy": "error", + "state_backend": "cursor", + "cursor_compress": true, + "cursor_max_keys": 2000, + "overfetch_factor": 2, + "max_refill_loops": 20, + "data": { + "merger_id": "merger_percent", + "type": "merger_percentage", + "items": [ + { + "percentage": 60, + "data": { + "subfeed_id": "sf_posts", + "type": "subfeed", + "method_name": "posts", + "dedup_priority": 10 + } + }, + { + "percentage": 40, + "data": { + "subfeed_id": "sf_ads", + "type": "subfeed", + "method_name": "ads", + "dedup_priority": 0 + } + } + ] + } + } +} +``` + +В примере выше, если `posts` и `ads` отдают объекты с одинаковым `id`, то «побеждает» источник с большим `dedup_priority`. + +### Параметры MergerDeduplication + +Обязательные поля: + +- `merger_id: str` — уникальный ID мерджера. +- `type: "merger_deduplication"` +- `data` — ровно один дочерний узел (subfeed или merger). + +Поля дедупликации: + +- `dedup_key: str | null` — имя ключа/атрибута для поиска дублей. + - если `null`, ключом считается сам объект (подходит, когда объекты уже hashable/строковые). +- `missing_key_policy: "error" | "keep" | "drop"` (default: `"error"`) + - `error`: выбросить ошибку, если у элемента нет `dedup_key`; + - `keep`: сохранить элемент, даже если ключа нет; + - `drop`: выкинуть элемент без ключа. + +Состояние seen (межстраничная дедупликация): + +- `state_backend: "cursor" | "redis"` (default: `"cursor"`) +- `state_ttl_seconds: int` (default: `3600`) — TTL для Redis состояния (только для backend=`redis`). +- `cursor_compress: bool` (default: `true`) — сжимать seen-состояние в cursor backend. +- `cursor_max_keys: int | null` — ограничить размер seen-состояния в cursor backend (полезно для контроля размера курсора). + +Производительность/поведение: + +- `overfetch_factor: int` (default: `1`) — «перезапрос» внутри листьев, чтобы быстрее добрать `limit` без множества рефиллов. +- `max_refill_loops: int` (default: `20`) — верхняя граница количества дозапросов на один лист. + +### Важные нюансы (сброс, cursor/redis, overfetch) + +- Сброс состояния при `page <= 0` или отсутствии курсора для `merger_id`. + - MergerDeduplication воспринимает это как «fresh session» и очищает курсоры всех дочерних узлов. + - Для backend=`redis` дополнительно удаляет ключ состояния в Redis. + +- Если `state_backend="redis"`, нужно передать `redis_client` в `FeedManager`. + - Ключ состояния в Redis строится как `dedup:{merger_id}:{user_id}`. + - Можно добавить суффикс через параметр запроса `custom_deduplication_key` (или `custom_view_session_key`), + чтобы разделять состояния для разных режимов выдачи. + +- Приоритет (`dedup_priority`) — это приоритет победы при конфликте дублей, а не порядок вывода. + - Больше `dedup_priority` → элемент «побеждает» и будет считаться seen с этим приоритетом. + - Это поле доступно у всех узлов (merger/subfeed) и используется MergerDeduplication при дедупликации. + +- overfetch работает безопасно только для «перематываемых» курсоров. + - Сейчас overfetch включается только если `next_page.after` у листа — целочисленный offset. + - Если `after` — строка/словарь/любой другой объект, он считается непрозрачным и overfetch не применяется. + +- Главный реальный bottleneck в дедупликации — не обёртки/копии, а рефиллы. + - Если дублей много и upstream-методы дорогие, стоит аккуратно подобрать `overfetch_factor` и `max_refill_loops`. + ### Требования к клиентскому методу Клиентский метод для получения данных должен обязательно включать в себя следующие параметры: From 8c0091536109e8b22db649ef743f441acabe089a Mon Sep 17 00:00:00 2001 From: Pavel Kochetov Date: Wed, 17 Dec 2025 00:14:40 +0000 Subject: [PATCH 09/33] Linter fixes. --- smartfeed/examples/example_client.py | 5 ++- smartfeed/manager.py | 5 ++- smartfeed/schemas.py | 64 +++++++++++++++++----------- 3 files changed, 46 insertions(+), 28 deletions(-) diff --git a/smartfeed/examples/example_client.py b/smartfeed/examples/example_client.py index 8b9d6a7..b1a55e3 100644 --- a/smartfeed/examples/example_client.py +++ b/smartfeed/examples/example_client.py @@ -25,8 +25,9 @@ class TestClientRequest(BaseModel): def validate_next_page(cls, value: Union[str, FeedResultNextPage]) -> Union[str, FeedResultNextPage]: if isinstance(value, str): payload = json.loads(base64.urlsafe_b64decode(value)) - if hasattr(FeedResultNextPage, "model_validate"): - return FeedResultNextPage.model_validate(payload) # type: ignore[attr-defined] + validate = getattr(FeedResultNextPage, "model_validate", None) + if validate is not None: + return validate(payload) return FeedResultNextPage.parse_obj(payload) return value diff --git a/smartfeed/manager.py b/smartfeed/manager.py index 7ac06f9..5ef6eb1 100644 --- a/smartfeed/manager.py +++ b/smartfeed/manager.py @@ -20,8 +20,9 @@ def __init__(self, config: Dict, methods_dict: Dict, redis_client: Optional[Unio :param redis_client: объект клиента Redis (для конфигурации с view_session = True). """ - if hasattr(FeedConfig, "model_validate"): - self.feed_config = FeedConfig.model_validate(config) # type: ignore[attr-defined] + validate = getattr(FeedConfig, "model_validate", None) + if validate is not None: + self.feed_config = validate(config) else: self.feed_config = FeedConfig.parse_obj(config) self.methods_dict = methods_dict diff --git a/smartfeed/schemas.py b/smartfeed/schemas.py index 16fe0cc..f818b1b 100644 --- a/smartfeed/schemas.py +++ b/smartfeed/schemas.py @@ -7,7 +7,7 @@ from abc import ABC, abstractmethod from collections import defaultdict, deque from random import shuffle -from typing import Annotated, Any, Callable, Dict, List, Literal, Optional, Union, no_type_check +from typing import Annotated, Any, Awaitable, Callable, Dict, Iterator, List, Literal, Optional, Union, cast, no_type_check import redis from pydantic import BaseModel, Field, PrivateAttr, model_validator @@ -19,7 +19,7 @@ def _pydantic_deep_copy(model: Any) -> Any: """Deep copy helper compatible with Pydantic v1 and v2.""" if hasattr(model, "model_copy"): - return model.model_copy(deep=True) # type: ignore[attr-defined] + return model.model_copy(deep=True) return model.copy(deep=True) @@ -63,7 +63,10 @@ class _RedisDedupState(_DedupState): redis_seen_cache: Dict[str, Optional[int]] redis_new_scores: Dict[str, int] seen_request_set: set[str] - zmscore: Callable[[Union[redis.Redis, AsyncRedis], str, List[str]], Any] + zmscore: Callable[ + [Union[redis.Redis, AsyncRedis], str, List[str]], + Union[Awaitable[List[Optional[float]]], List[Optional[float]]], + ] async def prefetch(self, keys: List[str]) -> None: if not keys: @@ -83,9 +86,11 @@ async def prefetch(self, keys: List[str]) -> None: if not unique: return - scores = self.zmscore(self.redis_client, self.redis_state_key, unique) - if inspect.iscoroutine(scores): - scores = await scores + scores_result = self.zmscore(self.redis_client, self.redis_state_key, unique) + if inspect.iscoroutine(scores_result): + scores = await cast(Awaitable[List[Optional[float]]], scores_result) + else: + scores = cast(List[Optional[float]], scores_result) for k, s in zip(unique, scores): self.redis_seen_cache[k] = None if s is None else int(s) @@ -847,8 +852,8 @@ async def get_data( dedup_active = bool(params.pop("_sf_dedup_active", False)) - items_data: List = [None] * len(self.items) - results: List[Optional[FeedResult]] = [None] * len(self.items) + items_data: List[List[Any]] = [[] for _ in self.items] + results: List[Optional[FeedResult]] = [None for _ in self.items] indexed_items = list(enumerate(self.items)) fetch_order = indexed_items @@ -860,7 +865,9 @@ async def get_data( ) for idx, item in fetch_order: - item_result = await item.data.get_data( + item_result = cast( + FeedResult, + await item.data.get_data( methods_dict=methods_dict, user_id=user_id, limit=limit * item.percentage // 100, @@ -868,17 +875,18 @@ async def get_data( redis_client=redis_client, _sf_dedup_active=dedup_active, **params, + ), ) results[idx] = item_result - for idx, item_result in enumerate(results): - assert item_result is not None - items_data[idx] = item_result.data + for idx, result_item in enumerate(results): + assert result_item is not None + items_data[idx] = result_item.data - if not result.has_next_page and item_result.has_next_page: + if not result.has_next_page and result_item.has_next_page: result.has_next_page = True - result.next_page.data.update(item_result.next_page.data) + result.next_page.data.update(result_item.next_page.data) # Добавляем данные позиции к общему результату процентного мерджера. result.data = await self._merge_items_data(items_data=items_data) @@ -1351,7 +1359,7 @@ def _compute_overfetch_params(self, *, remaining: int, next_after: Any) -> tuple start_after: Optional[int] = int(next_after) if can_overfetch else None return can_overfetch, request_limit, start_after - def _iter_subfeeds(self, feed: BaseFeedConfigModel): + def _iter_subfeeds(self, feed: BaseFeedConfigModel) -> Iterator["SubFeed"]: if isinstance(feed, SubFeed): yield feed return @@ -1412,7 +1420,12 @@ def _make_wrapped_leaf_method( dedup_state: "_DedupState", leaf_priority: int, ) -> Callable: - async def _wrapped_method(user_id: Any, limit: int, next_page: FeedResultNextPageInside, **kw: Any): + async def _wrapped_method( + user_id: Any, + limit: int, + next_page: FeedResultNextPageInside, + **kw: Any, + ) -> FeedResultClient: collected: List[Any] = [] upstream_has_next_page = False @@ -1541,8 +1554,9 @@ async def _redis_zmscore( if not members: return [] - if hasattr(redis_client, "zmscore"): - res = redis_client.zmscore(key, members) # type: ignore[attr-defined] + zmscore_fn = getattr(redis_client, "zmscore", None) + if zmscore_fn is not None: + res = zmscore_fn(key, members) if inspect.iscoroutine(res): res = await res # redis-py returns list[Optional[float]] @@ -1567,9 +1581,10 @@ async def _redis_zadd_and_expire( res = redis_client.zadd(key, mapping={m: float(s) for m, s in member_scores.items()}) if inspect.iscoroutine(res): await res - await redis_client.expire(key, self.state_ttl_seconds) - else: - redis_client.expire(key, self.state_ttl_seconds) + + expire_res = redis_client.expire(key, self.state_ttl_seconds) + if inspect.iscoroutine(expire_res): + await expire_res def _build_redis_state_key(self, user_id: Any, params: Dict[str, Any]) -> str: suffix = params.get("custom_deduplication_key") or params.get("custom_view_session_key") @@ -1591,7 +1606,8 @@ async def get_data( # Treat an explicit "page 0" (or missing cursor for this merger) as a fresh session. # This allows clients to restart the feed (e.g., full reload) without carrying over seen state. - requested_page = next_page.data.get(self.merger_id).page if self.merger_id in next_page.data else None + entry = next_page.data.get(self.merger_id) + requested_page = entry.page if entry is not None else None is_fresh_session = requested_page is None or (isinstance(requested_page, int) and requested_page <= 0) if self.state_backend == "redis" and not redis_client: @@ -1792,9 +1808,9 @@ class FeedConfig(BaseModel): # Update Forward Refs def _rebuild_model(model: Any) -> None: if hasattr(model, "model_rebuild"): - model.model_rebuild() # type: ignore[attr-defined] + model.model_rebuild() else: - model.update_forward_refs() # type: ignore[attr-defined] + model.update_forward_refs() _rebuild_model(MergerPositional) From c70c6c4bb24b145e276a1ae114c09793d6475ae3 Mon Sep 17 00:00:00 2001 From: Pavel Kochetov Date: Wed, 17 Dec 2025 00:18:21 +0000 Subject: [PATCH 10/33] black fixes. --- smartfeed/schemas.py | 38 +++++++++++------ tests/test_merger_deduplication.py | 35 ++++++++++------ tests/test_parsing_config.py | 2 +- tests/test_redis_live.py | 67 ++++++++++++++++-------------- tests/utils.py | 1 - 5 files changed, 83 insertions(+), 60 deletions(-) diff --git a/smartfeed/schemas.py b/smartfeed/schemas.py index f818b1b..dc45d0f 100644 --- a/smartfeed/schemas.py +++ b/smartfeed/schemas.py @@ -1,13 +1,26 @@ import base64 -from dataclasses import dataclass import inspect import json import logging import zlib from abc import ABC, abstractmethod from collections import defaultdict, deque +from dataclasses import dataclass from random import shuffle -from typing import Annotated, Any, Awaitable, Callable, Dict, Iterator, List, Literal, Optional, Union, cast, no_type_check +from typing import ( + Annotated, + Any, + Awaitable, + Callable, + Dict, + Iterator, + List, + Literal, + Optional, + Union, + cast, + no_type_check, +) import redis from pydantic import BaseModel, Field, PrivateAttr, model_validator @@ -108,6 +121,7 @@ def record(self, key: str, priority: int) -> None: self.redis_seen_cache[key] = priority self.redis_new_scores[key] = max(self.redis_new_scores.get(key, 0), priority) + FeedTypes = Annotated[ Union[ "MergerDeduplication", @@ -868,13 +882,13 @@ async def get_data( item_result = cast( FeedResult, await item.data.get_data( - methods_dict=methods_dict, - user_id=user_id, - limit=limit * item.percentage // 100, - next_page=next_page, - redis_client=redis_client, - _sf_dedup_active=dedup_active, - **params, + methods_dict=methods_dict, + user_id=user_id, + limit=limit * item.percentage // 100, + next_page=next_page, + redis_client=redis_client, + _sf_dedup_active=dedup_active, + **params, ), ) @@ -1328,9 +1342,7 @@ def _extract_dedup_value(self, item: Any) -> Any: value = getattr(item, self.dedup_key, None) if value is None and self.missing_key_policy == "error": - raise AssertionError( - f"Deduplication failed: entity {item} has no key or attr {self.dedup_key}" - ) + raise AssertionError(f"Deduplication failed: entity {item} has no key or attr {self.dedup_key}") return value def _get_entity_key(self, entity: Any) -> Optional[str]: @@ -1821,4 +1833,4 @@ def _rebuild_model(model: Any) -> None: _rebuild_model(MergerAppendDistribute) _rebuild_model(MergerPercentageGradient) _rebuild_model(MergerViewSession) -_rebuild_model(MergerDeduplication) \ No newline at end of file +_rebuild_model(MergerDeduplication) diff --git a/tests/test_merger_deduplication.py b/tests/test_merger_deduplication.py index 94d122a..9bda09f 100644 --- a/tests/test_merger_deduplication.py +++ b/tests/test_merger_deduplication.py @@ -2,13 +2,7 @@ import pytest -from smartfeed.schemas import ( - FeedResultClient, - FeedResultNextPage, - FeedResultNextPageInside, - MergerDeduplication, -) - +from smartfeed.schemas import FeedResultClient, FeedResultNextPage, FeedResultNextPageInside, MergerDeduplication from tests.fixtures.redis import redis_client # noqa: F401 from tests.utils import parse_model @@ -1009,7 +1003,9 @@ async def test_dedup_distribute_cursor_backend_across_pages_preserves_source_ref } merger = parse_model(MergerDeduplication, config) - res_1 = await merger.get_data(methods_dict=methods_dict, user_id="u", limit=10, next_page=FeedResultNextPage(data={})) + res_1 = await merger.get_data( + methods_dict=methods_dict, user_id="u", limit=10, next_page=FeedResultNextPage(data={}) + ) res_2 = await merger.get_data(methods_dict=methods_dict, user_id="u", limit=10, next_page=res_1.next_page) assert len(res_1.data) == 10 @@ -1026,7 +1022,7 @@ async def test_dedup_distribute_cursor_backend_across_pages_preserves_source_ref @pytest.mark.asyncio async def test_dedup_percentage_gradient_cursor_backend_across_pages() -> None: a_items = [{"id": i, "src": "A"} for i in range(1, 300)] - b_items = ([{"id": i, "src": "B"} for i in range(1, 30)] + [{"id": 1000 + i, "src": "B"} for i in range(1, 300)]) + b_items = [{"id": i, "src": "B"} for i in range(1, 30)] + [{"id": 1000 + i, "src": "B"} for i in range(1, 300)] methods_dict = { "a": make_offset_paged_method(a_items), @@ -1052,7 +1048,9 @@ async def test_dedup_percentage_gradient_cursor_backend_across_pages() -> None: } merger = parse_model(MergerDeduplication, config) - res_1 = await merger.get_data(methods_dict=methods_dict, user_id="u", limit=10, next_page=FeedResultNextPage(data={})) + res_1 = await merger.get_data( + methods_dict=methods_dict, user_id="u", limit=10, next_page=FeedResultNextPage(data={}) + ) res_2 = await merger.get_data(methods_dict=methods_dict, user_id="u", limit=10, next_page=res_1.next_page) _assert_no_dupes_in_page(res_1.data) @@ -1148,8 +1146,19 @@ async def test_dedup_wrapper_with_view_session_merger(redis_client) -> None: "type": "merger_percentage", "shuffle": False, "items": [ - {"percentage": 50, "data": {"subfeed_id": "sf_low", "type": "subfeed", "method_name": "low", "dedup_priority": 0}}, - {"percentage": 50, "data": {"subfeed_id": "sf_high", "type": "subfeed", "method_name": "high", "dedup_priority": 100}}, + { + "percentage": 50, + "data": {"subfeed_id": "sf_low", "type": "subfeed", "method_name": "low", "dedup_priority": 0}, + }, + { + "percentage": 50, + "data": { + "subfeed_id": "sf_high", + "type": "subfeed", + "method_name": "high", + "dedup_priority": 100, + }, + }, ], }, }, @@ -1290,7 +1299,7 @@ async def test_dedup_percentage_gradient_slot_ownership_cursor_backend() -> None a_items = [{"id": i, "src": "A"} for i in range(1, 300)] # Start with duplicates, then provide unique tail. - b_items = ([{"id": i, "src": "B"} for i in range(1, 30)] + [{"id": 1000 + i, "src": "B"} for i in range(1, 300)]) + b_items = [{"id": i, "src": "B"} for i in range(1, 30)] + [{"id": 1000 + i, "src": "B"} for i in range(1, 300)] methods_dict = { "a": make_offset_paged_method(a_items), diff --git a/tests/test_parsing_config.py b/tests/test_parsing_config.py index 66c2387..bf61007 100644 --- a/tests/test_parsing_config.py +++ b/tests/test_parsing_config.py @@ -2,9 +2,9 @@ from smartfeed.manager import FeedManager from smartfeed.schemas import ( - MergerDeduplication, FeedConfig, MergerAppend, + MergerDeduplication, MergerPercentage, MergerPercentageGradient, MergerPercentageItem, diff --git a/tests/test_redis_live.py b/tests/test_redis_live.py index 85effa2..e1dc13d 100644 --- a/tests/test_redis_live.py +++ b/tests/test_redis_live.py @@ -1,9 +1,10 @@ import asyncio import json +import time + import pytest import redis from redis.asyncio import Redis as AsyncRedis -import time from smartfeed.schemas import FeedResultNextPage, MergerViewSession from tests.fixtures.configs import METHODS_DICT @@ -15,21 +16,22 @@ class RedisReplicationSimulator: """ Симулятор задержки репликации Redis для тестирования проблемы кластера. """ + def __init__(self, real_client): self.real_client = real_client self.write_delay = 0.1 # Задержка для имитации репликации self.pending_writes = {} # Ключи которые только что записали - + def exists(self, cache_key): return self.real_client.exists(cache_key) - + def set(self, name, value, ex=None): # Записываем в реальный Redis result = self.real_client.set(name, value, ex=ex) # Помечаем что этот ключ только что записан (имитация репликации) self.pending_writes[name] = time.time() return result - + def get(self, name): # Если ключ только что записан (в течение write_delay секунд), возвращаем None if name in self.pending_writes: @@ -39,7 +41,7 @@ def get(self, name): else: # Задержка прошла, можно удалить из pending del self.pending_writes[name] - + # Обычное чтение из Redis return self.real_client.get(name) @@ -50,24 +52,24 @@ async def test_redis_replication_delay_problem(): Тест для воспроизведения проблемы репликации Redis с использованием RedisReplicationSimulator для имитации задержки. """ - + # Подключаемся к Redis (должен быть запущен локально) try: - real_client = redis.Redis(host='localhost', port=6379, db=0) + real_client = redis.Redis(host="localhost", port=6379, db=0) real_client.ping() # Проверяем соединение except (redis.ConnectionError, redis.ResponseError): pytest.skip("Redis not available for live testing") - + # Очищаем тестовый ключ test_key = "test_merger_view_session_test_user" real_client.delete(test_key) - + # Используем симулятор задержки репликации redis_client = RedisReplicationSimulator(real_client) merger_vs = parse_model(MergerViewSession, MERGER_VIEW_SESSION_CONFIG) - + print("\n=== Демонстрация проблемы с задержкой репликации ===") - + try: # Этот вызов должен воспроизвести проблему с оригинальным кодом result = await merger_vs.get_data( @@ -77,17 +79,17 @@ async def test_redis_replication_delay_problem(): user_id="test_user", redis_client=redis_client, ) - + print("✅ Исправление работает! Получили результат без ошибки:") print(f" Данные: {result.data[:5]}... (показаны первые 5)") print(f" Размер: {len(result.data)}") print(f" Есть следующая страница: {result.has_next_page}") - + # Проверяем что получили валидные данные assert len(result.data) == 10 assert result.data[0] == "test_user_1" assert result.has_next_page is True - + except TypeError as e: if "the JSON object must be str, bytes or bytearray, not NoneType" in str(e): print("❌ Проблема НЕ исправлена! Все еще получаем TypeError") @@ -95,33 +97,33 @@ async def test_redis_replication_delay_problem(): else: print(f"❓ Неожиданная ошибка: {e}") raise - + finally: # Очистка real_client.delete(test_key) real_client.close() -@pytest.mark.asyncio +@pytest.mark.asyncio async def test_redis_multiple_requests(): """ Тест множественных запросов для проверки стабильности исправления. """ - + try: - real_client = redis.Redis(host='localhost', port=6379, db=0) + real_client = redis.Redis(host="localhost", port=6379, db=0) real_client.ping() except (redis.ConnectionError, redis.ResponseError): pytest.skip("Redis not available for live testing") - + test_key = "test_merger_multiple_test_user" real_client.delete(test_key) - + redis_client = RedisReplicationSimulator(real_client) merger_vs = parse_model(MergerViewSession, MERGER_VIEW_SESSION_CONFIG) - + print("\n=== Тест множественных запросов ===") - + try: # Первый запрос - создает кэш result1 = await merger_vs.get_data( @@ -131,33 +133,34 @@ async def test_redis_multiple_requests(): user_id="test_user", redis_client=redis_client, ) - + print(f"Первый запрос: получили {len(result1.data)} элементов") - + # Ждем чтобы задержка репликации прошла await asyncio.sleep(0.2) - - # Второй запрос - должен использовать кэш + + # Второй запрос - должен использовать кэш from smartfeed.schemas import FeedResultNextPageInside + result2 = await merger_vs.get_data( methods_dict=METHODS_DICT, - limit=5, + limit=5, next_page=FeedResultNextPage( data={"merger_view_session_example": FeedResultNextPageInside(page=2, after=None)} ), user_id="test_user", redis_client=redis_client, ) - + print(f"Второй запрос: получили {len(result2.data)} элементов") print(f"Данные второй страницы: {result2.data}") - + # Проверяем что получили разные данные (пагинация работает) assert result1.data != result2.data assert len(result2.data) == 5 - + print("✅ Множественные запросы работают корректно!") - + finally: real_client.delete(test_key) real_client.close() @@ -165,4 +168,4 @@ async def test_redis_multiple_requests(): if __name__ == "__main__": # Для запуска напрямую без pytest - asyncio.run(test_redis_replication_delay_problem()) \ No newline at end of file + asyncio.run(test_redis_replication_delay_problem()) diff --git a/tests/utils.py b/tests/utils.py index f38689f..331f1e7 100644 --- a/tests/utils.py +++ b/tests/utils.py @@ -2,7 +2,6 @@ from typing import Any, Dict, Type, TypeVar - T = TypeVar("T") From 43b06c7705b56e01314f4cd743af07666e19c3e1 Mon Sep 17 00:00:00 2001 From: Pavel Kochetov Date: Thu, 5 Feb 2026 16:29:59 +0000 Subject: [PATCH 11/33] Minor redis fix. --- smartfeed/schemas.py | 53 +++++++++++++++++++++++++++++++------------- 1 file changed, 38 insertions(+), 15 deletions(-) diff --git a/smartfeed/schemas.py b/smartfeed/schemas.py index dc45d0f..ca8fff6 100644 --- a/smartfeed/schemas.py +++ b/smartfeed/schemas.py @@ -1,3 +1,4 @@ +import asyncio import base64 import inspect import json @@ -28,6 +29,23 @@ from redis.asyncio import RedisCluster as AsyncRedisCluster +def _is_async_redis_client(client: Any) -> bool: + return isinstance(client, (AsyncRedis, AsyncRedisCluster)) + + +async def _redis_call(client: Any, method_name: str, *args: Any, **kwargs: Any) -> Any: + """Call a Redis method without blocking the event loop. + + - For `redis.asyncio` clients, calls are awaited directly. + - For sync `redis.Redis`, calls are offloaded via `asyncio.to_thread()`. + """ + + method = getattr(client, method_name) + if _is_async_redis_client(client): + return await method(*args, **kwargs) + return await asyncio.to_thread(method, *args, **kwargs) + + def _pydantic_deep_copy(model: Any) -> Any: """Deep copy helper compatible with Pydantic v1 and v2.""" @@ -287,7 +305,7 @@ async def _set_cache( self, methods_dict: Dict[str, Callable], user_id: Any, - redis_client: redis.Redis, + redis_client: Union[redis.Redis, AsyncRedis], cache_key: str, **params: Any, ) -> List[Any]: @@ -313,7 +331,7 @@ async def _set_cache( data = result.data if self.deduplicate: data = self._dedup_data(data) - redis_client.set(name=cache_key, value=json.dumps(data), ex=self.session_live_time) + await _redis_call(redis_client, "set", cache_key, json.dumps(data), ex=self.session_live_time) return data async def _set_cache_async( @@ -356,7 +374,7 @@ async def _get_cache( user_id: Any, limit: int, next_page: FeedResultNextPage, - redis_client: redis.Redis, + redis_client: Union[redis.Redis, AsyncRedis], **params: Any, ) -> FeedResult: """ @@ -380,7 +398,8 @@ async def _get_cache( logging.info("MergerViewSession cache request for %s", cache_key) # Если кэш не найден или передан пустой курсор пагинации на мерджер, обновляем данные и записываем в кэш. - if not redis_client.exists(cache_key) or self.merger_id not in next_page.data: + cache_exists = bool(await _redis_call(redis_client, "exists", cache_key)) + if not cache_exists or self.merger_id not in next_page.data: logging.info("Cache miss or new session - generating fresh data for %s", cache_key) # Получаем свежие данные и используем их напрямую (избегаем чтение из кэша) session_data = await self._set_cache( @@ -389,7 +408,7 @@ async def _get_cache( else: logging.info("Cache exists - attempting read from Redis for %s", cache_key) # Читаем из кэша только если он уже существовал - cached_data = redis_client.get(name=cache_key) + cached_data = await _redis_call(redis_client, "get", cache_key) if cached_data is None: # Fallback: если кэш пропал, получаем свежие данные logging.info( @@ -1574,6 +1593,17 @@ async def _redis_zmscore( # redis-py returns list[Optional[float]] return [None if v is None else float(v) for v in list(res)] + # Fallback: pipelined zscore. For sync clients, run the whole pipeline in a thread. + if not _is_async_redis_client(redis_client): + def _sync_pipeline_execute() -> Any: + pipe = redis_client.pipeline() + for m in members: + pipe.zscore(key, m) + return pipe.execute() + + res = await asyncio.to_thread(_sync_pipeline_execute) + return [None if v is None else float(v) for v in list(res)] + pipe = redis_client.pipeline() for m in members: pipe.zscore(key, m) @@ -1590,13 +1620,8 @@ async def _redis_zadd_and_expire( ) -> None: if not member_scores: return - res = redis_client.zadd(key, mapping={m: float(s) for m, s in member_scores.items()}) - if inspect.iscoroutine(res): - await res - - expire_res = redis_client.expire(key, self.state_ttl_seconds) - if inspect.iscoroutine(expire_res): - await expire_res + await _redis_call(redis_client, "zadd", key, mapping={m: float(s) for m, s in member_scores.items()}) + await _redis_call(redis_client, "expire", key, self.state_ttl_seconds) def _build_redis_state_key(self, user_id: Any, params: Dict[str, Any]) -> str: suffix = params.get("custom_deduplication_key") or params.get("custom_view_session_key") @@ -1647,9 +1672,7 @@ async def get_data( redis_state_key = self._build_redis_state_key(user_id=user_id, params=params) if is_fresh_session: # Drop state for a full restart. - deleted = redis_client.delete(redis_state_key) - if inspect.iscoroutine(deleted): - await deleted + await _redis_call(redis_client, "delete", redis_state_key) # Create a single state helper shared across all leaf wrappers. if self.state_backend == "cursor": From 0f11803d121b4d54979b94782d0e7f7743340e0a Mon Sep 17 00:00:00 2001 From: Pavel Kochetov Date: Thu, 5 Feb 2026 21:48:53 +0000 Subject: [PATCH 12/33] Minor parrallelism fix. --- smartfeed/schemas.py | 38 +++++++++++++++++++++++++++----------- 1 file changed, 27 insertions(+), 11 deletions(-) diff --git a/smartfeed/schemas.py b/smartfeed/schemas.py index ca8fff6..939a504 100644 --- a/smartfeed/schemas.py +++ b/smartfeed/schemas.py @@ -582,19 +582,35 @@ async def get_data( if dedup_active: indexed_items = list(enumerate(self.items)) - fetch_order = sorted(indexed_items, key=lambda p: (getattr(p[1], "dedup_priority", 0), -p[0]), reverse=True) fetched: Dict[int, FeedResult] = {} - for idx, item in fetch_order: - fetched[idx] = await item.get_data( - methods_dict=methods_dict, - user_id=user_id, - limit=limit, - next_page=next_page, - redis_client=redis_client, - _sf_dedup_active=True, - **params, - ) + # Always-on parallelism in dedup mode: preserve dedup semantics by ensuring + # higher priority is recorded first; fetch same-priority children concurrently. + groups: Dict[int, List[tuple[int, FeedTypes]]] = defaultdict(list) + for idx, item in indexed_items: + prio = int(getattr(item, "dedup_priority", 0)) + groups[prio].append((idx, item)) + + for prio in sorted(groups.keys(), reverse=True): + group = groups[prio] + coros: List[Awaitable[FeedResult]] = [] + order: List[int] = [] + for idx, item in group: + order.append(idx) + coros.append( + item.get_data( + methods_dict=methods_dict, + user_id=user_id, + limit=limit, + next_page=_pydantic_deep_copy(next_page), + redis_client=redis_client, + _sf_dedup_active=True, + **params, + ) + ) + group_results = await asyncio.gather(*coros) + for idx, r in zip(order, group_results): + fetched[idx] = cast(FeedResult, r) for idx, _item in indexed_items: item_result = fetched[idx] From 2086519a1461e55d23d03d2f37f8759b5650d83e Mon Sep 17 00:00:00 2001 From: Pavel Kochetov Date: Thu, 5 Feb 2026 21:54:30 +0000 Subject: [PATCH 13/33] Switched to orjson. --- pyproject.toml | 1 + smartfeed/examples/example_client.py | 2 +- smartfeed/jsonlib.py | 41 ++++++++++++++++++++++++++++ smartfeed/schemas.py | 2 +- 4 files changed, 44 insertions(+), 2 deletions(-) create mode 100644 smartfeed/jsonlib.py diff --git a/pyproject.toml b/pyproject.toml index 108a4c7..2c4d669 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -14,6 +14,7 @@ packages = [ python = ">=3.9" pydantic = ">=1.10.7" redis = ">=4.5.5" +orjson = ">=3.9.0" [tool.poetry.group.dev.dependencies] isort = "^5.12.0" diff --git a/smartfeed/examples/example_client.py b/smartfeed/examples/example_client.py index b1a55e3..27ca2fa 100644 --- a/smartfeed/examples/example_client.py +++ b/smartfeed/examples/example_client.py @@ -1,5 +1,5 @@ import base64 -import json +from smartfeed import jsonlib as json from typing import Optional, Union from pydantic import BaseModel, ConfigDict, Field, field_validator diff --git a/smartfeed/jsonlib.py b/smartfeed/jsonlib.py new file mode 100644 index 0000000..401e6b4 --- /dev/null +++ b/smartfeed/jsonlib.py @@ -0,0 +1,41 @@ +from __future__ import annotations + +from typing import Any, Callable, Optional + +import orjson + + +DefaultFn = Callable[[Any], Any] + + +def dumps( + obj: Any, + *, + default: Optional[DefaultFn] = None, + sort_keys: bool = False, +) -> str: + """Serialize *obj* to JSON text using orjson. + + This is a small compatibility layer meant to cover the subset of the stdlib + `json.dumps` API used inside this package. + + Key differences vs `orjson.dumps`: + - Returns `str` (UTF-8) instead of `bytes`. + - Supports `default=` and `sort_keys=`. + """ + + option = 0 + if sort_keys: + option |= orjson.OPT_SORT_KEYS + + return orjson.dumps(obj, default=default, option=option).decode("utf-8") + + +def loads(data: Any) -> Any: + """Deserialize JSON from *data* using orjson. + + Accepts `str`, `bytes`, `bytearray`, or `memoryview` (same spirit as + stdlib `json.loads`). + """ + + return orjson.loads(data) diff --git a/smartfeed/schemas.py b/smartfeed/schemas.py index 939a504..ed04203 100644 --- a/smartfeed/schemas.py +++ b/smartfeed/schemas.py @@ -1,7 +1,7 @@ import asyncio import base64 import inspect -import json +from . import jsonlib as json import logging import zlib from abc import ABC, abstractmethod From 795c5748f9bb6ec1fac3934b906a3d9e542e7977 Mon Sep 17 00:00:00 2001 From: Pavel Kochetov Date: Thu, 5 Feb 2026 22:34:39 +0000 Subject: [PATCH 14/33] Refactor into separate merger modules. --- smartfeed/examples/example_client.py | 2 +- smartfeed/feed_models.py | 161 ++ smartfeed/jsonlib.py | 1 - smartfeed/mergers/__init__.py | 24 + smartfeed/mergers/append.py | 103 ++ smartfeed/mergers/append_distribute.py | 111 ++ smartfeed/mergers/deduplication.py | 548 ++++++ smartfeed/mergers/percentage.py | 111 ++ smartfeed/mergers/percentage_gradient.py | 164 ++ smartfeed/mergers/positional.py | 135 ++ smartfeed/mergers/view_session.py | 205 +++ smartfeed/schemas.py | 1926 +--------------------- 12 files changed, 1636 insertions(+), 1855 deletions(-) create mode 100644 smartfeed/feed_models.py create mode 100644 smartfeed/mergers/__init__.py create mode 100644 smartfeed/mergers/append.py create mode 100644 smartfeed/mergers/append_distribute.py create mode 100644 smartfeed/mergers/deduplication.py create mode 100644 smartfeed/mergers/percentage.py create mode 100644 smartfeed/mergers/percentage_gradient.py create mode 100644 smartfeed/mergers/positional.py create mode 100644 smartfeed/mergers/view_session.py diff --git a/smartfeed/examples/example_client.py b/smartfeed/examples/example_client.py index 27ca2fa..9b48842 100644 --- a/smartfeed/examples/example_client.py +++ b/smartfeed/examples/example_client.py @@ -1,9 +1,9 @@ import base64 -from smartfeed import jsonlib as json from typing import Optional, Union from pydantic import BaseModel, ConfigDict, Field, field_validator +from smartfeed import jsonlib as json from smartfeed.schemas import FeedResultClient, FeedResultNextPage, FeedResultNextPageInside diff --git a/smartfeed/feed_models.py b/smartfeed/feed_models.py new file mode 100644 index 0000000..b98f11f --- /dev/null +++ b/smartfeed/feed_models.py @@ -0,0 +1,161 @@ +import asyncio +import inspect +from abc import ABC, abstractmethod +from dataclasses import dataclass +from random import shuffle +from typing import Any, Awaitable, Callable, Dict, List, Literal, Optional, Union, cast + +import redis +from pydantic import BaseModel +from redis.asyncio import Redis as AsyncRedis +from redis.asyncio import RedisCluster as AsyncRedisCluster + + +def _is_async_redis_client(client: Any) -> bool: + return isinstance(client, (AsyncRedis, AsyncRedisCluster)) + + +async def _redis_call(client: Any, method_name: str, *args: Any, **kwargs: Any) -> Any: + """Call a Redis method without blocking the event loop. + + - For `redis.asyncio` clients, calls are awaited directly. + - For sync `redis.Redis`, calls are offloaded via `asyncio.to_thread()`. + """ + + method = getattr(client, method_name) + if _is_async_redis_client(client): + return await method(*args, **kwargs) + return await asyncio.to_thread(method, *args, **kwargs) + + +def _pydantic_deep_copy(model: Any) -> Any: + """Deep copy helper compatible with Pydantic v1 and v2.""" + + if hasattr(model, "model_copy"): + return model.model_copy(deep=True) + return model.copy(deep=True) + + +class FeedResultNextPageInside(BaseModel): + """Cursor model for one feed node.""" + + page: int = 1 + after: Any = None + + +class FeedResultNextPage(BaseModel): + """Cursor model for a whole feed traversal.""" + + data: Dict[str, FeedResultNextPageInside] + + +class FeedResult(BaseModel): + """Normalized output of any feed node `get_data()`.""" + + data: List + next_page: FeedResultNextPage + has_next_page: bool + + +class FeedResultClient(BaseModel): + """Result returned by client subfeed methods.""" + + data: List + next_page: FeedResultNextPageInside + has_next_page: bool + + +class BaseFeedConfigModel(ABC, BaseModel): + """Base class for merger/subfeed config models.""" + + # Higher value means the item should "win" deduplication when duplicates exist. + # This is primarily used by MergerDeduplication and by mergers when a dedup wrapper is active. + dedup_priority: int = 0 + + @abstractmethod + async def get_data( + self, + methods_dict: Dict[str, Callable], + user_id: Any, + limit: int, + next_page: FeedResultNextPage, + redis_client: Optional[Union[redis.Redis, AsyncRedis]] = None, + **params: Any, + ) -> FeedResult: + """Fetch data according to this node config.""" + + +@dataclass +class _SubFeedMethodSpec: + method: Callable + args: List[str] + + +class SubFeed(BaseFeedConfigModel): + """Leaf node pointing at a client method.""" + + subfeed_id: str + type: Literal["subfeed"] + method_name: str + subfeed_params: Dict[str, Any] = {} + raise_error: Optional[bool] = True + shuffle: bool = False + + def _get_method_spec(self, methods_dict: Dict[str, Callable]) -> _SubFeedMethodSpec: + method = methods_dict[self.method_name] + method_spec = getattr(method, "_smartfeed_original", method) + method_args = inspect.getfullargspec(method_spec).args + return _SubFeedMethodSpec(method=method, args=list(method_args)) + + async def get_data( + self, + methods_dict: Dict[str, Callable], + user_id: Any, + limit: int, + next_page: FeedResultNextPage, + redis_client: Optional[Union[redis.Redis, AsyncRedis]] = None, + **params: Any, + ) -> FeedResult: + subfeed_next_page = FeedResultNextPageInside( + page=next_page.data[self.subfeed_id].page if self.subfeed_id in next_page.data else 1, + after=next_page.data[self.subfeed_id].after if self.subfeed_id in next_page.data else None, + ) + + method_spec = self._get_method_spec(methods_dict) + + method_params: Dict[str, Any] = {} + for arg in method_spec.args: + if arg in params: + method_params[arg] = params[arg] + + try: + method_result = await method_spec.method( + user_id=user_id, + limit=limit, + next_page=subfeed_next_page, + **method_params, + **self.subfeed_params, + ) + except Exception: + if self.raise_error: + raise + + method_result = FeedResultClient( + data=[], + next_page=subfeed_next_page, + has_next_page=False, + ) + + if not isinstance(method_result, FeedResultClient): + raise TypeError('SubFeed function must return "FeedResultClient" instance.') + + if self.shuffle: + shuffle(method_result.data) + + return FeedResult( + data=method_result.data, + next_page=FeedResultNextPage( + data={self.subfeed_id: cast(FeedResultNextPageInside, method_result.next_page)} + ), + has_next_page=bool(method_result.has_next_page), + ) diff --git a/smartfeed/jsonlib.py b/smartfeed/jsonlib.py index 401e6b4..9ab650f 100644 --- a/smartfeed/jsonlib.py +++ b/smartfeed/jsonlib.py @@ -4,7 +4,6 @@ import orjson - DefaultFn = Callable[[Any], Any] diff --git a/smartfeed/mergers/__init__.py b/smartfeed/mergers/__init__.py new file mode 100644 index 0000000..24b1c8e --- /dev/null +++ b/smartfeed/mergers/__init__.py @@ -0,0 +1,24 @@ +"""Merger implementations. + +Each merger schema lives in its own module. +`smartfeed.schemas` re-exports these classes for backwards compatibility. +""" + +from .append import MergerAppend +from .append_distribute import MergerAppendDistribute +from .deduplication import MergerDeduplication +from .percentage import MergerPercentage, MergerPercentageItem +from .percentage_gradient import MergerPercentageGradient +from .positional import MergerPositional +from .view_session import MergerViewSession + +__all__ = [ + "MergerAppend", + "MergerAppendDistribute", + "MergerDeduplication", + "MergerPercentage", + "MergerPercentageItem", + "MergerPercentageGradient", + "MergerPositional", + "MergerViewSession", +] diff --git a/smartfeed/mergers/append.py b/smartfeed/mergers/append.py new file mode 100644 index 0000000..bfe0718 --- /dev/null +++ b/smartfeed/mergers/append.py @@ -0,0 +1,103 @@ +from __future__ import annotations + +import asyncio +from collections import defaultdict +from random import shuffle +from typing import TYPE_CHECKING, Any, Awaitable, Callable, Dict, List, Literal, Optional, Union, cast + +import redis +from redis.asyncio import Redis as AsyncRedis + +from ..feed_models import BaseFeedConfigModel, FeedResult, FeedResultNextPage, _pydantic_deep_copy + +if TYPE_CHECKING: + from ..schemas import FeedTypes + + +class MergerAppend(BaseFeedConfigModel): + """Append merger.""" + + merger_id: str + type: Literal["merger_append"] + items: List[FeedTypes] + shuffle: bool = False + + async def get_data( + self, + methods_dict: Dict[str, Callable], + user_id: Any, + limit: int, + next_page: FeedResultNextPage, + redis_client: Optional[Union[redis.Redis, AsyncRedis]] = None, + **params: Any, + ) -> FeedResult: + dedup_active = bool(params.pop("_sf_dedup_active", False)) + + result = FeedResult(data=[], next_page=FeedResultNextPage(data={}), has_next_page=False) + + if dedup_active: + indexed_items = list(enumerate(self.items)) + fetched: Dict[int, FeedResult] = {} + + groups: Dict[int, List[tuple[int, "FeedTypes"]]] = defaultdict(list) + for idx, item in indexed_items: + prio = int(getattr(item, "dedup_priority", 0)) + groups[prio].append((idx, item)) + + for prio in sorted(groups.keys(), reverse=True): + group = groups[prio] + coros: List[Awaitable[FeedResult]] = [] + order: List[int] = [] + for idx, item in group: + order.append(idx) + coros.append( + item.get_data( + methods_dict=methods_dict, + user_id=user_id, + limit=limit, + next_page=_pydantic_deep_copy(next_page), + redis_client=redis_client, + _sf_dedup_active=True, + **params, + ) + ) + group_results = await asyncio.gather(*coros) + for idx, r in zip(order, group_results): + fetched[idx] = cast(FeedResult, r) + + for idx, _item in indexed_items: + item_result = fetched[idx] + result.data.extend(item_result.data) + result.next_page.data.update(item_result.next_page.data) + if item_result.has_next_page: + result.has_next_page = True + + if len(result.data) > limit: + result.data = result.data[:limit] + else: + result_limit = limit + for item in self.items: + item_result = await item.get_data( + methods_dict=methods_dict, + user_id=user_id, + limit=result_limit, + next_page=next_page, + redis_client=redis_client, + **params, + ) + + result.data.extend(item_result.data) + result_limit -= len(item_result.data) + + if not result.has_next_page and item_result.has_next_page: + result.has_next_page = True + + result.next_page.data.update(item_result.next_page.data) + + if result_limit <= 0: + break + + if self.shuffle: + shuffle(result.data) + + return result diff --git a/smartfeed/mergers/append_distribute.py b/smartfeed/mergers/append_distribute.py new file mode 100644 index 0000000..5f07d66 --- /dev/null +++ b/smartfeed/mergers/append_distribute.py @@ -0,0 +1,111 @@ +from __future__ import annotations + +from collections import defaultdict, deque +from typing import TYPE_CHECKING, Any, Callable, Dict, List, Literal, Optional, Union + +import redis +from redis.asyncio import Redis as AsyncRedis +from typing_extensions import no_type_check + +from ..feed_models import BaseFeedConfigModel, FeedResult, FeedResultNextPage + +if TYPE_CHECKING: + from ..schemas import FeedTypes + + +class MergerAppendDistribute(BaseFeedConfigModel): + """Merger that uniformly distributes items by a key.""" + + merger_id: str + type: Literal["merger_distribute"] + items: List["FeedTypes"] + distribution_key: str + sorting_key: Optional[str] = None + sorting_desc: bool = False + + @no_type_check + async def _uniform_distribute(self, data: list) -> list: + if self.sorting_key: + data = sorted(data, key=lambda x: x[self.sorting_key], reverse=self.sorting_desc) + + grouped_entries = defaultdict(deque) + for entry in data: + grouped_entries[entry[self.distribution_key]].append(entry) + result = [] + prev_profile_id = None + while any(grouped_entries.values()): + for profile_id in list(grouped_entries.keys()): + if grouped_entries[profile_id]: + if profile_id != prev_profile_id or len(grouped_entries) == 1: + result.append(grouped_entries[profile_id].popleft()) + prev_profile_id = profile_id + if not grouped_entries[profile_id]: + del grouped_entries[profile_id] + else: + del grouped_entries[profile_id] + + return result + + async def get_data( + self, + methods_dict: Dict[str, Callable], + user_id: Any, + limit: int, + next_page: FeedResultNextPage, + redis_client: Optional[Union[redis.Redis, AsyncRedis]] = None, + **params: Any, + ) -> FeedResult: + dedup_active = bool(params.pop("_sf_dedup_active", False)) + + result = FeedResult(data=[], next_page=FeedResultNextPage(data={}), has_next_page=False) + + if dedup_active: + indexed_items = list(enumerate(self.items)) + fetch_order = sorted(indexed_items, key=lambda p: (getattr(p[1], "dedup_priority", 0), -p[0]), reverse=True) + fetched: Dict[int, FeedResult] = {} + + for idx, item in fetch_order: + fetched[idx] = await item.get_data( + methods_dict=methods_dict, + user_id=user_id, + limit=limit, + next_page=next_page, + redis_client=redis_client, + _sf_dedup_active=True, + **params, + ) + + for idx, _item in indexed_items: + item_result = fetched[idx] + result.data.extend(item_result.data) + result.next_page.data.update(item_result.next_page.data) + if item_result.has_next_page: + result.has_next_page = True + + if len(result.data) > limit: + result.data = result.data[:limit] + else: + result_limit = limit + for item in self.items: + item_result = await item.get_data( + methods_dict=methods_dict, + user_id=user_id, + limit=result_limit, + next_page=next_page, + redis_client=redis_client, + **params, + ) + + result.data.extend(item_result.data) + result_limit -= len(item_result.data) + + if not result.has_next_page and item_result.has_next_page: + result.has_next_page = True + + result.next_page.data.update(item_result.next_page.data) + + if result_limit <= 0: + break + + result.data = await self._uniform_distribute(result.data) + return result diff --git a/smartfeed/mergers/deduplication.py b/smartfeed/mergers/deduplication.py new file mode 100644 index 0000000..a9902a5 --- /dev/null +++ b/smartfeed/mergers/deduplication.py @@ -0,0 +1,548 @@ +from __future__ import annotations + +import asyncio +import base64 +import inspect +import zlib +from abc import ABC, abstractmethod +from dataclasses import dataclass +from typing import TYPE_CHECKING, Any, Awaitable, Callable, Dict, Iterator, List, Literal, Optional, Union, cast + +import redis +from pydantic import PrivateAttr, model_validator +from redis.asyncio import Redis as AsyncRedis + +from .. import jsonlib as json +from ..feed_models import ( + BaseFeedConfigModel, + FeedResult, + FeedResultClient, + FeedResultNextPage, + FeedResultNextPageInside, + SubFeed, + _is_async_redis_client, + _pydantic_deep_copy, + _redis_call, +) + +if TYPE_CHECKING: + from ..schemas import FeedTypes + + +class _DedupState(ABC): + @abstractmethod + def should_accept(self, key: str, priority: int) -> bool: + raise NotImplementedError + + @abstractmethod + def record(self, key: str, priority: int) -> None: + raise NotImplementedError + + async def prefetch(self, keys: List[str]) -> None: + return + + +@dataclass +class _CursorDedupState(_DedupState): + seen_priority_map: Dict[str, int] + seen_updates_in_order: List[tuple[str, int]] + seen_request_set: set[str] + + def should_accept(self, key: str, priority: int) -> bool: + if key in self.seen_request_set: + return False + existing_priority = self.seen_priority_map.get(key) + if existing_priority is not None and priority <= existing_priority: + return False + return True + + def record(self, key: str, priority: int) -> None: + self.seen_priority_map[key] = priority + self.seen_updates_in_order.append((key, priority)) + self.seen_request_set.add(key) + + +@dataclass +class _RedisDedupState(_DedupState): + redis_client: Union[redis.Redis, AsyncRedis] + redis_state_key: str + redis_seen_cache: Dict[str, Optional[int]] + redis_new_scores: Dict[str, int] + seen_request_set: set[str] + zmscore: Callable[ + [Union[redis.Redis, AsyncRedis], str, List[str]], + Union[Awaitable[List[Optional[float]]], List[Optional[float]]], + ] + + async def prefetch(self, keys: List[str]) -> None: + if not keys: + return + unique: List[str] = [] + seen: set[str] = set() + for k in keys: + if k in self.seen_request_set: + continue + if k in self.redis_seen_cache: + continue + if k in seen: + continue + seen.add(k) + unique.append(k) + + if not unique: + return + + scores_result = self.zmscore(self.redis_client, self.redis_state_key, unique) + if inspect.iscoroutine(scores_result): + scores = await cast(Awaitable[List[Optional[float]]], scores_result) + else: + scores = cast(List[Optional[float]], scores_result) + + for k, s in zip(unique, scores): + self.redis_seen_cache[k] = None if s is None else int(s) + + def should_accept(self, key: str, priority: int) -> bool: + if key in self.seen_request_set: + return False + existing_priority = self.redis_seen_cache.get(key) + if existing_priority is not None and priority <= existing_priority: + return False + return True + + def record(self, key: str, priority: int) -> None: + self.seen_request_set.add(key) + self.redis_seen_cache[key] = priority + self.redis_new_scores[key] = max(self.redis_new_scores.get(key, 0), priority) + + +class MergerDeduplication(BaseFeedConfigModel): + """Merger that deduplicates while preserving child mixing/position semantics.""" + + merger_id: str + type: Literal["merger_deduplication"] + data: "FeedTypes" + + dedup_key: Optional[str] = None + missing_key_policy: Literal["error", "keep", "drop"] = "error" + + state_backend: Literal["cursor", "redis"] = "cursor" + state_ttl_seconds: int = 3600 + cursor_compress: bool = True + cursor_max_keys: Optional[int] = None + + overfetch_factor: int = 1 + + max_refill_loops: int = 20 + + _descendant_cursor_keys_cache: Optional[set[str]] = PrivateAttr(default=None) + + @model_validator(mode="after") + def validate_merger_deduplication(self) -> "MergerDeduplication": + if self.overfetch_factor < 1: + raise ValueError('"overfetch_factor" must be >= 1') + if self.max_refill_loops < 1: + raise ValueError('"max_refill_loops" must be >= 1') + return self + + def _collect_descendant_cursor_keys(self, feed: BaseFeedConfigModel) -> set[str]: + keys: set[str] = set() + + subfeed_id = getattr(feed, "subfeed_id", None) + if isinstance(subfeed_id, str) and subfeed_id: + keys.add(subfeed_id) + + merger_id = getattr(feed, "merger_id", None) + if isinstance(merger_id, str) and merger_id: + keys.add(merger_id) + + child: Any + for attr_name in ("data", "positional", "default"): + child = getattr(feed, attr_name, None) + if isinstance(child, BaseFeedConfigModel): + keys.update(self._collect_descendant_cursor_keys(child)) + + for attr_name in ("item_from", "item_to"): + child = getattr(feed, attr_name, None) + inner = getattr(child, "data", None) + if isinstance(inner, BaseFeedConfigModel): + keys.update(self._collect_descendant_cursor_keys(inner)) + + items = getattr(feed, "items", None) + if isinstance(items, list): + for item in items: + if isinstance(item, BaseFeedConfigModel): + keys.update(self._collect_descendant_cursor_keys(item)) + continue + + inner = getattr(item, "data", None) + if isinstance(inner, BaseFeedConfigModel): + keys.update(self._collect_descendant_cursor_keys(inner)) + + return keys + + def _get_descendant_cursor_keys_cached(self) -> set[str]: + cached = self._descendant_cursor_keys_cache + if cached is None: + cached = self._collect_descendant_cursor_keys(self.data) + self._descendant_cursor_keys_cache = cached + return cached + + def _reset_descendant_cursors(self, next_page: FeedResultNextPage) -> None: + descendant_keys = self._get_descendant_cursor_keys_cached() + for key in descendant_keys: + next_page.data.pop(key, None) + + def _normalize_key(self, value: Any) -> str: + if isinstance(value, (str, int)): + return str(value) + if isinstance(value, (dict, list)): + return json.dumps(value, sort_keys=True, default=str) + return str(value) + + def _extract_dedup_value(self, item: Any) -> Any: + if not self.dedup_key: + return item + + try: + value = item.get(self.dedup_key) + except AttributeError: + value = getattr(item, self.dedup_key, None) + + if value is None and self.missing_key_policy == "error": + raise AssertionError(f"Deduplication failed: entity {item} has no key or attr {self.dedup_key}") + return value + + def _get_entity_key(self, entity: Any) -> Optional[str]: + raw_value = self._extract_dedup_value(entity) + if raw_value is None: + if self.missing_key_policy == "drop": + return None + if self.missing_key_policy == "keep": + raw_value = ("__missing__", id(entity)) + return self._normalize_key(raw_value) + + def _compute_overfetch_params(self, *, remaining: int, next_after: Any) -> tuple[bool, int, Optional[int]]: + can_overfetch = isinstance(next_after, int) + request_limit = max(1, remaining) + if can_overfetch and self.overfetch_factor > 1: + request_limit = max(1, remaining * self.overfetch_factor) + start_after: Optional[int] = int(next_after) if can_overfetch else None + return can_overfetch, request_limit, start_after + + def _iter_subfeeds(self, feed: BaseFeedConfigModel) -> Iterator[SubFeed]: + if isinstance(feed, SubFeed): + yield feed + return + + for attr_name in ("data", "positional", "default"): + inner = getattr(feed, attr_name, None) + if isinstance(inner, BaseFeedConfigModel): + yield from self._iter_subfeeds(inner) + + for attr_name in ("item_from", "item_to"): + wrapper = getattr(feed, attr_name, None) + inner = getattr(wrapper, "data", None) + if isinstance(inner, BaseFeedConfigModel): + yield from self._iter_subfeeds(inner) + + items = getattr(feed, "items", None) + if isinstance(items, list): + for item in items: + if isinstance(item, BaseFeedConfigModel): + yield from self._iter_subfeeds(item) + continue + inner = getattr(item, "data", None) + if isinstance(inner, BaseFeedConfigModel): + yield from self._iter_subfeeds(inner) + + def _register_wrapped_subfeed_method( + self, + *, + subfeed: SubFeed, + original_methods_dict: Dict[str, Callable], + rewritten_methods_dict: Dict[str, Callable], + dedup_state: _DedupState, + ) -> None: + original_name = subfeed.method_name + original_method = original_methods_dict[original_name] + unique_name = f"__dedup__{self.merger_id}__{subfeed.subfeed_id}" + + if unique_name in rewritten_methods_dict: + subfeed.method_name = unique_name + return + + subfeed.method_name = unique_name + leaf_priority = int(getattr(subfeed, "dedup_priority", 0)) + + wrapped = self._make_wrapped_leaf_method( + original_method=original_method, + dedup_state=dedup_state, + leaf_priority=leaf_priority, + ) + setattr(wrapped, "_smartfeed_original", original_method) + rewritten_methods_dict[unique_name] = wrapped + + def _make_wrapped_leaf_method( + self, + *, + original_method: Callable, + dedup_state: _DedupState, + leaf_priority: int, + ) -> Callable: + async def _wrapped_method( + user_id: Any, + limit: int, + next_page: FeedResultNextPageInside, + **kw: Any, + ) -> FeedResultClient: + collected: List[Any] = [] + upstream_has_next_page = False + + loops = 0 + while len(collected) < limit and loops < self.max_refill_loops: + loops += 1 + before_len = len(collected) + + remaining = limit - len(collected) + can_overfetch, request_limit, start_after = self._compute_overfetch_params( + remaining=remaining, + next_after=next_page.after, + ) + + method_result = await original_method(user_id=user_id, limit=request_limit, next_page=next_page, **kw) + if not isinstance(method_result, FeedResultClient): + raise TypeError('SubFeed function must return "FeedResultClient" instance.') + + upstream_has_next_page = upstream_has_next_page or method_result.has_next_page + + inspected_count = 0 + + keys_by_index: Optional[List[Optional[str]]] = None + if isinstance(dedup_state, _RedisDedupState): + keys_by_index = [] + batch_keys: List[str] = [] + for entity in method_result.data: + key = self._get_entity_key(entity) + keys_by_index.append(key) + if key is not None: + batch_keys.append(key) + await dedup_state.prefetch(batch_keys) + + for idx, entity in enumerate(method_result.data, start=1): + inspected_count = idx + + key = keys_by_index[idx - 1] if keys_by_index is not None else self._get_entity_key(entity) + if key is None: + continue + + if not dedup_state.should_accept(key, leaf_priority): + continue + + collected.append(entity) + dedup_state.record(key, leaf_priority) + + if len(collected) >= limit: + break + + if len(collected) == before_len: + if not method_result.has_next_page: + break + + if can_overfetch and request_limit > remaining and start_after is not None: + end_after = next_page.after + if isinstance(end_after, int) and end_after == start_after + len(method_result.data): + next_page.after = start_after + inspected_count + + return FeedResultClient(data=collected, next_page=next_page, has_next_page=upstream_has_next_page) + + return _wrapped_method + + def _decode_seen_from_cursor(self, next_page: FeedResultNextPage) -> Dict[str, int]: + entry = next_page.data.get(self.merger_id) + if not entry or entry.after is None: + return {} + + after = entry.after + if isinstance(after, dict) and "z" in after: + payload = base64.urlsafe_b64decode(after["z"].encode()) + raw = zlib.decompress(payload).decode() + decoded = json.loads(raw) + if isinstance(decoded, dict): + return {str(k): int(v) for k, v in decoded.items()} + if isinstance(decoded, list): + seen_map: Dict[str, int] = {} + for entry_item in decoded: + if isinstance(entry_item, (list, tuple)) and len(entry_item) == 2: + seen_map[str(entry_item[0])] = int(entry_item[1]) + else: + seen_map[str(entry_item)] = 0 + return seen_map + return {} + if isinstance(after, dict) and "seen" in after: + return {str(k): 0 for k in list(after["seen"])} + if isinstance(after, list): + return {str(k): 0 for k in list(after)} + if isinstance(after, dict): + return {str(k): int(v) for k, v in after.items() if k not in {"v", "c", "n"}} + return {} + + def _encode_seen_for_cursor(self, seen_updates_in_order: List[tuple[str, int]]) -> Any: + if self.cursor_max_keys is not None: + seen_updates_in_order = seen_updates_in_order[-self.cursor_max_keys :] + + if not self.cursor_compress: + return {"v": 2, "seen": [[k, p] for k, p in seen_updates_in_order]} + + raw = json.dumps([[k, p] for k, p in seen_updates_in_order]).encode() + compressed = zlib.compress(raw) + return { + "v": 2, + "c": "zlib+base64", + "n": len(seen_updates_in_order), + "z": base64.urlsafe_b64encode(compressed).decode(), + } + + async def _redis_zmscore( + self, + redis_client: Union[redis.Redis, AsyncRedis], + key: str, + members: List[str], + ) -> List[Optional[float]]: + if not members: + return [] + + zmscore_fn = getattr(redis_client, "zmscore", None) + if zmscore_fn is not None: + res = zmscore_fn(key, members) + if inspect.iscoroutine(res): + res = await res + return [None if v is None else float(v) for v in list(res)] + + if not _is_async_redis_client(redis_client): + + def _sync_pipeline_execute() -> Any: + pipe = redis_client.pipeline() + for m in members: + pipe.zscore(key, m) + return pipe.execute() + + res = await asyncio.to_thread(_sync_pipeline_execute) + return [None if v is None else float(v) for v in list(res)] + + pipe = redis_client.pipeline() + for m in members: + pipe.zscore(key, m) + res = pipe.execute() + if inspect.iscoroutine(res): + res = await res + return [None if v is None else float(v) for v in list(res)] + + async def _redis_zadd_and_expire( + self, + redis_client: Union[redis.Redis, AsyncRedis], + key: str, + member_scores: Dict[str, int], + ) -> None: + if not member_scores: + return + await _redis_call(redis_client, "zadd", key, mapping={m: float(s) for m, s in member_scores.items()}) + await _redis_call(redis_client, "expire", key, self.state_ttl_seconds) + + def _build_redis_state_key(self, user_id: Any, params: Dict[str, Any]) -> str: + suffix = params.get("custom_deduplication_key") or params.get("custom_view_session_key") + if suffix: + return f"dedup:{self.merger_id}:{user_id}:{suffix}" + return f"dedup:{self.merger_id}:{user_id}" + + async def get_data( + self, + methods_dict: Dict[str, Callable], + user_id: Any, + limit: int, + next_page: FeedResultNextPage, + redis_client: Optional[Union[redis.Redis, AsyncRedis]] = None, + **params: Any, + ) -> FeedResult: + if limit <= 0: + return FeedResult(data=[], next_page=next_page, has_next_page=False) + + entry = next_page.data.get(self.merger_id) + requested_page = entry.page if entry is not None else None + is_fresh_session = requested_page is None or (isinstance(requested_page, int) and requested_page <= 0) + + if self.state_backend == "redis" and not redis_client: + raise ValueError("Redis client must be provided if using MergerDeduplication with state_backend=redis") + + working_next_page = _pydantic_deep_copy(next_page) + + if is_fresh_session: + self._reset_descendant_cursors(working_next_page) + + seen_priority_map: Dict[str, int] = {} + seen_updates_in_order: List[tuple[str, int]] = [] + if self.state_backend == "cursor" and not is_fresh_session: + seen_priority_map = self._decode_seen_from_cursor(next_page) + + seen_request_set: set[str] = set(seen_priority_map.keys()) + + redis_state_key = "" + redis_new_scores: Dict[str, int] = {} + redis_seen_cache: Dict[str, Optional[int]] = {} + if self.state_backend == "redis" and redis_client: + redis_state_key = self._build_redis_state_key(user_id=user_id, params=params) + if is_fresh_session: + await _redis_call(redis_client, "delete", redis_state_key) + + if self.state_backend == "cursor": + dedup_state: _DedupState = _CursorDedupState( + seen_priority_map=seen_priority_map, + seen_updates_in_order=seen_updates_in_order, + seen_request_set=seen_request_set, + ) + else: + assert redis_client is not None + dedup_state = _RedisDedupState( + redis_client=redis_client, + redis_state_key=redis_state_key, + redis_seen_cache=redis_seen_cache, + redis_new_scores=redis_new_scores, + seen_request_set=seen_request_set, + zmscore=self._redis_zmscore, + ) + + original_methods_dict = methods_dict + + child = _pydantic_deep_copy(self.data) + + rewritten_methods_dict = dict(original_methods_dict) + + for sf in self._iter_subfeeds(child): + self._register_wrapped_subfeed_method( + subfeed=sf, + original_methods_dict=original_methods_dict, + rewritten_methods_dict=rewritten_methods_dict, + dedup_state=dedup_state, + ) + + child_result = await child.get_data( + methods_dict=rewritten_methods_dict, + user_id=user_id, + limit=limit, + next_page=working_next_page, + redis_client=redis_client, + _sf_dedup_active=True, + **params, + ) + + if self.state_backend == "redis" and redis_client: + await self._redis_zadd_and_expire(redis_client, redis_state_key, redis_new_scores) + + page = next_page.data[self.merger_id].page if self.merger_id in next_page.data else 1 + merger_after: Any = None + if self.state_backend == "cursor": + merger_after = self._encode_seen_for_cursor(seen_updates_in_order) + + result_next_page = _pydantic_deep_copy(child_result.next_page) + result_next_page.data[self.merger_id] = FeedResultNextPageInside(page=page + 1, after=merger_after) + + return FeedResult(data=child_result.data, next_page=result_next_page, has_next_page=child_result.has_next_page) diff --git a/smartfeed/mergers/percentage.py b/smartfeed/mergers/percentage.py new file mode 100644 index 0000000..76982fb --- /dev/null +++ b/smartfeed/mergers/percentage.py @@ -0,0 +1,111 @@ +from __future__ import annotations + +from random import shuffle +from typing import TYPE_CHECKING, Any, Callable, Dict, List, Literal, Optional, Union, cast + +import redis +from pydantic import BaseModel +from redis.asyncio import Redis as AsyncRedis + +from ..feed_models import BaseFeedConfigModel, FeedResult, FeedResultNextPage + +if TYPE_CHECKING: + from ..schemas import FeedTypes + + +class MergerPercentageItem(BaseModel): + """One percentage slot.""" + + percentage: int + data: FeedTypes + + +class MergerPercentage(BaseFeedConfigModel): + """Percentage-based mixing merger.""" + + merger_id: str + type: Literal["merger_percentage"] + items: List[MergerPercentageItem] + shuffle: bool = False + + @staticmethod + async def _merge_items_data(items_data: List[List]) -> List: + result: List = [] + cursor: List[Dict] = [] + + min_length = min(len(item_data) for item_data in items_data) or 1 + for item_data in items_data: + cursor.append( + { + "items": item_data, + "current": 0, + "size": round(len(item_data) / min_length), + } + ) + + full_length = sum(len(item_data) for item_data in items_data) + while len(result) < full_length: + for item_cursor in cursor: + items = item_cursor["items"] + start = item_cursor["current"] + end = start + item_cursor["size"] if start + item_cursor["size"] < len(items) else len(items) + result.extend(items[start:end]) + item_cursor["current"] = end + + return result + + async def get_data( + self, + methods_dict: Dict[str, Callable], + user_id: Any, + limit: int, + next_page: FeedResultNextPage, + redis_client: Optional[Union[redis.Redis, AsyncRedis]] = None, + **params: Any, + ) -> FeedResult: + result = FeedResult(data=[], next_page=FeedResultNextPage(data={}), has_next_page=False) + + dedup_active = bool(params.pop("_sf_dedup_active", False)) + + items_data: List[List[Any]] = [[] for _ in self.items] + results: List[Optional[FeedResult]] = [None for _ in self.items] + + indexed_items = list(enumerate(self.items)) + fetch_order = indexed_items + if dedup_active: + fetch_order = sorted( + indexed_items, + key=lambda p: (getattr(p[1].data, "dedup_priority", 0), -p[0]), + reverse=True, + ) + + for idx, item in fetch_order: + item_result = cast( + FeedResult, + await item.data.get_data( + methods_dict=methods_dict, + user_id=user_id, + limit=limit * item.percentage // 100, + next_page=next_page, + redis_client=redis_client, + _sf_dedup_active=dedup_active, + **params, + ), + ) + + results[idx] = item_result + + for idx, result_item in enumerate(results): + assert result_item is not None + items_data[idx] = result_item.data + + if not result.has_next_page and result_item.has_next_page: + result.has_next_page = True + result.next_page.data.update(result_item.next_page.data) + + result.data = await self._merge_items_data(items_data=items_data) + + if self.shuffle: + shuffle(result.data) + + return result diff --git a/smartfeed/mergers/percentage_gradient.py b/smartfeed/mergers/percentage_gradient.py new file mode 100644 index 0000000..17ed530 --- /dev/null +++ b/smartfeed/mergers/percentage_gradient.py @@ -0,0 +1,164 @@ +from random import shuffle +from typing import Any, Callable, Dict, Literal, Optional, Union + +import redis +from pydantic import model_validator +from redis.asyncio import Redis as AsyncRedis + +from ..feed_models import BaseFeedConfigModel, FeedResult, FeedResultNextPage, FeedResultNextPageInside +from .percentage import MergerPercentageItem + + +class MergerPercentageGradient(BaseFeedConfigModel): + """Percentage-gradient merger.""" + + merger_id: str + type: Literal["merger_percentage_gradient"] + item_from: MergerPercentageItem + item_to: MergerPercentageItem + step: int + size_to_step: int + shuffle: bool = False + + @model_validator(mode="after") + def validate_merger_percentage_gradient(self) -> "MergerPercentageGradient": + if self.step < 1 or self.step > 100: + raise ValueError('"step" must be in range from 1 to 100') + if self.size_to_step < 1: + raise ValueError('"size_to_step" must be bigger than 1') + return self + + async def _calculate_limits_and_percents(self, page: int, limit: int) -> Dict: + result: Dict = { + "limit_from": 0, + "limit_to": 0, + "percentages": [], + } + + percentage_from = self.item_from.percentage + percentage_to = self.item_to.percentage + start_position = limit * (page - 1) + first_iter = True + + for i in range(self.size_to_step, limit * page + self.size_to_step, self.size_to_step): + if not first_iter and percentage_to < 100: + percentage_from -= self.step + percentage_to += self.step + + if percentage_to > 100 or percentage_from < 0: + percentage_from = 0 + percentage_to = 100 + + if i > start_position: + iter_limit = (limit * page - start_position) if i > limit * page else (i - start_position) + start_position = i + + if result["percentages"] and result["percentages"][-1]["to"] >= 100: + result["limit_to"] += iter_limit + result["percentages"][-1]["limit"] += iter_limit + else: + result["limit_from"] += iter_limit * percentage_from // 100 + result["limit_to"] += iter_limit * percentage_to // 100 + iter_result = {"limit": iter_limit, "from": percentage_from, "to": percentage_to} + result["percentages"].append(iter_result) + + if first_iter: + first_iter = False + + return result + + async def get_data( + self, + methods_dict: Dict[str, Callable], + user_id: Any, + limit: int, + next_page: FeedResultNextPage, + redis_client: Optional[Union[redis.Redis, AsyncRedis]] = None, + **params: Any, + ) -> FeedResult: + result = FeedResult( + data=[], + next_page=FeedResultNextPage( + data={ + self.merger_id: FeedResultNextPageInside( + page=next_page.data[self.merger_id].page if self.merger_id in next_page.data else 1, + after=next_page.data[self.merger_id].after if self.merger_id in next_page.data else None, + ) + }, + ), + has_next_page=False, + ) + + limits_and_percents = await self._calculate_limits_and_percents( + page=result.next_page.data[self.merger_id].page, + limit=limit, + ) + + dedup_active = bool(params.pop("_sf_dedup_active", False)) + + from_priority = getattr(self.item_from.data, "dedup_priority", 0) + to_priority = getattr(self.item_to.data, "dedup_priority", 0) + + if dedup_active and to_priority > from_priority: + item_to = await self.item_to.data.get_data( + methods_dict=methods_dict, + user_id=user_id, + limit=limits_and_percents["limit_to"], + next_page=next_page, + redis_client=redis_client, + _sf_dedup_active=True, + **params, + ) + item_from = await self.item_from.data.get_data( + methods_dict=methods_dict, + user_id=user_id, + limit=limits_and_percents["limit_from"], + next_page=next_page, + redis_client=redis_client, + _sf_dedup_active=True, + **params, + ) + else: + item_from = await self.item_from.data.get_data( + methods_dict=methods_dict, + user_id=user_id, + limit=limits_and_percents["limit_from"], + next_page=next_page, + redis_client=redis_client, + _sf_dedup_active=dedup_active, + **params, + ) + item_to = await self.item_to.data.get_data( + methods_dict=methods_dict, + user_id=user_id, + limit=limits_and_percents["limit_to"], + next_page=next_page, + redis_client=redis_client, + _sf_dedup_active=dedup_active, + **params, + ) + + from_start_index = 0 + to_start_index = 0 + for lp_data in limits_and_percents["percentages"]: + from_end_index = (lp_data["limit"] * lp_data["from"] // 100) + from_start_index + to_end_index = (lp_data["limit"] * lp_data["to"] // 100) + to_start_index + + result.data.extend(item_from.data[from_start_index:from_end_index]) + result.data.extend(item_to.data[to_start_index:to_end_index]) + + from_start_index = from_end_index + to_start_index = to_end_index + + result.next_page.data.update(item_from.next_page.data) + result.next_page.data.update(item_to.next_page.data) + + if any([item_from.has_next_page, item_to.has_next_page]): + result.has_next_page = True + + if self.shuffle: + shuffle(result.data) + + result.next_page.data[self.merger_id].page += 1 + + return result diff --git a/smartfeed/mergers/positional.py b/smartfeed/mergers/positional.py new file mode 100644 index 0000000..a543ddb --- /dev/null +++ b/smartfeed/mergers/positional.py @@ -0,0 +1,135 @@ +from __future__ import annotations + +from typing import TYPE_CHECKING, Any, Callable, Dict, List, Literal, Optional, Union + +import redis +from pydantic import model_validator +from redis.asyncio import Redis as AsyncRedis + +from ..feed_models import BaseFeedConfigModel, FeedResult, FeedResultNextPage, FeedResultNextPageInside + +if TYPE_CHECKING: + from ..schemas import FeedTypes + + +class MergerPositional(BaseFeedConfigModel): + """Positional merger.""" + + merger_id: str + type: Literal["merger_positional"] + positions: List[int] = [] + start: Optional[int] = None + end: Optional[int] = None + step: Optional[int] = None + positional: FeedTypes + default: FeedTypes + + @model_validator(mode="after") + def validate_merger_positional(self) -> "MergerPositional": + if not self.positions and not all((self.start, self.end, self.step)): + raise ValueError('Either "positions" or "start", "end", and "step" must be provided') + if self.start and self.positions: + if isinstance(self.start, int) and self.start <= max(self.positions): + raise ValueError('"start" must be bigger than maximum value of "positions"') + if isinstance(self.start, int) and isinstance(self.end, int): + if self.end <= self.start: + raise ValueError('"end" must be bigger than "start"') + return self + + async def get_data( + self, + methods_dict: Dict[str, Callable], + user_id: Any, + limit: int, + next_page: FeedResultNextPage, + redis_client: Optional[Union[redis.Redis, AsyncRedis]] = None, + **params: Any, + ) -> FeedResult: + dedup_active = bool(params.pop("_sf_dedup_active", False)) + + page = next_page.data[self.merger_id].page if self.merger_id in next_page.data else 1 + + positional_has_next_page = True + page_positions: List[int] = [] + available_positions = range((page - 1) * limit, (page * limit) + 1) + for position in self.positions: + if position in available_positions: + page_positions.append(available_positions.index(position)) + + if max(available_positions) >= max(self.positions, default=0): + positional_has_next_page = False + + if self.start is not None and self.end is not None and self.step is not None: + positional_has_next_page = not max(available_positions) >= self.end + + for position in range(self.start, self.end, self.step): + if position in available_positions: + page_positions.append(available_positions.index(position)) + + if dedup_active and getattr(self.positional, "dedup_priority", 0) > getattr(self.default, "dedup_priority", 0): + pos_res = await self.positional.get_data( + methods_dict=methods_dict, + user_id=user_id, + limit=len(page_positions), + next_page=next_page, + redis_client=redis_client, + _sf_dedup_active=True, + **params, + ) + default_res = await self.default.get_data( + methods_dict=methods_dict, + user_id=user_id, + limit=limit, + next_page=next_page, + redis_client=redis_client, + _sf_dedup_active=True, + **params, + ) + else: + default_res = await self.default.get_data( + methods_dict=methods_dict, + user_id=user_id, + limit=limit, + next_page=next_page, + redis_client=redis_client, + _sf_dedup_active=dedup_active, + **params, + ) + pos_res = await self.positional.get_data( + methods_dict=methods_dict, + user_id=user_id, + limit=len(page_positions), + next_page=next_page, + redis_client=redis_client, + _sf_dedup_active=dedup_active, + **params, + ) + + result = FeedResult( + data=default_res.data, + next_page=FeedResultNextPage( + data={ + self.merger_id: FeedResultNextPageInside( + page=page, + after=next_page.data[self.merger_id].after if self.merger_id in next_page.data else None, + ) + }, + ), + has_next_page=default_res.has_next_page, + ) + + if not result.has_next_page and all([positional_has_next_page, pos_res.has_next_page]): + result.has_next_page = True + + result.next_page.data.update(default_res.next_page.data) + result.next_page.data.update(pos_res.next_page.data) + + for i, post in enumerate(pos_res.data): + result.data = result.data[: page_positions[i] - 1] + [post] + result.data[page_positions[i] - 1 :] + + if len(result.data) > limit: + result.data = result.data[:limit] + + result.next_page.data[self.merger_id].page += 1 + + return result diff --git a/smartfeed/mergers/view_session.py b/smartfeed/mergers/view_session.py new file mode 100644 index 0000000..cd3c2f8 --- /dev/null +++ b/smartfeed/mergers/view_session.py @@ -0,0 +1,205 @@ +from __future__ import annotations + +import logging +from random import shuffle +from typing import TYPE_CHECKING, Any, Callable, Dict, List, Literal, Optional, Union + +import redis +from redis.asyncio import Redis as AsyncRedis +from redis.asyncio import RedisCluster as AsyncRedisCluster + +from .. import jsonlib as json +from ..feed_models import BaseFeedConfigModel, FeedResult, FeedResultNextPage, FeedResultNextPageInside, _redis_call + +if TYPE_CHECKING: + from ..schemas import FeedTypes + + +class MergerViewSession(BaseFeedConfigModel): + """Merger with view-session caching.""" + + merger_id: str + type: Literal["merger_view_session"] + session_size: int + session_live_time: int + data: "FeedTypes" + deduplicate: bool = False + dedup_key: str = None # type: ignore + shuffle: bool = False + + def _get_dedup_key_or_attr(self, item: Any) -> str: + if not self.dedup_key: + return item + + try: + dedup_value = item.get(self.dedup_key) + except AttributeError: + dedup_value = getattr(item, self.dedup_key, None) + + assert dedup_value is not None, f"Deduplication failed: entity {item} has no key or attr {self.dedup_key}" + return dedup_value + + def _dedup_data(self, data: List[Any]) -> List[Any]: + deduplicated_data = {self._get_dedup_key_or_attr(item): item for item in data} + return list(deduplicated_data.values()) + + async def _set_cache( + self, + methods_dict: Dict[str, Callable], + user_id: Any, + redis_client: Union[redis.Redis, AsyncRedis], + cache_key: str, + **params: Any, + ) -> List[Any]: + result = await self.data.get_data( + methods_dict=methods_dict, + user_id=user_id, + limit=self.session_size, + next_page=FeedResultNextPage(data={}), + **params, + ) + + data = result.data + if self.deduplicate: + data = self._dedup_data(data) + await _redis_call(redis_client, "set", cache_key, json.dumps(data), ex=self.session_live_time) + return data + + async def _set_cache_async( + self, + methods_dict: Dict[str, Callable], + user_id: Any, + redis_client: AsyncRedis, + cache_key: str, + **params: Any, + ) -> List[Any]: + result = await self.data.get_data( + methods_dict=methods_dict, + user_id=user_id, + limit=self.session_size, + next_page=FeedResultNextPage(data={}), + **params, + ) + + data = result.data + if self.deduplicate: + data = self._dedup_data(data) + await redis_client.set(cache_key, json.dumps(data)) + await redis_client.expire(cache_key, self.session_live_time) + return data + + async def _get_cache( + self, + methods_dict: Dict[str, Callable], + user_id: Any, + limit: int, + next_page: FeedResultNextPage, + redis_client: Union[redis.Redis, AsyncRedis], + **params: Any, + ) -> FeedResult: + if session_cache_key := params.get("custom_view_session_key", None): + cache_key = f"{self.merger_id}_{user_id}_{session_cache_key}" + else: + cache_key = f"{self.merger_id}_{user_id}" + + logging.info("MergerViewSession cache request for %s", cache_key) + cache_exists = bool(await _redis_call(redis_client, "exists", cache_key)) + if not cache_exists or self.merger_id not in next_page.data: + logging.info("Cache miss or new session - generating fresh data for %s", cache_key) + session_data = await self._set_cache( + methods_dict=methods_dict, user_id=user_id, redis_client=redis_client, cache_key=cache_key, **params + ) + else: + logging.info("Cache exists - attempting read from Redis for %s", cache_key) + cached_data = await _redis_call(redis_client, "get", cache_key) + if cached_data is None: + logging.info( + "Redis returned None for %s - falling back to fresh data (cluster replication issue)", cache_key + ) + session_data = await self._set_cache( + methods_dict=methods_dict, user_id=user_id, redis_client=redis_client, cache_key=cache_key, **params + ) + else: + logging.info("Successfully read cached data for %s", cache_key) + session_data = json.loads(cached_data) + + page = next_page.data[self.merger_id].page if self.merger_id in next_page.data else 1 + return FeedResult( + data=session_data[(page - 1) * limit :][:limit], + next_page=FeedResultNextPage(data={self.merger_id: FeedResultNextPageInside(page=page + 1, after=None)}), + has_next_page=bool(len(session_data) > limit * page), + ) + + async def _get_cache_async( + self, + methods_dict: Dict[str, Callable], + user_id: Any, + limit: int, + next_page: FeedResultNextPage, + redis_client: AsyncRedis, + **params: Any, + ) -> FeedResult: + if session_cache_key := params.get("custom_view_session_key", None): + cache_key = f"{self.merger_id}_{user_id}_{session_cache_key}" + else: + cache_key = f"{self.merger_id}_{user_id}" + + if not await redis_client.exists(cache_key) or self.merger_id not in next_page.data: + session_data = await self._set_cache_async( + methods_dict=methods_dict, user_id=user_id, redis_client=redis_client, cache_key=cache_key, **params + ) + else: + cached_data = await redis_client.get(cache_key) + if cached_data is None: + logging.info( + "Redis returned None for %s - falling back to fresh data (cluster replication issue)", cache_key + ) + session_data = await self._set_cache_async( + methods_dict=methods_dict, user_id=user_id, redis_client=redis_client, cache_key=cache_key, **params + ) + else: + logging.info("Successfully read cached data for %s", cache_key) + session_data = json.loads(cached_data) + + page = next_page.data[self.merger_id].page if self.merger_id in next_page.data else 1 + return FeedResult( + data=session_data[(page - 1) * limit :][:limit], + next_page=FeedResultNextPage(data={self.merger_id: FeedResultNextPageInside(page=page + 1, after=None)}), + has_next_page=bool(len(session_data) > limit * page), + ) + + async def get_data( + self, + methods_dict: Dict[str, Callable], + user_id: Any, + limit: int, + next_page: FeedResultNextPage, + redis_client: Optional[Union[redis.Redis, AsyncRedis]] = None, + **params: Any, + ) -> FeedResult: + if not redis_client: + raise ValueError("Redis client must be provided if using Merger View Session") + + if isinstance(redis_client, (AsyncRedis, AsyncRedisCluster)): + result = await self._get_cache_async( + methods_dict=methods_dict, + user_id=user_id, + limit=limit, + next_page=next_page, + redis_client=redis_client, + **params, + ) + else: + result = await self._get_cache( + methods_dict=methods_dict, + user_id=user_id, + limit=limit, + next_page=next_page, + redis_client=redis_client, + **params, + ) + + if self.shuffle: + shuffle(result.data) + + return result diff --git a/smartfeed/schemas.py b/smartfeed/schemas.py index ed04203..9863bb4 100644 --- a/smartfeed/schemas.py +++ b/smartfeed/schemas.py @@ -1,1875 +1,95 @@ -import asyncio -import base64 -import inspect -from . import jsonlib as json -import logging -import zlib -from abc import ABC, abstractmethod -from collections import defaultdict, deque -from dataclasses import dataclass -from random import shuffle -from typing import ( - Annotated, - Any, - Awaitable, - Callable, - Dict, - Iterator, - List, - Literal, - Optional, - Union, - cast, - no_type_check, -) - -import redis -from pydantic import BaseModel, Field, PrivateAttr, model_validator -from redis.asyncio import Redis as AsyncRedis -from redis.asyncio import RedisCluster as AsyncRedisCluster - - -def _is_async_redis_client(client: Any) -> bool: - return isinstance(client, (AsyncRedis, AsyncRedisCluster)) - - -async def _redis_call(client: Any, method_name: str, *args: Any, **kwargs: Any) -> Any: - """Call a Redis method without blocking the event loop. - - - For `redis.asyncio` clients, calls are awaited directly. - - For sync `redis.Redis`, calls are offloaded via `asyncio.to_thread()`. - """ - - method = getattr(client, method_name) - if _is_async_redis_client(client): - return await method(*args, **kwargs) - return await asyncio.to_thread(method, *args, **kwargs) - - -def _pydantic_deep_copy(model: Any) -> Any: - """Deep copy helper compatible with Pydantic v1 and v2.""" - - if hasattr(model, "model_copy"): - return model.model_copy(deep=True) - return model.copy(deep=True) - - -class _DedupState(ABC): - @abstractmethod - def should_accept(self, key: str, priority: int) -> bool: - raise NotImplementedError - - @abstractmethod - def record(self, key: str, priority: int) -> None: - raise NotImplementedError - - async def prefetch(self, keys: List[str]) -> None: - return - - -@dataclass -class _CursorDedupState(_DedupState): - seen_priority_map: Dict[str, int] - seen_updates_in_order: List[tuple[str, int]] - seen_request_set: set[str] - - def should_accept(self, key: str, priority: int) -> bool: - if key in self.seen_request_set: - return False - existing_priority = self.seen_priority_map.get(key) - if existing_priority is not None and priority <= existing_priority: - return False - return True - - def record(self, key: str, priority: int) -> None: - self.seen_priority_map[key] = priority - self.seen_updates_in_order.append((key, priority)) - self.seen_request_set.add(key) +"""Public schema surface. +This module keeps the public import path (`smartfeed.schemas`) stable while +moving merger implementations into `smartfeed.mergers.*`. +""" -@dataclass -class _RedisDedupState(_DedupState): - redis_client: Union[redis.Redis, AsyncRedis] - redis_state_key: str - redis_seen_cache: Dict[str, Optional[int]] - redis_new_scores: Dict[str, int] - seen_request_set: set[str] - zmscore: Callable[ - [Union[redis.Redis, AsyncRedis], str, List[str]], - Union[Awaitable[List[Optional[float]]], List[Optional[float]]], - ] +from __future__ import annotations - async def prefetch(self, keys: List[str]) -> None: - if not keys: - return - unique: List[str] = [] - seen: set[str] = set() - for k in keys: - if k in self.seen_request_set: - continue - if k in self.redis_seen_cache: - continue - if k in seen: - continue - seen.add(k) - unique.append(k) +from typing import Annotated, Any, Dict, Union - if not unique: - return - - scores_result = self.zmscore(self.redis_client, self.redis_state_key, unique) - if inspect.iscoroutine(scores_result): - scores = await cast(Awaitable[List[Optional[float]]], scores_result) - else: - scores = cast(List[Optional[float]], scores_result) - - for k, s in zip(unique, scores): - self.redis_seen_cache[k] = None if s is None else int(s) - - def should_accept(self, key: str, priority: int) -> bool: - if key in self.seen_request_set: - return False - existing_priority = self.redis_seen_cache.get(key) - if existing_priority is not None and priority <= existing_priority: - return False - return True - - def record(self, key: str, priority: int) -> None: - self.seen_request_set.add(key) - self.redis_seen_cache[key] = priority - self.redis_new_scores[key] = max(self.redis_new_scores.get(key, 0), priority) +from pydantic import BaseModel, Field +from .feed_models import ( + BaseFeedConfigModel, + FeedResult, + FeedResultClient, + FeedResultNextPage, + FeedResultNextPageInside, + SubFeed, +) +from .mergers import ( + MergerAppend, + MergerAppendDistribute, + MergerDeduplication, + MergerPercentage, + MergerPercentageGradient, + MergerPercentageItem, + MergerPositional, + MergerViewSession, +) FeedTypes = Annotated[ Union[ - "MergerDeduplication", - "MergerAppend", - "MergerAppendDistribute", - "MergerPositional", - "MergerPercentage", - "MergerPercentageGradient", - "MergerViewSession", - "SubFeed", + MergerDeduplication, + MergerAppend, + MergerAppendDistribute, + MergerPositional, + MergerPercentage, + MergerPercentageGradient, + MergerViewSession, + SubFeed, ], Field(discriminator="type"), ] -class FeedResultNextPageInside(BaseModel): - """ - Модель данных курсора пагинации конкретной позиции. - - Attributes: - page порядковый номер страницы. - after данные для пагинации клиентского метода. - """ - - page: int = 1 - after: Any = None - - -class FeedResultNextPage(BaseModel): - """ - Модель курсора пагинации. - - Attributes: - data словарь вида "ключ: данные по пагинации", где ключ - subfeed_id или merger_id. - """ - - data: Dict[str, FeedResultNextPageInside] - - -class FeedResult(BaseModel): - """ - Модель результата метода get_data() любой позиции / целого фида. - - Attributes: - data список данных, возвращенных мерджером / субфидом. - next_page курсор пагинации. - has_next_page флаг наличия следующей страницы данных. - """ - - data: List - next_page: FeedResultNextPage - has_next_page: bool - - -class FeedResultClient(BaseModel): - """ - Модель результата клиентского метода субфида. - - Attributes: - data список данных, возвращенных мерджером / субфидом. - next_page курсор пагинации клиентского метода. - has_next_page флаг наличия следующей страницы данных. - """ - - data: List - next_page: FeedResultNextPageInside - has_next_page: bool - - -class BaseFeedConfigModel(ABC, BaseModel): - """ - Абстрактный класс для мерджера / субфида конфигурации. - """ - - # Higher value means the item should "win" deduplication when duplicates exist. - # This is primarily used by MergerDeduplication and by mergers when a dedup wrapper is active. - dedup_priority: int = 0 - - @abstractmethod - async def get_data( - self, - methods_dict: Dict[str, Callable], - user_id: Any, - limit: int, - next_page: FeedResultNextPage, - redis_client: Optional[Union[redis.Redis, AsyncRedis]] = None, - **params: Any, - ) -> FeedResult: - """ - Метод для получения данных. - - :param methods_dict: словарь с используемыми методами. - :param user_id: ID объекта для получения данных (например, ID пользователя). - :param limit: кол-во элементов. - :param next_page: курсор пагинации. - :param redis_client: объект клиента Redis (для конфигурации с view_session мерджером). - :param params: параметры для метода. - :return: список данных. - """ - - -class MergerViewSession(BaseFeedConfigModel): - """ - Модель мерджера с кэшированием. - - Attributes: - merger_id уникальный ID мерджера. - type тип объекта - всегда "merger_view_session". - view_session флаг использования механизма расчета всего фида сразу и сохранения в кэш. - session_size размер кэшируемого фида (limit получения данных для сохранения в кэш). - session_live_time срок хранения в кэше для кэшируемого фида (в секундах). - data мерджер или субфид. - deduplicate флаг дедупликации (удаления дублей из сессии). - dedup_key название ключа или атрибута, по которому логика дедпликации найдет дубли. - shuffle флаг для перемешивания полученных данных мерджера. - """ - - merger_id: str - type: Literal["merger_view_session"] - session_size: int - session_live_time: int - data: FeedTypes - deduplicate: bool = False - dedup_key: str = None # type: ignore - shuffle: bool = False - - def _get_dedup_key_or_attr(self, item: Any) -> str: - """ - Метод для получения ключа объекта кешируемой сессии. - - Если указанное в конфиге сессии название ключа имеет значение None, - в качестве ключа вернется сам объект. - Если название ключа не None, и для одного из объектов ни найден ни ключ, ни атрибут, - метод выбросит AssertionError. - - :param item: объект, для которого нужен ключ. - :return: ключ объекта. - """ - - if not self.dedup_key: - return item - - try: - dedup_value = item.get(self.dedup_key) - except AttributeError: - dedup_value = getattr(item, self.dedup_key, None) - - assert dedup_value is not None, f"Deduplication failed: entity {item} has no key or attr {self.dedup_key}" - return dedup_value - - def _dedup_data(self, data: List[Any]) -> List[Any]: - """ - Метод для удаления дублей в списке data с сохранением последовательности. - - :param data: список, в котором нужно удалить дубли. - :return: результат удаления дублей. - """ - - deduplicated_data = {self._get_dedup_key_or_attr(item): item for item in data} - return list(deduplicated_data.values()) - - async def _set_cache( - self, - methods_dict: Dict[str, Callable], - user_id: Any, - redis_client: Union[redis.Redis, AsyncRedis], - cache_key: str, - **params: Any, - ) -> List[Any]: - """ - Метод для кэширования данных Merger View Session. - - :param methods_dict: словарь с используемыми методами. - :param user_id: ID объекта для получения данных (например, ID пользователя). - :param redis_client: объект клиента Redis. - :param cache_key: ключ для кэширования. - :param params: любые внешние параметры, передаваемые в исполняемую функцию на клиентской стороне. - :return: обработанные данные, которые были записаны в кэш. - """ - - result = await self.data.get_data( - methods_dict=methods_dict, - user_id=user_id, - limit=self.session_size, - next_page=FeedResultNextPage(data={}), - **params, - ) - - data = result.data - if self.deduplicate: - data = self._dedup_data(data) - await _redis_call(redis_client, "set", cache_key, json.dumps(data), ex=self.session_live_time) - return data - - async def _set_cache_async( - self, - methods_dict: Dict[str, Callable], - user_id: Any, - redis_client: AsyncRedis, - cache_key: str, - **params: Any, - ) -> List[Any]: - """ - Метод для кэширования данных Merger View Session. - - :param methods_dict: словарь с используемыми методами. - :param user_id: ID объекта для получения данных (например, ID пользователя). - :param redis_client: объект клиента Redis. - :param cache_key: ключ для кэширования. - :param params: любые внешние параметры, передаваемые в исполняемую функцию на клиентской стороне. - :return: обработанные данные, которые были записаны в кэш. - """ - - result = await self.data.get_data( - methods_dict=methods_dict, - user_id=user_id, - limit=self.session_size, - next_page=FeedResultNextPage(data={}), - **params, - ) - - data = result.data - if self.deduplicate: - data = self._dedup_data(data) - await redis_client.set(cache_key, json.dumps(data)) - await redis_client.expire(cache_key, self.session_live_time) - return data - - async def _get_cache( - self, - methods_dict: Dict[str, Callable], - user_id: Any, - limit: int, - next_page: FeedResultNextPage, - redis_client: Union[redis.Redis, AsyncRedis], - **params: Any, - ) -> FeedResult: - """ - Метод для получения данных Merger View Session из кэша Redis. - При отсутствии данных в кэше - получить и сохранить. - - :param methods_dict: словарь с используемыми методами. - :param user_id: ID объекта для получения данных (например, ID пользователя). - :param limit: лимит на выдачу данных. - :param next_page: курсор для пагинации в формате SmartFeedResultNextPage. - :param redis_client: объект клиента Redis. - :param params: любые внешние параметры, передаваемые в исполняемую функцию на клиентской стороне. - :return: результат получения данных согласно конфигурации фида. - """ - - # Формируем ключ для кэширования данных мерджера. - if session_cache_key := params.get("custom_view_session_key", None): - cache_key = f"{self.merger_id}_{user_id}_{session_cache_key}" - else: - cache_key = f"{self.merger_id}_{user_id}" - - logging.info("MergerViewSession cache request for %s", cache_key) - # Если кэш не найден или передан пустой курсор пагинации на мерджер, обновляем данные и записываем в кэш. - cache_exists = bool(await _redis_call(redis_client, "exists", cache_key)) - if not cache_exists or self.merger_id not in next_page.data: - logging.info("Cache miss or new session - generating fresh data for %s", cache_key) - # Получаем свежие данные и используем их напрямую (избегаем чтение из кэша) - session_data = await self._set_cache( - methods_dict=methods_dict, user_id=user_id, redis_client=redis_client, cache_key=cache_key, **params - ) - else: - logging.info("Cache exists - attempting read from Redis for %s", cache_key) - # Читаем из кэша только если он уже существовал - cached_data = await _redis_call(redis_client, "get", cache_key) - if cached_data is None: - # Fallback: если кэш пропал, получаем свежие данные - logging.info( - "Redis returned None for %s - falling back to fresh data (cluster replication issue)", cache_key - ) - session_data = await self._set_cache( - methods_dict=methods_dict, user_id=user_id, redis_client=redis_client, cache_key=cache_key, **params - ) - else: - logging.info("Successfully read cached data for %s", cache_key) - session_data = json.loads(cached_data) - page = next_page.data[self.merger_id].page if self.merger_id in next_page.data else 1 - result = FeedResult( - data=session_data[(page - 1) * limit :][:limit], - next_page=FeedResultNextPage(data={self.merger_id: FeedResultNextPageInside(page=page + 1, after=None)}), - has_next_page=bool(len(session_data) > limit * page), - ) - return result - - async def _get_cache_async( - self, - methods_dict: Dict[str, Callable], - user_id: Any, - limit: int, - next_page: FeedResultNextPage, - redis_client: AsyncRedis, - **params: Any, - ) -> FeedResult: - """ - Метод для получения данных Merger View Session из кэша Redis. - При отсутствии данных в кэше - получить и сохранить. - - :param methods_dict: словарь с используемыми методами. - :param user_id: ID объекта для получения данных (например, ID пользователя). - :param limit: лимит на выдачу данных. - :param next_page: курсор для пагинации в формате SmartFeedResultNextPage. - :param redis_client: объект клиента Redis. - :param params: любые внешние параметры, передаваемые в исполняемую функцию на клиентской стороне. - :return: результат получения данных согласно конфигурации фида. - """ - - # Формируем ключ для кэширования данных мерджера. - if session_cache_key := params.get("custom_view_session_key", None): - cache_key = f"{self.merger_id}_{user_id}_{session_cache_key}" - else: - cache_key = f"{self.merger_id}_{user_id}" - - # Если кэш не найден или передан пустой курсор пагинации на мерджер, обновляем данные и записываем в кэш. - if not await redis_client.exists(cache_key) or self.merger_id not in next_page.data: - # Получаем свежие данные и используем их напрямую (избегаем чтение из кэша) - session_data = await self._set_cache_async( - methods_dict=methods_dict, user_id=user_id, redis_client=redis_client, cache_key=cache_key, **params - ) - else: - # Читаем из кэша только если он уже существовал - cached_data = await redis_client.get(cache_key) - if cached_data is None: - # Fallback: если кэш пропал, получаем свежие данные - logging.info( - "Redis returned None for %s - falling back to fresh data (cluster replication issue)", cache_key - ) - session_data = await self._set_cache_async( - methods_dict=methods_dict, user_id=user_id, redis_client=redis_client, cache_key=cache_key, **params - ) - else: - logging.info("Successfully read cached data for %s", cache_key) - session_data = json.loads(cached_data) - page = next_page.data[self.merger_id].page if self.merger_id in next_page.data else 1 - result = FeedResult( - data=session_data[(page - 1) * limit :][:limit], - next_page=FeedResultNextPage(data={self.merger_id: FeedResultNextPageInside(page=page + 1, after=None)}), - has_next_page=bool(len(session_data) > limit * page), - ) - return result - - async def get_data( - self, - methods_dict: Dict[str, Callable], - user_id: Any, - limit: int, - next_page: FeedResultNextPage, - redis_client: Optional[Union[redis.Redis, AsyncRedis]] = None, - **params: Any, - ) -> FeedResult: - """ - Метод для получения данных методом append. - - :param methods_dict: словарь с используемыми методами. - :param user_id: ID объекта для получения данных (например, ID пользователя). - :param limit: кол-во элементов. - :param next_page: курсор пагинации. - :param redis_client: объект клиента Redis (для конфигурации с view_session мерджером). - :param params: для метода класса. - :return: список данных методом append. - """ - - # Проверяем наличие клиента Redis в конфигурации фида. - if not redis_client: - raise ValueError("Redis client must be provided if using Merger View Session") - - # Формируем результат view session мерджера. - if isinstance(redis_client, (AsyncRedis, AsyncRedisCluster)): - result = await self._get_cache_async( - methods_dict=methods_dict, - user_id=user_id, - limit=limit, - next_page=next_page, - redis_client=redis_client, - **params, - ) - else: - result = await self._get_cache( - methods_dict=methods_dict, - user_id=user_id, - limit=limit, - next_page=next_page, - redis_client=redis_client, - **params, - ) - - # Если в конфигурации указано "смешать" данные. - if self.shuffle: - shuffle(result.data) - - return result - - -class MergerAppend(BaseFeedConfigModel): - """ - Модель append мерджера. - - Attributes: - merger_id уникальный ID мерджера. - type тип объекта - всегда "merger_append". - items позиции мерджера. - shuffle флаг для перемешивания полученных данных мерджера. - """ - - merger_id: str - type: Literal["merger_append"] - items: List[FeedTypes] - shuffle: bool = False - - async def get_data( - self, - methods_dict: Dict[str, Callable], - user_id: Any, - limit: int, - next_page: FeedResultNextPage, - redis_client: Optional[Union[redis.Redis, AsyncRedis]] = None, - **params: Any, - ) -> FeedResult: - """ - Метод для получения данных методом append. - - :param methods_dict: словарь с используемыми методами. - :param user_id: ID объекта для получения данных (например, ID пользователя). - :param limit: кол-во элементов. - :param next_page: курсор пагинации. - :param redis_client: объект клиента Redis (для конфигурации с view_session мерджером). - :param params: для метода класса. - :return: список данных методом append. - """ - - # When a MergerDeduplication wrapper is active, we may need to respect dedup_priority - # across children without changing the append output order. In that mode we fetch in - # priority order, then concatenate in the configured order and trim to `limit`. - dedup_active = bool(params.pop("_sf_dedup_active", False)) - - result = FeedResult(data=[], next_page=FeedResultNextPage(data={}), has_next_page=False) - - if dedup_active: - indexed_items = list(enumerate(self.items)) - fetched: Dict[int, FeedResult] = {} - - # Always-on parallelism in dedup mode: preserve dedup semantics by ensuring - # higher priority is recorded first; fetch same-priority children concurrently. - groups: Dict[int, List[tuple[int, FeedTypes]]] = defaultdict(list) - for idx, item in indexed_items: - prio = int(getattr(item, "dedup_priority", 0)) - groups[prio].append((idx, item)) - - for prio in sorted(groups.keys(), reverse=True): - group = groups[prio] - coros: List[Awaitable[FeedResult]] = [] - order: List[int] = [] - for idx, item in group: - order.append(idx) - coros.append( - item.get_data( - methods_dict=methods_dict, - user_id=user_id, - limit=limit, - next_page=_pydantic_deep_copy(next_page), - redis_client=redis_client, - _sf_dedup_active=True, - **params, - ) - ) - group_results = await asyncio.gather(*coros) - for idx, r in zip(order, group_results): - fetched[idx] = cast(FeedResult, r) - - for idx, _item in indexed_items: - item_result = fetched[idx] - result.data.extend(item_result.data) - result.next_page.data.update(item_result.next_page.data) - if item_result.has_next_page: - result.has_next_page = True - - if len(result.data) > limit: - result.data = result.data[:limit] - else: - result_limit = limit - for item in self.items: - item_result = await item.get_data( - methods_dict=methods_dict, - user_id=user_id, - limit=result_limit, - next_page=next_page, - redis_client=redis_client, - **params, - ) - - result.data.extend(item_result.data) - result_limit -= len(item_result.data) - - if not result.has_next_page and item_result.has_next_page: - result.has_next_page = True - - result.next_page.data.update(item_result.next_page.data) - - if result_limit <= 0: - break - - # Если в конфигурации указано "смешать" данные. - if self.shuffle: - shuffle(result.data) - - return result - - -class MergerPositional(BaseFeedConfigModel): - """ - Модель позиционного мерджера. - - Attributes: - merger_id уникальный ID мерджера. - type тип объекта - всегда "merger_positional". - positions позиции для вставки из мерджера / субфида "positional" [обязателен, если нет start, end, step]. - start начальная позиция [обязателен, если нет positions]. - end завершающая позиция [обязателен, если нет positions]. - step шаг позиций между "start" и "end" [обязателен, если нет positions]. - positional мерджер / субфид из которого берутся позиционные данные. - default мерджер / субфид из которого берутся остальные данные. - """ - - merger_id: str - type: Literal["merger_positional"] - positions: List[int] = [] - start: Optional[int] = None - end: Optional[int] = None - step: Optional[int] = None - positional: FeedTypes - default: FeedTypes - - @model_validator(mode="after") - def validate_merger_positional(self) -> "MergerPositional": - if not self.positions and not all((self.start, self.end, self.step)): - raise ValueError('Either "positions" or "start", "end", and "step" must be provided') - if self.start and self.positions: - if isinstance(self.start, int) and self.start <= max(self.positions): - raise ValueError('"start" must be bigger than maximum value of "positions"') - if isinstance(self.start, int) and isinstance(self.end, int): - if self.end <= self.start: - raise ValueError('"end" must be bigger than "start"') - return self - - async def get_data( - self, - methods_dict: Dict[str, Callable], - user_id: Any, - limit: int, - next_page: FeedResultNextPage, - redis_client: Optional[Union[redis.Redis, AsyncRedis]] = None, - **params: Any, - ) -> FeedResult: - """ - Метод для получения данных в позиционном соотношении из данных позиций. - - :param methods_dict: словарь с используемыми методами. - :param user_id: ID объекта для получения данных (например, ID пользователя). - :param limit: кол-во элементов. - :param next_page: курсор пагинации. - :param redis_client: объект клиента Redis (для конфигурации с view_session мерджером). - :param params: для метода класса. - :return: список данных в процентном соотношении. - """ - - dedup_active = bool(params.pop("_sf_dedup_active", False)) - - # Determine the merger page first (independent of children). - page = next_page.data[self.merger_id].page if self.merger_id in next_page.data else 1 - - positional_has_next_page = True - page_positions: List[int] = [] - available_positions = range((page - 1) * limit, (page * limit) + 1) - for position in self.positions: - if position in available_positions: - page_positions.append(available_positions.index(position)) - - # Если конечная позиция текущей страницы больше или равна MAX позиции в конфигурации, то has_next_page = False - if max(available_positions) >= max(self.positions, default=0): - positional_has_next_page = False - - if self.start is not None and self.end is not None and self.step is not None: - # Если конечная позиция текущей страницы больше или равна конечной шаговой позиции, то has_next_page = False - positional_has_next_page = not max(available_positions) >= self.end - - for position in range(self.start, self.end, self.step): - if position in available_positions: - page_positions.append(available_positions.index(position)) - - default_res: FeedResult - pos_res: FeedResult - - if dedup_active and getattr(self.positional, "dedup_priority", 0) > getattr(self.default, "dedup_priority", 0): - pos_res = await self.positional.get_data( - methods_dict=methods_dict, - user_id=user_id, - limit=len(page_positions), - next_page=next_page, - redis_client=redis_client, - _sf_dedup_active=True, - **params, - ) - default_res = await self.default.get_data( - methods_dict=methods_dict, - user_id=user_id, - limit=limit, - next_page=next_page, - redis_client=redis_client, - _sf_dedup_active=True, - **params, - ) - else: - default_res = await self.default.get_data( - methods_dict=methods_dict, - user_id=user_id, - limit=limit, - next_page=next_page, - redis_client=redis_client, - _sf_dedup_active=dedup_active, - **params, - ) - pos_res = await self.positional.get_data( - methods_dict=methods_dict, - user_id=user_id, - limit=len(page_positions), - next_page=next_page, - redis_client=redis_client, - _sf_dedup_active=dedup_active, - **params, - ) - - result = FeedResult( - data=default_res.data, - next_page=FeedResultNextPage( - data={ - self.merger_id: FeedResultNextPageInside( - page=page, - after=next_page.data[self.merger_id].after if self.merger_id in next_page.data else None, - ) - }, - ), - has_next_page=default_res.has_next_page, - ) - - # Если has_next_page = False, то проверяем has_next_page у позиции и, если необходимо, обновляем. - if not result.has_next_page and all([positional_has_next_page, pos_res.has_next_page]): - result.has_next_page = True - - # Обновляем next_page. - result.next_page.data.update(default_res.next_page.data) - result.next_page.data.update(pos_res.next_page.data) - - # Формируем общие данные позиционного мерджера. - for i, post in enumerate(pos_res.data): - result.data = result.data[: page_positions[i] - 1] + [post] + result.data[page_positions[i] - 1 :] - - # Проверка на возврат данных в количестве не более limit. - if len(result.data) > limit: - result.data = result.data[:limit] - - # Обновляем страницу для курсора пагинации мерджера. - result.next_page.data[self.merger_id].page += 1 - - return result - - -class MergerPercentageItem(BaseModel): - """ - Модель позиции процентного мерджера. - - Attributes: - percentage процент позиции в мерджере. - data мерджер / субфид. - """ - - percentage: int - data: FeedTypes - - -class MergerPercentage(BaseFeedConfigModel): - """ - Модель процентного мерджера. - - Attributes: - merger_id уникальный ID мерджера. - type тип объекта - всегда "merger_percentage". - shuffle флаг для перемешивания полученных данных мерджера. - items позиции мерджера. - """ - - merger_id: str - type: Literal["merger_percentage"] - items: List[MergerPercentageItem] - shuffle: bool = False - - @staticmethod - async def _merge_items_data(items_data: List[List]) -> List: - """ - Метод для получения максимально равномерно распределенных данных позиций процентного мерджера. - - :param items_data: список со списками данных из каждой позиции. - :return: максимально равномерно распределенные данные позиций процентного мерджера. - """ - - # Формируем возвращаемый результат и список курсоров для списка каждой позиции. - result: List = [] - cursor: List[Dict] = [] - - # Получаем длину самого маленького списка и формируем курсор для каждого списка. - min_length = min(len(item_data) for item_data in items_data) or 1 - for item_data in items_data: - cursor.append( - { - "items": item_data, - "current": 0, - "size": round(len(item_data) / min_length), - } - ) - - # Получаем общий размер всех элементов всех списков и пока не получаем результат такого же размера - # производим операции по распределению элементов. - full_length = sum(len(item_data) for item_data in items_data) - while len(result) < full_length: - for item_cursor in cursor: - items = item_cursor["items"] - start = item_cursor["current"] - end = start + item_cursor["size"] if start + item_cursor["size"] < len(items) else len(items) - result.extend(items[start:end]) - item_cursor["current"] = end - - return result - - async def get_data( - self, - methods_dict: Dict[str, Callable], - user_id: Any, - limit: int, - next_page: FeedResultNextPage, - redis_client: Optional[Union[redis.Redis, AsyncRedis]] = None, - **params: Any, - ) -> FeedResult: - """ - Метод для получения данных в процентном соотношении из данных позиций. - - :param methods_dict: словарь с используемыми методами. - :param user_id: ID объекта для получения данных (например, ID пользователя). - :param limit: кол-во элементов. - :param next_page: курсор пагинации. - :param redis_client: объект клиента Redis (для конфигурации с view_session мерджером). - :param params: для метода класса. - :return: список данных в процентном соотношении. - """ - - # Формируем результат процентного мерджера. - result = FeedResult(data=[], next_page=FeedResultNextPage(data={}), has_next_page=False) - - dedup_active = bool(params.pop("_sf_dedup_active", False)) - - items_data: List[List[Any]] = [[] for _ in self.items] - results: List[Optional[FeedResult]] = [None for _ in self.items] - - indexed_items = list(enumerate(self.items)) - fetch_order = indexed_items - if dedup_active: - fetch_order = sorted( - indexed_items, - key=lambda p: (getattr(p[1].data, "dedup_priority", 0), -p[0]), - reverse=True, - ) - - for idx, item in fetch_order: - item_result = cast( - FeedResult, - await item.data.get_data( - methods_dict=methods_dict, - user_id=user_id, - limit=limit * item.percentage // 100, - next_page=next_page, - redis_client=redis_client, - _sf_dedup_active=dedup_active, - **params, - ), - ) - - results[idx] = item_result - - for idx, result_item in enumerate(results): - assert result_item is not None - items_data[idx] = result_item.data - - if not result.has_next_page and result_item.has_next_page: - result.has_next_page = True - result.next_page.data.update(result_item.next_page.data) - - # Добавляем данные позиции к общему результату процентного мерджера. - result.data = await self._merge_items_data(items_data=items_data) - - # Если в конфигурации указано "смешать" данные. - if self.shuffle: - shuffle(result.data) - - return result - - -class MergerPercentageGradient(BaseFeedConfigModel): - """ - Модель процентного мерджера с градиентном. - - Attributes: - merger_id уникальный ID мерджера. - type тип объекта - всегда "merger_percentage_gradient". - item_from мерджер / субфид из которого начинается "перетекание" градиента. - item_to мерджер / субфид в который "перетекает" градиент. - step изменение в % соотношения из item_from в item_to. - size_to_step шаг для применения изменений % соотношения (например, через каждые 30 позиций). - shuffle флаг для перемешивания полученных данных мерджера. - """ - - merger_id: str - type: Literal["merger_percentage_gradient"] - item_from: MergerPercentageItem - item_to: MergerPercentageItem - step: int - size_to_step: int - shuffle: bool = False - - @model_validator(mode="after") - def validate_merger_percentage_gradient(self) -> "MergerPercentageGradient": - if self.step < 1 or self.step > 100: - raise ValueError('"step" must be in range from 1 to 100') - if self.size_to_step < 1: - raise ValueError('"size_to_step" must be bigger than 1') - return self - - async def _calculate_limits_and_percents(self, page: int, limit: int) -> Dict: - """ - Метод для получения списка лимитов данных с процентным соотношением позиций item_from & item_to, - учитывая градиентное изменение соотношений. - - :param page: порядковый номер страницы. - :param limit: общий лимит данных для страницы. - :return: список лимитов данных с процентным соотношением позиций item_from & item_to. - """ - - result: Dict = { - "limit_from": 0, - "limit_to": 0, - "percentages": [], - } - - percentage_from = self.item_from.percentage - percentage_to = self.item_to.percentage - start_position = limit * (page - 1) - first_iter = True - - for i in range(self.size_to_step, limit * page + self.size_to_step, self.size_to_step): - # При первой итерации и percentage_to >= 100 не меняем соотношение % между позициями. - if not first_iter and percentage_to < 100: - # Меняем процентное соотношение позиций на "шаг", указанный в конфигурации. - percentage_from -= self.step - percentage_to += self.step - - # Если процентное соотношение вышло за 100+, то устанавливаем предельные значения. - if percentage_to > 100 or percentage_from < 0: - percentage_from = 0 - percentage_to = 100 - - # Если индекс итерации по величине больше стартовой позиции согласно переданной странице, - # то начинаем обработку. - if i > start_position: - # Рассчитываем лимит получения данных для конкретной итерации. - iter_limit = (limit * page - start_position) if i > limit * page else (i - start_position) - start_position = i - - # Формируем результат для каждой итерации и добавляем в возвращаемый список, но если процентное - # соотношение у последней итерации 0 - 100, то добавляем лимит к ней. - if result["percentages"] and result["percentages"][-1]["to"] >= 100: - result["limit_to"] += iter_limit - result["percentages"][-1]["limit"] += iter_limit - else: - result["limit_from"] += iter_limit * percentage_from // 100 - result["limit_to"] += iter_limit * percentage_to // 100 - iter_result = {"limit": iter_limit, "from": percentage_from, "to": percentage_to} - result["percentages"].append(iter_result) - - # Если первая итерация цикла - if first_iter: - first_iter = False - - return result - - async def get_data( - self, - methods_dict: Dict[str, Callable], - user_id: Any, - limit: int, - next_page: FeedResultNextPage, - redis_client: Optional[Union[redis.Redis, AsyncRedis]] = None, - **params: Any, - ) -> FeedResult: - """ - Метод для получения данных в процентном соотношении с градиентом из данных позиций. - - :param methods_dict: словарь с используемыми методами. - :param user_id: ID объекта для получения данных (например, ID пользователя). - :param limit: кол-во элементов. - :param next_page: курсор пагинации. - :param redis_client: объект клиента Redis (для конфигурации с view_session мерджером). - :param params: для метода класса. - :return: список данных в процентном соотношении. - """ - - # Формируем результат процентного мерджера с градиентом. - result = FeedResult( - data=[], - next_page=FeedResultNextPage( - data={ - self.merger_id: FeedResultNextPageInside( - page=next_page.data[self.merger_id].page if self.merger_id in next_page.data else 1, - after=next_page.data[self.merger_id].after if self.merger_id in next_page.data else None, - ) - }, - ), - has_next_page=False, - ) - - # Получаем список лимитов данных и соотношений согласно странице и градиенту. - limits_and_percents = await self._calculate_limits_and_percents( - page=result.next_page.data[self.merger_id].page, - limit=limit, - ) - - dedup_active = bool(params.pop("_sf_dedup_active", False)) - - from_priority = getattr(self.item_from.data, "dedup_priority", 0) - to_priority = getattr(self.item_to.data, "dedup_priority", 0) - - if dedup_active and to_priority > from_priority: - item_to = await self.item_to.data.get_data( - methods_dict=methods_dict, - user_id=user_id, - limit=limits_and_percents["limit_to"], - next_page=next_page, - redis_client=redis_client, - _sf_dedup_active=True, - **params, - ) - item_from = await self.item_from.data.get_data( - methods_dict=methods_dict, - user_id=user_id, - limit=limits_and_percents["limit_from"], - next_page=next_page, - redis_client=redis_client, - _sf_dedup_active=True, - **params, - ) - else: - item_from = await self.item_from.data.get_data( - methods_dict=methods_dict, - user_id=user_id, - limit=limits_and_percents["limit_from"], - next_page=next_page, - redis_client=redis_client, - _sf_dedup_active=dedup_active, - **params, - ) - item_to = await self.item_to.data.get_data( - methods_dict=methods_dict, - user_id=user_id, - limit=limits_and_percents["limit_to"], - next_page=next_page, - redis_client=redis_client, - _sf_dedup_active=dedup_active, - **params, - ) - - from_start_index = 0 - to_start_index = 0 - for lp_data in limits_and_percents["percentages"]: - # Высчитываем лимиты для каждой позиции исходя из процентного соотношения. - from_end_index = (lp_data["limit"] * lp_data["from"] // 100) + from_start_index - to_end_index = (lp_data["limit"] * lp_data["to"] // 100) + to_start_index - - # Добавляем данные позиции к общему результату процентного мерджера с градиентом. - result.data.extend(item_from.data[from_start_index:from_end_index]) - result.data.extend(item_to.data[to_start_index:to_end_index]) - - # Обновляем стартовые индексы. - from_start_index = from_end_index - to_start_index = to_end_index - - # Обновляем next_page. - result.next_page.data.update(item_from.next_page.data) - result.next_page.data.update(item_to.next_page.data) - - # Если has_next_page = False, то проверяем has_next_page у позиций и, если необходимо, обновляем. - if any([item_from.has_next_page, item_to.has_next_page]): - result.has_next_page = True - - # Если в конфигурации указано "смешать" данные. - if self.shuffle: - shuffle(result.data) - - # Обновляем страницу для курсора пагинации мерджера. - result.next_page.data[self.merger_id].page += 1 - - return result - - -class MergerAppendDistribute(BaseFeedConfigModel): - """ - Модель мерджера, равномерно распределяющего данные по ключу. - - Attributes: - merger_id уникальный ID мерджера. - type тип объекта - всегда "merger_distribute". - items позиции мерджера. - distribution_key ключ для распределения данных мерджера. - sorting_key ключ сортировки. - sorting_desc флаг сортировки по убыванию. - """ - - merger_id: str - type: Literal["merger_distribute"] - items: List[FeedTypes] - distribution_key: str - sorting_key: Optional[str] = None - sorting_desc: bool = False - - @no_type_check - async def _uniform_distribute(self, data: list) -> list: - # Сортируем записи глобально по `created_at` в порядке убывания - if self.sorting_key: - data = sorted(data, key=lambda x: x[self.sorting_key], reverse=self.sorting_desc) - - # Группируем записи по `distribution_key` - grouped_entries = defaultdict(deque) - for entry in data: - grouped_entries[entry[self.distribution_key]].append(entry) - result = [] - prev_profile_id = None - while any(grouped_entries.values()): - for profile_id in list(grouped_entries.keys()): - if grouped_entries[profile_id]: - # Если текущий `distribution_key` отличается от предыдущего или он последний, берем его - if profile_id != prev_profile_id or len(grouped_entries) == 1: - result.append(grouped_entries[profile_id].popleft()) - prev_profile_id = profile_id - if not grouped_entries[profile_id]: # Если записи закончились, удаляем ключ из группы - del grouped_entries[profile_id] - else: - del grouped_entries[profile_id] - - return result - - async def get_data( - self, - methods_dict: Dict[str, Callable], - user_id: Any, - limit: int, - next_page: FeedResultNextPage, - redis_client: Optional[Union[redis.Redis, AsyncRedis]] = None, - **params: Any, - ) -> FeedResult: - """ - Метод для получения данных методом append. - - :param methods_dict: словарь с используемыми методами. - :param user_id: ID объекта для получения данных (например, ID пользователя). - :param limit: кол-во элементов. - :param next_page: курсор пагинации. - :param redis_client: объект клиента Redis (для конфигурации с view_session мерджером). - :param params: для метода класса. - :return: список данных методом append. - """ - - dedup_active = bool(params.pop("_sf_dedup_active", False)) - - result = FeedResult(data=[], next_page=FeedResultNextPage(data={}), has_next_page=False) - - if dedup_active: - indexed_items = list(enumerate(self.items)) - fetch_order = sorted(indexed_items, key=lambda p: (getattr(p[1], "dedup_priority", 0), -p[0]), reverse=True) - fetched: Dict[int, FeedResult] = {} - - for idx, item in fetch_order: - fetched[idx] = await item.get_data( - methods_dict=methods_dict, - user_id=user_id, - limit=limit, - next_page=next_page, - redis_client=redis_client, - _sf_dedup_active=True, - **params, - ) - - for idx, _item in indexed_items: - item_result = fetched[idx] - result.data.extend(item_result.data) - result.next_page.data.update(item_result.next_page.data) - if item_result.has_next_page: - result.has_next_page = True - - if len(result.data) > limit: - result.data = result.data[:limit] - else: - result_limit = limit - for item in self.items: - item_result = await item.get_data( - methods_dict=methods_dict, - user_id=user_id, - limit=result_limit, - next_page=next_page, - redis_client=redis_client, - **params, - ) - - result.data.extend(item_result.data) - result_limit -= len(item_result.data) - - if not result.has_next_page and item_result.has_next_page: - result.has_next_page = True - - result.next_page.data.update(item_result.next_page.data) - - if result_limit <= 0: - break - - # Распределяем данные равномерно по ключу. - result.data = await self._uniform_distribute(result.data) - return result - - -class MergerDeduplication(BaseFeedConfigModel): - """Merger that deduplicates while preserving child mixing/position semantics. - - This merger acts as a wrapper around exactly one child feed node. - Deduplication is applied at the leaf SubFeed method level with a shared seen-set. - This lets nested mergers (positional/percentage/gradient/etc.) keep their slot rules: - duplicates are skipped by fetching additional items from the *same* leaf source. - """ - - merger_id: str - type: Literal["merger_deduplication"] - data: FeedTypes - - dedup_key: Optional[str] = None - missing_key_policy: Literal["error", "keep", "drop"] = "error" - - state_backend: Literal["cursor", "redis"] = "cursor" - state_ttl_seconds: int = 3600 - cursor_compress: bool = True - cursor_max_keys: Optional[int] = None - - overfetch_factor: int = 1 - - max_refill_loops: int = 20 - - _descendant_cursor_keys_cache: Optional[set[str]] = PrivateAttr(default=None) - - @model_validator(mode="after") - def validate_merger_deduplication(self) -> "MergerDeduplication": - if self.overfetch_factor < 1: - raise ValueError('"overfetch_factor" must be >= 1') - if self.max_refill_loops < 1: - raise ValueError('"max_refill_loops" must be >= 1') - return self - - def _collect_descendant_cursor_keys(self, feed: BaseFeedConfigModel) -> set[str]: - keys: set[str] = set() - - subfeed_id = getattr(feed, "subfeed_id", None) - if isinstance(subfeed_id, str) and subfeed_id: - keys.add(subfeed_id) - - merger_id = getattr(feed, "merger_id", None) - if isinstance(merger_id, str) and merger_id: - keys.add(merger_id) - - # Recurse into known child containers across existing feed types. - child: Any - for attr_name in ("data", "positional", "default"): - child = getattr(feed, attr_name, None) - if isinstance(child, BaseFeedConfigModel): - keys.update(self._collect_descendant_cursor_keys(child)) - - for attr_name in ("item_from", "item_to"): - child = getattr(feed, attr_name, None) - inner = getattr(child, "data", None) - if isinstance(inner, BaseFeedConfigModel): - keys.update(self._collect_descendant_cursor_keys(inner)) - - items = getattr(feed, "items", None) - if isinstance(items, list): - for item in items: - if isinstance(item, BaseFeedConfigModel): - keys.update(self._collect_descendant_cursor_keys(item)) - continue - - inner = getattr(item, "data", None) - if isinstance(inner, BaseFeedConfigModel): - keys.update(self._collect_descendant_cursor_keys(inner)) - - return keys - - def _get_descendant_cursor_keys_cached(self) -> set[str]: - cached = self._descendant_cursor_keys_cache - if cached is None: - cached = self._collect_descendant_cursor_keys(self.data) - self._descendant_cursor_keys_cache = cached - return cached - - def _reset_descendant_cursors(self, next_page: FeedResultNextPage) -> None: - descendant_keys = self._get_descendant_cursor_keys_cached() - for key in descendant_keys: - next_page.data.pop(key, None) - - def _normalize_key(self, value: Any) -> str: - if isinstance(value, (str, int)): - return str(value) - if isinstance(value, (dict, list)): - return json.dumps(value, sort_keys=True, default=str) - return str(value) - - def _extract_dedup_value(self, item: Any) -> Any: - if not self.dedup_key: - return item - - try: - value = item.get(self.dedup_key) - except AttributeError: - value = getattr(item, self.dedup_key, None) - - if value is None and self.missing_key_policy == "error": - raise AssertionError(f"Deduplication failed: entity {item} has no key or attr {self.dedup_key}") - return value - - def _get_entity_key(self, entity: Any) -> Optional[str]: - """Return normalized dedup key for entity, or None if entity should be skipped.""" - - raw_value = self._extract_dedup_value(entity) - if raw_value is None: - if self.missing_key_policy == "drop": - return None - if self.missing_key_policy == "keep": - raw_value = ("__missing__", id(entity)) - return self._normalize_key(raw_value) - - def _compute_overfetch_params(self, *, remaining: int, next_after: Any) -> tuple[bool, int, Optional[int]]: - """Compute safe overfetch params. - - Overfetch is only safe when `after` is an integer offset (so we can rewind). - - Returns: (can_overfetch, request_limit, start_after) - """ - - can_overfetch = isinstance(next_after, int) - request_limit = max(1, remaining) - if can_overfetch and self.overfetch_factor > 1: - request_limit = max(1, remaining * self.overfetch_factor) - start_after: Optional[int] = int(next_after) if can_overfetch else None - return can_overfetch, request_limit, start_after - - def _iter_subfeeds(self, feed: BaseFeedConfigModel) -> Iterator["SubFeed"]: - if isinstance(feed, SubFeed): - yield feed - return - - for attr_name in ("data", "positional", "default"): - inner = getattr(feed, attr_name, None) - if isinstance(inner, BaseFeedConfigModel): - yield from self._iter_subfeeds(inner) - - for attr_name in ("item_from", "item_to"): - wrapper = getattr(feed, attr_name, None) - inner = getattr(wrapper, "data", None) - if isinstance(inner, BaseFeedConfigModel): - yield from self._iter_subfeeds(inner) - - items = getattr(feed, "items", None) - if isinstance(items, list): - for item in items: - if isinstance(item, BaseFeedConfigModel): - yield from self._iter_subfeeds(item) - continue - inner = getattr(item, "data", None) - if isinstance(inner, BaseFeedConfigModel): - yield from self._iter_subfeeds(inner) - - def _register_wrapped_subfeed_method( - self, - *, - subfeed: "SubFeed", - original_methods_dict: Dict[str, Callable], - rewritten_methods_dict: Dict[str, Callable], - dedup_state: "_DedupState", - ) -> None: - original_name = subfeed.method_name - original_method = original_methods_dict[original_name] - unique_name = f"__dedup__{self.merger_id}__{subfeed.subfeed_id}" - - # Idempotency: if the same subfeed id appears multiple times, don't re-wrap. - if unique_name in rewritten_methods_dict: - subfeed.method_name = unique_name - return - - subfeed.method_name = unique_name - leaf_priority = int(getattr(subfeed, "dedup_priority", 0)) - - wrapped = self._make_wrapped_leaf_method( - original_method=original_method, - dedup_state=dedup_state, - leaf_priority=leaf_priority, - ) - setattr(wrapped, "_smartfeed_original", original_method) - rewritten_methods_dict[unique_name] = wrapped - - def _make_wrapped_leaf_method( - self, - *, - original_method: Callable, - dedup_state: "_DedupState", - leaf_priority: int, - ) -> Callable: - async def _wrapped_method( - user_id: Any, - limit: int, - next_page: FeedResultNextPageInside, - **kw: Any, - ) -> FeedResultClient: - collected: List[Any] = [] - upstream_has_next_page = False - - loops = 0 - while len(collected) < limit and loops < self.max_refill_loops: - loops += 1 - before_len = len(collected) - - remaining = limit - len(collected) - can_overfetch, request_limit, start_after = self._compute_overfetch_params( - remaining=remaining, - next_after=next_page.after, - ) - - method_result = await original_method(user_id=user_id, limit=request_limit, next_page=next_page, **kw) - if not isinstance(method_result, FeedResultClient): - raise TypeError('SubFeed function must return "FeedResultClient" instance.') - - upstream_has_next_page = upstream_has_next_page or method_result.has_next_page - - inspected_count = 0 - - # Backend-specific optimization: Redis batches zmscore. - # For cursor backend, prefetch is a no-op and we avoid the extra pass entirely. - keys_by_index: Optional[List[Optional[str]]] = None - if isinstance(dedup_state, _RedisDedupState): - keys_by_index = [] - batch_keys: List[str] = [] - for entity in method_result.data: - key = self._get_entity_key(entity) - keys_by_index.append(key) - if key is not None: - batch_keys.append(key) - await dedup_state.prefetch(batch_keys) - - for idx, entity in enumerate(method_result.data, start=1): - inspected_count = idx - - key = keys_by_index[idx - 1] if keys_by_index is not None else self._get_entity_key(entity) - if key is None: - continue - - if not dedup_state.should_accept(key, leaf_priority): - continue - - collected.append(entity) - dedup_state.record(key, leaf_priority) - - if len(collected) >= limit: - break - - if len(collected) == before_len: - # No progress this loop. Stop if upstream is exhausted. - if not method_result.has_next_page: - break - - # If we oversampled with a simple integer cursor, rewind to the point we actually consumed. - # This prevents skipping un-inspected items that were fetched but not needed. - if can_overfetch and request_limit > remaining and start_after is not None: - end_after = next_page.after - if isinstance(end_after, int) and end_after == start_after + len(method_result.data): - next_page.after = start_after + inspected_count - - return FeedResultClient(data=collected, next_page=next_page, has_next_page=upstream_has_next_page) - - return _wrapped_method - - def _decode_seen_from_cursor(self, next_page: FeedResultNextPage) -> Dict[str, int]: - entry = next_page.data.get(self.merger_id) - if not entry or entry.after is None: - return {} - - after = entry.after - if isinstance(after, dict) and "z" in after: - payload = base64.urlsafe_b64decode(after["z"].encode()) - raw = zlib.decompress(payload).decode() - decoded = json.loads(raw) - if isinstance(decoded, dict): - return {str(k): int(v) for k, v in decoded.items()} - if isinstance(decoded, list): - # v2: list of [key, priority] entries - seen_map: Dict[str, int] = {} - for entry_item in decoded: - if isinstance(entry_item, (list, tuple)) and len(entry_item) == 2: - seen_map[str(entry_item[0])] = int(entry_item[1]) - else: - seen_map[str(entry_item)] = 0 - return seen_map - return {} - if isinstance(after, dict) and "seen" in after: - return {str(k): 0 for k in list(after["seen"])} - if isinstance(after, list): - return {str(k): 0 for k in list(after)} - if isinstance(after, dict): - # v2 uncompressed map - return {str(k): int(v) for k, v in after.items() if k not in {"v", "c", "n"}} - return {} - - def _encode_seen_for_cursor(self, seen_updates_in_order: List[tuple[str, int]]) -> Any: - if self.cursor_max_keys is not None: - seen_updates_in_order = seen_updates_in_order[-self.cursor_max_keys :] - - if not self.cursor_compress: - return {"v": 2, "seen": [[k, p] for k, p in seen_updates_in_order]} - - raw = json.dumps([[k, p] for k, p in seen_updates_in_order]).encode() - compressed = zlib.compress(raw) - return { - "v": 2, - "c": "zlib+base64", - "n": len(seen_updates_in_order), - "z": base64.urlsafe_b64encode(compressed).decode(), - } - - async def _redis_zmscore( - self, - redis_client: Union[redis.Redis, AsyncRedis], - key: str, - members: List[str], - ) -> List[Optional[float]]: - """Batch zscore for multiple members. - - Falls back to pipelined zscore when zmscore isn't available. - """ - - if not members: - return [] - - zmscore_fn = getattr(redis_client, "zmscore", None) - if zmscore_fn is not None: - res = zmscore_fn(key, members) - if inspect.iscoroutine(res): - res = await res - # redis-py returns list[Optional[float]] - return [None if v is None else float(v) for v in list(res)] - - # Fallback: pipelined zscore. For sync clients, run the whole pipeline in a thread. - if not _is_async_redis_client(redis_client): - def _sync_pipeline_execute() -> Any: - pipe = redis_client.pipeline() - for m in members: - pipe.zscore(key, m) - return pipe.execute() - - res = await asyncio.to_thread(_sync_pipeline_execute) - return [None if v is None else float(v) for v in list(res)] - - pipe = redis_client.pipeline() - for m in members: - pipe.zscore(key, m) - res = pipe.execute() - if inspect.iscoroutine(res): - res = await res - return [None if v is None else float(v) for v in list(res)] - - async def _redis_zadd_and_expire( - self, - redis_client: Union[redis.Redis, AsyncRedis], - key: str, - member_scores: Dict[str, int], - ) -> None: - if not member_scores: - return - await _redis_call(redis_client, "zadd", key, mapping={m: float(s) for m, s in member_scores.items()}) - await _redis_call(redis_client, "expire", key, self.state_ttl_seconds) - - def _build_redis_state_key(self, user_id: Any, params: Dict[str, Any]) -> str: - suffix = params.get("custom_deduplication_key") or params.get("custom_view_session_key") - if suffix: - return f"dedup:{self.merger_id}:{user_id}:{suffix}" - return f"dedup:{self.merger_id}:{user_id}" - - async def get_data( - self, - methods_dict: Dict[str, Callable], - user_id: Any, - limit: int, - next_page: FeedResultNextPage, - redis_client: Optional[Union[redis.Redis, AsyncRedis]] = None, - **params: Any, - ) -> FeedResult: - if limit <= 0: - return FeedResult(data=[], next_page=next_page, has_next_page=False) - - # Treat an explicit "page 0" (or missing cursor for this merger) as a fresh session. - # This allows clients to restart the feed (e.g., full reload) without carrying over seen state. - entry = next_page.data.get(self.merger_id) - requested_page = entry.page if entry is not None else None - is_fresh_session = requested_page is None or (isinstance(requested_page, int) and requested_page <= 0) - - if self.state_backend == "redis" and not redis_client: - raise ValueError("Redis client must be provided if using MergerDeduplication with state_backend=redis") - - working_next_page = _pydantic_deep_copy(next_page) - - if is_fresh_session: - # Reset cursors for all descendants under this merger so upstream nodes also restart. - self._reset_descendant_cursors(working_next_page) - - # Shared dedup state (cross-page) - seen_priority_map: Dict[str, int] = {} - seen_updates_in_order: List[tuple[str, int]] = [] - if self.state_backend == "cursor" and not is_fresh_session: - seen_priority_map = self._decode_seen_from_cursor(next_page) - - # Always maintain a per-request seen set to prevent duplicates within a single get_data() call. - seen_request_set: set[str] = set(seen_priority_map.keys()) - - redis_state_key = "" - redis_new_scores: Dict[str, int] = {} - redis_seen_cache: Dict[str, Optional[int]] = {} - if self.state_backend == "redis" and redis_client: - redis_state_key = self._build_redis_state_key(user_id=user_id, params=params) - if is_fresh_session: - # Drop state for a full restart. - await _redis_call(redis_client, "delete", redis_state_key) - - # Create a single state helper shared across all leaf wrappers. - if self.state_backend == "cursor": - dedup_state: _DedupState = _CursorDedupState( - seen_priority_map=seen_priority_map, - seen_updates_in_order=seen_updates_in_order, - seen_request_set=seen_request_set, - ) - else: - assert redis_client is not None - dedup_state = _RedisDedupState( - redis_client=redis_client, - redis_state_key=redis_state_key, - redis_seen_cache=redis_seen_cache, - redis_new_scores=redis_new_scores, - seen_request_set=seen_request_set, - zmscore=self._redis_zmscore, - ) - - # Preserve inner merger ordering/mixing semantics by deduplicating at the leaf method level - # with a shared seen-set. - original_methods_dict = methods_dict - - # Create a deep copy of the child tree and rewrite each SubFeed to call a unique wrapper - # so we can associate a dedup_priority with each leaf. - child = self.data - child = _pydantic_deep_copy(child) - - rewritten_methods_dict = dict(original_methods_dict) - - for sf in self._iter_subfeeds(child): - self._register_wrapped_subfeed_method( - subfeed=sf, - original_methods_dict=original_methods_dict, - rewritten_methods_dict=rewritten_methods_dict, - dedup_state=dedup_state, - ) - - child_result = await child.get_data( - methods_dict=rewritten_methods_dict, - user_id=user_id, - limit=limit, - next_page=working_next_page, - redis_client=redis_client, - _sf_dedup_active=True, - **params, - ) - - if self.state_backend == "redis" and redis_client: - await self._redis_zadd_and_expire(redis_client, redis_state_key, redis_new_scores) - - page = next_page.data[self.merger_id].page if self.merger_id in next_page.data else 1 - merger_after: Any = None - if self.state_backend == "cursor": - merger_after = self._encode_seen_for_cursor(seen_updates_in_order) - - result_next_page = _pydantic_deep_copy(child_result.next_page) - result_next_page.data[self.merger_id] = FeedResultNextPageInside(page=page + 1, after=merger_after) - - return FeedResult(data=child_result.data, next_page=result_next_page, has_next_page=child_result.has_next_page) - - -class SubFeed(BaseFeedConfigModel): - """ - Модель субфида. - - Attributes: - subfeed_id уникальный ID субфида. - type тип объекта - всегда "subfeed". - method_name название клиентского метода для получения данных субфида. - subfeed_params статичные параметры для метода субфида. - shuffle флаг для перемешивания полученных данных мерджера. - """ - - subfeed_id: str - type: Literal["subfeed"] - method_name: str - subfeed_params: Dict[str, Any] = {} - raise_error: Optional[bool] = True - shuffle: bool = False - - async def get_data( - self, - methods_dict: Dict[str, Callable], - user_id: Any, - limit: int, - next_page: FeedResultNextPage, - redis_client: Optional[Union[redis.Redis, AsyncRedis]] = None, - **params: Any, - ) -> FeedResult: - """ - Метод для получения данных из метода субфида. - - :param methods_dict: словарь с используемыми методами. - :param user_id: ID объекта для получения данных (например, ID пользователя). - :param limit: кол-во элементов. - :param next_page: курсор пагинации. - :param redis_client: объект клиента Redis (для конфигурации с view_session мерджером). - :param params: параметры для метода. - :return: список данных. - """ - - # Формируем next_page конкретного субфида. - subfeed_next_page = FeedResultNextPageInside( - page=next_page.data[self.subfeed_id].page if self.subfeed_id in next_page.data else 1, - after=next_page.data[self.subfeed_id].after if self.subfeed_id in next_page.data else None, - ) - - # Формируем params для функции субфида. - method = methods_dict[self.method_name] - method_spec = getattr(method, "_smartfeed_original", method) - method_args = inspect.getfullargspec(method_spec).args - method_params: Dict[str, Any] = {} - for arg in method_args: - if arg in params: - method_params[arg] = params[arg] - - # Получаем результат функции клиента в формате SubFeedResult. - try: - method_result = await methods_dict[self.method_name]( - user_id=user_id, - limit=limit, - next_page=subfeed_next_page, - **method_params, - **self.subfeed_params, - ) - except (Exception,) as _: - if self.raise_error: - raise - - method_result = FeedResultClient( - data=[], - next_page=subfeed_next_page, - has_next_page=False, - ) - - if not isinstance(method_result, FeedResultClient): - raise TypeError('SubFeed function must return "FeedResultClient" instance.') - - # Если в конфигурации указано "смешать" данные. - if self.shuffle: - shuffle(method_result.data) - - result = FeedResult( - data=method_result.data, - next_page=FeedResultNextPage(data={self.subfeed_id: method_result.next_page}), - has_next_page=method_result.has_next_page, - ) - return result - - class FeedConfig(BaseModel): - """ - Модель конфигурации фида. - - Attributes: - version версия конфигурации. - view_session флаг использования механизма расчета всего фида сразу и сохранения в кэш. - session_size размер кэшируемого фида (limit получения данных для сохранения в кэш). - session_live_time срок хранения в кэше для кэшируемого фида (в секундах). - feed мерджер или субфид. - """ + """Top-level feed config model.""" version: str feed: FeedTypes -# Update Forward Refs def _rebuild_model(model: Any) -> None: + """Resolve forward refs across modules (Pydantic v1/v2 compatible).""" + if hasattr(model, "model_rebuild"): - model.model_rebuild() + model.model_rebuild(force=True, _types_namespace={"FeedTypes": FeedTypes}) else: - model.update_forward_refs() - - -_rebuild_model(MergerPositional) -_rebuild_model(MergerPercentage) -_rebuild_model(SubFeed) -_rebuild_model(MergerPercentageItem) -_rebuild_model(MergerAppend) -_rebuild_model(MergerAppendDistribute) -_rebuild_model(MergerPercentageGradient) -_rebuild_model(MergerViewSession) -_rebuild_model(MergerDeduplication) + model.update_forward_refs(FeedTypes=FeedTypes) + + +for _m in ( + MergerPositional, + MergerPercentage, + MergerPercentageItem, + MergerAppend, + MergerAppendDistribute, + MergerPercentageGradient, + MergerViewSession, + MergerDeduplication, + SubFeed, + FeedConfig, +): + _rebuild_model(_m) + + +__all__ = [ + "BaseFeedConfigModel", + "FeedResult", + "FeedResultClient", + "FeedResultNextPage", + "FeedResultNextPageInside", + "SubFeed", + "MergerAppend", + "MergerAppendDistribute", + "MergerDeduplication", + "MergerPercentage", + "MergerPercentageGradient", + "MergerPercentageItem", + "MergerPositional", + "MergerViewSession", + "FeedTypes", + "FeedConfig", +] From e3fdefbc9938e24364b00a244774450317a3de5d Mon Sep 17 00:00:00 2001 From: Pavel Kochetov Date: Sat, 7 Feb 2026 13:05:35 +0000 Subject: [PATCH 15/33] Refactor continues. --- smartfeed/execution/context.py | 38 ++ smartfeed/execution/cursors.py | 63 +++ smartfeed/execution/executor.py | 616 +++++++++++++++++++++++ smartfeed/execution/plans.py | 60 +++ smartfeed/feed_models.py | 12 +- smartfeed/manager.py | 13 +- smartfeed/mergers/append.py | 110 ++-- smartfeed/mergers/append_distribute.py | 92 ++-- smartfeed/mergers/deduplication.py | 509 ++++--------------- smartfeed/mergers/percentage.py | 96 ++-- smartfeed/mergers/percentage_gradient.py | 169 +++---- smartfeed/mergers/positional.py | 137 ++--- smartfeed/mergers/view_session.py | 147 ++++-- smartfeed/policies/dedup.py | 220 ++++++++ smartfeed/policies/dedup_utils.py | 112 +++++ smartfeed/policies/seen_store.py | 158 ++++++ tests/fixtures/redis.py | 4 +- tests/test_manager_params.py | 40 ++ tests/test_merger_deduplication.py | 74 ++- 19 files changed, 1875 insertions(+), 795 deletions(-) create mode 100644 smartfeed/execution/context.py create mode 100644 smartfeed/execution/cursors.py create mode 100644 smartfeed/execution/executor.py create mode 100644 smartfeed/execution/plans.py create mode 100644 smartfeed/policies/dedup.py create mode 100644 smartfeed/policies/dedup_utils.py create mode 100644 smartfeed/policies/seen_store.py create mode 100644 tests/test_manager_params.py diff --git a/smartfeed/execution/context.py b/smartfeed/execution/context.py new file mode 100644 index 0000000..4a118ea --- /dev/null +++ b/smartfeed/execution/context.py @@ -0,0 +1,38 @@ +from __future__ import annotations + +from dataclasses import dataclass +from typing import Any, Callable, Dict, Optional, Union + +import redis +from redis.asyncio import Redis as AsyncRedis + + +@dataclass +class ExecutionContext: + """Execution context propagated through the feed tree. + + Keeps internal state (policies, backends) out of user params. + """ + + methods_dict: Dict[str, Callable] + user_id: Any + redis_client: Optional[Union[redis.Redis, AsyncRedis]] = None + + # Assigned by the caller (FeedManager / tests) to avoid circular imports. + executor: Any = None + + # Policies (optional) + dedup: Optional[object] = None + + # Execution settings (optional) + refill_settings: Optional["RefillExecutionSettings"] = None + dedup_settings: Optional["DedupExecutionSettings"] = None + + +@dataclass(frozen=True) +class RefillExecutionSettings: + overfetch_factor: int = 1 + max_refill_loops: int = 20 + + +DedupExecutionSettings = RefillExecutionSettings diff --git a/smartfeed/execution/cursors.py b/smartfeed/execution/cursors.py new file mode 100644 index 0000000..eef2fd0 --- /dev/null +++ b/smartfeed/execution/cursors.py @@ -0,0 +1,63 @@ +from __future__ import annotations + +from dataclasses import dataclass +from typing import Iterable + +from ..feed_models import BaseFeedConfigModel, FeedResultNextPage + + +@dataclass +class CursorMap: + next_page: FeedResultNextPage + + def merge_delta(self, *, base_next_page: FeedResultNextPage, owner_next_page: FeedResultNextPage) -> None: + """Merge only the cursor keys that actually changed.""" + + for key, value in owner_next_page.data.items(): + base_value = base_next_page.data.get(key) + if base_value == value: + continue + self.next_page.data[key] = value + + def reset_keys(self, keys: Iterable[str]) -> None: + for key in keys: + self.next_page.data.pop(key, None) + + @staticmethod + def can_overfetch(*, node: BaseFeedConfigModel, base_next_page: FeedResultNextPage) -> bool: + sub_id = getattr(node, "subfeed_id", None) + if not isinstance(sub_id, str): + return False + entry = base_next_page.data.get(sub_id) + if entry is None: + return False + return isinstance(entry.after, int) + + @staticmethod + def rewind_overfetch( + *, + node: BaseFeedConfigModel, + base_next_page: FeedResultNextPage, + result_next_page: FeedResultNextPage, + inspected_count: int, + batch_size: int, + ) -> None: + sub_id = getattr(node, "subfeed_id", None) + if not isinstance(sub_id, str): + return + if sub_id not in result_next_page.data: + return + + entry = result_next_page.data[sub_id] + end_after = entry.after + if not isinstance(end_after, int): + return + + base_entry = base_next_page.data.get(sub_id) + prev_after = base_entry.after if base_entry is not None else None + if not isinstance(prev_after, int): + return + + expected_end = prev_after + batch_size + if end_after == expected_end: + entry.after = prev_after + inspected_count diff --git a/smartfeed/execution/executor.py b/smartfeed/execution/executor.py new file mode 100644 index 0000000..20e3cbc --- /dev/null +++ b/smartfeed/execution/executor.py @@ -0,0 +1,616 @@ +from __future__ import annotations + +import asyncio +import inspect +from typing import Any, Dict, List, Optional, Tuple + +from ..feed_models import BaseFeedConfigModel, FeedResult, FeedResultNextPage, _pydantic_deep_copy +from .context import ExecutionContext +from .cursors import CursorMap +from .plans import CallablePlan, Plan, SlotSpec, SlotsPlan + + +class Executor: + """Shared execution engine. + + Owns recursion and concurrency. Nodes can optionally expose `build_plan(...)`. + """ + + async def run( + self, + node: BaseFeedConfigModel, + ctx: ExecutionContext, + limit: int, + next_page: FeedResultNextPage, + **params: Any, + ) -> FeedResult: + result, plan = await self._run_node_raw(node, ctx, limit, next_page, params) + + dedup = getattr(ctx, "dedup", None) + if dedup is None: + return result + + if isinstance(plan, SlotsPlan): + return result + + return await self._run_node_with_dedup_refill(node, ctx, limit, next_page, params, result) + + async def execute_plan(self, plan: Plan) -> FeedResult: + """Interpret and execute a declarative plan. + + Plans must not perform execution themselves; they are data structures. + """ + + if isinstance(plan, SlotsPlan): + return await self._execute_slots_plan(plan) + if isinstance(plan, CallablePlan): + return await plan.fn(self) + raise TypeError(f"Unknown plan type: {type(plan)!r}") + + async def gather(self, *coros: Any) -> List[Any]: + """Execute coroutines concurrently. + + Centralizes concurrency in the executor layer. + """ + + return list(await asyncio.gather(*coros)) + + async def _maybe_await(self, value: Any) -> Any: + if inspect.isawaitable(value): + return await value + return value + + async def _run_node_raw( + self, + node: BaseFeedConfigModel, + ctx: ExecutionContext, + limit: int, + next_page: FeedResultNextPage, + params: Dict[str, Any], + ) -> Tuple[FeedResult, Optional[Plan]]: + build_plan = getattr(node, "build_plan", None) + if callable(build_plan): + plan: Plan = build_plan(ctx=ctx, limit=limit, next_page=next_page, **params) + result = await self.execute_plan(plan) + return result, plan + + result = await node.get_data( + methods_dict=ctx.methods_dict, + user_id=ctx.user_id, + limit=limit, + next_page=next_page, + redis_client=ctx.redis_client, + ctx=ctx, + **params, + ) + return result, None + + async def _execute_slots_plan(self, plan: SlotsPlan) -> FeedResult: + if plan.limit <= 0: + assembled = await self._maybe_await(plan.assemble([], plan.next_page, {})) + return assembled + + working_next_page = _pydantic_deep_copy(plan.next_page) + cursor = CursorMap(working_next_page) + owners, owner_index = self._collect_plan_owners(plan) + dedup_policy = getattr(plan.ctx, "dedup", None) + refill_settings = getattr(plan.ctx, "refill_settings", None) or getattr(plan.ctx, "dedup_settings", None) + dedup_active = dedup_policy is not None + + owner_max_demand = self._owner_slot_demand(plan) + owner_buffers, owner_results = await self._run_plan_owners( + plan=plan, + owners=owners, + owner_max_demand=owner_max_demand, + dedup_active=dedup_active, + cursor=cursor, + ) + + if dedup_policy is not None: + owner_buffers, owner_results = await self._arbitrate_owner_buffers( + owners=owners, + owner_index=owner_index, + owner_buffers=owner_buffers, + owner_results=owner_results, + dedup_policy=dedup_policy, + ) + + deficits = self._compute_slot_deficits( + plan=plan, + owner_buffers=owner_buffers, + ) + if deficits: + await self._refill_deficits( + plan=plan, + deficits=deficits, + owners=owners, + owner_index=owner_index, + owner_buffers=owner_buffers, + owner_results=owner_results, + dedup_policy=dedup_policy, + refill_settings=refill_settings, + cursor=cursor, + ) + + output = self._consume_slots(plan=plan, owner_buffers=owner_buffers) + assembled = await self._maybe_await(plan.assemble(output, cursor.next_page, owner_results)) + return assembled + + def _owner_slot_demand(self, plan: SlotsPlan) -> Dict[int, int]: + """Compute a per-owner maximum demand based on the slot schedule.""" + + demand: Dict[int, int] = {} + for slot in plan.slots: + owner_id = id(slot.owner) + demand[owner_id] = demand.get(owner_id, 0) + int(slot.max_count) + return demand + + def _collect_plan_owners(self, plan: SlotsPlan) -> tuple[List[Any], Dict[int, int]]: + owners: List[Any] = [] + owner_index: Dict[int, int] = {} + for slot in plan.slots: + owner_id = id(slot.owner) + if owner_id in owner_index: + continue + owner_index[owner_id] = len(owners) + owners.append(slot.owner) + return owners, owner_index + + async def _run_owner( + self, + *, + plan: SlotsPlan, + owner: Any, + demand: int, + base_next_page: FeedResultNextPage, + dedup_active: bool, + ) -> FeedResult: + isolated_next_page = _pydantic_deep_copy(base_next_page) + owner_ctx = plan.ctx + if dedup_active: + owner_ctx = ExecutionContext( + methods_dict=plan.ctx.methods_dict, + user_id=plan.ctx.user_id, + redis_client=plan.ctx.redis_client, + executor=plan.ctx.executor, + dedup=None, + refill_settings=None, + dedup_settings=None, + ) + return await self.run(owner, owner_ctx, demand, isolated_next_page, **plan.params) + + async def _run_plan_owners( + self, + *, + plan: SlotsPlan, + owners: List[Any], + owner_max_demand: Dict[int, int], + dedup_active: bool, + cursor: CursorMap, + ) -> tuple[Dict[int, List[Any]], Dict[int, FeedResult]]: + owner_buffers: Dict[int, List[Any]] = {id(o): [] for o in owners} + owner_results: Dict[int, FeedResult] = {} + + ops: List[tuple[Any, int]] = [] + for owner in owners: + if plan.owner_fetch_limits is not None and id(owner) in plan.owner_fetch_limits: + demand = int(plan.owner_fetch_limits[id(owner)]) + else: + demand = min(plan.limit, int(owner_max_demand.get(id(owner), 0))) + if demand > 0: + ops.append((owner, demand)) + + if not ops: + return owner_buffers, owner_results + + results = await self.gather( + *[ + self._run_owner( + plan=plan, + owner=owner, + demand=demand, + base_next_page=plan.next_page, + dedup_active=dedup_active, + ) + for owner, demand in ops + ] + ) + for (owner, _demand), owner_result in zip(ops, results): + owner_results[id(owner)] = owner_result + owner_buffers[id(owner)] = list(owner_result.data) + cursor.merge_delta( + base_next_page=plan.next_page, + owner_next_page=owner_result.next_page, + ) + + return owner_buffers, owner_results + + async def _arbitrate_owner_buffers( + self, + *, + owners: List[Any], + owner_index: Dict[int, int], + owner_buffers: Dict[int, List[Any]], + owner_results: Dict[int, FeedResult], + dedup_policy: Any, + ) -> tuple[Dict[int, List[Any]], Dict[int, FeedResult]]: + owner_buffers = await dedup_policy.arbitrate_owner_buffers( + owners=owners, + owner_buffers=owner_buffers, + owner_rank=owner_index, + ) + + for owner in owners: + owner_id = id(owner) + if owner_id not in owner_results: + continue + old = owner_results[owner_id] + owner_results[owner_id] = FeedResult( + data=list(owner_buffers.get(owner_id, [])), + next_page=old.next_page, + has_next_page=old.has_next_page, + ) + + return owner_buffers, owner_results + + def _compute_slot_deficits( + self, + *, + plan: SlotsPlan, + owner_buffers: Dict[int, List[Any]], + ) -> Dict[int, int]: + total_max = sum(int(s.max_count) for s in plan.slots) + quota_schedule = total_max <= int(plan.limit) + + consumed: Dict[int, int] = {} + remaining = int(plan.limit) + deficit_slots: List[int] = [] + + for slot in plan.slots: + if remaining <= 0: + break + + owner_id = id(slot.owner) + want = min(int(slot.max_count), remaining) + if want <= 0: + continue + + have_total = len(owner_buffers.get(owner_id, [])) + already = int(consumed.get(owner_id, 0)) + available = max(0, have_total - already) + take = min(want, available) + if take < want: + deficit_slots.append(owner_id) + consumed[owner_id] = already + take + remaining -= take + + page_underfilled = remaining > 0 + + if not quota_schedule and not page_underfilled: + return {} + + deficits: Dict[int, int] = {} + + if quota_schedule: + return self._compute_quota_deficits(plan=plan, owner_buffers=owner_buffers) + + return self._compute_fill_deficits( + plan=plan, + remaining=remaining, + deficit_slots=deficit_slots, + ) + + def _compute_quota_deficits( + self, + *, + plan: SlotsPlan, + owner_buffers: Dict[int, List[Any]], + ) -> Dict[int, int]: + deficits: Dict[int, int] = {} + remaining = int(plan.limit) + consumed: Dict[int, int] = {} + for slot in plan.slots: + if remaining <= 0: + break + + owner_id = id(slot.owner) + want = min(int(slot.max_count), remaining) + if want <= 0: + continue + + have_total = len(owner_buffers.get(owner_id, [])) + already = int(consumed.get(owner_id, 0)) + available = max(0, have_total - already) + take = min(want, available) + missing = max(0, want - take) + if missing: + deficits[owner_id] = deficits.get(owner_id, 0) + missing + consumed[owner_id] = already + take + remaining -= take + + return deficits + + def _compute_fill_deficits( + self, + *, + plan: SlotsPlan, + remaining: int, + deficit_slots: List[int], + ) -> Dict[int, int]: + deficits: Dict[int, int] = {} + to_fill = int(remaining) + if to_fill <= 0: + return deficits + + if deficit_slots: + deficits[deficit_slots[-1]] = deficits.get(deficit_slots[-1], 0) + to_fill + return deficits + + if plan.slots: + last_owner_id = id(plan.slots[-1].owner) + deficits[last_owner_id] = deficits.get(last_owner_id, 0) + to_fill + return deficits + + return deficits + + async def _refill_deficits( + self, + *, + plan: SlotsPlan, + deficits: Dict[int, int], + owners: List[Any], + owner_index: Dict[int, int], + owner_buffers: Dict[int, List[Any]], + owner_results: Dict[int, FeedResult], + dedup_policy: Any, + refill_settings: Any, + cursor: CursorMap, + ) -> None: + overfetch_factor = max(1, int(getattr(refill_settings, "overfetch_factor", 1))) + max_refill_loops = max(1, int(getattr(refill_settings, "max_refill_loops", 20))) + + deficit_owners: List[Any] = [o for o in owners if id(o) in deficits] + deficit_owners = sorted( + deficit_owners, + key=lambda o: ( + int(getattr(o, "dedup_priority", 0)), + owner_index.get(id(o), 0), + ), + ) + + state: Dict[int, Dict[str, Any]] = {} + for refill_owner in deficit_owners: + refill_owner_id = id(refill_owner) + missing_total = int(deficits.get(refill_owner_id, 0)) + if missing_total <= 0: + continue + + base_np = owner_results[refill_owner_id].next_page if refill_owner_id in owner_results else plan.next_page + state[refill_owner_id] = { + "owner": refill_owner, + "missing_total": missing_total, + "remaining": int(missing_total), + "accepted": [], + "loops": 0, + "current_next_page": base_np, + "has_next_page": True, + "last_result": None, + "last_request_limit": 0, + "last_can_overfetch": False, + "last_base_next_page": base_np, + } + + if not state: + return + + while True: + wave_ops: List[Tuple[Any, int, FeedResultNextPage, int, bool]] = [] + for refill_owner in deficit_owners: + refill_owner_id = id(refill_owner) + owner_state = state.get(refill_owner_id) + if owner_state is None: + continue + if owner_state["remaining"] <= 0: + continue + if not owner_state["has_next_page"]: + continue + if owner_state["loops"] >= max_refill_loops: + continue + + base_np = owner_state["current_next_page"] + request_limit = max(1, int(owner_state["remaining"])) + can_overfetch = CursorMap.can_overfetch(node=refill_owner, base_next_page=base_np) + if can_overfetch and overfetch_factor > 1: + request_limit = max(1, int(owner_state["remaining"]) * overfetch_factor) + + wave_ops.append((refill_owner, refill_owner_id, base_np, request_limit, can_overfetch)) + + if not wave_ops: + break + + results = await self.gather( + *[ + self._run_owner( + plan=plan, + owner=owner, + demand=request_limit, + base_next_page=base_np, + dedup_active=True, + ) + for owner, _owner_id, base_np, request_limit, _can_overfetch in wave_ops + ] + ) + + for (owner, owner_id, base_np, request_limit, can_overfetch), result in zip(wave_ops, results): + owner_state = state[owner_id] + owner_state["last_result"] = result + owner_state["last_request_limit"] = request_limit + owner_state["last_can_overfetch"] = can_overfetch + owner_state["last_base_next_page"] = base_np + owner_state["current_next_page"] = result.next_page + owner_state["has_next_page"] = bool(result.has_next_page) + + cursor.merge_delta( + base_next_page=plan.next_page, + owner_next_page=result.next_page, + ) + + for refill_owner in deficit_owners: + refill_owner_id = id(refill_owner) + owner_state = state.get(refill_owner_id) + if owner_state is None: + continue + if owner_state["remaining"] <= 0: + continue + last_result = owner_state["last_result"] + if last_result is None: + continue + + refill_prio = int(getattr(refill_owner, "dedup_priority", 0)) + wave_accepted, inspected_count = await dedup_policy.accept_batch( + items=list(last_result.data), + priority=refill_prio, + limit=int(owner_state["remaining"]), + ) + + if owner_state["last_can_overfetch"] and owner_state["last_request_limit"] > owner_state["remaining"]: + CursorMap.rewind_overfetch( + node=refill_owner, + base_next_page=owner_state["last_base_next_page"], + result_next_page=owner_state["current_next_page"], + inspected_count=inspected_count, + batch_size=len(last_result.data), + ) + + if wave_accepted: + owner_state["accepted"].extend(wave_accepted) + owner_state["remaining"] = int(owner_state["missing_total"]) - len(owner_state["accepted"]) + + if owner_state["remaining"] > 0 and owner_state["has_next_page"]: + owner_state["loops"] += 1 + + owner_state["last_result"] = None + + for refill_owner in deficit_owners: + refill_owner_id = id(refill_owner) + owner_state = state.get(refill_owner_id) + if owner_state is None: + continue + + accepted = owner_state["accepted"] + if accepted: + owner_buffers.setdefault(refill_owner_id, []) + owner_buffers[refill_owner_id].extend(accepted) + + owner_results[refill_owner_id] = FeedResult( + data=list(owner_buffers.get(refill_owner_id, [])), + next_page=owner_state["current_next_page"], + has_next_page=owner_state["has_next_page"], + ) + + def _consume_slots(self, *, plan: SlotsPlan, owner_buffers: Dict[int, List[Any]]) -> List[Any]: + output: List[Any] = [] + for slot in plan.slots: + if len(output) >= plan.limit: + break + + remaining = plan.limit - len(output) + take = min(int(slot.max_count), remaining) + if take <= 0: + continue + + owner_buffer = owner_buffers.get(id(slot.owner), []) + if not owner_buffer: + continue + + chunk = owner_buffer[:take] + del owner_buffer[: len(chunk)] + output.extend(chunk) + + return output + + async def _run_node_with_dedup_refill( + self, + node: BaseFeedConfigModel, + ctx: ExecutionContext, + limit: int, + next_page: FeedResultNextPage, + params: Dict[str, Any], + initial_result: FeedResult, + ) -> FeedResult: + dedup = getattr(ctx, "dedup", None) + if dedup is None: + return initial_result + + settings = getattr(ctx, "refill_settings", None) or getattr(ctx, "dedup_settings", None) + overfetch_factor = max(1, int(getattr(settings, "overfetch_factor", 1))) + max_refill_loops = max(1, int(getattr(settings, "max_refill_loops", 20))) + priority = int(getattr(node, "dedup_priority", 0)) + + collected: List[Any] = [] + remaining = int(limit) + loops = 0 + + current_result = initial_result + current_next_page = current_result.next_page + current_request_limit = max(1, remaining) + has_next_page = bool(current_result.has_next_page) + base_next_page = next_page + + while remaining > 0: + can_overfetch = CursorMap.can_overfetch(node=node, base_next_page=base_next_page) + + accepted, inspected_count = await dedup.accept_batch( + items=list(current_result.data), + priority=priority, + limit=remaining, + ) + + if can_overfetch and current_request_limit > remaining: + CursorMap.rewind_overfetch( + node=node, + base_next_page=base_next_page, + result_next_page=current_next_page, + inspected_count=inspected_count, + batch_size=len(current_result.data), + ) + + if accepted: + collected.extend(accepted) + remaining = limit - len(collected) + + if remaining <= 0 or not has_next_page or loops >= max_refill_loops: + break + loops += 1 + + base_next_page = current_next_page + next_request_limit = max(1, remaining) + can_overfetch = CursorMap.can_overfetch(node=node, base_next_page=base_next_page) + if can_overfetch and overfetch_factor > 1: + next_request_limit = max(1, remaining * overfetch_factor) + + current_result, _plan = await self._run_node_raw( + node, + ctx, + next_request_limit, + base_next_page, + params, + ) + current_next_page = current_result.next_page + current_request_limit = next_request_limit + has_next_page = bool(current_result.has_next_page) + + return FeedResult( + data=collected, + next_page=current_next_page, + has_next_page=has_next_page, + ) + + +__all__ = [ + "Executor", + "Plan", + "CallablePlan", + "SlotSpec", + "SlotsPlan", +] diff --git a/smartfeed/execution/plans.py b/smartfeed/execution/plans.py new file mode 100644 index 0000000..8a23a29 --- /dev/null +++ b/smartfeed/execution/plans.py @@ -0,0 +1,60 @@ +from __future__ import annotations + +from dataclasses import dataclass +from typing import Any, Awaitable, Callable, Dict, List, Optional, Protocol + +from ..feed_models import BaseFeedConfigModel, FeedResult, FeedResultNextPage +from .context import ExecutionContext + + +class Plan(Protocol): + """Declarative execution plan. + + Plans describe what to run; the `Executor` is responsible for interpreting + and executing them. + """ + + +@dataclass(frozen=True) +class CallablePlan: + """A plan implemented as an async callable. + + Useful for mergers whose child limits depend on previous child results. + """ + + fn: Callable[["Executor"], Awaitable[FeedResult]] + + +@dataclass(frozen=True) +class SlotSpec: + """A slot segment owned by a child node. + + Output order is defined by the sequence of SlotSpecs. + """ + + owner: BaseFeedConfigModel + max_count: int + + +@dataclass(frozen=True) +class SlotsPlan: + """Plan expressed as slot ownership + an assembly function. + + The executor will fetch children (possibly in priority order) and then assemble + results in the slot schedule order. + """ + + ctx: ExecutionContext + limit: int + next_page: FeedResultNextPage + params: Dict[str, Any] + slots: List[SlotSpec] + assemble: Callable[[List[Any], FeedResultNextPage, Dict[int, FeedResult]], Any] + owner_fetch_limits: Optional[Dict[int, int]] = None + + +# NOTE: `Executor` is imported only for typing to avoid an import cycle. +from typing import TYPE_CHECKING + +if TYPE_CHECKING: + from .executor import Executor diff --git a/smartfeed/feed_models.py b/smartfeed/feed_models.py index b98f11f..50abcbe 100644 --- a/smartfeed/feed_models.py +++ b/smartfeed/feed_models.py @@ -3,13 +3,16 @@ from abc import ABC, abstractmethod from dataclasses import dataclass from random import shuffle -from typing import Any, Awaitable, Callable, Dict, List, Literal, Optional, Union, cast +from typing import TYPE_CHECKING, Any, Awaitable, Callable, Dict, List, Literal, Optional, Union, cast import redis from pydantic import BaseModel from redis.asyncio import Redis as AsyncRedis from redis.asyncio import RedisCluster as AsyncRedisCluster +if TYPE_CHECKING: + from .execution.context import ExecutionContext + def _is_async_redis_client(client: Any) -> bool: return isinstance(client, (AsyncRedis, AsyncRedisCluster)) @@ -80,6 +83,7 @@ async def get_data( limit: int, next_page: FeedResultNextPage, redis_client: Optional[Union[redis.Redis, AsyncRedis]] = None, + ctx: Optional["ExecutionContext"] = None, **params: Any, ) -> FeedResult: """Fetch data according to this node config.""" @@ -114,8 +118,14 @@ async def get_data( limit: int, next_page: FeedResultNextPage, redis_client: Optional[Union[redis.Redis, AsyncRedis]] = None, + ctx: Optional["ExecutionContext"] = None, **params: Any, ) -> FeedResult: + if ctx is None: + from .execution.context import ExecutionContext as _ExecutionContext + + ctx = _ExecutionContext(methods_dict=methods_dict, user_id=user_id, redis_client=redis_client) + subfeed_next_page = FeedResultNextPageInside( page=next_page.data[self.subfeed_id].page if self.subfeed_id in next_page.data else 1, after=next_page.data[self.subfeed_id].after if self.subfeed_id in next_page.data else None, diff --git a/smartfeed/manager.py b/smartfeed/manager.py index 5ef6eb1..f42e95f 100644 --- a/smartfeed/manager.py +++ b/smartfeed/manager.py @@ -3,6 +3,8 @@ import redis from redis.asyncio import Redis as AsyncRedis +from .execution.context import ExecutionContext +from .execution.executor import Executor from .schemas import FeedConfig, FeedResult, FeedResultNextPage @@ -39,12 +41,7 @@ async def get_data(self, user_id: Any, limit: int, next_page: FeedResultNextPage :return: результат получения данных согласно конфигурации фида. """ - result = await self.feed_config.feed.get_data( - methods_dict=self.methods_dict, - user_id=user_id, - limit=limit, - next_page=next_page, - redis_client=self.redis_client, - **params, - ) + ctx = ExecutionContext(methods_dict=self.methods_dict, user_id=user_id, redis_client=self.redis_client) + ctx.executor = Executor() + result = await ctx.executor.run(self.feed_config.feed, ctx, limit, next_page, **params) return result diff --git a/smartfeed/mergers/append.py b/smartfeed/mergers/append.py index bfe0718..e4fc609 100644 --- a/smartfeed/mergers/append.py +++ b/smartfeed/mergers/append.py @@ -1,14 +1,14 @@ from __future__ import annotations -import asyncio -from collections import defaultdict from random import shuffle -from typing import TYPE_CHECKING, Any, Awaitable, Callable, Dict, List, Literal, Optional, Union, cast +from typing import TYPE_CHECKING, Any, Callable, Dict, List, Literal, Optional, Union, cast import redis from redis.asyncio import Redis as AsyncRedis -from ..feed_models import BaseFeedConfigModel, FeedResult, FeedResultNextPage, _pydantic_deep_copy +from ..execution.context import ExecutionContext +from ..execution.executor import SlotSpec, SlotsPlan +from ..feed_models import BaseFeedConfigModel, FeedResult, FeedResultNextPage if TYPE_CHECKING: from ..schemas import FeedTypes @@ -22,6 +22,34 @@ class MergerAppend(BaseFeedConfigModel): items: List[FeedTypes] shuffle: bool = False + def build_plan( + self, + *, + ctx: ExecutionContext, + limit: int, + next_page: FeedResultNextPage, + **params: Any, + ) -> SlotsPlan: + slots = [SlotSpec(owner=cast(BaseFeedConfigModel, item), max_count=limit) for item in self.items] + + def _assemble( + output: List[Any], merged_next_page: FeedResultNextPage, owner_results: Dict[int, FeedResult] + ) -> FeedResult: + has_next_page = any(r.has_next_page for r in owner_results.values()) + result = FeedResult(data=output, next_page=merged_next_page, has_next_page=has_next_page) + if self.shuffle: + shuffle(result.data) + return result + + return SlotsPlan( + ctx=ctx, + limit=limit, + next_page=next_page, + params=dict(params), + slots=slots, + assemble=_assemble, + ) + async def get_data( self, methods_dict: Dict[str, Callable], @@ -29,75 +57,15 @@ async def get_data( limit: int, next_page: FeedResultNextPage, redis_client: Optional[Union[redis.Redis, AsyncRedis]] = None, + ctx: Optional[ExecutionContext] = None, **params: Any, ) -> FeedResult: - dedup_active = bool(params.pop("_sf_dedup_active", False)) - - result = FeedResult(data=[], next_page=FeedResultNextPage(data={}), has_next_page=False) - - if dedup_active: - indexed_items = list(enumerate(self.items)) - fetched: Dict[int, FeedResult] = {} - - groups: Dict[int, List[tuple[int, "FeedTypes"]]] = defaultdict(list) - for idx, item in indexed_items: - prio = int(getattr(item, "dedup_priority", 0)) - groups[prio].append((idx, item)) - - for prio in sorted(groups.keys(), reverse=True): - group = groups[prio] - coros: List[Awaitable[FeedResult]] = [] - order: List[int] = [] - for idx, item in group: - order.append(idx) - coros.append( - item.get_data( - methods_dict=methods_dict, - user_id=user_id, - limit=limit, - next_page=_pydantic_deep_copy(next_page), - redis_client=redis_client, - _sf_dedup_active=True, - **params, - ) - ) - group_results = await asyncio.gather(*coros) - for idx, r in zip(order, group_results): - fetched[idx] = cast(FeedResult, r) - - for idx, _item in indexed_items: - item_result = fetched[idx] - result.data.extend(item_result.data) - result.next_page.data.update(item_result.next_page.data) - if item_result.has_next_page: - result.has_next_page = True - - if len(result.data) > limit: - result.data = result.data[:limit] - else: - result_limit = limit - for item in self.items: - item_result = await item.get_data( - methods_dict=methods_dict, - user_id=user_id, - limit=result_limit, - next_page=next_page, - redis_client=redis_client, - **params, - ) - - result.data.extend(item_result.data) - result_limit -= len(item_result.data) - - if not result.has_next_page and item_result.has_next_page: - result.has_next_page = True - - result.next_page.data.update(item_result.next_page.data) + if ctx is None: + ctx = ExecutionContext(methods_dict=methods_dict, user_id=user_id, redis_client=redis_client) - if result_limit <= 0: - break + if ctx.executor is None: + from ..execution.executor import Executor - if self.shuffle: - shuffle(result.data) + ctx.executor = Executor() - return result + return await ctx.executor.run(self, ctx, limit, next_page, **params) diff --git a/smartfeed/mergers/append_distribute.py b/smartfeed/mergers/append_distribute.py index 5f07d66..442ee00 100644 --- a/smartfeed/mergers/append_distribute.py +++ b/smartfeed/mergers/append_distribute.py @@ -7,6 +7,8 @@ from redis.asyncio import Redis as AsyncRedis from typing_extensions import no_type_check +from ..execution.context import ExecutionContext +from ..execution.executor import SlotSpec, SlotsPlan from ..feed_models import BaseFeedConfigModel, FeedResult, FeedResultNextPage if TYPE_CHECKING: @@ -46,6 +48,32 @@ async def _uniform_distribute(self, data: list) -> list: return result + def build_plan( + self, + *, + ctx: ExecutionContext, + limit: int, + next_page: FeedResultNextPage, + **params: Any, + ) -> SlotsPlan: + slots = [SlotSpec(owner=item, max_count=limit) for item in self.items] + + async def _assemble( + output: List[Any], merged_next_page: FeedResultNextPage, owner_results: Dict[int, FeedResult] + ) -> FeedResult: + has_next_page = any(r.has_next_page for r in owner_results.values()) + distributed = await self._uniform_distribute(output) + return FeedResult(data=distributed, next_page=merged_next_page, has_next_page=has_next_page) + + return SlotsPlan( + ctx=ctx, + limit=limit, + next_page=next_page, + params=dict(params), + slots=slots, + assemble=_assemble, + ) + async def get_data( self, methods_dict: Dict[str, Callable], @@ -53,59 +81,15 @@ async def get_data( limit: int, next_page: FeedResultNextPage, redis_client: Optional[Union[redis.Redis, AsyncRedis]] = None, + ctx: Optional[ExecutionContext] = None, **params: Any, ) -> FeedResult: - dedup_active = bool(params.pop("_sf_dedup_active", False)) - - result = FeedResult(data=[], next_page=FeedResultNextPage(data={}), has_next_page=False) - - if dedup_active: - indexed_items = list(enumerate(self.items)) - fetch_order = sorted(indexed_items, key=lambda p: (getattr(p[1], "dedup_priority", 0), -p[0]), reverse=True) - fetched: Dict[int, FeedResult] = {} - - for idx, item in fetch_order: - fetched[idx] = await item.get_data( - methods_dict=methods_dict, - user_id=user_id, - limit=limit, - next_page=next_page, - redis_client=redis_client, - _sf_dedup_active=True, - **params, - ) - - for idx, _item in indexed_items: - item_result = fetched[idx] - result.data.extend(item_result.data) - result.next_page.data.update(item_result.next_page.data) - if item_result.has_next_page: - result.has_next_page = True - - if len(result.data) > limit: - result.data = result.data[:limit] - else: - result_limit = limit - for item in self.items: - item_result = await item.get_data( - methods_dict=methods_dict, - user_id=user_id, - limit=result_limit, - next_page=next_page, - redis_client=redis_client, - **params, - ) - - result.data.extend(item_result.data) - result_limit -= len(item_result.data) - - if not result.has_next_page and item_result.has_next_page: - result.has_next_page = True - - result.next_page.data.update(item_result.next_page.data) - - if result_limit <= 0: - break - - result.data = await self._uniform_distribute(result.data) - return result + if ctx is None: + ctx = ExecutionContext(methods_dict=methods_dict, user_id=user_id, redis_client=redis_client) + + if ctx.executor is None: + from ..execution.executor import Executor + + ctx.executor = Executor() + + return await ctx.executor.run(self, ctx, limit, next_page, **params) diff --git a/smartfeed/mergers/deduplication.py b/smartfeed/mergers/deduplication.py index a9902a5..94c9841 100644 --- a/smartfeed/mergers/deduplication.py +++ b/smartfeed/mergers/deduplication.py @@ -1,120 +1,28 @@ from __future__ import annotations -import asyncio -import base64 -import inspect -import zlib -from abc import ABC, abstractmethod -from dataclasses import dataclass -from typing import TYPE_CHECKING, Any, Awaitable, Callable, Dict, Iterator, List, Literal, Optional, Union, cast +from typing import TYPE_CHECKING, Any, Callable, Dict, Literal, Optional, Union import redis from pydantic import PrivateAttr, model_validator from redis.asyncio import Redis as AsyncRedis -from .. import jsonlib as json +from ..execution.context import ExecutionContext, RefillExecutionSettings +from ..execution.cursors import CursorMap +from ..execution.executor import CallablePlan from ..feed_models import ( BaseFeedConfigModel, FeedResult, - FeedResultClient, FeedResultNextPage, FeedResultNextPageInside, - SubFeed, - _is_async_redis_client, _pydantic_deep_copy, - _redis_call, ) +from ..policies.dedup import DeduplicationPolicy +from ..policies.seen_store import CursorSeenStore, RedisSeenStore, SeenStore if TYPE_CHECKING: from ..schemas import FeedTypes -class _DedupState(ABC): - @abstractmethod - def should_accept(self, key: str, priority: int) -> bool: - raise NotImplementedError - - @abstractmethod - def record(self, key: str, priority: int) -> None: - raise NotImplementedError - - async def prefetch(self, keys: List[str]) -> None: - return - - -@dataclass -class _CursorDedupState(_DedupState): - seen_priority_map: Dict[str, int] - seen_updates_in_order: List[tuple[str, int]] - seen_request_set: set[str] - - def should_accept(self, key: str, priority: int) -> bool: - if key in self.seen_request_set: - return False - existing_priority = self.seen_priority_map.get(key) - if existing_priority is not None and priority <= existing_priority: - return False - return True - - def record(self, key: str, priority: int) -> None: - self.seen_priority_map[key] = priority - self.seen_updates_in_order.append((key, priority)) - self.seen_request_set.add(key) - - -@dataclass -class _RedisDedupState(_DedupState): - redis_client: Union[redis.Redis, AsyncRedis] - redis_state_key: str - redis_seen_cache: Dict[str, Optional[int]] - redis_new_scores: Dict[str, int] - seen_request_set: set[str] - zmscore: Callable[ - [Union[redis.Redis, AsyncRedis], str, List[str]], - Union[Awaitable[List[Optional[float]]], List[Optional[float]]], - ] - - async def prefetch(self, keys: List[str]) -> None: - if not keys: - return - unique: List[str] = [] - seen: set[str] = set() - for k in keys: - if k in self.seen_request_set: - continue - if k in self.redis_seen_cache: - continue - if k in seen: - continue - seen.add(k) - unique.append(k) - - if not unique: - return - - scores_result = self.zmscore(self.redis_client, self.redis_state_key, unique) - if inspect.iscoroutine(scores_result): - scores = await cast(Awaitable[List[Optional[float]]], scores_result) - else: - scores = cast(List[Optional[float]], scores_result) - - for k, s in zip(unique, scores): - self.redis_seen_cache[k] = None if s is None else int(s) - - def should_accept(self, key: str, priority: int) -> bool: - if key in self.seen_request_set: - return False - existing_priority = self.redis_seen_cache.get(key) - if existing_priority is not None and priority <= existing_priority: - return False - return True - - def record(self, key: str, priority: int) -> None: - self.seen_request_set.add(key) - self.redis_seen_cache[key] = priority - self.redis_new_scores[key] = max(self.redis_new_scores.get(key, 0), priority) - - class MergerDeduplication(BaseFeedConfigModel): """Merger that deduplicates while preserving child mixing/position semantics.""" @@ -189,264 +97,7 @@ def _get_descendant_cursor_keys_cached(self) -> set[str]: def _reset_descendant_cursors(self, next_page: FeedResultNextPage) -> None: descendant_keys = self._get_descendant_cursor_keys_cached() - for key in descendant_keys: - next_page.data.pop(key, None) - - def _normalize_key(self, value: Any) -> str: - if isinstance(value, (str, int)): - return str(value) - if isinstance(value, (dict, list)): - return json.dumps(value, sort_keys=True, default=str) - return str(value) - - def _extract_dedup_value(self, item: Any) -> Any: - if not self.dedup_key: - return item - - try: - value = item.get(self.dedup_key) - except AttributeError: - value = getattr(item, self.dedup_key, None) - - if value is None and self.missing_key_policy == "error": - raise AssertionError(f"Deduplication failed: entity {item} has no key or attr {self.dedup_key}") - return value - - def _get_entity_key(self, entity: Any) -> Optional[str]: - raw_value = self._extract_dedup_value(entity) - if raw_value is None: - if self.missing_key_policy == "drop": - return None - if self.missing_key_policy == "keep": - raw_value = ("__missing__", id(entity)) - return self._normalize_key(raw_value) - - def _compute_overfetch_params(self, *, remaining: int, next_after: Any) -> tuple[bool, int, Optional[int]]: - can_overfetch = isinstance(next_after, int) - request_limit = max(1, remaining) - if can_overfetch and self.overfetch_factor > 1: - request_limit = max(1, remaining * self.overfetch_factor) - start_after: Optional[int] = int(next_after) if can_overfetch else None - return can_overfetch, request_limit, start_after - - def _iter_subfeeds(self, feed: BaseFeedConfigModel) -> Iterator[SubFeed]: - if isinstance(feed, SubFeed): - yield feed - return - - for attr_name in ("data", "positional", "default"): - inner = getattr(feed, attr_name, None) - if isinstance(inner, BaseFeedConfigModel): - yield from self._iter_subfeeds(inner) - - for attr_name in ("item_from", "item_to"): - wrapper = getattr(feed, attr_name, None) - inner = getattr(wrapper, "data", None) - if isinstance(inner, BaseFeedConfigModel): - yield from self._iter_subfeeds(inner) - - items = getattr(feed, "items", None) - if isinstance(items, list): - for item in items: - if isinstance(item, BaseFeedConfigModel): - yield from self._iter_subfeeds(item) - continue - inner = getattr(item, "data", None) - if isinstance(inner, BaseFeedConfigModel): - yield from self._iter_subfeeds(inner) - - def _register_wrapped_subfeed_method( - self, - *, - subfeed: SubFeed, - original_methods_dict: Dict[str, Callable], - rewritten_methods_dict: Dict[str, Callable], - dedup_state: _DedupState, - ) -> None: - original_name = subfeed.method_name - original_method = original_methods_dict[original_name] - unique_name = f"__dedup__{self.merger_id}__{subfeed.subfeed_id}" - - if unique_name in rewritten_methods_dict: - subfeed.method_name = unique_name - return - - subfeed.method_name = unique_name - leaf_priority = int(getattr(subfeed, "dedup_priority", 0)) - - wrapped = self._make_wrapped_leaf_method( - original_method=original_method, - dedup_state=dedup_state, - leaf_priority=leaf_priority, - ) - setattr(wrapped, "_smartfeed_original", original_method) - rewritten_methods_dict[unique_name] = wrapped - - def _make_wrapped_leaf_method( - self, - *, - original_method: Callable, - dedup_state: _DedupState, - leaf_priority: int, - ) -> Callable: - async def _wrapped_method( - user_id: Any, - limit: int, - next_page: FeedResultNextPageInside, - **kw: Any, - ) -> FeedResultClient: - collected: List[Any] = [] - upstream_has_next_page = False - - loops = 0 - while len(collected) < limit and loops < self.max_refill_loops: - loops += 1 - before_len = len(collected) - - remaining = limit - len(collected) - can_overfetch, request_limit, start_after = self._compute_overfetch_params( - remaining=remaining, - next_after=next_page.after, - ) - - method_result = await original_method(user_id=user_id, limit=request_limit, next_page=next_page, **kw) - if not isinstance(method_result, FeedResultClient): - raise TypeError('SubFeed function must return "FeedResultClient" instance.') - - upstream_has_next_page = upstream_has_next_page or method_result.has_next_page - - inspected_count = 0 - - keys_by_index: Optional[List[Optional[str]]] = None - if isinstance(dedup_state, _RedisDedupState): - keys_by_index = [] - batch_keys: List[str] = [] - for entity in method_result.data: - key = self._get_entity_key(entity) - keys_by_index.append(key) - if key is not None: - batch_keys.append(key) - await dedup_state.prefetch(batch_keys) - - for idx, entity in enumerate(method_result.data, start=1): - inspected_count = idx - - key = keys_by_index[idx - 1] if keys_by_index is not None else self._get_entity_key(entity) - if key is None: - continue - - if not dedup_state.should_accept(key, leaf_priority): - continue - - collected.append(entity) - dedup_state.record(key, leaf_priority) - - if len(collected) >= limit: - break - - if len(collected) == before_len: - if not method_result.has_next_page: - break - - if can_overfetch and request_limit > remaining and start_after is not None: - end_after = next_page.after - if isinstance(end_after, int) and end_after == start_after + len(method_result.data): - next_page.after = start_after + inspected_count - - return FeedResultClient(data=collected, next_page=next_page, has_next_page=upstream_has_next_page) - - return _wrapped_method - - def _decode_seen_from_cursor(self, next_page: FeedResultNextPage) -> Dict[str, int]: - entry = next_page.data.get(self.merger_id) - if not entry or entry.after is None: - return {} - - after = entry.after - if isinstance(after, dict) and "z" in after: - payload = base64.urlsafe_b64decode(after["z"].encode()) - raw = zlib.decompress(payload).decode() - decoded = json.loads(raw) - if isinstance(decoded, dict): - return {str(k): int(v) for k, v in decoded.items()} - if isinstance(decoded, list): - seen_map: Dict[str, int] = {} - for entry_item in decoded: - if isinstance(entry_item, (list, tuple)) and len(entry_item) == 2: - seen_map[str(entry_item[0])] = int(entry_item[1]) - else: - seen_map[str(entry_item)] = 0 - return seen_map - return {} - if isinstance(after, dict) and "seen" in after: - return {str(k): 0 for k in list(after["seen"])} - if isinstance(after, list): - return {str(k): 0 for k in list(after)} - if isinstance(after, dict): - return {str(k): int(v) for k, v in after.items() if k not in {"v", "c", "n"}} - return {} - - def _encode_seen_for_cursor(self, seen_updates_in_order: List[tuple[str, int]]) -> Any: - if self.cursor_max_keys is not None: - seen_updates_in_order = seen_updates_in_order[-self.cursor_max_keys :] - - if not self.cursor_compress: - return {"v": 2, "seen": [[k, p] for k, p in seen_updates_in_order]} - - raw = json.dumps([[k, p] for k, p in seen_updates_in_order]).encode() - compressed = zlib.compress(raw) - return { - "v": 2, - "c": "zlib+base64", - "n": len(seen_updates_in_order), - "z": base64.urlsafe_b64encode(compressed).decode(), - } - - async def _redis_zmscore( - self, - redis_client: Union[redis.Redis, AsyncRedis], - key: str, - members: List[str], - ) -> List[Optional[float]]: - if not members: - return [] - - zmscore_fn = getattr(redis_client, "zmscore", None) - if zmscore_fn is not None: - res = zmscore_fn(key, members) - if inspect.iscoroutine(res): - res = await res - return [None if v is None else float(v) for v in list(res)] - - if not _is_async_redis_client(redis_client): - - def _sync_pipeline_execute() -> Any: - pipe = redis_client.pipeline() - for m in members: - pipe.zscore(key, m) - return pipe.execute() - - res = await asyncio.to_thread(_sync_pipeline_execute) - return [None if v is None else float(v) for v in list(res)] - - pipe = redis_client.pipeline() - for m in members: - pipe.zscore(key, m) - res = pipe.execute() - if inspect.iscoroutine(res): - res = await res - return [None if v is None else float(v) for v in list(res)] - - async def _redis_zadd_and_expire( - self, - redis_client: Union[redis.Redis, AsyncRedis], - key: str, - member_scores: Dict[str, int], - ) -> None: - if not member_scores: - return - await _redis_call(redis_client, "zadd", key, mapping={m: float(s) for m, s in member_scores.items()}) - await _redis_call(redis_client, "expire", key, self.state_ttl_seconds) + CursorMap(next_page).reset_keys(descendant_keys) def _build_redis_state_key(self, user_id: Any, params: Dict[str, Any]) -> str: suffix = params.get("custom_deduplication_key") or params.get("custom_view_session_key") @@ -461,88 +112,102 @@ async def get_data( limit: int, next_page: FeedResultNextPage, redis_client: Optional[Union[redis.Redis, AsyncRedis]] = None, + ctx: Optional[ExecutionContext] = None, **params: Any, ) -> FeedResult: - if limit <= 0: - return FeedResult(data=[], next_page=next_page, has_next_page=False) + if ctx is None: + ctx = ExecutionContext(methods_dict=methods_dict, user_id=user_id, redis_client=redis_client) + elif ctx.redis_client is None and redis_client is not None: + ctx.redis_client = redis_client + + if ctx.executor is None: + from ..execution.executor import Executor - entry = next_page.data.get(self.merger_id) - requested_page = entry.page if entry is not None else None - is_fresh_session = requested_page is None or (isinstance(requested_page, int) and requested_page <= 0) + ctx.executor = Executor() - if self.state_backend == "redis" and not redis_client: - raise ValueError("Redis client must be provided if using MergerDeduplication with state_backend=redis") + return await ctx.executor.run(self, ctx, limit, next_page, **params) - working_next_page = _pydantic_deep_copy(next_page) + def build_plan( + self, + *, + ctx: ExecutionContext, + limit: int, + next_page: FeedResultNextPage, + **params: Any, + ) -> CallablePlan: + async def _run(executor: Any) -> FeedResult: + if limit <= 0: + return FeedResult(data=[], next_page=next_page, has_next_page=False) - if is_fresh_session: - self._reset_descendant_cursors(working_next_page) + if ctx.executor is None: + ctx.executor = executor - seen_priority_map: Dict[str, int] = {} - seen_updates_in_order: List[tuple[str, int]] = [] - if self.state_backend == "cursor" and not is_fresh_session: - seen_priority_map = self._decode_seen_from_cursor(next_page) + entry = next_page.data.get(self.merger_id) + requested_page = entry.page if entry is not None else None + is_fresh_session = requested_page is None or (isinstance(requested_page, int) and requested_page <= 0) - seen_request_set: set[str] = set(seen_priority_map.keys()) + redis_client = ctx.redis_client + if self.state_backend == "redis" and not redis_client: + raise ValueError("Redis client must be provided if using MergerDeduplication with state_backend=redis") - redis_state_key = "" - redis_new_scores: Dict[str, int] = {} - redis_seen_cache: Dict[str, Optional[int]] = {} - if self.state_backend == "redis" and redis_client: - redis_state_key = self._build_redis_state_key(user_id=user_id, params=params) + working_next_page = _pydantic_deep_copy(next_page) if is_fresh_session: - await _redis_call(redis_client, "delete", redis_state_key) + self._reset_descendant_cursors(working_next_page) + + seen_request_set: set[str] = set() + store: SeenStore + if self.state_backend == "cursor": + cursor_entry = next_page.data.get(self.merger_id) + store = CursorSeenStore.from_after( + cursor_entry.after if (cursor_entry is not None and not is_fresh_session) else None, + cursor_compress=self.cursor_compress, + cursor_max_keys=self.cursor_max_keys, + ) + else: + assert redis_client is not None + redis_state_key = self._build_redis_state_key(user_id=ctx.user_id, params=params) + store = RedisSeenStore.create( + redis_client=redis_client, + redis_key=redis_state_key, + ttl_seconds=self.state_ttl_seconds, + ) + if is_fresh_session: + await store.reset() - if self.state_backend == "cursor": - dedup_state: _DedupState = _CursorDedupState( - seen_priority_map=seen_priority_map, - seen_updates_in_order=seen_updates_in_order, + policy = DeduplicationPolicy( + dedup_key=self.dedup_key, + missing_key_policy=self.missing_key_policy, + store=store, seen_request_set=seen_request_set, ) - else: - assert redis_client is not None - dedup_state = _RedisDedupState( - redis_client=redis_client, - redis_state_key=redis_state_key, - redis_seen_cache=redis_seen_cache, - redis_new_scores=redis_new_scores, - seen_request_set=seen_request_set, - zmscore=self._redis_zmscore, - ) - - original_methods_dict = methods_dict - child = _pydantic_deep_copy(self.data) - - rewritten_methods_dict = dict(original_methods_dict) - - for sf in self._iter_subfeeds(child): - self._register_wrapped_subfeed_method( - subfeed=sf, - original_methods_dict=original_methods_dict, - rewritten_methods_dict=rewritten_methods_dict, - dedup_state=dedup_state, + refill_settings = RefillExecutionSettings( + overfetch_factor=self.overfetch_factor, + max_refill_loops=self.max_refill_loops, + ) + child_ctx = ExecutionContext( + methods_dict=ctx.methods_dict, + user_id=ctx.user_id, + redis_client=ctx.redis_client, + executor=ctx.executor, + dedup=policy, + refill_settings=refill_settings, + dedup_settings=refill_settings, ) - child_result = await child.get_data( - methods_dict=rewritten_methods_dict, - user_id=user_id, - limit=limit, - next_page=working_next_page, - redis_client=redis_client, - _sf_dedup_active=True, - **params, - ) - - if self.state_backend == "redis" and redis_client: - await self._redis_zadd_and_expire(redis_client, redis_state_key, redis_new_scores) + child = _pydantic_deep_copy(self.data) + child_result = await executor.run(child, child_ctx, limit, working_next_page, **params) - page = next_page.data[self.merger_id].page if self.merger_id in next_page.data else 1 - merger_after: Any = None - if self.state_backend == "cursor": - merger_after = self._encode_seen_for_cursor(seen_updates_in_order) + commit_result: Any = await store.commit() + merger_after: Any = commit_result if self.state_backend == "cursor" else None - result_next_page = _pydantic_deep_copy(child_result.next_page) - result_next_page.data[self.merger_id] = FeedResultNextPageInside(page=page + 1, after=merger_after) + page = next_page.data[self.merger_id].page if self.merger_id in next_page.data else 1 + result_next_page = _pydantic_deep_copy(child_result.next_page) + result_next_page.data[self.merger_id] = FeedResultNextPageInside(page=page + 1, after=merger_after) + return FeedResult( + data=child_result.data, + next_page=result_next_page, + has_next_page=child_result.has_next_page, + ) - return FeedResult(data=child_result.data, next_page=result_next_page, has_next_page=child_result.has_next_page) + return CallablePlan(fn=_run) diff --git a/smartfeed/mergers/percentage.py b/smartfeed/mergers/percentage.py index 76982fb..1da51ea 100644 --- a/smartfeed/mergers/percentage.py +++ b/smartfeed/mergers/percentage.py @@ -7,6 +7,8 @@ from pydantic import BaseModel from redis.asyncio import Redis as AsyncRedis +from ..execution.context import ExecutionContext +from ..execution.executor import SlotSpec, SlotsPlan from ..feed_models import BaseFeedConfigModel, FeedResult, FeedResultNextPage if TYPE_CHECKING: @@ -61,51 +63,61 @@ async def get_data( limit: int, next_page: FeedResultNextPage, redis_client: Optional[Union[redis.Redis, AsyncRedis]] = None, + ctx: Optional[ExecutionContext] = None, **params: Any, ) -> FeedResult: - result = FeedResult(data=[], next_page=FeedResultNextPage(data={}), has_next_page=False) + if ctx is None: + ctx = ExecutionContext(methods_dict=methods_dict, user_id=user_id, redis_client=redis_client) - dedup_active = bool(params.pop("_sf_dedup_active", False)) + if ctx.executor is None: + from ..execution.executor import Executor - items_data: List[List[Any]] = [[] for _ in self.items] - results: List[Optional[FeedResult]] = [None for _ in self.items] + ctx.executor = Executor() - indexed_items = list(enumerate(self.items)) - fetch_order = indexed_items - if dedup_active: - fetch_order = sorted( - indexed_items, - key=lambda p: (getattr(p[1].data, "dedup_priority", 0), -p[0]), - reverse=True, - ) - - for idx, item in fetch_order: - item_result = cast( - FeedResult, - await item.data.get_data( - methods_dict=methods_dict, - user_id=user_id, - limit=limit * item.percentage // 100, - next_page=next_page, - redis_client=redis_client, - _sf_dedup_active=dedup_active, - **params, - ), - ) - - results[idx] = item_result - - for idx, result_item in enumerate(results): - assert result_item is not None - items_data[idx] = result_item.data - - if not result.has_next_page and result_item.has_next_page: - result.has_next_page = True - result.next_page.data.update(result_item.next_page.data) - - result.data = await self._merge_items_data(items_data=items_data) + return await ctx.executor.run(self, ctx, limit, next_page, **params) - if self.shuffle: - shuffle(result.data) - - return result + def build_plan( + self, + *, + ctx: ExecutionContext, + limit: int, + next_page: FeedResultNextPage, + **params: Any, + ) -> SlotsPlan: + owners: List[BaseFeedConfigModel] = [cast(BaseFeedConfigModel, item.data) for item in self.items] + + slots: List[SlotSpec] = [] + for item, owner in zip(self.items, owners): + child_limit = limit * int(item.percentage) // 100 + slots.append(SlotSpec(owner=owner, max_count=max(0, child_limit))) + + async def _assemble( + output: List[Any], + merged_next_page: FeedResultNextPage, + owner_results: Dict[int, FeedResult], + ) -> FeedResult: + items_data: List[List[Any]] = [] + has_next_page = False + + for owner in owners: + child_res = owner_results.get(id(owner)) + if child_res is None: + items_data.append([]) + continue + items_data.append(list(child_res.data)) + has_next_page = has_next_page or bool(child_res.has_next_page) + + data = await self._merge_items_data(items_data=items_data) + if self.shuffle: + shuffle(data) + + return FeedResult(data=data, next_page=merged_next_page, has_next_page=has_next_page) + + return SlotsPlan( + ctx=ctx, + limit=limit, + next_page=next_page, + params=dict(params), + slots=slots, + assemble=_assemble, + ) diff --git a/smartfeed/mergers/percentage_gradient.py b/smartfeed/mergers/percentage_gradient.py index 17ed530..460e9f0 100644 --- a/smartfeed/mergers/percentage_gradient.py +++ b/smartfeed/mergers/percentage_gradient.py @@ -1,10 +1,12 @@ from random import shuffle -from typing import Any, Callable, Dict, Literal, Optional, Union +from typing import Any, Callable, Dict, List, Literal, Optional, Union, cast import redis from pydantic import model_validator from redis.asyncio import Redis as AsyncRedis +from ..execution.context import ExecutionContext +from ..execution.executor import SlotSpec, SlotsPlan from ..feed_models import BaseFeedConfigModel, FeedResult, FeedResultNextPage, FeedResultNextPageInside from .percentage import MergerPercentageItem @@ -28,7 +30,7 @@ def validate_merger_percentage_gradient(self) -> "MergerPercentageGradient": raise ValueError('"size_to_step" must be bigger than 1') return self - async def _calculate_limits_and_percents(self, page: int, limit: int) -> Dict: + def _calculate_limits_and_percents(self, page: int, limit: int) -> Dict: result: Dict = { "limit_from": 0, "limit_to": 0, @@ -74,91 +76,90 @@ async def get_data( limit: int, next_page: FeedResultNextPage, redis_client: Optional[Union[redis.Redis, AsyncRedis]] = None, + ctx: Optional[ExecutionContext] = None, **params: Any, ) -> FeedResult: - result = FeedResult( - data=[], - next_page=FeedResultNextPage( - data={ - self.merger_id: FeedResultNextPageInside( - page=next_page.data[self.merger_id].page if self.merger_id in next_page.data else 1, - after=next_page.data[self.merger_id].after if self.merger_id in next_page.data else None, - ) - }, - ), - has_next_page=False, - ) + if ctx is None: + ctx = ExecutionContext(methods_dict=methods_dict, user_id=user_id, redis_client=redis_client) - limits_and_percents = await self._calculate_limits_and_percents( - page=result.next_page.data[self.merger_id].page, - limit=limit, + if ctx.executor is None: + from ..execution.executor import Executor + + ctx.executor = Executor() + + return await ctx.executor.run(self, ctx, limit, next_page, **params) + + def build_plan( + self, + *, + ctx: ExecutionContext, + limit: int, + next_page: FeedResultNextPage, + **params: Any, + ) -> SlotsPlan: + start_page = next_page.data[self.merger_id].page if self.merger_id in next_page.data else 1 + start_after = next_page.data[self.merger_id].after if self.merger_id in next_page.data else None + + plan_next_page = FeedResultNextPage( + data={ + **next_page.data, + self.merger_id: FeedResultNextPageInside(page=start_page, after=start_after), + } ) - dedup_active = bool(params.pop("_sf_dedup_active", False)) - - from_priority = getattr(self.item_from.data, "dedup_priority", 0) - to_priority = getattr(self.item_to.data, "dedup_priority", 0) - - if dedup_active and to_priority > from_priority: - item_to = await self.item_to.data.get_data( - methods_dict=methods_dict, - user_id=user_id, - limit=limits_and_percents["limit_to"], - next_page=next_page, - redis_client=redis_client, - _sf_dedup_active=True, - **params, - ) - item_from = await self.item_from.data.get_data( - methods_dict=methods_dict, - user_id=user_id, - limit=limits_and_percents["limit_from"], - next_page=next_page, - redis_client=redis_client, - _sf_dedup_active=True, - **params, - ) - else: - item_from = await self.item_from.data.get_data( - methods_dict=methods_dict, - user_id=user_id, - limit=limits_and_percents["limit_from"], - next_page=next_page, - redis_client=redis_client, - _sf_dedup_active=dedup_active, - **params, - ) - item_to = await self.item_to.data.get_data( - methods_dict=methods_dict, - user_id=user_id, - limit=limits_and_percents["limit_to"], - next_page=next_page, - redis_client=redis_client, - _sf_dedup_active=dedup_active, - **params, - ) - - from_start_index = 0 - to_start_index = 0 - for lp_data in limits_and_percents["percentages"]: - from_end_index = (lp_data["limit"] * lp_data["from"] // 100) + from_start_index - to_end_index = (lp_data["limit"] * lp_data["to"] // 100) + to_start_index - - result.data.extend(item_from.data[from_start_index:from_end_index]) - result.data.extend(item_to.data[to_start_index:to_end_index]) - - from_start_index = from_end_index - to_start_index = to_end_index - - result.next_page.data.update(item_from.next_page.data) - result.next_page.data.update(item_to.next_page.data) - - if any([item_from.has_next_page, item_to.has_next_page]): - result.has_next_page = True - - if self.shuffle: - shuffle(result.data) - - result.next_page.data[self.merger_id].page += 1 + limits_and_percents = self._calculate_limits_and_percents(page=start_page, limit=limit) - return result + owner_from = cast(BaseFeedConfigModel, self.item_from.data) + owner_to = cast(BaseFeedConfigModel, self.item_to.data) + + slots = [ + SlotSpec(owner=owner_from, max_count=int(limits_and_percents["limit_from"])), + SlotSpec(owner=owner_to, max_count=int(limits_and_percents["limit_to"])), + ] + + async def _assemble( + output: List[Any], + merged_next_page: FeedResultNextPage, + owner_results: Dict[int, FeedResult], + ) -> FeedResult: + from_res = owner_results.get(id(owner_from)) + to_res = owner_results.get(id(owner_to)) + + from_data = list(from_res.data) if from_res is not None else [] + to_data = list(to_res.data) if to_res is not None else [] + + data: List[Any] = [] + from_start_index = 0 + to_start_index = 0 + for lp_data in limits_and_percents["percentages"]: + from_end_index = (lp_data["limit"] * lp_data["from"] // 100) + from_start_index + to_end_index = (lp_data["limit"] * lp_data["to"] // 100) + to_start_index + + data.extend(from_data[from_start_index:from_end_index]) + data.extend(to_data[to_start_index:to_end_index]) + + from_start_index = from_end_index + to_start_index = to_end_index + + has_next_page = False + if from_res is not None and from_res.has_next_page: + has_next_page = True + if to_res is not None and to_res.has_next_page: + has_next_page = True + + if self.shuffle: + shuffle(data) + + if self.merger_id in merged_next_page.data: + merged_next_page.data[self.merger_id].page += 1 + + return FeedResult(data=data, next_page=merged_next_page, has_next_page=has_next_page) + + return SlotsPlan( + ctx=ctx, + limit=limit, + next_page=plan_next_page, + params=dict(params), + slots=slots, + assemble=_assemble, + ) diff --git a/smartfeed/mergers/positional.py b/smartfeed/mergers/positional.py index a543ddb..bc3a095 100644 --- a/smartfeed/mergers/positional.py +++ b/smartfeed/mergers/positional.py @@ -6,6 +6,8 @@ from pydantic import model_validator from redis.asyncio import Redis as AsyncRedis +from ..execution.context import ExecutionContext +from ..execution.executor import SlotSpec, SlotsPlan from ..feed_models import BaseFeedConfigModel, FeedResult, FeedResultNextPage, FeedResultNextPageInside if TYPE_CHECKING: @@ -43,10 +45,27 @@ async def get_data( limit: int, next_page: FeedResultNextPage, redis_client: Optional[Union[redis.Redis, AsyncRedis]] = None, + ctx: Optional[ExecutionContext] = None, **params: Any, ) -> FeedResult: - dedup_active = bool(params.pop("_sf_dedup_active", False)) + if ctx is None: + ctx = ExecutionContext(methods_dict=methods_dict, user_id=user_id, redis_client=redis_client) + if ctx.executor is None: + from ..execution.executor import Executor + + ctx.executor = Executor() + + return await ctx.executor.run(self, ctx, limit, next_page, **params) + + def build_plan( + self, + *, + ctx: ExecutionContext, + limit: int, + next_page: FeedResultNextPage, + **params: Any, + ) -> SlotsPlan: page = next_page.data[self.merger_id].page if self.merger_id in next_page.data else 1 positional_has_next_page = True @@ -66,70 +85,54 @@ async def get_data( if position in available_positions: page_positions.append(available_positions.index(position)) - if dedup_active and getattr(self.positional, "dedup_priority", 0) > getattr(self.default, "dedup_priority", 0): - pos_res = await self.positional.get_data( - methods_dict=methods_dict, - user_id=user_id, - limit=len(page_positions), - next_page=next_page, - redis_client=redis_client, - _sf_dedup_active=True, - **params, - ) - default_res = await self.default.get_data( - methods_dict=methods_dict, - user_id=user_id, - limit=limit, - next_page=next_page, - redis_client=redis_client, - _sf_dedup_active=True, - **params, - ) - else: - default_res = await self.default.get_data( - methods_dict=methods_dict, - user_id=user_id, - limit=limit, - next_page=next_page, - redis_client=redis_client, - _sf_dedup_active=dedup_active, - **params, - ) - pos_res = await self.positional.get_data( - methods_dict=methods_dict, - user_id=user_id, - limit=len(page_positions), - next_page=next_page, - redis_client=redis_client, - _sf_dedup_active=dedup_active, - **params, - ) - - result = FeedResult( - data=default_res.data, - next_page=FeedResultNextPage( - data={ - self.merger_id: FeedResultNextPageInside( - page=page, - after=next_page.data[self.merger_id].after if self.merger_id in next_page.data else None, - ) - }, - ), - has_next_page=default_res.has_next_page, + pos_limit = len(page_positions) + + # Build a slot ownership schedule by applying the same sequential insert + # semantics as the legacy assembly logic. + schedule: List[BaseFeedConfigModel] = [self.default for _ in range(limit)] + for insert_index in [p - 1 for p in page_positions[:pos_limit]]: + schedule.insert(insert_index, self.positional) + schedule = schedule[:limit] + + # Compress the schedule into contiguous segments. + slots: List[SlotSpec] = [] + if schedule: + current_owner = schedule[0] + count = 1 + for owner in schedule[1:]: + if owner is current_owner: + count += 1 + continue + slots.append(SlotSpec(owner=current_owner, max_count=count)) + current_owner = owner + count = 1 + slots.append(SlotSpec(owner=current_owner, max_count=count)) + + after = next_page.data[self.merger_id].after if self.merger_id in next_page.data else None + + def _assemble( + output: List[Any], merged_next_page: FeedResultNextPage, owner_results: Dict[int, FeedResult] + ) -> FeedResult: + default_res = owner_results.get(id(self.default)) + pos_res = owner_results.get(id(self.positional)) + + has_next_page = bool(default_res.has_next_page) if default_res is not None else False + if not has_next_page and positional_has_next_page and pos_res is not None and pos_res.has_next_page: + has_next_page = True + + result_next_page = merged_next_page + result_next_page.data[self.merger_id] = FeedResultNextPageInside(page=page + 1, after=after) + return FeedResult(data=output, next_page=result_next_page, has_next_page=has_next_page) + + return SlotsPlan( + ctx=ctx, + limit=limit, + next_page=next_page, + params=dict(params), + slots=slots, + owner_fetch_limits={ + id(self.default): limit, + id(self.positional): pos_limit, + }, + assemble=_assemble, ) - - if not result.has_next_page and all([positional_has_next_page, pos_res.has_next_page]): - result.has_next_page = True - - result.next_page.data.update(default_res.next_page.data) - result.next_page.data.update(pos_res.next_page.data) - - for i, post in enumerate(pos_res.data): - result.data = result.data[: page_positions[i] - 1] + [post] + result.data[page_positions[i] - 1 :] - - if len(result.data) > limit: - result.data = result.data[:limit] - - result.next_page.data[self.merger_id].page += 1 - - return result diff --git a/smartfeed/mergers/view_session.py b/smartfeed/mergers/view_session.py index cd3c2f8..b32fdf3 100644 --- a/smartfeed/mergers/view_session.py +++ b/smartfeed/mergers/view_session.py @@ -9,6 +9,8 @@ from redis.asyncio import RedisCluster as AsyncRedisCluster from .. import jsonlib as json +from ..execution.context import ExecutionContext +from ..execution.executor import CallablePlan from ..feed_models import BaseFeedConfigModel, FeedResult, FeedResultNextPage, FeedResultNextPageInside, _redis_call if TYPE_CHECKING: @@ -49,15 +51,21 @@ async def _set_cache( user_id: Any, redis_client: Union[redis.Redis, AsyncRedis], cache_key: str, + ctx: Optional[ExecutionContext] = None, **params: Any, ) -> List[Any]: - result = await self.data.get_data( - methods_dict=methods_dict, - user_id=user_id, - limit=self.session_size, - next_page=FeedResultNextPage(data={}), - **params, - ) + if ctx is not None and ctx.executor is not None: + result = await ctx.executor.run(self.data, ctx, self.session_size, FeedResultNextPage(data={}), **params) + else: + result = await self.data.get_data( + methods_dict=methods_dict, + user_id=user_id, + limit=self.session_size, + next_page=FeedResultNextPage(data={}), + redis_client=ctx.redis_client if ctx is not None else None, + ctx=ctx, + **params, + ) data = result.data if self.deduplicate: @@ -71,15 +79,21 @@ async def _set_cache_async( user_id: Any, redis_client: AsyncRedis, cache_key: str, + ctx: Optional[ExecutionContext] = None, **params: Any, ) -> List[Any]: - result = await self.data.get_data( - methods_dict=methods_dict, - user_id=user_id, - limit=self.session_size, - next_page=FeedResultNextPage(data={}), - **params, - ) + if ctx is not None and ctx.executor is not None: + result = await ctx.executor.run(self.data, ctx, self.session_size, FeedResultNextPage(data={}), **params) + else: + result = await self.data.get_data( + methods_dict=methods_dict, + user_id=user_id, + limit=self.session_size, + next_page=FeedResultNextPage(data={}), + redis_client=ctx.redis_client if ctx is not None else None, + ctx=ctx, + **params, + ) data = result.data if self.deduplicate: @@ -95,6 +109,7 @@ async def _get_cache( limit: int, next_page: FeedResultNextPage, redis_client: Union[redis.Redis, AsyncRedis], + ctx: Optional[ExecutionContext] = None, **params: Any, ) -> FeedResult: if session_cache_key := params.get("custom_view_session_key", None): @@ -107,7 +122,12 @@ async def _get_cache( if not cache_exists or self.merger_id not in next_page.data: logging.info("Cache miss or new session - generating fresh data for %s", cache_key) session_data = await self._set_cache( - methods_dict=methods_dict, user_id=user_id, redis_client=redis_client, cache_key=cache_key, **params + methods_dict=methods_dict, + user_id=user_id, + redis_client=redis_client, + cache_key=cache_key, + ctx=ctx, + **params, ) else: logging.info("Cache exists - attempting read from Redis for %s", cache_key) @@ -117,7 +137,12 @@ async def _get_cache( "Redis returned None for %s - falling back to fresh data (cluster replication issue)", cache_key ) session_data = await self._set_cache( - methods_dict=methods_dict, user_id=user_id, redis_client=redis_client, cache_key=cache_key, **params + methods_dict=methods_dict, + user_id=user_id, + redis_client=redis_client, + cache_key=cache_key, + ctx=ctx, + **params, ) else: logging.info("Successfully read cached data for %s", cache_key) @@ -137,6 +162,7 @@ async def _get_cache_async( limit: int, next_page: FeedResultNextPage, redis_client: AsyncRedis, + ctx: Optional[ExecutionContext] = None, **params: Any, ) -> FeedResult: if session_cache_key := params.get("custom_view_session_key", None): @@ -146,7 +172,12 @@ async def _get_cache_async( if not await redis_client.exists(cache_key) or self.merger_id not in next_page.data: session_data = await self._set_cache_async( - methods_dict=methods_dict, user_id=user_id, redis_client=redis_client, cache_key=cache_key, **params + methods_dict=methods_dict, + user_id=user_id, + redis_client=redis_client, + cache_key=cache_key, + ctx=ctx, + **params, ) else: cached_data = await redis_client.get(cache_key) @@ -155,7 +186,12 @@ async def _get_cache_async( "Redis returned None for %s - falling back to fresh data (cluster replication issue)", cache_key ) session_data = await self._set_cache_async( - methods_dict=methods_dict, user_id=user_id, redis_client=redis_client, cache_key=cache_key, **params + methods_dict=methods_dict, + user_id=user_id, + redis_client=redis_client, + cache_key=cache_key, + ctx=ctx, + **params, ) else: logging.info("Successfully read cached data for %s", cache_key) @@ -175,31 +211,60 @@ async def get_data( limit: int, next_page: FeedResultNextPage, redis_client: Optional[Union[redis.Redis, AsyncRedis]] = None, + ctx: Optional[ExecutionContext] = None, **params: Any, ) -> FeedResult: - if not redis_client: - raise ValueError("Redis client must be provided if using Merger View Session") + if ctx is None: + ctx = ExecutionContext(methods_dict=methods_dict, user_id=user_id, redis_client=redis_client) + elif ctx.redis_client is None and redis_client is not None: + ctx.redis_client = redis_client - if isinstance(redis_client, (AsyncRedis, AsyncRedisCluster)): - result = await self._get_cache_async( - methods_dict=methods_dict, - user_id=user_id, - limit=limit, - next_page=next_page, - redis_client=redis_client, - **params, - ) - else: - result = await self._get_cache( - methods_dict=methods_dict, - user_id=user_id, - limit=limit, - next_page=next_page, - redis_client=redis_client, - **params, - ) + if ctx.executor is None: + from ..execution.executor import Executor + + ctx.executor = Executor() + + return await ctx.executor.run(self, ctx, limit, next_page, **params) + + def build_plan( + self, + *, + ctx: ExecutionContext, + limit: int, + next_page: FeedResultNextPage, + **params: Any, + ) -> CallablePlan: + async def _run(executor: Any) -> FeedResult: + if ctx.redis_client is None: + raise ValueError("Redis client must be provided if using Merger View Session") + + if ctx.executor is None: + ctx.executor = executor + + redis_client = ctx.redis_client + if isinstance(redis_client, (AsyncRedis, AsyncRedisCluster)): + result = await self._get_cache_async( + methods_dict=ctx.methods_dict, + user_id=ctx.user_id, + limit=limit, + next_page=next_page, + redis_client=redis_client, + ctx=ctx, + **params, + ) + else: + result = await self._get_cache( + methods_dict=ctx.methods_dict, + user_id=ctx.user_id, + limit=limit, + next_page=next_page, + redis_client=redis_client, + ctx=ctx, + **params, + ) - if self.shuffle: - shuffle(result.data) + if self.shuffle: + shuffle(result.data) + return result - return result + return CallablePlan(fn=_run) diff --git a/smartfeed/policies/dedup.py b/smartfeed/policies/dedup.py new file mode 100644 index 0000000..b040c47 --- /dev/null +++ b/smartfeed/policies/dedup.py @@ -0,0 +1,220 @@ +from __future__ import annotations + +from dataclasses import dataclass +from typing import Any, Dict, List, Literal, Optional, Tuple + +from .. import jsonlib as json +from .seen_store import SeenStore + +MissingKeyPolicy = Literal["error", "keep", "drop"] + + +def normalize_key(value: Any) -> str: + if isinstance(value, (str, int)): + return str(value) + if isinstance(value, (dict, list)): + return json.dumps(value, sort_keys=True, default=str) + return str(value) + + +def extract_dedup_value(item: Any, dedup_key: Optional[str], missing_key_policy: MissingKeyPolicy) -> Any: + if not dedup_key: + return item + + try: + value = item.get(dedup_key) + except AttributeError: + value = getattr(item, dedup_key, None) + + if value is None and missing_key_policy == "error": + raise AssertionError(f"Deduplication failed: entity {item} has no key or attr {dedup_key}") + return value + + +def entity_key(item: Any, dedup_key: Optional[str], missing_key_policy: MissingKeyPolicy) -> Optional[str]: + raw_value = extract_dedup_value(item, dedup_key, missing_key_policy) + if raw_value is None: + if missing_key_policy == "drop": + return None + if missing_key_policy == "keep": + raw_value = ("__missing__", id(item)) + return normalize_key(raw_value) + + +@dataclass +class DeduplicationPolicy: + """Deduplication policy applied during execution. + + This keeps dedup logic out of merger implementations and plan interpreters. + """ + + dedup_key: Optional[str] + missing_key_policy: MissingKeyPolicy + + # Store backend (cursor or redis) + store: SeenStore + + # Keys encountered/accepted within this request. Prevents duplicates inside one response. + seen_request_set: set[str] + + def key_for(self, item: Any) -> Optional[str]: + return entity_key(item, self.dedup_key, self.missing_key_policy) + + async def prefetch_keys(self, keys: List[str]) -> None: + if not keys: + return + + filtered: List[str] = [] + seen: set[str] = set() + for k in keys: + if k in self.seen_request_set: + continue + if k in seen: + continue + seen.add(k) + filtered.append(k) + + if not filtered: + return + + await self.store.prefetch(filtered) + + def should_accept(self, key: str, priority: int) -> bool: + if key in self.seen_request_set: + return False + + existing_priority = self.store.get(key) + if existing_priority is not None and priority <= existing_priority: + return False + return True + + def record(self, key: str, priority: int) -> None: + self.seen_request_set.add(key) + self.store.set_max(key, priority) + + async def arbitrate_owner_buffers( + self, + *, + owners: List[Any], + owner_buffers: Dict[int, List[Any]], + owner_rank: Dict[int, int], + ) -> Dict[int, List[Any]]: + """Arbitrate winners across multiple owners. + + - Deterministic tie-break: (-priority, owner_rank, item_rank) + - Records winners into the store + - Returns per-owner buffers containing only accepted winners + """ + + keys_to_prefetch: List[str] = [] + keys_seen_local: set[str] = set() + for owner in owners: + owner_id = id(owner) + for item in owner_buffers.get(owner_id, []): + key = self.key_for(item) + if key is None: + continue + if key in keys_seen_local: + continue + keys_seen_local.add(key) + keys_to_prefetch.append(key) + + if keys_to_prefetch: + await self.prefetch_keys(keys_to_prefetch) + + winners: Dict[str, int] = {} + winner_prio: Dict[str, int] = {} + winner_tie: Dict[str, Tuple[int, int, int]] = {} + + for owner in owners: + owner_id = id(owner) + prio = int(getattr(owner, "dedup_priority", 0)) + rank = int(owner_rank.get(owner_id, 0)) + for item_rank, item in enumerate(owner_buffers.get(owner_id, [])): + key = self.key_for(item) + if key is None: + continue + tie = (-prio, rank, item_rank) + existing = winner_tie.get(key) + if existing is None or tie < existing: + winners[key] = owner_id + winner_prio[key] = prio + winner_tie[key] = tie + + for key, _tie in sorted(winner_tie.items(), key=lambda kv: kv[1]): + winner_owner_id = winners.get(key) + if winner_owner_id is None: + continue + prio = int(winner_prio.get(key, 0)) + if not self.should_accept(key, prio): + continue + self.record(key, prio) + + request_set = self.seen_request_set + per_owner_accepted: Dict[int, List[Any]] = {id(o): [] for o in owners} + for owner in owners: + owner_id = id(owner) + accepted: List[Any] = [] + for item in owner_buffers.get(owner_id, []): + key = self.key_for(item) + if key is None: + continue + if winners.get(key) != owner_id: + continue + if key not in request_set: + continue + accepted.append(item) + per_owner_accepted[owner_id] = accepted + + return per_owner_accepted + + async def accept_batch( + self, + *, + items: List[Any], + priority: int, + limit: Optional[int] = None, + ) -> Tuple[List[Any], int]: + """Accept items from a single stream in order. + + Returns accepted items and the number of inspected items. + """ + + if not items: + return [], 0 + + keys_to_prefetch: List[str] = [] + keys_seen_local: set[str] = set() + for item in items: + key = self.key_for(item) + if key is None: + continue + if key in self.seen_request_set: + continue + if key in keys_seen_local: + continue + keys_seen_local.add(key) + keys_to_prefetch.append(key) + + if keys_to_prefetch: + await self.prefetch_keys(keys_to_prefetch) + + accepted: List[Any] = [] + inspected_count = 0 + max_accept = int(limit) if limit is not None else len(items) + + for idx, item in enumerate(items, start=1): + inspected_count = idx + if len(accepted) >= max_accept: + break + key = self.key_for(item) + if key is None: + continue + if not self.should_accept(key, priority): + continue + accepted.append(item) + self.record(key, priority) + if len(accepted) >= max_accept: + break + + return accepted, inspected_count diff --git a/smartfeed/policies/dedup_utils.py b/smartfeed/policies/dedup_utils.py new file mode 100644 index 0000000..e12aed1 --- /dev/null +++ b/smartfeed/policies/dedup_utils.py @@ -0,0 +1,112 @@ +from __future__ import annotations + +import asyncio +import base64 +import inspect +import zlib +from typing import Any, Dict, List, Optional, Tuple, Union + +import redis +from redis.asyncio import Redis as AsyncRedis + +from .. import jsonlib as json +from ..feed_models import _is_async_redis_client, _redis_call + + +def decode_seen_from_cursor(after: Any) -> Dict[str, int]: + if after is None: + return {} + + if isinstance(after, dict) and "z" in after: + payload = base64.urlsafe_b64decode(str(after["z"]).encode()) + raw = zlib.decompress(payload).decode() + decoded = json.loads(raw) + if isinstance(decoded, dict): + return {str(k): int(v) for k, v in decoded.items()} + if isinstance(decoded, list): + seen_map: Dict[str, int] = {} + for entry_item in decoded: + if isinstance(entry_item, (list, tuple)) and len(entry_item) == 2: + seen_map[str(entry_item[0])] = int(entry_item[1]) + else: + seen_map[str(entry_item)] = 0 + return seen_map + return {} + + if isinstance(after, dict) and "seen" in after: + return {str(k): 0 for k in list(after["seen"])} + if isinstance(after, list): + return {str(k): 0 for k in list(after)} + if isinstance(after, dict): + return {str(k): int(v) for k, v in after.items() if k not in {"v", "c", "n"}} + return {} + + +def encode_seen_for_cursor( + seen_updates_in_order: List[Tuple[str, int]], + *, + cursor_compress: bool, + cursor_max_keys: Optional[int], +) -> Any: + if cursor_max_keys is not None: + seen_updates_in_order = seen_updates_in_order[-cursor_max_keys:] + + if not cursor_compress: + return {"v": 2, "seen": [[k, p] for k, p in seen_updates_in_order]} + + raw = json.dumps([[k, p] for k, p in seen_updates_in_order]).encode() + compressed = zlib.compress(raw) + return { + "v": 2, + "c": "zlib+base64", + "n": len(seen_updates_in_order), + "z": base64.urlsafe_b64encode(compressed).decode(), + } + + +async def redis_zmscore( + redis_client: Union[redis.Redis, AsyncRedis], + key: str, + members: List[str], +) -> List[Optional[float]]: + if not members: + return [] + + zmscore_fn = getattr(redis_client, "zmscore", None) + if zmscore_fn is not None: + res = zmscore_fn(key, members) + if inspect.iscoroutine(res): + res = await res + return [None if v is None else float(v) for v in list(res)] + + if not _is_async_redis_client(redis_client): + + def _sync_pipeline_execute() -> Any: + pipe = redis_client.pipeline() + for m in members: + pipe.zscore(key, m) + return pipe.execute() + + res = await asyncio.to_thread(_sync_pipeline_execute) + return [None if v is None else float(v) for v in list(res)] + + pipe = redis_client.pipeline() + for m in members: + pipe.zscore(key, m) + res = pipe.execute() + if inspect.iscoroutine(res): + res = await res + return [None if v is None else float(v) for v in list(res)] + + +async def redis_zadd_and_expire( + redis_client: Union[redis.Redis, AsyncRedis], + key: str, + member_scores: Dict[str, int], + *, + ttl_seconds: int, +) -> None: + if not member_scores: + return + await _redis_call(redis_client, "zadd", key, mapping={m: float(s) for m, s in member_scores.items()}) + await _redis_call(redis_client, "expire", key, ttl_seconds) diff --git a/smartfeed/policies/seen_store.py b/smartfeed/policies/seen_store.py new file mode 100644 index 0000000..a867346 --- /dev/null +++ b/smartfeed/policies/seen_store.py @@ -0,0 +1,158 @@ +from __future__ import annotations + +import inspect +from dataclasses import dataclass +from typing import Any, Dict, List, Optional, Protocol, Tuple, Union, cast + +import redis +from redis.asyncio import Redis as AsyncRedis + +from ..feed_models import _redis_call +from .dedup_utils import decode_seen_from_cursor, encode_seen_for_cursor, redis_zadd_and_expire, redis_zmscore + + +class SeenStore(Protocol): + async def prefetch(self, keys: List[str]) -> None: + raise NotImplementedError + + def get(self, key: str) -> Optional[int]: + raise NotImplementedError + + def set_max(self, key: str, priority: int) -> None: + raise NotImplementedError + + async def reset(self) -> None: + raise NotImplementedError + + async def commit(self) -> Any: + raise NotImplementedError + + +@dataclass +class CursorSeenStore: + """Seen-store that persists in the cursor (`after`).""" + + cursor_compress: bool + cursor_max_keys: Optional[int] + + seen_priority_map: Dict[str, int] + seen_updates_in_order: List[Tuple[str, int]] + + @classmethod + def from_after( + cls, + after: Any, + *, + cursor_compress: bool, + cursor_max_keys: Optional[int], + ) -> "CursorSeenStore": + seen_priority_map = decode_seen_from_cursor(after) + return cls( + cursor_compress=cursor_compress, + cursor_max_keys=cursor_max_keys, + seen_priority_map=seen_priority_map, + seen_updates_in_order=[], + ) + + async def prefetch(self, keys: List[str]) -> None: + return None + + def get(self, key: str) -> Optional[int]: + return self.seen_priority_map.get(key) + + def set_max(self, key: str, priority: int) -> None: + existing = self.seen_priority_map.get(key) + if existing is not None and priority <= existing: + return + self.seen_priority_map[key] = priority + self.seen_updates_in_order.append((key, priority)) + + async def reset(self) -> None: + self.seen_priority_map.clear() + self.seen_updates_in_order.clear() + + async def commit(self) -> Any: + return encode_seen_for_cursor( + self.seen_updates_in_order, + cursor_compress=self.cursor_compress, + cursor_max_keys=self.cursor_max_keys, + ) + + +@dataclass +class RedisSeenStore: + """Seen-store backed by a Redis zset (member=key, score=priority).""" + + redis_client: Union[redis.Redis, AsyncRedis] + redis_key: str + ttl_seconds: int + + redis_seen_cache: Dict[str, Optional[int]] + redis_new_scores: Dict[str, int] + + @classmethod + def create( + cls, + *, + redis_client: Union[redis.Redis, AsyncRedis], + redis_key: str, + ttl_seconds: int, + ) -> "RedisSeenStore": + return cls( + redis_client=redis_client, + redis_key=redis_key, + ttl_seconds=ttl_seconds, + redis_seen_cache={}, + redis_new_scores={}, + ) + + async def prefetch(self, keys: List[str]) -> None: + if not keys: + return + + unique: List[str] = [] + seen: set[str] = set() + for k in keys: + if k in self.redis_seen_cache: + continue + if k in seen: + continue + seen.add(k) + unique.append(k) + + if not unique: + return + + scores_result = redis_zmscore(self.redis_client, self.redis_key, unique) + if inspect.iscoroutine(scores_result): + scores = await cast(Any, scores_result) + else: + scores = scores_result + + for k, s in zip(unique, scores): + self.redis_seen_cache[k] = None if s is None else int(s) + + def get(self, key: str) -> Optional[int]: + return self.redis_seen_cache.get(key) + + def set_max(self, key: str, priority: int) -> None: + existing = self.redis_seen_cache.get(key) + if existing is not None and priority <= existing: + return + self.redis_seen_cache[key] = priority + self.redis_new_scores[key] = max(self.redis_new_scores.get(key, 0), priority) + + async def reset(self) -> None: + await _redis_call(self.redis_client, "delete", self.redis_key) + self.redis_seen_cache.clear() + self.redis_new_scores.clear() + + async def commit(self) -> Any: + await redis_zadd_and_expire( + self.redis_client, + self.redis_key, + self.redis_new_scores, + ttl_seconds=self.ttl_seconds, + ) + self.redis_new_scores.clear() + return None diff --git a/tests/fixtures/redis.py b/tests/fixtures/redis.py index 2c07678..5c9af72 100644 --- a/tests/fixtures/redis.py +++ b/tests/fixtures/redis.py @@ -15,7 +15,7 @@ async def redis_client(request): client = AsyncRedis(host="localhost", port=6379) try: await client.ping() - except Exception: # pragma: no cover + except Exception: pytest.skip("Redis is not available on localhost:6379") yield client await client.aclose() @@ -24,7 +24,7 @@ async def redis_client(request): client = redis.Redis(host="localhost", port=6379, db=0) try: client.ping() - except Exception: # pragma: no cover + except Exception: pytest.skip("Redis is not available on localhost:6379") yield client client.close() diff --git a/tests/test_manager_params.py b/tests/test_manager_params.py new file mode 100644 index 0000000..a62d160 --- /dev/null +++ b/tests/test_manager_params.py @@ -0,0 +1,40 @@ +import pytest + +from smartfeed.manager import FeedManager +from smartfeed.schemas import FeedResultClient, FeedResultNextPage, FeedResultNextPageInside + + +async def meta_method( + user_id: str, + limit: int, + next_page: FeedResultNextPageInside, + meta: dict, +) -> FeedResultClient: + assert meta["tag"] == "alpha" + take = int(meta.get("take", limit)) + data = [f"{user_id}:{meta['tag']}"] * min(limit, take) + next_page.after = None + next_page.page += 1 + return FeedResultClient(data=data, next_page=next_page, has_next_page=False) + + +@pytest.mark.asyncio +async def test_manager_passes_params_to_subfeed() -> None: + config = { + "version": "1", + "feed": { + "subfeed_id": "sf_meta", + "type": "subfeed", + "method_name": "meta_method", + }, + } + + manager = FeedManager(config=config, methods_dict={"meta_method": meta_method}) + result = await manager.get_data( + user_id="u1", + limit=5, + next_page=FeedResultNextPage(data={}), + meta={"tag": "alpha", "take": 2}, + ) + + assert result.data == ["u1:alpha", "u1:alpha"] diff --git a/tests/test_merger_deduplication.py b/tests/test_merger_deduplication.py index 9bda09f..2ef210a 100644 --- a/tests/test_merger_deduplication.py +++ b/tests/test_merger_deduplication.py @@ -1,3 +1,4 @@ +import asyncio import inspect import pytest @@ -789,11 +790,12 @@ async def test_dedup_append_cursor_backend_across_pages_and_refill_advances_leaf ) assert _ids(res_1.data) == [1, 2, 3, 4, 5] + assert res_1.next_page.data["dedup_append_pages"].page == 2 - # In dedup-active append mode, each child is requested with the full page limit (5). - # B must therefore collect 5 unique items while skipping 2 duplicates -> scan ids 1..7. - assert res_1.next_page.data["sf_b"].after == 7 + # In default arbitrate mode, B only needs to scan far enough to fill the remaining + # portion of the page after arbitration (here: 3 items: ids 3..5). + assert res_1.next_page.data["sf_b"].after == 5 res_2 = await merger.get_data( methods_dict=methods_dict, @@ -807,6 +809,72 @@ async def test_dedup_append_cursor_backend_across_pages_and_refill_advances_leaf _assert_pages_no_overlap(res_1, res_2) +@pytest.mark.asyncio +async def test_dedup_arbitrate_mode_runs_parallel_prefetch_and_arbitrates_winners() -> None: + started_a = asyncio.Event() + started_b = asyncio.Event() + release = asyncio.Event() + + items_a = [{"id": i, "src": "A"} for i in range(1, 200)] + items_b = [{"id": i, "src": "B"} for i in range(1, 200)] + + async def method_a(user_id, limit, next_page, **kwargs): # pylint: disable=unused-argument + started_a.set() + await release.wait() + offset = int(next_page.after or 0) + data = items_a[offset : offset + limit] + next_page.after = offset + len(data) + next_page.page += 1 + return FeedResultClient(data=data, next_page=next_page, has_next_page=True) + + async def method_b(user_id, limit, next_page, **kwargs): # pylint: disable=unused-argument + started_b.set() + await release.wait() + offset = int(next_page.after or 0) + data = items_b[offset : offset + limit] + next_page.after = offset + len(data) + next_page.page += 1 + return FeedResultClient(data=data, next_page=next_page, has_next_page=True) + + methods_dict = {"a": method_a, "b": method_b} + + config = { + "merger_id": "dedup_arbitrate", + "type": "merger_deduplication", + "dedup_key": "id", + "state_backend": "cursor", + "cursor_compress": True, + "data": { + "merger_id": "pct", + "type": "merger_percentage", + "shuffle": False, + "items": [ + {"percentage": 50, "data": {"subfeed_id": "sf_a", "type": "subfeed", "method_name": "a"}}, + {"percentage": 50, "data": {"subfeed_id": "sf_b", "type": "subfeed", "method_name": "b"}}, + ], + }, + } + + merger = parse_model(MergerDeduplication, config) + + task = asyncio.create_task( + merger.get_data(methods_dict=methods_dict, user_id="u", limit=10, next_page=FeedResultNextPage(data={})) + ) + + # If execution is sequential, one of these would time out. + await asyncio.wait_for(started_a.wait(), timeout=1) + await asyncio.wait_for(started_b.wait(), timeout=1) + release.set() + + res = await asyncio.wait_for(task, timeout=2) + + assert len(res.data) == 10 + _assert_no_dupes_in_page(res.data) + # With equal priorities, stable tie-breaker should pick A (first branch) for overlapping keys. + winning = {item["id"]: item["src"] for item in res.data} + assert all(winning[i] == "A" for i in range(1, 6)) + + @pytest.mark.asyncio async def test_dedup_refill_loops_advance_dict_after_cursor_not_just_page() -> None: """Dedup refill loops must correctly advance dict-shaped `after` cursors.""" From 076c1017f584500592d69bf6518fc98ff99c0f3f Mon Sep 17 00:00:00 2001 From: Pavel Kochetov Date: Sat, 7 Feb 2026 14:40:54 +0000 Subject: [PATCH 16/33] Test cleanup. --- smartfeed/examples/example_client.py | 4 +- smartfeed/manager.py | 3 +- tests/fixtures/dedup_helpers.py | 441 ++++++++ tests/test_merger_deduplication.py | 1396 +++++++------------------- 4 files changed, 782 insertions(+), 1062 deletions(-) create mode 100644 tests/fixtures/dedup_helpers.py diff --git a/smartfeed/examples/example_client.py b/smartfeed/examples/example_client.py index 9b48842..d11e00a 100644 --- a/smartfeed/examples/example_client.py +++ b/smartfeed/examples/example_client.py @@ -5,6 +5,7 @@ from smartfeed import jsonlib as json from smartfeed.schemas import FeedResultClient, FeedResultNextPage, FeedResultNextPageInside +from tests.utils import parse_model class TestClientRequest(BaseModel): @@ -28,7 +29,8 @@ def validate_next_page(cls, value: Union[str, FeedResultNextPage]) -> Union[str, validate = getattr(FeedResultNextPage, "model_validate", None) if validate is not None: return validate(payload) - return FeedResultNextPage.parse_obj(payload) + return parse_model(FeedResultNextPage, payload) # type: ignore + return value diff --git a/smartfeed/manager.py b/smartfeed/manager.py index f42e95f..3c0ea05 100644 --- a/smartfeed/manager.py +++ b/smartfeed/manager.py @@ -6,6 +6,7 @@ from .execution.context import ExecutionContext from .execution.executor import Executor from .schemas import FeedConfig, FeedResult, FeedResultNextPage +from tests.utils import parse_model class FeedManager: @@ -26,7 +27,7 @@ def __init__(self, config: Dict, methods_dict: Dict, redis_client: Optional[Unio if validate is not None: self.feed_config = validate(config) else: - self.feed_config = FeedConfig.parse_obj(config) + self.feed_config = parse_model(FeedConfig, config) # type: ignore self.methods_dict = methods_dict self.redis_client = redis_client diff --git a/tests/fixtures/dedup_helpers.py b/tests/fixtures/dedup_helpers.py new file mode 100644 index 0000000..eb3fbdc --- /dev/null +++ b/tests/fixtures/dedup_helpers.py @@ -0,0 +1,441 @@ +from smartfeed.schemas import FeedResultClient, FeedResultNextPage + + +def _effective_limit(limit, max_per_call): + effective_limit = limit + if isinstance(max_per_call, int) and max_per_call > 0: + effective_limit = min(effective_limit, max_per_call) + return effective_limit + + +def make_offset_paged_method(items, *, max_per_call=None): + async def _method(user_id, limit, next_page, **kwargs): # pylint: disable=unused-argument + offset = int(next_page.after or 0) + effective_limit = _effective_limit(limit, max_per_call) + result_data = items[offset : offset + effective_limit] + next_page.after = offset + len(result_data) + next_page.page += 1 + has_next_page = (offset + len(result_data)) < len(items) + return FeedResultClient(data=result_data, next_page=next_page, has_next_page=has_next_page) + + return _method + + +def make_string_after_paged_method(items, *, max_per_call=None, after_field="created_at"): + """A subfeed method whose cursor is a string (e.g. timestamp). + + Cursor semantics: `after` is the last returned `created_at` value (monotonic). + """ + + async def _method(user_id, limit, next_page, **kwargs): # pylint: disable=unused-argument + effective_limit = _effective_limit(limit, max_per_call) + + after = next_page.after + start_idx = 0 + if isinstance(after, str) and after: + # Find first item with created_at > after + for i, item in enumerate(items): + if str(item[after_field]) > after: + start_idx = i + break + else: + start_idx = len(items) + + result_data = items[start_idx : start_idx + effective_limit] + has_next_page = (start_idx + len(result_data)) < len(items) + + if result_data: + next_page.after = str(result_data[-1][after_field]) + next_page.page += 1 + return FeedResultClient(data=result_data, next_page=next_page, has_next_page=has_next_page) + + return _method + + +def make_profile_dict_after_method( + profiles_to_items, + *, + max_per_call=None, + after_key="after", +): + """A subfeed method whose cursor is a dict of per-profile offsets. + + Example shape: after = {"p1": 0, "p2": 0} + Cursor semantics: each profile offset increments as items are *read*. + """ + + profile_ids = list(profiles_to_items.keys()) + + async def _method(user_id, limit, next_page, **kwargs): # pylint: disable=unused-argument + effective_limit = _effective_limit(limit, max_per_call) + + after = next_page.after + if not isinstance(after, dict): + after = {pid: 0 for pid in profile_ids} + else: + after = dict(after) + for pid in profile_ids: + after.setdefault(pid, 0) + + result = [] + has_next_page = False + + # Build a cyclic iteration over profiles. + active_profiles = [pid for pid in profile_ids] + + i = 0 + while active_profiles and len(result) < effective_limit: + pid = active_profiles[i % len(active_profiles)] + idx = after.get(pid, 0) + items = profiles_to_items.get(pid, []) + + if idx >= len(items): + # This profile is exhausted. + active_profiles.remove(pid) + continue + + result.append(items[idx]) + after[pid] = idx + 1 + i += 1 + + # Determine if any profile still has unread items. + for pid in profile_ids: + if after.get(pid, 0) < len(profiles_to_items.get(pid, [])): + has_next_page = True + break + + next_page.after = after + next_page.page += 1 + return FeedResultClient(data=result, next_page=next_page, has_next_page=has_next_page) + + return _method + + +def _assert_cursor_monotonic_if_present(res_1, res_2, keys): + for key in keys: + if key not in res_1.next_page.data: + continue + + assert key in res_2.next_page.data + + after_1 = res_1.next_page.data[key].after + after_2 = res_2.next_page.data[key].after + + if after_1 is None or after_2 is None: + continue + + if isinstance(after_1, int) and isinstance(after_2, int): + assert after_2 >= after_1 + continue + + if isinstance(after_1, dict) and isinstance(after_2, dict): + continue + + try: + assert after_2 >= after_1 + except TypeError: + pass + + +def _sources(data): + return [x.get("src") for x in data] + + +def _ids(data): + return [x.get("id") for x in data] + + +def _assert_no_dupes_in_page(data): + ids = _ids(data) + assert len(ids) == len(set(ids)) + + +def _assert_pages_no_overlap(res_1, res_2): + assert not (set(_ids(res_1.data)) & set(_ids(res_2.data))) + + +def _assert_two_pages_no_dupes(res_1, res_2): + _assert_no_dupes_in_page(res_1.data) + _assert_no_dupes_in_page(res_2.data) + _assert_pages_no_overlap(res_1, res_2) + + +def _assert_sources_at_positions(data, positions, expected_src): + sources = _sources(data) + for pos in positions: + assert sources[pos - 1] == expected_src + + +def make_items(src, start, end, *, user_id_mod=None, id_offset=0, extra=None): + items = [] + for i in range(start, end): + item_id = id_offset + i + item = {"id": item_id, "src": src} + if user_id_mod is not None: + item["user_id"] = f"u{item_id % user_id_mod}" + if extra: + item.update(extra) + items.append(item) + return items + + +def _subfeed(subfeed_id, method_name, *, dedup_priority=None): + data = {"subfeed_id": subfeed_id, "type": "subfeed", "method_name": method_name} + if dedup_priority is not None: + data["dedup_priority"] = dedup_priority + return data + + +def _dedup_config(merger_id, data, *, dedup_key="id", state_backend="cursor", cursor_compress=True, **kwargs): + config = { + "merger_id": merger_id, + "type": "merger_deduplication", + "dedup_key": dedup_key, + "state_backend": state_backend, + "cursor_compress": cursor_compress, + "data": data, + } + config.update(kwargs) + return config + + +def _percentage_config(merger_id, items, *, shuffle=False): + return {"merger_id": merger_id, "type": "merger_percentage", "shuffle": shuffle, "items": items} + + +def _append_config(merger_id, items, *, shuffle=False): + return {"merger_id": merger_id, "type": "merger_append", "shuffle": shuffle, "items": items} + + +def _distribute_config(merger_id, items, *, distribution_key="user_id"): + return { + "merger_id": merger_id, + "type": "merger_distribute", + "distribution_key": distribution_key, + "items": items, + } + + +def _positional_config(merger_id, *, positions, positional, default): + return { + "merger_id": merger_id, + "type": "merger_positional", + "positions": positions, + "positional": positional, + "default": default, + } + + +def _gradient_config( + merger_id, + *, + item_from, + item_to, + step, + size_to_step, + shuffle=False, +): + return { + "merger_id": merger_id, + "type": "merger_percentage_gradient", + "item_from": item_from, + "item_to": item_to, + "step": step, + "size_to_step": size_to_step, + "shuffle": shuffle, + } + + +async def _run_two_pages(merger, methods_dict, limit, *, next_page=None, **kwargs): + if next_page is None: + next_page = FeedResultNextPage(data={}) + res_1 = await merger.get_data(methods_dict=methods_dict, user_id="u", limit=limit, next_page=next_page, **kwargs) + res_2 = await merger.get_data( + methods_dict=methods_dict, user_id="u", limit=limit, next_page=res_1.next_page, **kwargs + ) + return res_1, res_2 + + +def _percentage_items(first, second, *, first_pct=50, second_pct=50): + return [ + {"percentage": first_pct, "data": first}, + {"percentage": second_pct, "data": second}, + ] + + +def _two_subfeed_spec(*, name="a", subfeed_id="sf_a", max_per_call=None, dedup_priority=None): + return { + "name": name, + "subfeed_id": subfeed_id, + "max_per_call": max_per_call, + "dedup_priority": dedup_priority, + } + + +def _build_two_subfeed_methods(items_a, items_b, *, spec_a=None, spec_b=None): + if spec_a is None: + spec_a = _two_subfeed_spec() + if spec_b is None: + spec_b = _two_subfeed_spec(name="b", subfeed_id="sf_b") + + methods_dict = { + spec_a["name"]: make_offset_paged_method(items_a, max_per_call=spec_a["max_per_call"]), + spec_b["name"]: make_offset_paged_method(items_b, max_per_call=spec_b["max_per_call"]), + } + subfeed_a = _subfeed(spec_a["subfeed_id"], spec_a["name"], dedup_priority=spec_a["dedup_priority"]) + subfeed_b = _subfeed(spec_b["subfeed_id"], spec_b["name"], dedup_priority=spec_b["dedup_priority"]) + return methods_dict, subfeed_a, subfeed_b + + +def _build_two_subfeed_dedup_merger( + *, + items_a, + items_b, + child_builder, + merger_id, + spec_a=None, + spec_b=None, + dedup_kwargs=None, +): + methods_dict, subfeed_a, subfeed_b = _build_two_subfeed_methods( + items_a, + items_b, + spec_a=spec_a, + spec_b=spec_b, + ) + config = _dedup_config(merger_id, child_builder(subfeed_a, subfeed_b), **(dedup_kwargs or {})) + return config, methods_dict, subfeed_a, subfeed_b + + +def _build_deep_positional_pct_dedup_merger( + *, + items_p, + items_d1, + items_d2, + dedup_merger_id, + pos_merger_id, + pct_merger_id, + positions, + overfetch_factor=None, + max_refill_loops=None, +): + methods_dict = { + "p": make_offset_paged_method(items_p), + "d1": make_offset_paged_method(items_d1), + "d2": make_offset_paged_method(items_d2), + } + + dedup_kwargs = {} + if overfetch_factor is not None: + dedup_kwargs["overfetch_factor"] = overfetch_factor + if max_refill_loops is not None: + dedup_kwargs["max_refill_loops"] = max_refill_loops + + config = _dedup_config( + dedup_merger_id, + _positional_config( + pos_merger_id, + positions=positions, + positional=_subfeed("sf_p", "p"), + default=_percentage_config( + pct_merger_id, + items=_percentage_items(_subfeed("sf_d1", "d1"), _subfeed("sf_d2", "d2")), + ), + ), + **dedup_kwargs, + ) + return config, methods_dict + + +def _inner_append_config(*, merger_id: str, subfeed_id: str, method_name: str, dedup_priority: int): + return { + "merger_id": merger_id, + "type": "merger_append", + # Important: dedup deletion priority must be visible at this node so parent mergers + # can fetch higher-priority subtrees first when a dedup wrapper is active. + "dedup_priority": dedup_priority, + "shuffle": False, + "items": [ + { + "subfeed_id": subfeed_id, + "type": "subfeed", + "method_name": method_name, + "dedup_priority": dedup_priority, + } + ], + } + + +def _build_deep_priority_tree_for_merger_type(*, merger_type: str): + """Return a deep tree config where low/high leaves overlap by id. + + The inner leaves are wrapped into an append merger to ensure a "deep" tree even + when the outer merger is flat. + """ + + low = _inner_append_config(merger_id="inner_low", subfeed_id="sf_low", method_name="low", dedup_priority=0) + high = _inner_append_config(merger_id="inner_high", subfeed_id="sf_high", method_name="high", dedup_priority=100) + + if merger_type == "merger_append": + return { + "merger_id": "outer_append", + "type": "merger_append", + "shuffle": False, + # Put low first intentionally; priority must still make high win for overlapping ids. + "items": [low, high], + } + + if merger_type == "merger_distribute": + return { + "merger_id": "outer_dist", + "type": "merger_distribute", + "distribution_key": "user_id", + # Put low first intentionally. + "items": [low, high], + } + + if merger_type == "merger_percentage": + return { + "merger_id": "outer_pct", + "type": "merger_percentage", + "shuffle": False, + "items": [ + {"percentage": 50, "data": low}, + {"percentage": 50, "data": high}, + ], + } + + if merger_type == "merger_percentage_gradient": + return { + "merger_id": "outer_grad", + "type": "merger_percentage_gradient", + "item_from": {"percentage": 60, "data": low}, + "item_to": {"percentage": 40, "data": high}, + "step": 20, + "size_to_step": 5, + "shuffle": False, + } + + if merger_type == "merger_positional": + # High priority on positional branch so it must win duplicates. + high_pos = _inner_append_config( + merger_id="inner_pos_high", + subfeed_id="sf_high", + method_name="high", + dedup_priority=100, + ) + low_def = _inner_append_config( + merger_id="inner_def_low", + subfeed_id="sf_low", + method_name="low", + dedup_priority=0, + ) + return { + "merger_id": "outer_pos", + "type": "merger_positional", + "positions": [1, 3, 5, 7, 9, 11], + "positional": high_pos, + "default": low_def, + } + + raise AssertionError(f"Unknown merger_type: {merger_type}") diff --git a/tests/test_merger_deduplication.py b/tests/test_merger_deduplication.py index 2ef210a..ca83885 100644 --- a/tests/test_merger_deduplication.py +++ b/tests/test_merger_deduplication.py @@ -4,254 +4,24 @@ import pytest from smartfeed.schemas import FeedResultClient, FeedResultNextPage, FeedResultNextPageInside, MergerDeduplication +from tests.fixtures import dedup_helpers as dh from tests.fixtures.redis import redis_client # noqa: F401 from tests.utils import parse_model -def make_offset_paged_method(items, *, max_per_call=None): - async def _method(user_id, limit, next_page, **kwargs): # pylint: disable=unused-argument - offset = int(next_page.after or 0) - effective_limit = limit - if isinstance(max_per_call, int) and max_per_call > 0: - effective_limit = min(effective_limit, max_per_call) - result_data = items[offset : offset + effective_limit] - next_page.after = offset + len(result_data) - next_page.page += 1 - has_next_page = (offset + len(result_data)) < len(items) - return FeedResultClient(data=result_data, next_page=next_page, has_next_page=has_next_page) +PROFILES_B_1_TO_8 = { + "p0": [{"id": 1, "src": "B"}, {"id": 3, "src": "B"}, {"id": 5, "src": "B"}, {"id": 7, "src": "B"}], + "p1": [{"id": 2, "src": "B"}, {"id": 4, "src": "B"}, {"id": 6, "src": "B"}, {"id": 8, "src": "B"}], +} - return _method +def _assert_winning_src_for_ids(data, ids, expected_src: str) -> None: + winning = {item["id"]: item["src"] for item in data} + assert all(winning[i] == expected_src for i in ids if i in winning) -def make_string_after_paged_method(items, *, max_per_call=None, after_field="created_at"): - """A subfeed method whose cursor is a string (e.g. timestamp). - Cursor semantics: `after` is the last returned `created_at` value (monotonic). - """ - - async def _method(user_id, limit, next_page, **kwargs): # pylint: disable=unused-argument - effective_limit = limit - if isinstance(max_per_call, int) and max_per_call > 0: - effective_limit = min(effective_limit, max_per_call) - - after = next_page.after - start_idx = 0 - if isinstance(after, str) and after: - # Find first item with created_at > after - for i, item in enumerate(items): - if str(item[after_field]) > after: - start_idx = i - break - else: - start_idx = len(items) - - result_data = items[start_idx : start_idx + effective_limit] - has_next_page = (start_idx + len(result_data)) < len(items) - - if result_data: - next_page.after = str(result_data[-1][after_field]) - next_page.page += 1 - return FeedResultClient(data=result_data, next_page=next_page, has_next_page=has_next_page) - - return _method - - -def make_profile_dict_after_method( - profiles_to_items, - *, - max_per_call=None, - after_key="after", -): - """A subfeed method whose cursor is a dict of per-profile offsets. - - Example shape: after = {"p1": 0, "p2": 0} - Cursor semantics: each profile offset increments as items are *read*. - """ - - profile_ids = list(profiles_to_items.keys()) - - async def _method(user_id, limit, next_page, **kwargs): # pylint: disable=unused-argument - effective_limit = limit - if isinstance(max_per_call, int) and max_per_call > 0: - effective_limit = min(effective_limit, max_per_call) - - after = next_page.after - if not isinstance(after, dict): - after = {pid: 0 for pid in profile_ids} - else: - after = dict(after) - for pid in profile_ids: - after.setdefault(pid, 0) - - result = [] - has_next_page = False - - # Build a cyclic iteration over profiles. - active_profiles = [pid for pid in profile_ids] - - i = 0 - while active_profiles and len(result) < effective_limit: - pid = active_profiles[i % len(active_profiles)] - idx = after.get(pid, 0) - items = profiles_to_items.get(pid, []) - - if idx >= len(items): - # This profile is exhausted. - active_profiles.remove(pid) - continue - - result.append(items[idx]) - after[pid] = idx + 1 - i += 1 - - # Determine if any profile still has unread items. - for pid in profile_ids: - if after.get(pid, 0) < len(profiles_to_items.get(pid, [])): - has_next_page = True - break - - next_page.after = after - next_page.page += 1 - return FeedResultClient(data=result, next_page=next_page, has_next_page=has_next_page) - - return _method - - -def _assert_cursor_monotonic_if_present(res_1, res_2, keys): - for key in keys: - if key not in res_1.next_page.data: - continue - - assert key in res_2.next_page.data - - after_1 = res_1.next_page.data[key].after - after_2 = res_2.next_page.data[key].after - - if after_1 is None or after_2 is None: - continue - - if isinstance(after_1, int) and isinstance(after_2, int): - assert after_2 >= after_1 - continue - - if isinstance(after_1, dict) and isinstance(after_2, dict): - continue - - try: - assert after_2 >= after_1 - except TypeError: - pass - - -def _sources(data): - return [x.get("src") for x in data] - - -def _ids(data): - return [x.get("id") for x in data] - - -def _assert_no_dupes_in_page(data): - ids = _ids(data) - assert len(ids) == len(set(ids)) - - -def _assert_pages_no_overlap(res_1, res_2): - assert not (set(_ids(res_1.data)) & set(_ids(res_2.data))) - - -def _inner_append_config(*, merger_id: str, subfeed_id: str, method_name: str, dedup_priority: int): - return { - "merger_id": merger_id, - "type": "merger_append", - # Important: dedup deletion priority must be visible at this node so parent mergers - # can fetch higher-priority subtrees first when a dedup wrapper is active. - "dedup_priority": dedup_priority, - "shuffle": False, - "items": [ - { - "subfeed_id": subfeed_id, - "type": "subfeed", - "method_name": method_name, - "dedup_priority": dedup_priority, - } - ], - } - - -def _build_deep_priority_tree_for_merger_type(*, merger_type: str): - """Return a deep tree config where low/high leaves overlap by id. - - The inner leaves are wrapped into an append merger to ensure a "deep" tree even - when the outer merger is flat. - """ - - low = _inner_append_config(merger_id="inner_low", subfeed_id="sf_low", method_name="low", dedup_priority=0) - high = _inner_append_config(merger_id="inner_high", subfeed_id="sf_high", method_name="high", dedup_priority=100) - - if merger_type == "merger_append": - return { - "merger_id": "outer_append", - "type": "merger_append", - "shuffle": False, - # Put low first intentionally; priority must still make high win for overlapping ids. - "items": [low, high], - } - - if merger_type == "merger_distribute": - return { - "merger_id": "outer_dist", - "type": "merger_distribute", - "distribution_key": "user_id", - # Put low first intentionally. - "items": [low, high], - } - - if merger_type == "merger_percentage": - return { - "merger_id": "outer_pct", - "type": "merger_percentage", - "shuffle": False, - "items": [ - {"percentage": 50, "data": low}, - {"percentage": 50, "data": high}, - ], - } - - if merger_type == "merger_percentage_gradient": - return { - "merger_id": "outer_grad", - "type": "merger_percentage_gradient", - "item_from": {"percentage": 60, "data": low}, - "item_to": {"percentage": 40, "data": high}, - "step": 20, - "size_to_step": 5, - "shuffle": False, - } - - if merger_type == "merger_positional": - # High priority on positional branch so it must win duplicates. - high_pos = _inner_append_config( - merger_id="inner_pos_high", - subfeed_id="sf_high", - method_name="high", - dedup_priority=100, - ) - low_def = _inner_append_config( - merger_id="inner_def_low", - subfeed_id="sf_low", - method_name="low", - dedup_priority=0, - ) - return { - "merger_id": "outer_pos", - "type": "merger_positional", - "positions": [1, 3, 5, 7, 9, 11], - "positional": high_pos, - "default": low_def, - } - - raise AssertionError(f"Unknown merger_type: {merger_type}") +def _make_items_by_ids(src: str, ids, *, user_id_mod: int): + return [{"id": i, "user_id": f"u{i % user_id_mod}", "src": src} for i in ids] @pytest.mark.asyncio @@ -263,68 +33,45 @@ async def test_dedup_positional_slot_ownership_cursor_backend() -> None: """ # Default branch has early ids 1..3, which will be seen first. - default_items = [{"id": i, "src": "default"} for i in range(1, 300)] + default_items = dh.make_items("default", 1, 300) # Positional branch starts with duplicates 1..3; it must skip them and fetch 4.. instead. - positional_items = [{"id": i, "src": "pos"} for i in range(1, 300)] + positional_items = dh.make_items("pos", 1, 300) methods_dict = { - "default": make_offset_paged_method(default_items), - "pos": make_offset_paged_method(positional_items), + "default": dh.make_offset_paged_method(default_items), + "pos": dh.make_offset_paged_method(positional_items), } - config = { - "merger_id": "dedup_wrapper", - "type": "merger_deduplication", - "dedup_key": "id", - "state_backend": "cursor", - "cursor_compress": True, - "max_refill_loops": 20, - "data": { - "merger_id": "positional_mix", - "type": "merger_positional", + config = dh._dedup_config( + "dedup_wrapper", + dh._positional_config( + "positional_mix", # Ensure positional inserts exist on both pages for limit=6: # page1 uses (1,3,5), page2 uses (7,9,11) which map to the same in-page slots. - "positions": [1, 3, 5, 7, 9, 11], - "positional": {"subfeed_id": "sf_pos", "type": "subfeed", "method_name": "pos"}, - "default": {"subfeed_id": "sf_default", "type": "subfeed", "method_name": "default"}, - }, - } + positions=[1, 3, 5, 7, 9, 11], + positional=dh._subfeed("sf_pos", "pos"), + default=dh._subfeed("sf_default", "default"), + ), + max_refill_loops=20, + ) merger = parse_model(MergerDeduplication, config) - res_1 = await merger.get_data( - methods_dict=methods_dict, - user_id="u", - limit=6, - next_page=FeedResultNextPage(data={}), - ) + res_1, res_2 = await dh._run_two_pages(merger, methods_dict, 6) assert len(res_1.data) == 6 - _assert_no_dupes_in_page(res_1.data) + dh._assert_no_dupes_in_page(res_1.data) # Slot ownership: configured positions [1,3,5] are the positional branch. - assert _sources(res_1.data)[0] == "pos" - assert _sources(res_1.data)[2] == "pos" - assert _sources(res_1.data)[4] == "pos" + dh._assert_sources_at_positions(res_1.data, [1, 3, 5], "pos") # Next page: still no overlap across pages, and positional slots remain owned. - res_2 = await merger.get_data( - methods_dict=methods_dict, - user_id="u", - limit=6, - next_page=res_1.next_page, - ) - assert len(res_2.data) == 6 - _assert_no_dupes_in_page(res_2.data) - _assert_pages_no_overlap(res_1, res_2) + dh._assert_two_pages_no_dupes(res_1, res_2) + dh._assert_sources_at_positions(res_2.data, [1, 3, 5], "pos") - assert _sources(res_2.data)[0] == "pos" - assert _sources(res_2.data)[2] == "pos" - assert _sources(res_2.data)[4] == "pos" - - _assert_cursor_monotonic_if_present(res_1, res_2, keys=["sf_pos", "sf_default", "positional_mix", "dedup_wrapper"]) + dh._assert_cursor_monotonic_if_present(res_1, res_2, keys=["sf_pos", "sf_default", "positional_mix", "dedup_wrapper"]) @pytest.mark.asyncio @@ -332,69 +79,37 @@ async def test_dedup_percentage_slot_ownership_cursor_backend() -> None: """Percentage mixing order must be preserved even with duplicates across sources.""" # A is called first by the percentage merger; its ids will be seen before B. - a_items = [{"id": i, "src": "A"} for i in range(1, 300)] + a_items = dh.make_items("A", 1, 300) # B starts with duplicates 1..3; it must skip them and fetch unique tail items. # Same IDs as A to force cross-source duplicates. - b_items = [{"id": i, "src": "B"} for i in range(1, 300)] - - methods_dict = { - "a": make_offset_paged_method(a_items), - "b": make_offset_paged_method(b_items), - } - - config = { - "merger_id": "dedup_wrapper_pct", - "type": "merger_deduplication", - "dedup_key": "id", - "state_backend": "cursor", - "cursor_compress": True, - "data": { - "merger_id": "pct_mix", - "type": "merger_percentage", - "shuffle": False, - "items": [ - {"percentage": 50, "data": {"subfeed_id": "sf_a", "type": "subfeed", "method_name": "a"}}, - {"percentage": 50, "data": {"subfeed_id": "sf_b", "type": "subfeed", "method_name": "b"}}, - ], - }, - } - + b_items = dh.make_items("B", 1, 300) + + config, methods_dict, _, _ = dh._build_two_subfeed_dedup_merger( + items_a=a_items, + items_b=b_items, + merger_id="dedup_wrapper_pct", + child_builder=lambda sf_a, sf_b: dh._percentage_config( + "pct_mix", + items=dh._percentage_items(sf_a, sf_b), + ), + ) merger = parse_model(MergerDeduplication, config) - res_1 = await merger.get_data( - methods_dict=methods_dict, - user_id="u", - limit=10, - next_page=FeedResultNextPage(data={}), - ) + res_1, res_2 = await dh._run_two_pages(merger, methods_dict, 10) assert len(res_1.data) == 10 - _assert_no_dupes_in_page(res_1.data) + dh._assert_no_dupes_in_page(res_1.data) # Slot ownership: percentage merge alternates when list sizes are equal. - sources_1 = _sources(res_1.data) - assert sources_1[0] == "A" - assert sources_1[1] == "B" - assert sources_1[2] == "A" - assert sources_1[3] == "B" - - res_2 = await merger.get_data( - methods_dict=methods_dict, - user_id="u", - limit=10, - next_page=res_1.next_page, - ) + assert dh._sources(res_1.data)[:4] == ["A", "B", "A", "B"] assert len(res_2.data) == 10 - _assert_no_dupes_in_page(res_2.data) - _assert_pages_no_overlap(res_1, res_2) + dh._assert_two_pages_no_dupes(res_1, res_2) - sources_2 = _sources(res_2.data) - assert sources_2[0] == "A" - assert sources_2[1] == "B" + assert dh._sources(res_2.data)[:2] == ["A", "B"] - _assert_cursor_monotonic_if_present(res_1, res_2, keys=["sf_a", "sf_b", "pct_mix", "dedup_wrapper_pct"]) + dh._assert_cursor_monotonic_if_present(res_1, res_2, keys=["sf_a", "sf_b", "pct_mix", "dedup_wrapper_pct"]) @pytest.mark.asyncio @@ -402,70 +117,35 @@ async def test_dedup_deep_tree_cursor_backend() -> None: """Dedup must work through deep merger trees (wrapping leaf methods).""" # Leaf sources: intentionally overlapping ids across different leaves. - p_items = [{"id": i, "src": "P"} for i in range(1, 30)] - d1_items = [{"id": i, "src": "D1"} for i in range(1, 30)] # overlaps P - d2_items = [{"id": 100 + i, "src": "D2"} for i in range(1, 30)] - - methods_dict = { - "p": make_offset_paged_method(p_items), - "d1": make_offset_paged_method(d1_items), - "d2": make_offset_paged_method(d2_items), - } - - # Deep tree: Dedup -> Positional(default=Percentage(D1,D2), positional=SubFeed(P)) - config = { - "merger_id": "dedup_deep", - "type": "merger_deduplication", - "dedup_key": "id", - "state_backend": "cursor", - "cursor_compress": True, - "data": { - "merger_id": "pos_deep", - "type": "merger_positional", - # Ensure positional positions exist on both page 1 (1,4) and page 2 (9,12) for limit=8. - "positions": [1, 4, 9, 12], - "positional": {"subfeed_id": "sf_p", "type": "subfeed", "method_name": "p"}, - "default": { - "merger_id": "pct_deep", - "type": "merger_percentage", - "shuffle": False, - "items": [ - {"percentage": 50, "data": {"subfeed_id": "sf_d1", "type": "subfeed", "method_name": "d1"}}, - {"percentage": 50, "data": {"subfeed_id": "sf_d2", "type": "subfeed", "method_name": "d2"}}, - ], - }, - }, - } + p_items = dh.make_items("P", 1, 30) + d1_items = dh.make_items("D1", 1, 30) # overlaps P + d2_items = dh.make_items("D2", 1, 30, id_offset=100) + + config, methods_dict = dh._build_deep_positional_pct_dedup_merger( + items_p=p_items, + items_d1=d1_items, + items_d2=d2_items, + dedup_merger_id="dedup_deep", + pos_merger_id="pos_deep", + pct_merger_id="pct_deep", + # Ensure positional positions exist on both page 1 (1,4) and page 2 (9,12) for limit=8. + positions=[1, 4, 9, 12], + ) merger = parse_model(MergerDeduplication, config) - res_1 = await merger.get_data( - methods_dict=methods_dict, - user_id="u", - limit=8, - next_page=FeedResultNextPage(data={}), - ) + res_1, res_2 = await dh._run_two_pages(merger, methods_dict, 8) assert len(res_1.data) == 8 - _assert_no_dupes_in_page(res_1.data) + dh._assert_no_dupes_in_page(res_1.data) # Positional ownership must hold even with deep defaults. - assert _sources(res_1.data)[0] == "P" # position 1 - assert _sources(res_1.data)[3] == "P" # position 4 - - res_2 = await merger.get_data( - methods_dict=methods_dict, - user_id="u", - limit=8, - next_page=res_1.next_page, - ) + dh._assert_sources_at_positions(res_1.data, [1, 4], "P") assert len(res_2.data) == 8 - _assert_no_dupes_in_page(res_2.data) - _assert_pages_no_overlap(res_1, res_2) + dh._assert_two_pages_no_dupes(res_1, res_2) - assert _sources(res_2.data)[0] == "P" - assert _sources(res_2.data)[3] == "P" + dh._assert_sources_at_positions(res_2.data, [1, 4], "P") @pytest.mark.parametrize( @@ -492,32 +172,19 @@ async def test_dedup_deletion_priority_works_for_deep_trees_all_merger_types(mer # "priority" is unobservable because earlier branches can fill the page). We do that by # making the low branch short and duplicate-heavy. if merger_type in {"merger_append", "merger_distribute"}: - low_items = [ - {"id": 1, "user_id": "u0", "src": "low"}, - {"id": 2, "user_id": "u1", "src": "low"}, - {"id": 3, "user_id": "u2", "src": "low"}, - {"id": 1000, "user_id": "u0", "src": "low"}, - {"id": 1001, "user_id": "u1", "src": "low"}, - ] - high_items = [{"id": i, "user_id": f"u{i%3}", "src": "high"} for i in range(1, 200)] + low_items = _make_items_by_ids("low", [1, 2, 3, 1000, 1001], user_id_mod=3) + high_items = dh.make_items("high", 1, 200, user_id_mod=3) else: - low_items = [{"id": i, "user_id": f"u{i%3}", "src": "low"} for i in range(1, 200)] - high_items = [{"id": i, "user_id": f"u{i%3}", "src": "high"} for i in range(1, 200)] + low_items = dh.make_items("low", 1, 200, user_id_mod=3) + high_items = dh.make_items("high", 1, 200, user_id_mod=3) methods_dict = { - "low": make_offset_paged_method(low_items), - "high": make_offset_paged_method(high_items), + "low": dh.make_offset_paged_method(low_items), + "high": dh.make_offset_paged_method(high_items), } - deep_tree = _build_deep_priority_tree_for_merger_type(merger_type=merger_type) - config = { - "merger_id": f"dedup_priority_deep_{merger_type}", - "type": "merger_deduplication", - "dedup_key": "id", - "state_backend": "cursor", - "cursor_compress": True, - "data": deep_tree, - } + deep_tree = dh._build_deep_priority_tree_for_merger_type(merger_type=merger_type) + config = dh._dedup_config(f"dedup_priority_deep_{merger_type}", deep_tree) merger = parse_model(MergerDeduplication, config) res = await merger.get_data( @@ -527,15 +194,20 @@ async def test_dedup_deletion_priority_works_for_deep_trees_all_merger_types(mer next_page=FeedResultNextPage(data={}), ) - _assert_no_dupes_in_page(res.data) + dh._assert_no_dupes_in_page(res.data) + + # For append/distribute, priority is only observable if both branches contribute something. + if merger_type in {"merger_append", "merger_distribute"}: + sources = set(dh._sources(res.data)) + assert "high" in sources + assert "low" in sources # Priority is about which source wins overlapping ids (not about output order). - winning = {item["id"]: item["src"] for item in res.data} - assert all(winning[i] == "high" for i in range(1, 6) if i in winning) + _assert_winning_src_for_ids(res.data, range(1, 6), "high") # Placement invariant for positional: positional slots must still be owned by positional branch. if merger_type == "merger_positional": - sources = _sources(res.data) + sources = dh._sources(res.data) assert sources[0] == "high" assert sources[2] == "high" assert sources[4] == "high" @@ -550,52 +222,28 @@ async def test_dedup_overfetch_factor_does_not_skip_unseen_items_in_deep_tree_cu un-inspected items. In a deep tree, this must hold for all descendant SubFeeds. """ - p_items = [{"id": 1000 + i, "src": "P"} for i in range(1, 200)] - d1_items = [{"id": i, "src": "D1"} for i in range(1, 200)] - d2_items = [{"id": 500 + i, "src": "D2"} for i in range(1, 200)] - - methods_dict = { - "p": make_offset_paged_method(p_items), - "d1": make_offset_paged_method(d1_items), - "d2": make_offset_paged_method(d2_items), - } - - config = { - "merger_id": "dedup_overfetch", - "type": "merger_deduplication", - "dedup_key": "id", - "state_backend": "cursor", - "cursor_compress": True, - "overfetch_factor": 3, - "data": { - "merger_id": "pos_overfetch", - "type": "merger_positional", - "positions": [1, 4, 9, 12], - "positional": {"subfeed_id": "sf_p", "type": "subfeed", "method_name": "p"}, - "default": { - "merger_id": "pct_overfetch", - "type": "merger_percentage", - "shuffle": False, - "items": [ - {"percentage": 50, "data": {"subfeed_id": "sf_d1", "type": "subfeed", "method_name": "d1"}}, - {"percentage": 50, "data": {"subfeed_id": "sf_d2", "type": "subfeed", "method_name": "d2"}}, - ], - }, - }, - } + p_items = dh.make_items("P", 1, 200, id_offset=1000) + d1_items = dh.make_items("D1", 1, 200) + d2_items = dh.make_items("D2", 1, 200, id_offset=500) + + config, methods_dict = dh._build_deep_positional_pct_dedup_merger( + items_p=p_items, + items_d1=d1_items, + items_d2=d2_items, + dedup_merger_id="dedup_overfetch", + pos_merger_id="pos_overfetch", + pct_merger_id="pct_overfetch", + positions=[1, 4, 9, 12], + overfetch_factor=3, + ) merger = parse_model(MergerDeduplication, config) - # Page 1 - res_1 = await merger.get_data( - methods_dict=methods_dict, - user_id="u", - limit=8, - next_page=FeedResultNextPage(data={}), - ) + # Page 1/2 + res_1, res_2 = await dh._run_two_pages(merger, methods_dict, 8) assert len(res_1.data) == 8 - _assert_no_dupes_in_page(res_1.data) + dh._assert_no_dupes_in_page(res_1.data) # Dedup merger cursor must exist and advance page. assert "dedup_overfetch" in res_1.next_page.data @@ -613,16 +261,8 @@ async def test_dedup_overfetch_factor_does_not_skip_unseen_items_in_deep_tree_cu assert res_1.next_page.data["sf_d2"].after == 4 # Page 2 (monotonic advancement, still no over-advancement) - res_2 = await merger.get_data( - methods_dict=methods_dict, - user_id="u", - limit=8, - next_page=res_1.next_page, - ) - assert len(res_2.data) == 8 - _assert_no_dupes_in_page(res_2.data) - _assert_pages_no_overlap(res_1, res_2) + dh._assert_two_pages_no_dupes(res_1, res_2) assert res_2.next_page.data["dedup_overfetch"].page == 3 assert res_2.next_page.data["pos_overfetch"].page == 3 @@ -631,7 +271,7 @@ async def test_dedup_overfetch_factor_does_not_skip_unseen_items_in_deep_tree_cu assert res_2.next_page.data["sf_d1"].after == 8 assert res_2.next_page.data["sf_d2"].after == 8 - _assert_cursor_monotonic_if_present( + dh._assert_cursor_monotonic_if_present( res_1, res_2, keys=["sf_p", "sf_d1", "sf_d2", "pos_overfetch", "dedup_overfetch"], @@ -640,17 +280,10 @@ async def test_dedup_overfetch_factor_does_not_skip_unseen_items_in_deep_tree_cu @pytest.mark.asyncio async def test_dedup_page_zero_resets_seen_and_descendant_cursors() -> None: - items = [{"id": i, "src": "S"} for i in range(1, 50)] - methods_dict = {"s": make_offset_paged_method(items)} - - config = { - "merger_id": "dedup_reset", - "type": "merger_deduplication", - "dedup_key": "id", - "state_backend": "cursor", - "cursor_compress": True, - "data": {"subfeed_id": "sf_stream", "type": "subfeed", "method_name": "s"}, - } + items = dh.make_items("S", 1, 50) + methods_dict = {"s": dh.make_offset_paged_method(items)} + + config = dh._dedup_config("dedup_reset", dh._subfeed("sf_stream", "s")) merger = parse_model(MergerDeduplication, config) @@ -660,7 +293,7 @@ async def test_dedup_page_zero_resets_seen_and_descendant_cursors() -> None: limit=5, next_page=FeedResultNextPage(data={}), ) - assert _ids(res_1.data) == [1, 2, 3, 4, 5] + assert dh._ids(res_1.data) == [1, 2, 3, 4, 5] # Simulate client "full reload": page=0 for the dedup merger. # Also include the stale descendant cursor; dedup should clear it. @@ -671,79 +304,17 @@ async def test_dedup_page_zero_resets_seen_and_descendant_cursors() -> None: next_page=FeedResultNextPage( data={ "dedup_reset": FeedResultNextPageInside(page=0, after=res_1.next_page.data["dedup_reset"].after), - "sf_stream": res_1.next_page.data["sf_stream"], + # Use a deliberately bogus descendant cursor; the dedup wrapper must ignore/reset it. + "sf_stream": FeedResultNextPageInside(page=99, after=999), } ), ) # Must restart from the beginning. - assert _ids(res_2.data) == [1, 2, 3, 4, 5] - - -@pytest.mark.parametrize("redis_client", ["sync", "async"], indirect=True) -@pytest.mark.asyncio -async def test_dedup_redis_backend_cross_page(redis_client) -> None: - items_a = [{"id": i, "src": "A"} for i in range(1, 300)] - # Same IDs as A to force cross-source duplicates. - items_b = [{"id": i, "src": "B"} for i in range(1, 300)] - - methods_dict = { - "a": make_offset_paged_method(items_a), - "b": make_offset_paged_method(items_b), - } - - config = { - "merger_id": "dedup_redis", - "type": "merger_deduplication", - "dedup_key": "id", - "state_backend": "redis", - "state_ttl_seconds": 60, - "data": { - "merger_id": "pct_mix", - "type": "merger_percentage", - "shuffle": False, - "items": [ - {"percentage": 50, "data": {"subfeed_id": "sf_a", "type": "subfeed", "method_name": "a"}}, - {"percentage": 50, "data": {"subfeed_id": "sf_b", "type": "subfeed", "method_name": "b"}}, - ], - }, - } - - merger = parse_model(MergerDeduplication, config) - - res_1 = await merger.get_data( - methods_dict=methods_dict, - user_id="u", - limit=10, - next_page=FeedResultNextPage(data={}), - redis_client=redis_client, - custom_deduplication_key="t1", - ) - - res_2 = await merger.get_data( - methods_dict=methods_dict, - user_id="u", - limit=10, - next_page=res_1.next_page, - redis_client=redis_client, - custom_deduplication_key="t1", - ) - - _assert_no_dupes_in_page(res_1.data) - _assert_no_dupes_in_page(res_2.data) - _assert_pages_no_overlap(res_1, res_2) - - # Redis backend should not store seen ids in cursor after. - assert "dedup_redis" in res_2.next_page.data - assert res_2.next_page.data["dedup_redis"].after is None - - # Ensure state is persisted in Redis. - key = "dedup:dedup_redis:u:t1" - members = redis_client.zrange(key, 0, -1) - if inspect.iscoroutine(members): - members = await members - assert len(members) >= len(set(_ids(res_1.data) + _ids(res_2.data))) - + assert dh._ids(res_2.data) == [1, 2, 3, 4, 5] + # And must not propagate the bogus descendant cursor. + assert res_2.next_page.data["sf_stream"].after == 5 + assert res_2.next_page.data["sf_stream"].page == 2 @pytest.mark.asyncio async def test_dedup_append_cursor_backend_across_pages_and_refill_advances_leaf_cursor_exactly() -> None: @@ -754,59 +325,38 @@ async def test_dedup_append_cursor_backend_across_pages_and_refill_advances_leaf """ a_items = [{"id": 1, "src": "A"}, {"id": 2, "src": "A"}] - b_items = [{"id": i, "src": "B"} for i in range(1, 50)] - - methods_dict = { - "a": make_offset_paged_method(a_items), - # Force multiple internal calls. - "b": make_offset_paged_method(b_items, max_per_call=1), - } - - config = { - "merger_id": "dedup_append_pages", - "type": "merger_deduplication", - "dedup_key": "id", - "state_backend": "cursor", - "cursor_compress": True, - "max_refill_loops": 50, - "data": { - "merger_id": "append_mix", - "type": "merger_append", - "shuffle": False, - "items": [ - {"subfeed_id": "sf_a", "type": "subfeed", "method_name": "a"}, - {"subfeed_id": "sf_b", "type": "subfeed", "method_name": "b"}, - ], - }, - } - + b_items = dh.make_items("B", 1, 50) + + config, methods_dict, _, _ = dh._build_two_subfeed_dedup_merger( + items_a=a_items, + items_b=b_items, + merger_id="dedup_append_pages", + child_builder=lambda sf_a, sf_b: dh._append_config("append_mix", [sf_a, sf_b]), + spec_b=dh._two_subfeed_spec(name="b", subfeed_id="sf_b", max_per_call=1), + dedup_kwargs={"max_refill_loops": 50}, + ) merger = parse_model(MergerDeduplication, config) - res_1 = await merger.get_data( - methods_dict=methods_dict, - user_id="u", - limit=5, - next_page=FeedResultNextPage(data={}), - ) + res_1, res_2 = await dh._run_two_pages(merger, methods_dict, 5) - assert _ids(res_1.data) == [1, 2, 3, 4, 5] + assert dh._ids(res_1.data) == [1, 2, 3, 4, 5] + assert dh._sources(res_1.data) == ["A", "A", "B", "B", "B"] assert res_1.next_page.data["dedup_append_pages"].page == 2 # In default arbitrate mode, B only needs to scan far enough to fill the remaining # portion of the page after arbitration (here: 3 items: ids 3..5). assert res_1.next_page.data["sf_b"].after == 5 + b_contributed = sum(1 for x in res_1.data if x.get("src") == "B") + assert res_1.next_page.data["sf_b"].after > b_contributed - res_2 = await merger.get_data( - methods_dict=methods_dict, - user_id="u", - limit=5, - next_page=res_1.next_page, - ) + # A is exhausted after 2 reads; ensure cursor reflects that. + assert res_1.next_page.data["sf_a"].after == 2 assert len(res_2.data) == 5 - _assert_no_dupes_in_page(res_2.data) - _assert_pages_no_overlap(res_1, res_2) + dh._assert_two_pages_no_dupes(res_1, res_2) + # Across two pages, B should have advanced exactly 5 more items. + assert res_2.next_page.data["sf_b"].after == 10 @pytest.mark.asyncio @@ -815,46 +365,34 @@ async def test_dedup_arbitrate_mode_runs_parallel_prefetch_and_arbitrates_winner started_b = asyncio.Event() release = asyncio.Event() - items_a = [{"id": i, "src": "A"} for i in range(1, 200)] - items_b = [{"id": i, "src": "B"} for i in range(1, 200)] - - async def method_a(user_id, limit, next_page, **kwargs): # pylint: disable=unused-argument - started_a.set() - await release.wait() - offset = int(next_page.after or 0) - data = items_a[offset : offset + limit] - next_page.after = offset + len(data) - next_page.page += 1 - return FeedResultClient(data=data, next_page=next_page, has_next_page=True) - - async def method_b(user_id, limit, next_page, **kwargs): # pylint: disable=unused-argument - started_b.set() - await release.wait() - offset = int(next_page.after or 0) - data = items_b[offset : offset + limit] - next_page.after = offset + len(data) - next_page.page += 1 - return FeedResultClient(data=data, next_page=next_page, has_next_page=True) - - methods_dict = {"a": method_a, "b": method_b} - - config = { - "merger_id": "dedup_arbitrate", - "type": "merger_deduplication", - "dedup_key": "id", - "state_backend": "cursor", - "cursor_compress": True, - "data": { - "merger_id": "pct", - "type": "merger_percentage", - "shuffle": False, - "items": [ - {"percentage": 50, "data": {"subfeed_id": "sf_a", "type": "subfeed", "method_name": "a"}}, - {"percentage": 50, "data": {"subfeed_id": "sf_b", "type": "subfeed", "method_name": "b"}}, - ], - }, + items_a = dh.make_items("A", 1, 200) + items_b = dh.make_items("B", 1, 200) + + def make_method(*, items, started_event): + async def _method(user_id, limit, next_page, **kwargs): # pylint: disable=unused-argument + started_event.set() + await release.wait() + offset = int(next_page.after or 0) + data = items[offset : offset + limit] + next_page.after = offset + len(data) + next_page.page += 1 + return FeedResultClient(data=data, next_page=next_page, has_next_page=True) + + return _method + + methods_dict = { + "a": make_method(items=items_a, started_event=started_a), + "b": make_method(items=items_b, started_event=started_b), } + config = dh._dedup_config( + "dedup_arbitrate", + dh._percentage_config( + "pct", + items=dh._percentage_items(dh._subfeed("sf_a", "a"), dh._subfeed("sf_b", "b")), + ), + ) + merger = parse_model(MergerDeduplication, config) task = asyncio.create_task( @@ -869,10 +407,9 @@ async def method_b(user_id, limit, next_page, **kwargs): # pylint: disable=unus res = await asyncio.wait_for(task, timeout=2) assert len(res.data) == 10 - _assert_no_dupes_in_page(res.data) + dh._assert_no_dupes_in_page(res.data) # With equal priorities, stable tie-breaker should pick A (first branch) for overlapping keys. - winning = {item["id"]: item["src"] for item in res.data} - assert all(winning[i] == "A" for i in range(1, 6)) + _assert_winning_src_for_ids(res.data, range(1, 6), "A") @pytest.mark.asyncio @@ -883,41 +420,26 @@ async def test_dedup_refill_loops_advance_dict_after_cursor_not_just_page() -> N a_items = [{"id": 1, "src": "A"}, {"id": 2, "src": "A"}] # B produces ids 1.. in round-robin across profiles; cursor is per-profile offsets. - b_profiles = { - "p0": [{"id": 1, "src": "B"}, {"id": 3, "src": "B"}, {"id": 5, "src": "B"}, {"id": 7, "src": "B"}], - "p1": [{"id": 2, "src": "B"}, {"id": 4, "src": "B"}, {"id": 6, "src": "B"}, {"id": 8, "src": "B"}], - } + b_profiles = PROFILES_B_1_TO_8 methods_dict = { - "a": make_offset_paged_method(a_items), - "b": make_profile_dict_after_method(b_profiles), + "a": dh.make_offset_paged_method(a_items), + "b": dh.make_profile_dict_after_method(b_profiles), } # Use a percentage merger so B is asked for a small limit (2 items for limit=4). # This forces refill loops when B's first batch is all duplicates. - config = { - "merger_id": "dedup_dict_after", - "type": "merger_deduplication", - "dedup_key": "id", - "state_backend": "cursor", - "cursor_compress": True, - "max_refill_loops": 50, - "data": { - "merger_id": "pct_mix", - "type": "merger_percentage", - "shuffle": False, - "items": [ - { - "percentage": 50, - "data": {"subfeed_id": "sf_a", "type": "subfeed", "method_name": "a", "dedup_priority": 100}, - }, - { - "percentage": 50, - "data": {"subfeed_id": "sf_b", "type": "subfeed", "method_name": "b", "dedup_priority": 0}, - }, - ], - }, - } + config = dh._dedup_config( + "dedup_dict_after", + dh._percentage_config( + "pct_mix", + items=dh._percentage_items( + dh._subfeed("sf_a", "a", dedup_priority=100), + dh._subfeed("sf_b", "b", dedup_priority=0), + ), + ), + max_refill_loops=50, + ) merger = parse_model(MergerDeduplication, config) res = await merger.get_data( @@ -928,8 +450,8 @@ async def test_dedup_refill_loops_advance_dict_after_cursor_not_just_page() -> N ) assert len(res.data) == 4 - _assert_no_dupes_in_page(res.data) - assert set(_ids(res.data)) == {1, 2, 3, 4} + dh._assert_no_dupes_in_page(res.data) + assert set(dh._ids(res.data)) == {1, 2, 3, 4} assert "sf_b" in res.next_page.data assert isinstance(res.next_page.data["sf_b"].after, dict) @@ -944,24 +466,17 @@ async def test_dedup_overfetch_does_not_overadvance_non_int_after_cursor() -> No """overfetch_factor must not cause over-advancement for non-rewindable cursors.""" # Single subfeed with dict after cursor; no dedup skips should happen. - profiles = { - "p0": [{"id": 1, "src": "B"}, {"id": 3, "src": "B"}, {"id": 5, "src": "B"}, {"id": 7, "src": "B"}], - "p1": [{"id": 2, "src": "B"}, {"id": 4, "src": "B"}, {"id": 6, "src": "B"}, {"id": 8, "src": "B"}], - } + profiles = PROFILES_B_1_TO_8 methods_dict = { - "b": make_profile_dict_after_method(profiles), + "b": dh.make_profile_dict_after_method(profiles), } - config = { - "merger_id": "dedup_nonint_overfetch", - "type": "merger_deduplication", - "dedup_key": "id", - "state_backend": "cursor", - "cursor_compress": True, - "overfetch_factor": 5, - "data": {"subfeed_id": "sf_b", "type": "subfeed", "method_name": "b"}, - } + config = dh._dedup_config( + "dedup_nonint_overfetch", + dh._subfeed("sf_b", "b"), + overfetch_factor=5, + ) merger = parse_model(MergerDeduplication, config) res = await merger.get_data(methods_dict=methods_dict, user_id="u", limit=4, next_page=FeedResultNextPage(data={})) @@ -984,39 +499,21 @@ async def test_dedup_overfetch_rewinds_offset_cursor_when_first_batch_all_duplic so it doesn't skip items. """ - items_a = [{"id": i, "src": "A"} for i in range(1, 300)] - items_b = [{"id": i, "src": "B"} for i in range(1, 300)] - - methods_dict = { - "a": make_offset_paged_method(items_a), - "b": make_offset_paged_method(items_b), - } - - config = { - "merger_id": "dedup_overfetch_rewind", - "type": "merger_deduplication", - "dedup_key": "id", - "state_backend": "cursor", - "cursor_compress": True, - "overfetch_factor": 3, - "max_refill_loops": 20, - "data": { - "merger_id": "pct_mix", - "type": "merger_percentage", - "shuffle": False, - "items": [ - { - "percentage": 50, - "data": {"subfeed_id": "sf_a", "type": "subfeed", "method_name": "a", "dedup_priority": 100}, - }, - { - "percentage": 50, - "data": {"subfeed_id": "sf_b", "type": "subfeed", "method_name": "b", "dedup_priority": 0}, - }, - ], - }, - } + items_a = dh.make_items("A", 1, 300) + items_b = dh.make_items("B", 1, 300) + config, methods_dict, _, _ = dh._build_two_subfeed_dedup_merger( + items_a=items_a, + items_b=items_b, + merger_id="dedup_overfetch_rewind", + child_builder=lambda sf_a, sf_b: dh._percentage_config( + "pct_mix", + items=dh._percentage_items(sf_a, sf_b), + ), + spec_a=dh._two_subfeed_spec(dedup_priority=100), + spec_b=dh._two_subfeed_spec(name="b", subfeed_id="sf_b", dedup_priority=0), + dedup_kwargs={"overfetch_factor": 3, "max_refill_loops": 20}, + ) merger = parse_model(MergerDeduplication, config) res = await merger.get_data( methods_dict=methods_dict, @@ -1026,12 +523,11 @@ async def test_dedup_overfetch_rewinds_offset_cursor_when_first_batch_all_duplic ) assert len(res.data) == 10 - _assert_no_dupes_in_page(res.data) + dh._assert_no_dupes_in_page(res.data) # A provides 1..5, B must provide 6..10. - winning = {item["id"]: item["src"] for item in res.data} - assert all(winning[i] == "A" for i in range(1, 6)) - assert all(winning[i] == "B" for i in range(6, 11)) + _assert_winning_src_for_ids(res.data, range(1, 6), "A") + _assert_winning_src_for_ids(res.data, range(6, 11), "B") # Cursor rewind check: # - First loop for B reads 5 duplicates -> after becomes 5 @@ -1039,147 +535,132 @@ async def test_dedup_overfetch_rewinds_offset_cursor_when_first_batch_all_duplic assert res.next_page.data["sf_b"].after == 10 +@pytest.mark.parametrize( + "items_a,items_b,min_b_id", + [ + (dh.make_items("A", 1, 4, user_id_mod=2), dh.make_items("B", 1, 200, user_id_mod=2), 4), + (dh.make_items("A", 1, 200, user_id_mod=3), dh.make_items("B", 1, 200, user_id_mod=3), None), + ], +) @pytest.mark.asyncio -async def test_dedup_distribute_cursor_backend_across_pages_preserves_source_refill() -> None: +async def test_dedup_distribute_cursor_backend_across_pages_preserves_source_refill( + items_a, items_b, min_b_id +) -> None: """Distribute: duplicates skipped per-leaf and page slices don't overlap.""" - # A is short so B must contribute. - items_a = [{"id": i, "user_id": f"u{i%2}", "src": "A"} for i in range(1, 4)] - # B overlaps A by id and continues. - items_b = [{"id": i, "user_id": f"u{i%2}", "src": "B"} for i in range(1, 200)] - - methods_dict = { - "a": make_offset_paged_method(items_a), - "b": make_offset_paged_method(items_b), - } - - config = { - "merger_id": "dedup_dist_pages", - "type": "merger_deduplication", - "dedup_key": "id", - "state_backend": "cursor", - "cursor_compress": True, - "data": { - "merger_id": "dist", - "type": "merger_distribute", - "distribution_key": "user_id", - "items": [ - {"subfeed_id": "sf_a", "type": "subfeed", "method_name": "a"}, - {"subfeed_id": "sf_b", "type": "subfeed", "method_name": "b"}, - ], - }, - } - - merger = parse_model(MergerDeduplication, config) - res_1 = await merger.get_data( - methods_dict=methods_dict, user_id="u", limit=10, next_page=FeedResultNextPage(data={}) + config, methods_dict, _, _ = dh._build_two_subfeed_dedup_merger( + items_a=items_a, + items_b=items_b, + merger_id="dedup_dist_pages", + child_builder=lambda sf_a, sf_b: dh._distribute_config("dist", [sf_a, sf_b]), ) - res_2 = await merger.get_data(methods_dict=methods_dict, user_id="u", limit=10, next_page=res_1.next_page) + merger = parse_model(MergerDeduplication, config) + res_1, res_2 = await dh._run_two_pages(merger, methods_dict, 10) assert len(res_1.data) == 10 assert len(res_2.data) == 10 - _assert_no_dupes_in_page(res_1.data) - _assert_no_dupes_in_page(res_2.data) - _assert_pages_no_overlap(res_1, res_2) + dh._assert_two_pages_no_dupes(res_1, res_2) # Placement/refill: B must skip duplicate ids 1..3 and still fill the page. - b_ids_1 = [x["id"] for x in res_1.data if x.get("src") == "B"] - assert b_ids_1 and min(b_ids_1) >= 4 + if min_b_id is not None: + b_ids_1 = [x["id"] for x in res_1.data if x.get("src") == "B"] + assert b_ids_1 and min(b_ids_1) >= min_b_id @pytest.mark.asyncio async def test_dedup_percentage_gradient_cursor_backend_across_pages() -> None: - a_items = [{"id": i, "src": "A"} for i in range(1, 300)] - b_items = [{"id": i, "src": "B"} for i in range(1, 30)] + [{"id": 1000 + i, "src": "B"} for i in range(1, 300)] + a_items = dh.make_items("A", 1, 300) + b_items = dh.make_items("B", 1, 30) + dh.make_items("B", 1, 300, id_offset=1000) methods_dict = { - "a": make_offset_paged_method(a_items), - "b": make_offset_paged_method(b_items), - } - - config = { - "merger_id": "dedup_grad_pages", - "type": "merger_deduplication", - "dedup_key": "id", - "state_backend": "cursor", - "cursor_compress": True, - "max_refill_loops": 50, - "data": { - "merger_id": "grad_mix", - "type": "merger_percentage_gradient", - "item_from": {"percentage": 60, "data": {"subfeed_id": "sf_a", "type": "subfeed", "method_name": "a"}}, - "item_to": {"percentage": 40, "data": {"subfeed_id": "sf_b", "type": "subfeed", "method_name": "b"}}, - "step": 20, - "size_to_step": 5, - "shuffle": False, - }, - } + "a": dh.make_offset_paged_method(a_items), + "b": dh.make_offset_paged_method(b_items), + } + + config = dh._dedup_config( + "dedup_grad_pages", + dh._gradient_config( + "grad_mix", + item_from={"percentage": 60, "data": dh._subfeed("sf_a", "a")}, + item_to={"percentage": 40, "data": dh._subfeed("sf_b", "b")}, + step=20, + size_to_step=5, + shuffle=False, + ), + max_refill_loops=50, + ) merger = parse_model(MergerDeduplication, config) - res_1 = await merger.get_data( - methods_dict=methods_dict, user_id="u", limit=10, next_page=FeedResultNextPage(data={}) - ) - res_2 = await merger.get_data(methods_dict=methods_dict, user_id="u", limit=10, next_page=res_1.next_page) + res_1, res_2 = await dh._run_two_pages(merger, methods_dict, 10) - _assert_no_dupes_in_page(res_1.data) - _assert_no_dupes_in_page(res_2.data) - _assert_pages_no_overlap(res_1, res_2) + dh._assert_two_pages_no_dupes(res_1, res_2) + + sources = dh._sources(res_1.data) + assert sources == ["A", "A", "A", "B", "B", "A", "A", "B", "B", "B"] # Gradient merger cursor should exist and advance. assert res_1.next_page.data["grad_mix"].page == 2 assert res_2.next_page.data["grad_mix"].page == 3 +@pytest.mark.parametrize( + "merger_id,custom_deduplication_key,items_a,items_b,child_builder", + [ + ( + "dedup_redis", + "t1", + dh.make_items("A", 1, 300), + dh.make_items("B", 1, 300), # Same IDs as A to force cross-source duplicates. + lambda sf_a, sf_b: dh._percentage_config("pct_mix", items=dh._percentage_items(sf_a, sf_b)), + ), + ( + "dedup_redis_append", + "t2", + dh.make_items("A", 1, 20), + dh.make_items("B", 1, 300), + lambda sf_a, sf_b: dh._append_config("append_mix", [sf_a, sf_b]), + ), + ], +) @pytest.mark.parametrize("redis_client", ["sync", "async"], indirect=True) @pytest.mark.asyncio -async def test_dedup_redis_backend_cross_page_append(redis_client) -> None: - items_a = [{"id": i, "src": "A"} for i in range(1, 20)] - items_b = [{"id": i, "src": "B"} for i in range(1, 300)] - - methods_dict = { - "a": make_offset_paged_method(items_a), - "b": make_offset_paged_method(items_b), - } - - config = { - "merger_id": "dedup_redis_append", - "type": "merger_deduplication", - "dedup_key": "id", - "state_backend": "redis", - "state_ttl_seconds": 60, - "data": { - "merger_id": "append_mix", - "type": "merger_append", - "shuffle": False, - "items": [ - {"subfeed_id": "sf_a", "type": "subfeed", "method_name": "a"}, - {"subfeed_id": "sf_b", "type": "subfeed", "method_name": "b"}, - ], - }, - } - - merger = parse_model(MergerDeduplication, config) - res_1 = await merger.get_data( - methods_dict=methods_dict, - user_id="u", - limit=10, - next_page=FeedResultNextPage(data={}), - redis_client=redis_client, - custom_deduplication_key="t2", +async def test_dedup_redis_backend_cross_page( + redis_client, + merger_id, + custom_deduplication_key, + items_a, + items_b, + child_builder, +) -> None: + config, methods_dict, _, _ = dh._build_two_subfeed_dedup_merger( + items_a=items_a, + items_b=items_b, + merger_id=merger_id, + child_builder=child_builder, + dedup_kwargs={"state_backend": "redis", "state_ttl_seconds": 60}, ) - res_2 = await merger.get_data( - methods_dict=methods_dict, - user_id="u", - limit=10, - next_page=res_1.next_page, + merger = parse_model(MergerDeduplication, config) + + res_1, res_2 = await dh._run_two_pages( + merger, + methods_dict, + 10, redis_client=redis_client, - custom_deduplication_key="t2", + custom_deduplication_key=custom_deduplication_key, ) - _assert_no_dupes_in_page(res_1.data) - _assert_no_dupes_in_page(res_2.data) - _assert_pages_no_overlap(res_1, res_2) - assert res_2.next_page.data["dedup_redis_append"].after is None + dh._assert_two_pages_no_dupes(res_1, res_2) + + # Redis backend should not store seen ids in cursor after. + assert merger_id in res_2.next_page.data + assert res_2.next_page.data[merger_id].after is None + + # Ensure state is persisted in Redis. + key = f"dedup:{merger_id}:u:{custom_deduplication_key}" + members = redis_client.zrange(key, 0, -1) + if inspect.iscoroutine(members): + members = await members + assert len(members) >= len(set(dh._ids(res_1.data) + dh._ids(res_2.data))) @pytest.mark.parametrize("redis_client", ["sync", "async"], indirect=True) @@ -1188,116 +669,46 @@ async def test_dedup_wrapper_with_view_session_merger(redis_client) -> None: """Dedup wrapper must work when the child is a view_session merger.""" # Two leaves with overlapping ids; view_session computes a session once. - items_low = [{"id": i, "src": "low"} for i in range(1, 100)] - items_high = [{"id": i, "src": "high"} for i in range(1, 100)] - - methods_dict = { - "low": make_offset_paged_method(items_low), - "high": make_offset_paged_method(items_high), - } + items_low = dh.make_items("low", 1, 100) + items_high = dh.make_items("high", 1, 100) + + methods_dict, subfeed_low, subfeed_high = dh._build_two_subfeed_methods( + items_low, + items_high, + spec_a=dh._two_subfeed_spec(name="low", subfeed_id="sf_low", dedup_priority=0), + spec_b=dh._two_subfeed_spec(name="high", subfeed_id="sf_high", dedup_priority=100), + ) - config = { - "merger_id": "dedup_vs", - "type": "merger_deduplication", - "dedup_key": "id", - "state_backend": "cursor", - "cursor_compress": True, - "data": { + config = dh._dedup_config( + "dedup_vs", + { "merger_id": "vs", "type": "merger_view_session", "session_size": 30, "session_live_time": 60, "deduplicate": False, "shuffle": False, - "data": { - "merger_id": "pct", - "type": "merger_percentage", - "shuffle": False, - "items": [ - { - "percentage": 50, - "data": {"subfeed_id": "sf_low", "type": "subfeed", "method_name": "low", "dedup_priority": 0}, - }, - { - "percentage": 50, - "data": { - "subfeed_id": "sf_high", - "type": "subfeed", - "method_name": "high", - "dedup_priority": 100, - }, - }, - ], - }, + "data": dh._percentage_config( + "pct", + items=dh._percentage_items(subfeed_low, subfeed_high), + ), }, - } + ) merger = parse_model(MergerDeduplication, config) - res_1 = await merger.get_data( - methods_dict=methods_dict, - user_id="u", - limit=10, - next_page=FeedResultNextPage(data={}), + res_1, res_2 = await dh._run_two_pages( + merger, + methods_dict, + 10, redis_client=redis_client, custom_view_session_key="vs1", ) - res_2 = await merger.get_data( - methods_dict=methods_dict, - user_id="u", - limit=10, - next_page=res_1.next_page, - redis_client=redis_client, - custom_view_session_key="vs1", - ) - - _assert_no_dupes_in_page(res_1.data) - _assert_no_dupes_in_page(res_2.data) - _assert_pages_no_overlap(res_1, res_2) + dh._assert_two_pages_no_dupes(res_1, res_2) # Deletion priority: for the overlapping early ids, the winning entity must be from high. - winning = {item["id"]: item["src"] for item in (res_1.data + res_2.data)} - assert all(winning[i] == "high" for i in range(1, 11) if i in winning) - - -@pytest.mark.asyncio -async def test_dedup_append_distribute_cursor_backend_no_dupes() -> None: - items_a = [{"id": i, "user_id": f"u{i%3}", "src": "A"} for i in range(1, 200)] - items_b = [{"id": i, "user_id": f"u{i%3}", "src": "B"} for i in range(1, 200)] - - methods_dict = { - "a": make_offset_paged_method(items_a), - "b": make_offset_paged_method(items_b), - } - - config = { - "merger_id": "dedup_dist", - "type": "merger_deduplication", - "dedup_key": "id", - "state_backend": "cursor", - "cursor_compress": True, - "data": { - "merger_id": "dist", - "type": "merger_distribute", - "distribution_key": "user_id", - "items": [ - {"subfeed_id": "sf_a", "type": "subfeed", "method_name": "a"}, - {"subfeed_id": "sf_b", "type": "subfeed", "method_name": "b"}, - ], - }, - } - - merger = parse_model(MergerDeduplication, config) - res = await merger.get_data( - methods_dict=methods_dict, - user_id="u", - limit=30, - next_page=FeedResultNextPage(data={}), - ) - - assert len(res.data) == 30 - _assert_no_dupes_in_page(res.data) + _assert_winning_src_for_ids(res_1.data + res_2.data, range(1, 11), "high") @pytest.mark.asyncio @@ -1308,37 +719,20 @@ async def test_dedup_in_page_deletion_priority_keeps_high_priority_even_if_confi The "high" branch is second in config, but has higher dedup_priority. """ - low_items = [{"id": i, "src": "low"} for i in range(1, 200)] - high_items = [{"id": i, "src": "high"} for i in range(1, 200)] - - methods_dict = { - "low": make_offset_paged_method(low_items), - "high": make_offset_paged_method(high_items), - } - - config = { - "merger_id": "dedup_priority", - "type": "merger_deduplication", - "dedup_key": "id", - "state_backend": "cursor", - "cursor_compress": True, - "data": { - "merger_id": "pct", - "type": "merger_percentage", - "shuffle": False, - "items": [ - { - "percentage": 50, - "data": {"subfeed_id": "sf_low", "type": "subfeed", "method_name": "low", "dedup_priority": 0}, - }, - { - "percentage": 50, - "data": {"subfeed_id": "sf_high", "type": "subfeed", "method_name": "high", "dedup_priority": 100}, - }, - ], - }, - } + low_items = dh.make_items("low", 1, 200) + high_items = dh.make_items("high", 1, 200) + config, methods_dict, _, _ = dh._build_two_subfeed_dedup_merger( + items_a=low_items, + items_b=high_items, + merger_id="dedup_priority", + child_builder=lambda sf_a, sf_b: dh._percentage_config( + "pct", + items=dh._percentage_items(sf_a, sf_b), + ), + spec_a=dh._two_subfeed_spec(name="low", subfeed_id="sf_low", dedup_priority=0), + spec_b=dh._two_subfeed_spec(name="high", subfeed_id="sf_high", dedup_priority=100), + ) merger = parse_model(MergerDeduplication, config) res = await merger.get_data( methods_dict=methods_dict, @@ -1347,127 +741,9 @@ async def test_dedup_in_page_deletion_priority_keeps_high_priority_even_if_confi next_page=FeedResultNextPage(data={}), ) - _assert_no_dupes_in_page(res.data) + dh._assert_no_dupes_in_page(res.data) # Priority is about which source "wins" for a given dedup_key, not about output order. # With 50/50 limits, the high-priority branch should supply ids 1..5, while the low-priority # branch will be advanced to avoid duplicates. - winning = {item["id"]: item["src"] for item in res.data} - assert all(winning[i] == "high" for i in range(1, 6)) - - -@pytest.mark.asyncio -async def test_dedup_percentage_gradient_slot_ownership_cursor_backend() -> None: - """Dedup must preserve gradient chunking semantics. + _assert_winning_src_for_ids(res.data, range(1, 6), "high") - For limit=10, size_to_step=5, from/to percentages should yield chunks: - - first 5: 3 from A, 2 from B - - next 5: 2 from A, 3 from B - Dedup must refill within each leaf so these chunk sizes remain true. - """ - - a_items = [{"id": i, "src": "A"} for i in range(1, 300)] - # Start with duplicates, then provide unique tail. - b_items = [{"id": i, "src": "B"} for i in range(1, 30)] + [{"id": 1000 + i, "src": "B"} for i in range(1, 300)] - - methods_dict = { - "a": make_offset_paged_method(a_items), - "b": make_offset_paged_method(b_items), - } - - config = { - "merger_id": "dedup_gradient", - "type": "merger_deduplication", - "dedup_key": "id", - "state_backend": "cursor", - "cursor_compress": True, - "max_refill_loops": 50, - "data": { - "merger_id": "grad_mix", - "type": "merger_percentage_gradient", - "item_from": { - "percentage": 60, - "data": {"subfeed_id": "sf_a", "type": "subfeed", "method_name": "a"}, - }, - "item_to": { - "percentage": 40, - "data": {"subfeed_id": "sf_b", "type": "subfeed", "method_name": "b"}, - }, - "step": 20, - "size_to_step": 5, - "shuffle": False, - }, - } - - merger = parse_model(MergerDeduplication, config) - res = await merger.get_data( - methods_dict=methods_dict, - user_id="u", - limit=10, - next_page=FeedResultNextPage(data={}), - ) - - assert len(res.data) == 10 - _assert_no_dupes_in_page(res.data) - - sources = _sources(res.data) - assert sources[:3] == ["A", "A", "A"] - assert sources[3:5] == ["B", "B"] - assert sources[5:7] == ["A", "A"] - assert sources[7:10] == ["B", "B", "B"] - - -@pytest.mark.asyncio -async def test_dedup_preserves_append_priority_and_advances_cursors_cursor_backend() -> None: - """Append order is the priority signal; dedup must not let later sources win duplicates. - - Also asserts that a leaf cursor advances even when items are skipped as duplicates. - """ - - a_items = [ - {"id": 1, "src": "A"}, - {"id": 2, "src": "A"}, - ] - # B repeats A's ids first, then continues with unique ids. - b_items = [{"id": i, "src": "B"} for i in range(1, 50)] - - methods_dict = { - "a": make_offset_paged_method(a_items), - "b": make_offset_paged_method(b_items), - } - - config = { - "merger_id": "dedup_append", - "type": "merger_deduplication", - "dedup_key": "id", - "state_backend": "cursor", - "cursor_compress": True, - "max_refill_loops": 20, - "data": { - "merger_id": "append_mix", - "type": "merger_append", - "shuffle": False, - "items": [ - {"subfeed_id": "sf_a", "type": "subfeed", "method_name": "a"}, - {"subfeed_id": "sf_b", "type": "subfeed", "method_name": "b"}, - ], - }, - } - - merger = parse_model(MergerDeduplication, config) - res = await merger.get_data( - methods_dict=methods_dict, - user_id="u", - limit=5, - next_page=FeedResultNextPage(data={}), - ) - - assert _ids(res.data) == [1, 2, 3, 4, 5] - assert _sources(res.data)[:2] == ["A", "A"] - assert _sources(res.data)[2:] == ["B", "B", "B"] - - # B had to scan past duplicated ids 1 and 2, so its cursor should advance - # farther than the number of items it contributed to the final page. - assert "sf_b" in res.next_page.data - assert isinstance(res.next_page.data["sf_b"].after, int) - b_contributed = sum(1 for x in res.data if x.get("src") == "B") - assert res.next_page.data["sf_b"].after > b_contributed From 70e0006cf4e1ae9f8964d0d27d33ec0d34b50943 Mon Sep 17 00:00:00 2001 From: Pavel Kochetov Date: Sat, 7 Feb 2026 15:18:27 +0000 Subject: [PATCH 17/33] Minor cleanup. --- smartfeed/examples/example_client.py | 85 ++---------- smartfeed/execution/context.py | 16 ++- smartfeed/execution/executor.py | 75 +++-------- smartfeed/manager.py | 14 +- smartfeed/mergers/append.py | 10 +- smartfeed/mergers/append_distribute.py | 16 +-- smartfeed/mergers/deduplication.py | 69 +++++----- smartfeed/mergers/percentage.py | 16 +-- smartfeed/mergers/percentage_gradient.py | 12 +- smartfeed/mergers/positional.py | 10 +- smartfeed/mergers/view_session.py | 156 ++++------------------- smartfeed/policies/dedup_utils.py | 14 +- smartfeed/policies/seen_store.py | 9 +- smartfeed/pydantic_compat.py | 16 +++ tests/test_merger_deduplication.py | 17 +-- tests/test_merger_view_session.py | 34 ++--- tests/utils.py | 15 +-- 17 files changed, 165 insertions(+), 419 deletions(-) create mode 100644 smartfeed/pydantic_compat.py diff --git a/smartfeed/examples/example_client.py b/smartfeed/examples/example_client.py index d11e00a..a24e130 100644 --- a/smartfeed/examples/example_client.py +++ b/smartfeed/examples/example_client.py @@ -4,14 +4,12 @@ from pydantic import BaseModel, ConfigDict, Field, field_validator from smartfeed import jsonlib as json +from smartfeed.pydantic_compat import parse_model from smartfeed.schemas import FeedResultClient, FeedResultNextPage, FeedResultNextPageInside -from tests.utils import parse_model class TestClientRequest(BaseModel): - """ - Пример модели клиентского входящего запроса. - """ + """Example client request model.""" profile_id: str = Field(...) limit: int = Field(...) @@ -26,18 +24,13 @@ class TestClientRequest(BaseModel): def validate_next_page(cls, value: Union[str, FeedResultNextPage]) -> Union[str, FeedResultNextPage]: if isinstance(value, str): payload = json.loads(base64.urlsafe_b64decode(value)) - validate = getattr(FeedResultNextPage, "model_validate", None) - if validate is not None: - return validate(payload) - return parse_model(FeedResultNextPage, payload) # type: ignore + return parse_model(FeedResultNextPage, payload) return value class ClientMixerClass: - """ - Пример клиентского класса ClientMixer. - """ + """Example client methods for SmartFeed.""" @staticmethod async def example_method( @@ -46,16 +39,6 @@ async def example_method( next_page: FeedResultNextPageInside, limit_to_return: Optional[int] = None, ) -> FeedResultClient: - """ - Пример клиентского метода. - - :param user_id: ID профиля. - :param limit: кол-во элементов. - :param next_page: курсор пагинации. - :param limit_to_return: ограничить кол-во результата. - :return: массив букв "profile_id" в количестве "limit" штук. - """ - data = [f"{user_id}_{i}" for i in range(1, 1000)] from_index = (data.index(next_page.after) + 1) if next_page.after else 0 @@ -68,9 +51,7 @@ async def example_method( next_page.after = result_data[-1] if result_data else None next_page.page += 1 - - result = FeedResultClient(data=result_data, next_page=next_page, has_next_page=True) - return result + return FeedResultClient(data=result_data, next_page=next_page, has_next_page=True) @staticmethod async def empty_method( @@ -79,21 +60,9 @@ async def empty_method( next_page: FeedResultNextPageInside, limit_to_return: Optional[int] = None, # pylint: disable=W0613 ) -> FeedResultClient: - """ - Пример клиентского метода, возвращающего пустые данные. - - :param user_id: ID профиля. - :param limit: кол-во элементов. - :param next_page: курсор пагинации. - :param limit_to_return: ограничить кол-во результата. - :return: массив букв "profile_id" в количестве "limit" штук. - """ - next_page.after = None next_page.page += 1 - - result = FeedResultClient(data=[], next_page=next_page, has_next_page=False) - return result + return FeedResultClient(data=[], next_page=next_page, has_next_page=False) @staticmethod async def error_method( @@ -102,21 +71,9 @@ async def error_method( next_page: FeedResultNextPageInside, limit_to_return: Optional[int] = None, # pylint: disable=W0613 ) -> FeedResultClient: - """ - Пример клиентского метода, возвращающего пустые данные. - - :param user_id: ID профиля. - :param limit: кол-во элементов. - :param next_page: курсор пагинации. - :param limit_to_return: ограничить кол-во результата. - :return: массив букв "profile_id" в количестве "limit" штук. - """ - next_page.after = None next_page.page = int(10 / 0) - - result = FeedResultClient(data=[], next_page=next_page, has_next_page=False) - return result + return FeedResultClient(data=[], next_page=next_page, has_next_page=False) @staticmethod async def doubles_method( @@ -125,23 +82,11 @@ async def doubles_method( next_page: FeedResultNextPageInside, limit_to_return: Optional[int] = None, # pylint: disable=W0613 ) -> FeedResultClient: - """ - Пример клиентского метода, возвращающего данные с дублями. - - :param user_id: ID профиля. - :param limit: кол-во элементов. - :param next_page: курсор пагинации. - :param limit_to_return: ограничить кол-во результата. - :return: массив целых чисел, равный [i for i in range(1, 11)] после удаления дублей. - """ - data = [1, 2, 3, 4, 3, 2, 5, 6, 4, 4, 7, 8, 9, 10, 9, 9, 9] next_page.after = None next_page.page += 1 - - result = FeedResultClient(data=data, next_page=next_page, has_next_page=False) - return result + return FeedResultClient(data=data, next_page=next_page, has_next_page=False) @staticmethod async def keys_method( @@ -150,16 +95,6 @@ async def keys_method( next_page: FeedResultNextPageInside, limit_to_return: Optional[int] = None, ) -> FeedResultClient: - """ - Пример клиентского метода. - - :param user_id: ID профиля. - :param limit: кол-во элементов. - :param next_page: курсор пагинации. - :param limit_to_return: ограничить кол-во результата. - :return: массив букв "profile_id" в количестве "limit" штук. - """ - data = [{"user_id": f"{user_id}_{i%10}", "value": i} for i in range(1, 1000)] from_index = (data.index(next_page.after) + 1) if next_page.after else 0 @@ -172,6 +107,4 @@ async def keys_method( next_page.after = result_data[-1] if result_data else None next_page.page += 1 - - result = FeedResultClient(data=result_data, next_page=next_page, has_next_page=True) - return result + return FeedResultClient(data=result_data, next_page=next_page, has_next_page=True) diff --git a/smartfeed/execution/context.py b/smartfeed/execution/context.py index 4a118ea..134c261 100644 --- a/smartfeed/execution/context.py +++ b/smartfeed/execution/context.py @@ -26,13 +26,21 @@ class ExecutionContext: # Execution settings (optional) refill_settings: Optional["RefillExecutionSettings"] = None - dedup_settings: Optional["DedupExecutionSettings"] = None + dedup_settings: Optional["RefillExecutionSettings"] = None + + def ensure_redis_client(self, redis_client: Optional[Union[redis.Redis, AsyncRedis]]) -> None: + if self.redis_client is None and redis_client is not None: + self.redis_client = redis_client + + def ensure_executor(self) -> Any: + if self.executor is None: + from .executor import Executor + + self.executor = Executor() + return self.executor @dataclass(frozen=True) class RefillExecutionSettings: overfetch_factor: int = 1 max_refill_loops: int = 20 - - -DedupExecutionSettings = RefillExecutionSettings diff --git a/smartfeed/execution/executor.py b/smartfeed/execution/executor.py index 20e3cbc..cd3ba4c 100644 --- a/smartfeed/execution/executor.py +++ b/smartfeed/execution/executor.py @@ -286,19 +286,11 @@ def _compute_slot_deficits( page_underfilled = remaining > 0 - if not quota_schedule and not page_underfilled: - return {} - - deficits: Dict[int, int] = {} - if quota_schedule: return self._compute_quota_deficits(plan=plan, owner_buffers=owner_buffers) - - return self._compute_fill_deficits( - plan=plan, - remaining=remaining, - deficit_slots=deficit_slots, - ) + if not page_underfilled: + return {} + return self._compute_fill_deficits(plan=plan, remaining=remaining, deficit_slots=deficit_slots) def _compute_quota_deficits( self, @@ -337,21 +329,12 @@ def _compute_fill_deficits( remaining: int, deficit_slots: List[int], ) -> Dict[int, int]: - deficits: Dict[int, int] = {} to_fill = int(remaining) if to_fill <= 0: - return deficits - - if deficit_slots: - deficits[deficit_slots[-1]] = deficits.get(deficit_slots[-1], 0) + to_fill - return deficits - - if plan.slots: - last_owner_id = id(plan.slots[-1].owner) - deficits[last_owner_id] = deficits.get(last_owner_id, 0) + to_fill - return deficits + return {} - return deficits + owner_id = deficit_slots[-1] if deficit_slots else (id(plan.slots[-1].owner) if plan.slots else None) + return {owner_id: to_fill} if owner_id is not None else {} async def _refill_deficits( self, @@ -418,10 +401,11 @@ async def _refill_deficits( continue base_np = owner_state["current_next_page"] - request_limit = max(1, int(owner_state["remaining"])) + remaining_before = max(1, int(owner_state["remaining"])) + request_limit = remaining_before can_overfetch = CursorMap.can_overfetch(node=refill_owner, base_next_page=base_np) if can_overfetch and overfetch_factor > 1: - request_limit = max(1, int(owner_state["remaining"]) * overfetch_factor) + request_limit = max(1, remaining_before * overfetch_factor) wave_ops.append((refill_owner, refill_owner_id, base_np, request_limit, can_overfetch)) @@ -443,43 +427,26 @@ async def _refill_deficits( for (owner, owner_id, base_np, request_limit, can_overfetch), result in zip(wave_ops, results): owner_state = state[owner_id] - owner_state["last_result"] = result - owner_state["last_request_limit"] = request_limit - owner_state["last_can_overfetch"] = can_overfetch - owner_state["last_base_next_page"] = base_np + remaining_before = int(owner_state["remaining"]) + owner_state["current_next_page"] = result.next_page owner_state["has_next_page"] = bool(result.has_next_page) + cursor.merge_delta(base_next_page=plan.next_page, owner_next_page=result.next_page) - cursor.merge_delta( - base_next_page=plan.next_page, - owner_next_page=result.next_page, - ) - - for refill_owner in deficit_owners: - refill_owner_id = id(refill_owner) - owner_state = state.get(refill_owner_id) - if owner_state is None: - continue - if owner_state["remaining"] <= 0: - continue - last_result = owner_state["last_result"] - if last_result is None: - continue - - refill_prio = int(getattr(refill_owner, "dedup_priority", 0)) + refill_prio = int(getattr(owner, "dedup_priority", 0)) wave_accepted, inspected_count = await dedup_policy.accept_batch( - items=list(last_result.data), + items=list(result.data), priority=refill_prio, - limit=int(owner_state["remaining"]), + limit=max(0, remaining_before), ) - if owner_state["last_can_overfetch"] and owner_state["last_request_limit"] > owner_state["remaining"]: + if can_overfetch and request_limit > remaining_before: CursorMap.rewind_overfetch( - node=refill_owner, - base_next_page=owner_state["last_base_next_page"], - result_next_page=owner_state["current_next_page"], + node=owner, + base_next_page=base_np, + result_next_page=result.next_page, inspected_count=inspected_count, - batch_size=len(last_result.data), + batch_size=len(result.data), ) if wave_accepted: @@ -489,8 +456,6 @@ async def _refill_deficits( if owner_state["remaining"] > 0 and owner_state["has_next_page"]: owner_state["loops"] += 1 - owner_state["last_result"] = None - for refill_owner in deficit_owners: refill_owner_id = id(refill_owner) owner_state = state.get(refill_owner_id) diff --git a/smartfeed/manager.py b/smartfeed/manager.py index 3c0ea05..d6c7a76 100644 --- a/smartfeed/manager.py +++ b/smartfeed/manager.py @@ -4,9 +4,8 @@ from redis.asyncio import Redis as AsyncRedis from .execution.context import ExecutionContext -from .execution.executor import Executor +from .pydantic_compat import parse_model from .schemas import FeedConfig, FeedResult, FeedResultNextPage -from tests.utils import parse_model class FeedManager: @@ -23,11 +22,7 @@ def __init__(self, config: Dict, methods_dict: Dict, redis_client: Optional[Unio :param redis_client: объект клиента Redis (для конфигурации с view_session = True). """ - validate = getattr(FeedConfig, "model_validate", None) - if validate is not None: - self.feed_config = validate(config) - else: - self.feed_config = parse_model(FeedConfig, config) # type: ignore + self.feed_config = parse_model(FeedConfig, config) self.methods_dict = methods_dict self.redis_client = redis_client @@ -43,6 +38,5 @@ async def get_data(self, user_id: Any, limit: int, next_page: FeedResultNextPage """ ctx = ExecutionContext(methods_dict=self.methods_dict, user_id=user_id, redis_client=self.redis_client) - ctx.executor = Executor() - result = await ctx.executor.run(self.feed_config.feed, ctx, limit, next_page, **params) - return result + executor = ctx.ensure_executor() + return await executor.run(self.feed_config.feed, ctx, limit, next_page, **params) diff --git a/smartfeed/mergers/append.py b/smartfeed/mergers/append.py index e4fc609..a415159 100644 --- a/smartfeed/mergers/append.py +++ b/smartfeed/mergers/append.py @@ -62,10 +62,8 @@ async def get_data( ) -> FeedResult: if ctx is None: ctx = ExecutionContext(methods_dict=methods_dict, user_id=user_id, redis_client=redis_client) + else: + ctx.ensure_redis_client(redis_client) - if ctx.executor is None: - from ..execution.executor import Executor - - ctx.executor = Executor() - - return await ctx.executor.run(self, ctx, limit, next_page, **params) + executor = ctx.ensure_executor() + return await executor.run(self, ctx, limit, next_page, **params) diff --git a/smartfeed/mergers/append_distribute.py b/smartfeed/mergers/append_distribute.py index 442ee00..3e0a8e1 100644 --- a/smartfeed/mergers/append_distribute.py +++ b/smartfeed/mergers/append_distribute.py @@ -26,7 +26,7 @@ class MergerAppendDistribute(BaseFeedConfigModel): sorting_desc: bool = False @no_type_check - async def _uniform_distribute(self, data: list) -> list: + def _uniform_distribute(self, data: list) -> list: if self.sorting_key: data = sorted(data, key=lambda x: x[self.sorting_key], reverse=self.sorting_desc) @@ -58,11 +58,11 @@ def build_plan( ) -> SlotsPlan: slots = [SlotSpec(owner=item, max_count=limit) for item in self.items] - async def _assemble( + def _assemble( output: List[Any], merged_next_page: FeedResultNextPage, owner_results: Dict[int, FeedResult] ) -> FeedResult: has_next_page = any(r.has_next_page for r in owner_results.values()) - distributed = await self._uniform_distribute(output) + distributed = self._uniform_distribute(output) return FeedResult(data=distributed, next_page=merged_next_page, has_next_page=has_next_page) return SlotsPlan( @@ -86,10 +86,8 @@ async def get_data( ) -> FeedResult: if ctx is None: ctx = ExecutionContext(methods_dict=methods_dict, user_id=user_id, redis_client=redis_client) + else: + ctx.ensure_redis_client(redis_client) - if ctx.executor is None: - from ..execution.executor import Executor - - ctx.executor = Executor() - - return await ctx.executor.run(self, ctx, limit, next_page, **params) + executor = ctx.ensure_executor() + return await executor.run(self, ctx, limit, next_page, **params) diff --git a/smartfeed/mergers/deduplication.py b/smartfeed/mergers/deduplication.py index 94c9841..2686ac9 100644 --- a/smartfeed/mergers/deduplication.py +++ b/smartfeed/mergers/deduplication.py @@ -54,37 +54,34 @@ def validate_merger_deduplication(self) -> "MergerDeduplication": def _collect_descendant_cursor_keys(self, feed: BaseFeedConfigModel) -> set[str]: keys: set[str] = set() - - subfeed_id = getattr(feed, "subfeed_id", None) - if isinstance(subfeed_id, str) and subfeed_id: - keys.add(subfeed_id) - - merger_id = getattr(feed, "merger_id", None) - if isinstance(merger_id, str) and merger_id: - keys.add(merger_id) - - child: Any - for attr_name in ("data", "positional", "default"): - child = getattr(feed, attr_name, None) - if isinstance(child, BaseFeedConfigModel): - keys.update(self._collect_descendant_cursor_keys(child)) - - for attr_name in ("item_from", "item_to"): - child = getattr(feed, attr_name, None) - inner = getattr(child, "data", None) - if isinstance(inner, BaseFeedConfigModel): - keys.update(self._collect_descendant_cursor_keys(inner)) - - items = getattr(feed, "items", None) - if isinstance(items, list): - for item in items: - if isinstance(item, BaseFeedConfigModel): - keys.update(self._collect_descendant_cursor_keys(item)) - continue - - inner = getattr(item, "data", None) + stack = [feed] + while stack: + node = stack.pop() + + for attr in ("subfeed_id", "merger_id"): + value = getattr(node, attr, None) + if isinstance(value, str) and value: + keys.add(value) + + for child in ( + getattr(node, "data", None), + getattr(node, "positional", None), + getattr(node, "default", None), + ): + if isinstance(child, BaseFeedConfigModel): + stack.append(child) + + for wrapper in (getattr(node, "item_from", None), getattr(node, "item_to", None)): + inner = getattr(wrapper, "data", None) if isinstance(inner, BaseFeedConfigModel): - keys.update(self._collect_descendant_cursor_keys(inner)) + stack.append(inner) + + items = getattr(node, "items", None) + if isinstance(items, list): + for item in items: + inner = item if isinstance(item, BaseFeedConfigModel) else getattr(item, "data", None) + if isinstance(inner, BaseFeedConfigModel): + stack.append(inner) return keys @@ -117,15 +114,11 @@ async def get_data( ) -> FeedResult: if ctx is None: ctx = ExecutionContext(methods_dict=methods_dict, user_id=user_id, redis_client=redis_client) - elif ctx.redis_client is None and redis_client is not None: - ctx.redis_client = redis_client - - if ctx.executor is None: - from ..execution.executor import Executor - - ctx.executor = Executor() + else: + ctx.ensure_redis_client(redis_client) - return await ctx.executor.run(self, ctx, limit, next_page, **params) + executor = ctx.ensure_executor() + return await executor.run(self, ctx, limit, next_page, **params) def build_plan( self, diff --git a/smartfeed/mergers/percentage.py b/smartfeed/mergers/percentage.py index 1da51ea..1b034b4 100644 --- a/smartfeed/mergers/percentage.py +++ b/smartfeed/mergers/percentage.py @@ -31,7 +31,7 @@ class MergerPercentage(BaseFeedConfigModel): shuffle: bool = False @staticmethod - async def _merge_items_data(items_data: List[List]) -> List: + def _merge_items_data(items_data: List[List]) -> List: result: List = [] cursor: List[Dict] = [] @@ -68,13 +68,11 @@ async def get_data( ) -> FeedResult: if ctx is None: ctx = ExecutionContext(methods_dict=methods_dict, user_id=user_id, redis_client=redis_client) + else: + ctx.ensure_redis_client(redis_client) - if ctx.executor is None: - from ..execution.executor import Executor - - ctx.executor = Executor() - - return await ctx.executor.run(self, ctx, limit, next_page, **params) + executor = ctx.ensure_executor() + return await executor.run(self, ctx, limit, next_page, **params) def build_plan( self, @@ -91,7 +89,7 @@ def build_plan( child_limit = limit * int(item.percentage) // 100 slots.append(SlotSpec(owner=owner, max_count=max(0, child_limit))) - async def _assemble( + def _assemble( output: List[Any], merged_next_page: FeedResultNextPage, owner_results: Dict[int, FeedResult], @@ -107,7 +105,7 @@ async def _assemble( items_data.append(list(child_res.data)) has_next_page = has_next_page or bool(child_res.has_next_page) - data = await self._merge_items_data(items_data=items_data) + data = self._merge_items_data(items_data=items_data) if self.shuffle: shuffle(data) diff --git a/smartfeed/mergers/percentage_gradient.py b/smartfeed/mergers/percentage_gradient.py index 460e9f0..b1ff586 100644 --- a/smartfeed/mergers/percentage_gradient.py +++ b/smartfeed/mergers/percentage_gradient.py @@ -81,13 +81,11 @@ async def get_data( ) -> FeedResult: if ctx is None: ctx = ExecutionContext(methods_dict=methods_dict, user_id=user_id, redis_client=redis_client) + else: + ctx.ensure_redis_client(redis_client) - if ctx.executor is None: - from ..execution.executor import Executor - - ctx.executor = Executor() - - return await ctx.executor.run(self, ctx, limit, next_page, **params) + executor = ctx.ensure_executor() + return await executor.run(self, ctx, limit, next_page, **params) def build_plan( self, @@ -117,7 +115,7 @@ def build_plan( SlotSpec(owner=owner_to, max_count=int(limits_and_percents["limit_to"])), ] - async def _assemble( + def _assemble( output: List[Any], merged_next_page: FeedResultNextPage, owner_results: Dict[int, FeedResult], diff --git a/smartfeed/mergers/positional.py b/smartfeed/mergers/positional.py index bc3a095..3ac9b32 100644 --- a/smartfeed/mergers/positional.py +++ b/smartfeed/mergers/positional.py @@ -50,13 +50,11 @@ async def get_data( ) -> FeedResult: if ctx is None: ctx = ExecutionContext(methods_dict=methods_dict, user_id=user_id, redis_client=redis_client) + else: + ctx.ensure_redis_client(redis_client) - if ctx.executor is None: - from ..execution.executor import Executor - - ctx.executor = Executor() - - return await ctx.executor.run(self, ctx, limit, next_page, **params) + executor = ctx.ensure_executor() + return await executor.run(self, ctx, limit, next_page, **params) def build_plan( self, diff --git a/smartfeed/mergers/view_session.py b/smartfeed/mergers/view_session.py index b32fdf3..8adf814 100644 --- a/smartfeed/mergers/view_session.py +++ b/smartfeed/mergers/view_session.py @@ -6,7 +6,6 @@ import redis from redis.asyncio import Redis as AsyncRedis -from redis.asyncio import RedisCluster as AsyncRedisCluster from .. import jsonlib as json from ..execution.context import ExecutionContext @@ -47,83 +46,41 @@ def _dedup_data(self, data: List[Any]) -> List[Any]: async def _set_cache( self, - methods_dict: Dict[str, Callable], - user_id: Any, redis_client: Union[redis.Redis, AsyncRedis], cache_key: str, - ctx: Optional[ExecutionContext] = None, + ctx: ExecutionContext, **params: Any, ) -> List[Any]: - if ctx is not None and ctx.executor is not None: - result = await ctx.executor.run(self.data, ctx, self.session_size, FeedResultNextPage(data={}), **params) - else: - result = await self.data.get_data( - methods_dict=methods_dict, - user_id=user_id, - limit=self.session_size, - next_page=FeedResultNextPage(data={}), - redis_client=ctx.redis_client if ctx is not None else None, - ctx=ctx, - **params, - ) + if ctx.executor is None: + raise ValueError("Executor must be initialized for MergerViewSession") - data = result.data - if self.deduplicate: - data = self._dedup_data(data) - await _redis_call(redis_client, "set", cache_key, json.dumps(data), ex=self.session_live_time) - return data - - async def _set_cache_async( - self, - methods_dict: Dict[str, Callable], - user_id: Any, - redis_client: AsyncRedis, - cache_key: str, - ctx: Optional[ExecutionContext] = None, - **params: Any, - ) -> List[Any]: - if ctx is not None and ctx.executor is not None: - result = await ctx.executor.run(self.data, ctx, self.session_size, FeedResultNextPage(data={}), **params) - else: - result = await self.data.get_data( - methods_dict=methods_dict, - user_id=user_id, - limit=self.session_size, - next_page=FeedResultNextPage(data={}), - redis_client=ctx.redis_client if ctx is not None else None, - ctx=ctx, - **params, - ) + result = await ctx.executor.run(self.data, ctx, self.session_size, FeedResultNextPage(data={}), **params) data = result.data if self.deduplicate: data = self._dedup_data(data) - await redis_client.set(cache_key, json.dumps(data)) - await redis_client.expire(cache_key, self.session_live_time) + await _redis_call(redis_client, "set", cache_key, json.dumps(data), ex=self.session_live_time) return data async def _get_cache( self, - methods_dict: Dict[str, Callable], - user_id: Any, limit: int, next_page: FeedResultNextPage, redis_client: Union[redis.Redis, AsyncRedis], - ctx: Optional[ExecutionContext] = None, + ctx: ExecutionContext, **params: Any, ) -> FeedResult: - if session_cache_key := params.get("custom_view_session_key", None): - cache_key = f"{self.merger_id}_{user_id}_{session_cache_key}" - else: - cache_key = f"{self.merger_id}_{user_id}" + cache_key = ( + f"{self.merger_id}_{ctx.user_id}_{session_cache_key}" + if (session_cache_key := params.get("custom_view_session_key")) + else f"{self.merger_id}_{ctx.user_id}" + ) logging.info("MergerViewSession cache request for %s", cache_key) cache_exists = bool(await _redis_call(redis_client, "exists", cache_key)) if not cache_exists or self.merger_id not in next_page.data: logging.info("Cache miss or new session - generating fresh data for %s", cache_key) session_data = await self._set_cache( - methods_dict=methods_dict, - user_id=user_id, redis_client=redis_client, cache_key=cache_key, ctx=ctx, @@ -137,57 +94,6 @@ async def _get_cache( "Redis returned None for %s - falling back to fresh data (cluster replication issue)", cache_key ) session_data = await self._set_cache( - methods_dict=methods_dict, - user_id=user_id, - redis_client=redis_client, - cache_key=cache_key, - ctx=ctx, - **params, - ) - else: - logging.info("Successfully read cached data for %s", cache_key) - session_data = json.loads(cached_data) - - page = next_page.data[self.merger_id].page if self.merger_id in next_page.data else 1 - return FeedResult( - data=session_data[(page - 1) * limit :][:limit], - next_page=FeedResultNextPage(data={self.merger_id: FeedResultNextPageInside(page=page + 1, after=None)}), - has_next_page=bool(len(session_data) > limit * page), - ) - - async def _get_cache_async( - self, - methods_dict: Dict[str, Callable], - user_id: Any, - limit: int, - next_page: FeedResultNextPage, - redis_client: AsyncRedis, - ctx: Optional[ExecutionContext] = None, - **params: Any, - ) -> FeedResult: - if session_cache_key := params.get("custom_view_session_key", None): - cache_key = f"{self.merger_id}_{user_id}_{session_cache_key}" - else: - cache_key = f"{self.merger_id}_{user_id}" - - if not await redis_client.exists(cache_key) or self.merger_id not in next_page.data: - session_data = await self._set_cache_async( - methods_dict=methods_dict, - user_id=user_id, - redis_client=redis_client, - cache_key=cache_key, - ctx=ctx, - **params, - ) - else: - cached_data = await redis_client.get(cache_key) - if cached_data is None: - logging.info( - "Redis returned None for %s - falling back to fresh data (cluster replication issue)", cache_key - ) - session_data = await self._set_cache_async( - methods_dict=methods_dict, - user_id=user_id, redis_client=redis_client, cache_key=cache_key, ctx=ctx, @@ -216,15 +122,11 @@ async def get_data( ) -> FeedResult: if ctx is None: ctx = ExecutionContext(methods_dict=methods_dict, user_id=user_id, redis_client=redis_client) - elif ctx.redis_client is None and redis_client is not None: - ctx.redis_client = redis_client - - if ctx.executor is None: - from ..execution.executor import Executor - - ctx.executor = Executor() + else: + ctx.ensure_redis_client(redis_client) - return await ctx.executor.run(self, ctx, limit, next_page, **params) + executor = ctx.ensure_executor() + return await executor.run(self, ctx, limit, next_page, **params) def build_plan( self, @@ -241,27 +143,13 @@ async def _run(executor: Any) -> FeedResult: if ctx.executor is None: ctx.executor = executor - redis_client = ctx.redis_client - if isinstance(redis_client, (AsyncRedis, AsyncRedisCluster)): - result = await self._get_cache_async( - methods_dict=ctx.methods_dict, - user_id=ctx.user_id, - limit=limit, - next_page=next_page, - redis_client=redis_client, - ctx=ctx, - **params, - ) - else: - result = await self._get_cache( - methods_dict=ctx.methods_dict, - user_id=ctx.user_id, - limit=limit, - next_page=next_page, - redis_client=redis_client, - ctx=ctx, - **params, - ) + result = await self._get_cache( + limit=limit, + next_page=next_page, + redis_client=ctx.redis_client, + ctx=ctx, + **params, + ) if self.shuffle: shuffle(result.data) diff --git a/smartfeed/policies/dedup_utils.py b/smartfeed/policies/dedup_utils.py index e12aed1..2559e0f 100644 --- a/smartfeed/policies/dedup_utils.py +++ b/smartfeed/policies/dedup_utils.py @@ -2,9 +2,8 @@ import asyncio import base64 -import inspect import zlib -from typing import Any, Dict, List, Optional, Tuple, Union +from typing import Any, Awaitable, Dict, List, Optional, Tuple, Union, cast import redis from redis.asyncio import Redis as AsyncRedis @@ -72,11 +71,8 @@ async def redis_zmscore( if not members: return [] - zmscore_fn = getattr(redis_client, "zmscore", None) - if zmscore_fn is not None: - res = zmscore_fn(key, members) - if inspect.iscoroutine(res): - res = await res + if getattr(redis_client, "zmscore", None) is not None: + res = await _redis_call(redis_client, "zmscore", key, members) return [None if v is None else float(v) for v in list(res)] if not _is_async_redis_client(redis_client): @@ -93,9 +89,7 @@ def _sync_pipeline_execute() -> Any: pipe = redis_client.pipeline() for m in members: pipe.zscore(key, m) - res = pipe.execute() - if inspect.iscoroutine(res): - res = await res + res = await cast(Awaitable[Any], pipe.execute()) return [None if v is None else float(v) for v in list(res)] diff --git a/smartfeed/policies/seen_store.py b/smartfeed/policies/seen_store.py index a867346..d0a9258 100644 --- a/smartfeed/policies/seen_store.py +++ b/smartfeed/policies/seen_store.py @@ -1,8 +1,7 @@ from __future__ import annotations -import inspect from dataclasses import dataclass -from typing import Any, Dict, List, Optional, Protocol, Tuple, Union, cast +from typing import Any, Dict, List, Optional, Protocol, Tuple, Union import redis from redis.asyncio import Redis as AsyncRedis @@ -123,11 +122,7 @@ async def prefetch(self, keys: List[str]) -> None: if not unique: return - scores_result = redis_zmscore(self.redis_client, self.redis_key, unique) - if inspect.iscoroutine(scores_result): - scores = await cast(Any, scores_result) - else: - scores = scores_result + scores = await redis_zmscore(self.redis_client, self.redis_key, unique) for k, s in zip(unique, scores): self.redis_seen_cache[k] = None if s is None else int(s) diff --git a/smartfeed/pydantic_compat.py b/smartfeed/pydantic_compat.py new file mode 100644 index 0000000..185e004 --- /dev/null +++ b/smartfeed/pydantic_compat.py @@ -0,0 +1,16 @@ +from __future__ import annotations + +from typing import Any, Mapping, Type, TypeVar + +T = TypeVar("T") + + +def parse_model(model_cls: Type[T], obj: Mapping[str, Any]) -> T: + """Parse a mapping into a Pydantic model. + + Uses Pydantic v2 `model_validate` when available, otherwise falls back to v1 `parse_obj`. + """ + + if hasattr(model_cls, "model_validate"): + return model_cls.model_validate(obj) # type: ignore[attr-defined] + return model_cls.parse_obj(obj) # type: ignore[attr-defined] diff --git a/tests/test_merger_deduplication.py b/tests/test_merger_deduplication.py index ca83885..ab9838f 100644 --- a/tests/test_merger_deduplication.py +++ b/tests/test_merger_deduplication.py @@ -1,14 +1,13 @@ import asyncio -import inspect import pytest +from smartfeed.feed_models import _redis_call from smartfeed.schemas import FeedResultClient, FeedResultNextPage, FeedResultNextPageInside, MergerDeduplication from tests.fixtures import dedup_helpers as dh from tests.fixtures.redis import redis_client # noqa: F401 from tests.utils import parse_model - PROFILES_B_1_TO_8 = { "p0": [{"id": 1, "src": "B"}, {"id": 3, "src": "B"}, {"id": 5, "src": "B"}, {"id": 7, "src": "B"}], "p1": [{"id": 2, "src": "B"}, {"id": 4, "src": "B"}, {"id": 6, "src": "B"}, {"id": 8, "src": "B"}], @@ -71,7 +70,9 @@ async def test_dedup_positional_slot_ownership_cursor_backend() -> None: dh._assert_two_pages_no_dupes(res_1, res_2) dh._assert_sources_at_positions(res_2.data, [1, 3, 5], "pos") - dh._assert_cursor_monotonic_if_present(res_1, res_2, keys=["sf_pos", "sf_default", "positional_mix", "dedup_wrapper"]) + dh._assert_cursor_monotonic_if_present( + res_1, res_2, keys=["sf_pos", "sf_default", "positional_mix", "dedup_wrapper"] + ) @pytest.mark.asyncio @@ -316,6 +317,7 @@ async def test_dedup_page_zero_resets_seen_and_descendant_cursors() -> None: assert res_2.next_page.data["sf_stream"].after == 5 assert res_2.next_page.data["sf_stream"].page == 2 + @pytest.mark.asyncio async def test_dedup_append_cursor_backend_across_pages_and_refill_advances_leaf_cursor_exactly() -> None: """Append: across pages there is no overlap; refill advances cursors correctly. @@ -543,9 +545,7 @@ async def test_dedup_overfetch_rewinds_offset_cursor_when_first_batch_all_duplic ], ) @pytest.mark.asyncio -async def test_dedup_distribute_cursor_backend_across_pages_preserves_source_refill( - items_a, items_b, min_b_id -) -> None: +async def test_dedup_distribute_cursor_backend_across_pages_preserves_source_refill(items_a, items_b, min_b_id) -> None: """Distribute: duplicates skipped per-leaf and page slices don't overlap.""" config, methods_dict, _, _ = dh._build_two_subfeed_dedup_merger( @@ -657,9 +657,7 @@ async def test_dedup_redis_backend_cross_page( # Ensure state is persisted in Redis. key = f"dedup:{merger_id}:u:{custom_deduplication_key}" - members = redis_client.zrange(key, 0, -1) - if inspect.iscoroutine(members): - members = await members + members = await _redis_call(redis_client, "zrange", key, 0, -1) assert len(members) >= len(set(dh._ids(res_1.data) + dh._ids(res_2.data))) @@ -746,4 +744,3 @@ async def test_dedup_in_page_deletion_priority_keeps_high_priority_even_if_confi # With 50/50 limits, the high-priority branch should supply ids 1..5, while the low-priority # branch will be advanced to avoid duplicates. _assert_winning_src_for_ids(res.data, range(1, 6), "high") - diff --git a/tests/test_merger_view_session.py b/tests/test_merger_view_session.py index bd02bfe..f62a096 100644 --- a/tests/test_merger_view_session.py +++ b/tests/test_merger_view_session.py @@ -1,8 +1,8 @@ -import inspect import json import pytest +from smartfeed.feed_models import _redis_call from smartfeed.schemas import FeedResultNextPage, FeedResultNextPageInside, MergerViewSession from tests.fixtures.configs import METHODS_DICT from tests.fixtures.mergers import MERGER_VIEW_SESSION_CONFIG, MERGER_VIEW_SESSION_DUPS_CONFIG @@ -10,6 +10,10 @@ from tests.utils import parse_model +async def _get_cache_json(redis_client, key: str): + return json.loads(await _redis_call(redis_client, "get", key)) + + @pytest.mark.asyncio async def test_merger_view_session_no_redis() -> None: """ @@ -41,12 +45,7 @@ async def test_merger_view_session(redis_client) -> None: user_id="x", redis_client=redis_client, ) - merger_vs_cache = redis_client.get(name="merger_view_session_example_x") - # Для использования синхронной и асинхронной фикстуры в одном тесте проверяем метод get - if inspect.iscoroutine(merger_vs_cache): - merger_vs_cache = json.loads(await merger_vs_cache) - else: - merger_vs_cache = json.loads(merger_vs_cache) + merger_vs_cache = await _get_cache_json(redis_client, "merger_view_session_example_x") assert merger_vs_res.data == ["x_1", "x_2", "x_3", "x_4", "x_5", "x_6", "x_7", "x_8", "x_9", "x_10"] assert len(merger_vs_cache) == merger_vs.session_size @@ -70,12 +69,7 @@ async def test_merger_view_session_custom_key(redis_client) -> None: redis_client=redis_client, custom_view_session_key="foo", ) - merger_vs_cache = redis_client.get(name="merger_view_session_example_x_foo") - # Для использования синхронной и асинхронной фикстуры в одном тесте проверяем метод get - if inspect.iscoroutine(merger_vs_cache): - merger_vs_cache = json.loads(await merger_vs_cache) - else: - merger_vs_cache = json.loads(merger_vs_cache) + merger_vs_cache = await _get_cache_json(redis_client, "merger_view_session_example_x_foo") assert merger_vs_res.data == ["x_1", "x_2", "x_3", "x_4", "x_5", "x_6", "x_7", "x_8", "x_9", "x_10"] assert len(merger_vs_cache) == merger_vs.session_size @@ -99,12 +93,7 @@ async def test_merger_view_session_next_page(redis_client) -> None: user_id="x", redis_client=redis_client, ) - merger_vs_cache = redis_client.get(name="merger_view_session_example_x") - # Для использования синхронной и асинхронной фикстуры в одном тесте проверяем метод get - if inspect.iscoroutine(merger_vs_cache): - merger_vs_cache = json.loads(await merger_vs_cache) - else: - merger_vs_cache = json.loads(merger_vs_cache) + merger_vs_cache = await _get_cache_json(redis_client, "merger_view_session_example_x") assert merger_vs_res.data == ["x_11", "x_12", "x_13", "x_14", "x_15", "x_16", "x_17", "x_18", "x_19", "x_20"] assert len(merger_vs_cache) == merger_vs.session_size @@ -122,12 +111,7 @@ async def test_merger_view_session_deduplication(redis_client) -> None: user_id="x", redis_client=redis_client, ) - merger_vs_cache = redis_client.get(name="merger_view_session_example_x") - # Для использования синхронной и асинхронной фикстуры в одном тесте проверяем метод get - if inspect.iscoroutine(merger_vs_cache): - merger_vs_cache = json.loads(await merger_vs_cache) - else: - merger_vs_cache = json.loads(merger_vs_cache) + merger_vs_cache = await _get_cache_json(redis_client, "merger_view_session_example_x") assert merger_vs_res.data == [i for i in range(1, 11)] assert len(merger_vs_cache) == merger_vs.session_size diff --git a/tests/utils.py b/tests/utils.py index 331f1e7..af29848 100644 --- a/tests/utils.py +++ b/tests/utils.py @@ -1,16 +1,5 @@ from __future__ import annotations -from typing import Any, Dict, Type, TypeVar +from smartfeed.pydantic_compat import parse_model -T = TypeVar("T") - - -def parse_model(model_cls: Type[T], obj: Dict[str, Any]) -> T: - """Parse a dict into a Pydantic model. - - Uses Pydantic v2 `model_validate` when available, otherwise falls back to v1 `parse_obj`. - """ - - if hasattr(model_cls, "model_validate"): - return model_cls.model_validate(obj) # type: ignore[attr-defined] - return model_cls.parse_obj(obj) # type: ignore[attr-defined] +__all__ = ["parse_model"] From 8032ce3982bfa3c091e63be20613266f15f1beda Mon Sep 17 00:00:00 2001 From: Pavel Kochetov Date: Sat, 7 Feb 2026 15:23:01 +0000 Subject: [PATCH 18/33] If subfeed is sync - throw it into thread. --- smartfeed/feed_models.py | 27 ++++++++++++++++++++------- 1 file changed, 20 insertions(+), 7 deletions(-) diff --git a/smartfeed/feed_models.py b/smartfeed/feed_models.py index 50abcbe..6d29d67 100644 --- a/smartfeed/feed_models.py +++ b/smartfeed/feed_models.py @@ -138,14 +138,27 @@ async def get_data( if arg in params: method_params[arg] = params[arg] + method = method_spec.method + is_async = inspect.iscoroutinefunction(method) or inspect.iscoroutinefunction(getattr(method, "__call__", None)) + try: - method_result = await method_spec.method( - user_id=user_id, - limit=limit, - next_page=subfeed_next_page, - **method_params, - **self.subfeed_params, - ) + if is_async: + method_result = await method( + user_id=user_id, + limit=limit, + next_page=subfeed_next_page, + **method_params, + **self.subfeed_params, + ) + else: + method_result = await asyncio.to_thread( + method, + user_id=user_id, + limit=limit, + next_page=subfeed_next_page, + **method_params, + **self.subfeed_params, + ) except Exception: if self.raise_error: raise From efece1c94d4a537a9b64e93f26719665e026e368 Mon Sep 17 00:00:00 2001 From: Pavel Kochetov Date: Sun, 8 Feb 2026 00:54:33 +0000 Subject: [PATCH 19/33] Dedup runtime separated. --- smartfeed/execution/dedup_runtime.py | 354 ++++++++++++++++++++++++++ smartfeed/execution/executor.py | 364 ++------------------------- 2 files changed, 374 insertions(+), 344 deletions(-) create mode 100644 smartfeed/execution/dedup_runtime.py diff --git a/smartfeed/execution/dedup_runtime.py b/smartfeed/execution/dedup_runtime.py new file mode 100644 index 0000000..15773a1 --- /dev/null +++ b/smartfeed/execution/dedup_runtime.py @@ -0,0 +1,354 @@ +from __future__ import annotations + +from typing import TYPE_CHECKING, Any, Dict, List, Optional, Tuple + +from ..feed_models import BaseFeedConfigModel, FeedResult, FeedResultNextPage +from .context import ExecutionContext +from .cursors import CursorMap +from .plans import SlotsPlan + +if TYPE_CHECKING: + from .executor import Executor + + +class DedupRuntime: + """Dedup/refill orchestration. + + This owns the control flow (refill loops, slot deficit refills, etc.) + while `DeduplicationPolicy` stays focused on acceptance/arbitration decisions. + """ + + def __init__(self, executor: "Executor") -> None: + self._executor = executor + + def _get_refill_settings(self, ctx: ExecutionContext) -> Any: + return getattr(ctx, "refill_settings", None) or getattr(ctx, "dedup_settings", None) + + async def run_node_with_dedup_refill( + self, + *, + node: BaseFeedConfigModel, + ctx: ExecutionContext, + limit: int, + next_page: FeedResultNextPage, + params: Dict[str, Any], + initial_result: FeedResult, + ) -> FeedResult: + dedup = getattr(ctx, "dedup", None) + if dedup is None: + return initial_result + + settings = self._get_refill_settings(ctx) + overfetch_factor = max(1, int(getattr(settings, "overfetch_factor", 1))) + max_refill_loops = max(1, int(getattr(settings, "max_refill_loops", 20))) + priority = int(getattr(node, "dedup_priority", 0)) + + collected: List[Any] = [] + remaining = int(limit) + loops = 0 + + current_result = initial_result + current_next_page = current_result.next_page + current_request_limit = max(1, remaining) + has_next_page = bool(current_result.has_next_page) + base_next_page = next_page + + # NOTE: Refill loops are inherently sequential for a single node because + # each subsequent request depends on the previous cursor. + while remaining > 0: + can_overfetch = CursorMap.can_overfetch(node=node, base_next_page=base_next_page) + + accepted, inspected_count = await dedup.accept_batch( + items=list(current_result.data), + priority=priority, + limit=remaining, + ) + + if can_overfetch and current_request_limit > remaining: + CursorMap.rewind_overfetch( + node=node, + base_next_page=base_next_page, + result_next_page=current_next_page, + inspected_count=inspected_count, + batch_size=len(current_result.data), + ) + + if accepted: + collected.extend(accepted) + remaining = limit - len(collected) + + if remaining <= 0 or not has_next_page or loops >= max_refill_loops: + break + loops += 1 + + base_next_page = current_next_page + next_request_limit = max(1, remaining) + can_overfetch = CursorMap.can_overfetch(node=node, base_next_page=base_next_page) + if can_overfetch and overfetch_factor > 1: + next_request_limit = max(1, remaining * overfetch_factor) + + current_result, _plan = await self._executor._run_node_raw( + node, + ctx, + next_request_limit, + base_next_page, + params, + ) + current_next_page = current_result.next_page + current_request_limit = next_request_limit + has_next_page = bool(current_result.has_next_page) + + return FeedResult( + data=collected, + next_page=current_next_page, + has_next_page=has_next_page, + ) + + async def apply_slots_plan_dedup( + self, + *, + plan: SlotsPlan, + owners: List[Any], + owner_index: Dict[int, int], + owner_buffers: Dict[int, List[Any]], + owner_results: Dict[int, FeedResult], + dedup_policy: Any, + refill_settings: Any, + cursor: CursorMap, + ) -> Tuple[Dict[int, List[Any]], Dict[int, FeedResult]]: + owner_buffers = await dedup_policy.arbitrate_owner_buffers( + owners=owners, + owner_buffers=owner_buffers, + owner_rank=owner_index, + ) + + for owner in owners: + owner_id = id(owner) + if owner_id not in owner_results: + continue + old = owner_results[owner_id] + owner_results[owner_id] = FeedResult( + data=list(owner_buffers.get(owner_id, [])), + next_page=old.next_page, + has_next_page=old.has_next_page, + ) + + deficits = self._compute_slot_deficits(plan=plan, owner_buffers=owner_buffers) + if deficits: + await self._refill_deficits( + plan=plan, + deficits=deficits, + owners=owners, + owner_index=owner_index, + owner_buffers=owner_buffers, + owner_results=owner_results, + dedup_policy=dedup_policy, + refill_settings=refill_settings, + cursor=cursor, + ) + + return owner_buffers, owner_results + + def _compute_slot_deficits(self, *, plan: SlotsPlan, owner_buffers: Dict[int, List[Any]]) -> Dict[int, int]: + total_max = sum(int(s.max_count) for s in plan.slots) + quota_schedule = total_max <= int(plan.limit) + + consumed: Dict[int, int] = {} + remaining = int(plan.limit) + deficit_slots: List[int] = [] + + for slot in plan.slots: + if remaining <= 0: + break + + owner_id = id(slot.owner) + want = min(int(slot.max_count), remaining) + if want <= 0: + continue + + have_total = len(owner_buffers.get(owner_id, [])) + already = int(consumed.get(owner_id, 0)) + available = max(0, have_total - already) + take = min(want, available) + if take < want: + deficit_slots.append(owner_id) + consumed[owner_id] = already + take + remaining -= take + + page_underfilled = remaining > 0 + + if quota_schedule: + return self._compute_quota_deficits(plan=plan, owner_buffers=owner_buffers) + if not page_underfilled: + return {} + return self._compute_fill_deficits(plan=plan, remaining=remaining, deficit_slots=deficit_slots) + + def _compute_quota_deficits(self, *, plan: SlotsPlan, owner_buffers: Dict[int, List[Any]]) -> Dict[int, int]: + deficits: Dict[int, int] = {} + remaining = int(plan.limit) + consumed: Dict[int, int] = {} + for slot in plan.slots: + if remaining <= 0: + break + + owner_id = id(slot.owner) + want = min(int(slot.max_count), remaining) + if want <= 0: + continue + + have_total = len(owner_buffers.get(owner_id, [])) + already = int(consumed.get(owner_id, 0)) + available = max(0, have_total - already) + take = min(want, available) + missing = max(0, want - take) + if missing: + deficits[owner_id] = deficits.get(owner_id, 0) + missing + consumed[owner_id] = already + take + remaining -= take + + return deficits + + def _compute_fill_deficits(self, *, plan: SlotsPlan, remaining: int, deficit_slots: List[int]) -> Dict[int, int]: + to_fill = int(remaining) + if to_fill <= 0: + return {} + + owner_id = deficit_slots[-1] if deficit_slots else (id(plan.slots[-1].owner) if plan.slots else None) + return {owner_id: to_fill} if owner_id is not None else {} + + async def _refill_deficits( + self, + *, + plan: SlotsPlan, + deficits: Dict[int, int], + owners: List[Any], + owner_index: Dict[int, int], + owner_buffers: Dict[int, List[Any]], + owner_results: Dict[int, FeedResult], + dedup_policy: Any, + refill_settings: Any, + cursor: CursorMap, + ) -> None: + overfetch_factor = max(1, int(getattr(refill_settings, "overfetch_factor", 1))) + max_refill_loops = max(1, int(getattr(refill_settings, "max_refill_loops", 20))) + + deficit_owners: List[Any] = [o for o in owners if id(o) in deficits] + deficit_owners = sorted( + deficit_owners, + key=lambda o: ( + int(getattr(o, "dedup_priority", 0)), + owner_index.get(id(o), 0), + ), + ) + + state: Dict[int, Dict[str, Any]] = {} + for refill_owner in deficit_owners: + refill_owner_id = id(refill_owner) + missing_total = int(deficits.get(refill_owner_id, 0)) + if missing_total <= 0: + continue + + base_np = owner_results[refill_owner_id].next_page if refill_owner_id in owner_results else plan.next_page + state[refill_owner_id] = { + "owner": refill_owner, + "missing_total": missing_total, + "remaining": int(missing_total), + "accepted": [], + "loops": 0, + "current_next_page": base_np, + "has_next_page": True, + "last_result": None, + "last_request_limit": 0, + "last_can_overfetch": False, + "last_base_next_page": base_np, + } + + if not state: + return + + while True: + wave_ops: List[Tuple[Any, int, FeedResultNextPage, int, bool]] = [] + for refill_owner in deficit_owners: + refill_owner_id = id(refill_owner) + owner_state = state.get(refill_owner_id) + if owner_state is None: + continue + if owner_state["remaining"] <= 0: + continue + if not owner_state["has_next_page"]: + continue + if owner_state["loops"] >= max_refill_loops: + continue + + base_np = owner_state["current_next_page"] + remaining_before = max(1, int(owner_state["remaining"])) + request_limit = remaining_before + can_overfetch = CursorMap.can_overfetch(node=refill_owner, base_next_page=base_np) + if can_overfetch and overfetch_factor > 1: + request_limit = max(1, remaining_before * overfetch_factor) + + wave_ops.append((refill_owner, refill_owner_id, base_np, request_limit, can_overfetch)) + + if not wave_ops: + break + + results = await self._executor.gather( + *[ + self._executor._run_owner( + plan=plan, + owner=owner, + demand=request_limit, + base_next_page=base_np, + dedup_active=True, + ) + for owner, _owner_id, base_np, request_limit, _can_overfetch in wave_ops + ] + ) + + for (owner, owner_id, base_np, request_limit, can_overfetch), result in zip(wave_ops, results): + owner_state = state[owner_id] + remaining_before = int(owner_state["remaining"]) + + owner_state["current_next_page"] = result.next_page + owner_state["has_next_page"] = bool(result.has_next_page) + cursor.merge_delta(base_next_page=plan.next_page, owner_next_page=result.next_page) + + refill_prio = int(getattr(owner, "dedup_priority", 0)) + wave_accepted, inspected_count = await dedup_policy.accept_batch( + items=list(result.data), + priority=refill_prio, + limit=max(0, remaining_before), + ) + + if can_overfetch and request_limit > remaining_before: + CursorMap.rewind_overfetch( + node=owner, + base_next_page=base_np, + result_next_page=result.next_page, + inspected_count=inspected_count, + batch_size=len(result.data), + ) + + if wave_accepted: + owner_state["accepted"].extend(wave_accepted) + owner_state["remaining"] = int(owner_state["missing_total"]) - len(owner_state["accepted"]) + + if owner_state["remaining"] > 0 and owner_state["has_next_page"]: + owner_state["loops"] += 1 + + for refill_owner in deficit_owners: + refill_owner_id = id(refill_owner) + owner_state = state.get(refill_owner_id) + if owner_state is None: + continue + + accepted = owner_state["accepted"] + if accepted: + owner_buffers.setdefault(refill_owner_id, []) + owner_buffers[refill_owner_id].extend(accepted) + + owner_results[refill_owner_id] = FeedResult( + data=list(owner_buffers.get(refill_owner_id, [])), + next_page=owner_state["current_next_page"], + has_next_page=owner_state["has_next_page"], + ) diff --git a/smartfeed/execution/executor.py b/smartfeed/execution/executor.py index cd3ba4c..e4fb3ca 100644 --- a/smartfeed/execution/executor.py +++ b/smartfeed/execution/executor.py @@ -7,6 +7,7 @@ from ..feed_models import BaseFeedConfigModel, FeedResult, FeedResultNextPage, _pydantic_deep_copy from .context import ExecutionContext from .cursors import CursorMap +from .dedup_runtime import DedupRuntime from .plans import CallablePlan, Plan, SlotSpec, SlotsPlan @@ -33,7 +34,21 @@ async def run( if isinstance(plan, SlotsPlan): return result - return await self._run_node_with_dedup_refill(node, ctx, limit, next_page, params, result) + return await self._dedup_runtime().run_node_with_dedup_refill( + node=node, + ctx=ctx, + limit=limit, + next_page=next_page, + params=params, + initial_result=result, + ) + + def _dedup_runtime(self) -> DedupRuntime: + runtime = getattr(self, "_dedup_runtime_instance", None) + if runtime is None: + runtime = DedupRuntime(self) + setattr(self, "_dedup_runtime_instance", runtime) + return runtime async def execute_plan(self, plan: Plan) -> FeedResult: """Interpret and execute a declarative plan. @@ -107,31 +122,17 @@ async def _execute_slots_plan(self, plan: SlotsPlan) -> FeedResult: ) if dedup_policy is not None: - owner_buffers, owner_results = await self._arbitrate_owner_buffers( + owner_buffers, owner_results = await self._dedup_runtime().apply_slots_plan_dedup( + plan=plan, owners=owners, owner_index=owner_index, owner_buffers=owner_buffers, owner_results=owner_results, dedup_policy=dedup_policy, + refill_settings=refill_settings, + cursor=cursor, ) - deficits = self._compute_slot_deficits( - plan=plan, - owner_buffers=owner_buffers, - ) - if deficits: - await self._refill_deficits( - plan=plan, - deficits=deficits, - owners=owners, - owner_index=owner_index, - owner_buffers=owner_buffers, - owner_results=owner_results, - dedup_policy=dedup_policy, - refill_settings=refill_settings, - cursor=cursor, - ) - output = self._consume_slots(plan=plan, owner_buffers=owner_buffers) assembled = await self._maybe_await(plan.assemble(output, cursor.next_page, owner_results)) return assembled @@ -225,254 +226,6 @@ async def _run_plan_owners( return owner_buffers, owner_results - async def _arbitrate_owner_buffers( - self, - *, - owners: List[Any], - owner_index: Dict[int, int], - owner_buffers: Dict[int, List[Any]], - owner_results: Dict[int, FeedResult], - dedup_policy: Any, - ) -> tuple[Dict[int, List[Any]], Dict[int, FeedResult]]: - owner_buffers = await dedup_policy.arbitrate_owner_buffers( - owners=owners, - owner_buffers=owner_buffers, - owner_rank=owner_index, - ) - - for owner in owners: - owner_id = id(owner) - if owner_id not in owner_results: - continue - old = owner_results[owner_id] - owner_results[owner_id] = FeedResult( - data=list(owner_buffers.get(owner_id, [])), - next_page=old.next_page, - has_next_page=old.has_next_page, - ) - - return owner_buffers, owner_results - - def _compute_slot_deficits( - self, - *, - plan: SlotsPlan, - owner_buffers: Dict[int, List[Any]], - ) -> Dict[int, int]: - total_max = sum(int(s.max_count) for s in plan.slots) - quota_schedule = total_max <= int(plan.limit) - - consumed: Dict[int, int] = {} - remaining = int(plan.limit) - deficit_slots: List[int] = [] - - for slot in plan.slots: - if remaining <= 0: - break - - owner_id = id(slot.owner) - want = min(int(slot.max_count), remaining) - if want <= 0: - continue - - have_total = len(owner_buffers.get(owner_id, [])) - already = int(consumed.get(owner_id, 0)) - available = max(0, have_total - already) - take = min(want, available) - if take < want: - deficit_slots.append(owner_id) - consumed[owner_id] = already + take - remaining -= take - - page_underfilled = remaining > 0 - - if quota_schedule: - return self._compute_quota_deficits(plan=plan, owner_buffers=owner_buffers) - if not page_underfilled: - return {} - return self._compute_fill_deficits(plan=plan, remaining=remaining, deficit_slots=deficit_slots) - - def _compute_quota_deficits( - self, - *, - plan: SlotsPlan, - owner_buffers: Dict[int, List[Any]], - ) -> Dict[int, int]: - deficits: Dict[int, int] = {} - remaining = int(plan.limit) - consumed: Dict[int, int] = {} - for slot in plan.slots: - if remaining <= 0: - break - - owner_id = id(slot.owner) - want = min(int(slot.max_count), remaining) - if want <= 0: - continue - - have_total = len(owner_buffers.get(owner_id, [])) - already = int(consumed.get(owner_id, 0)) - available = max(0, have_total - already) - take = min(want, available) - missing = max(0, want - take) - if missing: - deficits[owner_id] = deficits.get(owner_id, 0) + missing - consumed[owner_id] = already + take - remaining -= take - - return deficits - - def _compute_fill_deficits( - self, - *, - plan: SlotsPlan, - remaining: int, - deficit_slots: List[int], - ) -> Dict[int, int]: - to_fill = int(remaining) - if to_fill <= 0: - return {} - - owner_id = deficit_slots[-1] if deficit_slots else (id(plan.slots[-1].owner) if plan.slots else None) - return {owner_id: to_fill} if owner_id is not None else {} - - async def _refill_deficits( - self, - *, - plan: SlotsPlan, - deficits: Dict[int, int], - owners: List[Any], - owner_index: Dict[int, int], - owner_buffers: Dict[int, List[Any]], - owner_results: Dict[int, FeedResult], - dedup_policy: Any, - refill_settings: Any, - cursor: CursorMap, - ) -> None: - overfetch_factor = max(1, int(getattr(refill_settings, "overfetch_factor", 1))) - max_refill_loops = max(1, int(getattr(refill_settings, "max_refill_loops", 20))) - - deficit_owners: List[Any] = [o for o in owners if id(o) in deficits] - deficit_owners = sorted( - deficit_owners, - key=lambda o: ( - int(getattr(o, "dedup_priority", 0)), - owner_index.get(id(o), 0), - ), - ) - - state: Dict[int, Dict[str, Any]] = {} - for refill_owner in deficit_owners: - refill_owner_id = id(refill_owner) - missing_total = int(deficits.get(refill_owner_id, 0)) - if missing_total <= 0: - continue - - base_np = owner_results[refill_owner_id].next_page if refill_owner_id in owner_results else plan.next_page - state[refill_owner_id] = { - "owner": refill_owner, - "missing_total": missing_total, - "remaining": int(missing_total), - "accepted": [], - "loops": 0, - "current_next_page": base_np, - "has_next_page": True, - "last_result": None, - "last_request_limit": 0, - "last_can_overfetch": False, - "last_base_next_page": base_np, - } - - if not state: - return - - while True: - wave_ops: List[Tuple[Any, int, FeedResultNextPage, int, bool]] = [] - for refill_owner in deficit_owners: - refill_owner_id = id(refill_owner) - owner_state = state.get(refill_owner_id) - if owner_state is None: - continue - if owner_state["remaining"] <= 0: - continue - if not owner_state["has_next_page"]: - continue - if owner_state["loops"] >= max_refill_loops: - continue - - base_np = owner_state["current_next_page"] - remaining_before = max(1, int(owner_state["remaining"])) - request_limit = remaining_before - can_overfetch = CursorMap.can_overfetch(node=refill_owner, base_next_page=base_np) - if can_overfetch and overfetch_factor > 1: - request_limit = max(1, remaining_before * overfetch_factor) - - wave_ops.append((refill_owner, refill_owner_id, base_np, request_limit, can_overfetch)) - - if not wave_ops: - break - - results = await self.gather( - *[ - self._run_owner( - plan=plan, - owner=owner, - demand=request_limit, - base_next_page=base_np, - dedup_active=True, - ) - for owner, _owner_id, base_np, request_limit, _can_overfetch in wave_ops - ] - ) - - for (owner, owner_id, base_np, request_limit, can_overfetch), result in zip(wave_ops, results): - owner_state = state[owner_id] - remaining_before = int(owner_state["remaining"]) - - owner_state["current_next_page"] = result.next_page - owner_state["has_next_page"] = bool(result.has_next_page) - cursor.merge_delta(base_next_page=plan.next_page, owner_next_page=result.next_page) - - refill_prio = int(getattr(owner, "dedup_priority", 0)) - wave_accepted, inspected_count = await dedup_policy.accept_batch( - items=list(result.data), - priority=refill_prio, - limit=max(0, remaining_before), - ) - - if can_overfetch and request_limit > remaining_before: - CursorMap.rewind_overfetch( - node=owner, - base_next_page=base_np, - result_next_page=result.next_page, - inspected_count=inspected_count, - batch_size=len(result.data), - ) - - if wave_accepted: - owner_state["accepted"].extend(wave_accepted) - owner_state["remaining"] = int(owner_state["missing_total"]) - len(owner_state["accepted"]) - - if owner_state["remaining"] > 0 and owner_state["has_next_page"]: - owner_state["loops"] += 1 - - for refill_owner in deficit_owners: - refill_owner_id = id(refill_owner) - owner_state = state.get(refill_owner_id) - if owner_state is None: - continue - - accepted = owner_state["accepted"] - if accepted: - owner_buffers.setdefault(refill_owner_id, []) - owner_buffers[refill_owner_id].extend(accepted) - - owner_results[refill_owner_id] = FeedResult( - data=list(owner_buffers.get(refill_owner_id, [])), - next_page=owner_state["current_next_page"], - has_next_page=owner_state["has_next_page"], - ) - def _consume_slots(self, *, plan: SlotsPlan, owner_buffers: Dict[int, List[Any]]) -> List[Any]: output: List[Any] = [] for slot in plan.slots: @@ -494,83 +247,6 @@ def _consume_slots(self, *, plan: SlotsPlan, owner_buffers: Dict[int, List[Any]] return output - async def _run_node_with_dedup_refill( - self, - node: BaseFeedConfigModel, - ctx: ExecutionContext, - limit: int, - next_page: FeedResultNextPage, - params: Dict[str, Any], - initial_result: FeedResult, - ) -> FeedResult: - dedup = getattr(ctx, "dedup", None) - if dedup is None: - return initial_result - - settings = getattr(ctx, "refill_settings", None) or getattr(ctx, "dedup_settings", None) - overfetch_factor = max(1, int(getattr(settings, "overfetch_factor", 1))) - max_refill_loops = max(1, int(getattr(settings, "max_refill_loops", 20))) - priority = int(getattr(node, "dedup_priority", 0)) - - collected: List[Any] = [] - remaining = int(limit) - loops = 0 - - current_result = initial_result - current_next_page = current_result.next_page - current_request_limit = max(1, remaining) - has_next_page = bool(current_result.has_next_page) - base_next_page = next_page - - while remaining > 0: - can_overfetch = CursorMap.can_overfetch(node=node, base_next_page=base_next_page) - - accepted, inspected_count = await dedup.accept_batch( - items=list(current_result.data), - priority=priority, - limit=remaining, - ) - - if can_overfetch and current_request_limit > remaining: - CursorMap.rewind_overfetch( - node=node, - base_next_page=base_next_page, - result_next_page=current_next_page, - inspected_count=inspected_count, - batch_size=len(current_result.data), - ) - - if accepted: - collected.extend(accepted) - remaining = limit - len(collected) - - if remaining <= 0 or not has_next_page or loops >= max_refill_loops: - break - loops += 1 - - base_next_page = current_next_page - next_request_limit = max(1, remaining) - can_overfetch = CursorMap.can_overfetch(node=node, base_next_page=base_next_page) - if can_overfetch and overfetch_factor > 1: - next_request_limit = max(1, remaining * overfetch_factor) - - current_result, _plan = await self._run_node_raw( - node, - ctx, - next_request_limit, - base_next_page, - params, - ) - current_next_page = current_result.next_page - current_request_limit = next_request_limit - has_next_page = bool(current_result.has_next_page) - - return FeedResult( - data=collected, - next_page=current_next_page, - has_next_page=has_next_page, - ) - __all__ = [ "Executor", From fd2201b487004f103fc56c0b6363c0cb1dd8613b Mon Sep 17 00:00:00 2001 From: Pavel Kochetov Date: Sun, 8 Feb 2026 01:20:19 +0000 Subject: [PATCH 20/33] More test coverage. --- smartfeed/policies/dedup_utils.py | 43 +++++++++++------ tests/test_dedup_policy_unit.py | 74 +++++++++++++++++++++++++++++ tests/test_dedup_utils.py | 78 +++++++++++++++++++++++++++++++ tests/test_seen_store_unit.py | 52 +++++++++++++++++++++ tests/test_view_session_unit.py | 54 +++++++++++++++++++++ 5 files changed, 287 insertions(+), 14 deletions(-) create mode 100644 tests/test_dedup_policy_unit.py create mode 100644 tests/test_dedup_utils.py create mode 100644 tests/test_seen_store_unit.py create mode 100644 tests/test_view_session_unit.py diff --git a/smartfeed/policies/dedup_utils.py b/smartfeed/policies/dedup_utils.py index 2559e0f..4483619 100644 --- a/smartfeed/policies/dedup_utils.py +++ b/smartfeed/policies/dedup_utils.py @@ -12,32 +12,47 @@ from ..feed_models import _is_async_redis_client, _redis_call +def _seen_entries_to_map(entries: Any) -> Dict[str, int]: + """Coerce a legacy cursor "seen" list into a {key: priority} map. + + Supports: + - ["k1", "k2", ...] (implies priority 0) + - [["k1", 10], ["k2", 3], ...] (explicit priorities) + """ + + seen_map: Dict[str, int] = {} + if not isinstance(entries, list): + return seen_map + + for entry_item in entries: + if isinstance(entry_item, (list, tuple)) and len(entry_item) == 2: + seen_map[str(entry_item[0])] = int(entry_item[1]) + else: + seen_map[str(entry_item)] = 0 + return seen_map + + def decode_seen_from_cursor(after: Any) -> Dict[str, int]: if after is None: return {} if isinstance(after, dict) and "z" in after: + if after.get("v") != 2: + return {} + if after.get("c") != "zlib+base64": + return {} payload = base64.urlsafe_b64decode(str(after["z"]).encode()) raw = zlib.decompress(payload).decode() decoded = json.loads(raw) - if isinstance(decoded, dict): - return {str(k): int(v) for k, v in decoded.items()} if isinstance(decoded, list): - seen_map: Dict[str, int] = {} - for entry_item in decoded: - if isinstance(entry_item, (list, tuple)) and len(entry_item) == 2: - seen_map[str(entry_item[0])] = int(entry_item[1]) - else: - seen_map[str(entry_item)] = 0 - return seen_map + return _seen_entries_to_map(decoded) return {} if isinstance(after, dict) and "seen" in after: - return {str(k): 0 for k in list(after["seen"])} - if isinstance(after, list): - return {str(k): 0 for k in list(after)} - if isinstance(after, dict): - return {str(k): int(v) for k, v in after.items() if k not in {"v", "c", "n"}} + if after.get("v") != 2: + return {} + return _seen_entries_to_map(list(after["seen"])) + return {} diff --git a/tests/test_dedup_policy_unit.py b/tests/test_dedup_policy_unit.py new file mode 100644 index 0000000..b846f6b --- /dev/null +++ b/tests/test_dedup_policy_unit.py @@ -0,0 +1,74 @@ +from dataclasses import dataclass +from typing import Any, Dict, List, Optional + +import pytest + +from smartfeed.policies.dedup import DeduplicationPolicy +from smartfeed.policies.seen_store import CursorSeenStore + + +def _policy(*, dedup_key: Optional[str], missing_key_policy: str, after: Any = None) -> DeduplicationPolicy: + store = CursorSeenStore.from_after(after=after, cursor_compress=False, cursor_max_keys=None) + return DeduplicationPolicy( + dedup_key=dedup_key, + missing_key_policy=missing_key_policy, # type: ignore[arg-type] + store=store, + seen_request_set=set(), + ) + + +@pytest.mark.asyncio +async def test_accept_batch_dedups_within_request_and_respects_existing_priority() -> None: + # Existing seen key with high priority should block lower/equal priority + existing_after = {"v": 2, "seen": [["1", 10]]} + policy = _policy(dedup_key="id", missing_key_policy="keep", after=existing_after) + + items = [{"id": 1}, {"id": 1}, {"id": 2}] + accepted, inspected = await policy.accept_batch(items=items, priority=5) + + assert inspected == 3 + assert accepted == [{"id": 2}] + + +@pytest.mark.asyncio +async def test_accept_batch_missing_key_policies() -> None: + items = [{"id": 1}, {"nope": 2}] + + policy_drop = _policy(dedup_key="id", missing_key_policy="drop") + accepted_drop, _ = await policy_drop.accept_batch(items=items, priority=0) + assert accepted_drop == [{"id": 1}] + + policy_error = _policy(dedup_key="id", missing_key_policy="error") + with pytest.raises(AssertionError): + await policy_error.accept_batch(items=items, priority=0) + + +@dataclass +class _Owner: + dedup_priority: int + + +@pytest.mark.asyncio +async def test_arbitrate_owner_buffers_prefers_higher_priority_owner() -> None: + policy = _policy(dedup_key="id", missing_key_policy="keep") + + owner_low = _Owner(dedup_priority=1) + owner_high = _Owner(dedup_priority=2) + + owners = [owner_low, owner_high] + owner_rank: Dict[int, int] = {id(owner_low): 0, id(owner_high): 1} + + shared = {"id": "same"} + owner_buffers: Dict[int, List[Any]] = { + id(owner_low): [shared, {"id": "low_only"}], + id(owner_high): [shared, {"id": "high_only"}], + } + + per_owner = await policy.arbitrate_owner_buffers( + owners=owners, + owner_buffers=owner_buffers, + owner_rank=owner_rank, + ) + + assert per_owner[id(owner_high)] == [shared, {"id": "high_only"}] + assert per_owner[id(owner_low)] == [{"id": "low_only"}] diff --git a/tests/test_dedup_utils.py b/tests/test_dedup_utils.py new file mode 100644 index 0000000..9556af3 --- /dev/null +++ b/tests/test_dedup_utils.py @@ -0,0 +1,78 @@ +from typing import Any, List + +import pytest + +from smartfeed.feed_models import _redis_call +from smartfeed.policies.dedup_utils import ( + decode_seen_from_cursor, + encode_seen_for_cursor, + redis_zmscore, +) + +from tests.fixtures.redis import redis_client + + +class _RedisNoZmscore: + """Wrapper around a real sync redis client to force the pipeline fallback. + + This keeps the backend "real Redis" while exercising the no-zmscore branch. + """ + + zmscore = None # type: ignore[assignment] + + def __init__(self, client: Any) -> None: + self._client = client + + def pipeline(self) -> Any: + return self._client.pipeline() + + +def test_encode_decode_seen_cursor_compressed_roundtrip_and_truncation() -> None: + seen_updates = [("a", 1), ("b", 2), ("c", 3)] + + encoded = encode_seen_for_cursor(seen_updates, cursor_compress=True, cursor_max_keys=None) + decoded = decode_seen_from_cursor(encoded) + assert decoded == {"a": 1, "b": 2, "c": 3} + + encoded_trunc = encode_seen_for_cursor(seen_updates, cursor_compress=True, cursor_max_keys=2) + decoded_trunc = decode_seen_from_cursor(encoded_trunc) + assert decoded_trunc == {"b": 2, "c": 3} + + +def test_decode_seen_cursor_v2_only() -> None: + assert decode_seen_from_cursor(None) == {} + + # Supported v2 uncompressed format + assert decode_seen_from_cursor({"v": 2, "seen": [["a", 9], ["b", 1]]}) == {"a": 9, "b": 1} + + # Legacy/unknown shapes are intentionally rejected + assert decode_seen_from_cursor(["x", "y"]) == {} + assert decode_seen_from_cursor({"a": 1}) == {} + assert decode_seen_from_cursor({"v": 1, "seen": [["a", 1]]}) == {} + + +@pytest.mark.parametrize("redis_client", ["sync", "async"], indirect=True) +@pytest.mark.asyncio +async def test_redis_zmscore_native(redis_client) -> None: + key = "test_zmscore_native" + await _redis_call(redis_client, "delete", key) + await _redis_call(redis_client, "zadd", key, mapping={"a": 1.0, "b": 2.0}) + + res = await redis_zmscore(redis_client, key, ["a", "missing", "b"]) + assert res == [1.0, None, 2.0] + + await _redis_call(redis_client, "delete", key) + + +@pytest.mark.parametrize("redis_client", ["sync"], indirect=True) +@pytest.mark.asyncio +async def test_redis_zmscore_pipeline_fallback_for_sync_client_without_zmscore(redis_client) -> None: + key = "test_zmscore_fallback" + await _redis_call(redis_client, "delete", key) + await _redis_call(redis_client, "zadd", key, mapping={"a": 1.0, "b": 2.0}) + + wrapped = _RedisNoZmscore(redis_client) + res = await redis_zmscore(wrapped, key, ["a", "missing", "b"]) + assert res == [1.0, None, 2.0] + + await _redis_call(redis_client, "delete", key) diff --git a/tests/test_seen_store_unit.py b/tests/test_seen_store_unit.py new file mode 100644 index 0000000..50dab93 --- /dev/null +++ b/tests/test_seen_store_unit.py @@ -0,0 +1,52 @@ +from typing import Any + +import pytest + +from smartfeed.feed_models import _redis_call +from smartfeed.policies.dedup_utils import decode_seen_from_cursor +from smartfeed.policies.seen_store import CursorSeenStore, RedisSeenStore + +from tests.fixtures.redis import redis_client + + +@pytest.mark.asyncio +async def test_cursor_seen_store_set_max_and_commit_roundtrip() -> None: + store = CursorSeenStore.from_after(after=None, cursor_compress=True, cursor_max_keys=None) + store.set_max("a", 1) + store.set_max("a", 1) # no-op + store.set_max("a", 0) # no-op (lower) + store.set_max("b", 2) + + after = await store.commit() + decoded = decode_seen_from_cursor(after) + assert decoded == {"a": 1, "b": 2} + + +@pytest.mark.parametrize("redis_client", ["sync", "async"], indirect=True) +@pytest.mark.asyncio +async def test_redis_seen_store_prefetch_set_max_commit_and_reset(redis_client) -> None: + key = "test_seen_store" + await _redis_call(redis_client, "delete", key) + # Pre-seed zset state + await _redis_call(redis_client, "zadd", key, mapping={"a": 5.0}) + + store = RedisSeenStore.create(redis_client=redis_client, redis_key=key, ttl_seconds=60) + + await store.prefetch(["a", "a", "b"]) # duplicates + assert store.get("a") == 5 + assert store.get("b") is None + + store.set_max("a", 3) # should not reduce existing + store.set_max("b", 2) + + await store.commit() + + # New state should be present in redis + scores = list(await _redis_call(redis_client, "zmscore", key, ["a", "b"])) + assert scores == [5.0, 2.0] + + await store.reset() + scores_after_reset = list(await _redis_call(redis_client, "zmscore", key, ["a", "b"])) + assert scores_after_reset == [None, None] + + await _redis_call(redis_client, "delete", key) diff --git a/tests/test_view_session_unit.py b/tests/test_view_session_unit.py new file mode 100644 index 0000000..a847eea --- /dev/null +++ b/tests/test_view_session_unit.py @@ -0,0 +1,54 @@ +from dataclasses import dataclass +from typing import Any + +import pytest + +from smartfeed.feed_models import _redis_call +from smartfeed.schemas import FeedResultNextPage, FeedResultNextPageInside, MergerViewSession +from tests.fixtures.configs import METHODS_DICT +from tests.fixtures.mergers import MERGER_VIEW_SESSION_CONFIG +from tests.fixtures.redis import redis_client +from tests.utils import parse_model + + +@dataclass +class _ItemWithAttr: + id: str + + +def test_get_dedup_key_supports_dict_and_attr_and_raises_on_missing() -> None: + cfg = dict(MERGER_VIEW_SESSION_CONFIG) + cfg.update({"deduplicate": True, "dedup_key": "id"}) + merger = parse_model(MergerViewSession, cfg) + + assert merger._get_dedup_key_or_attr({"id": "x"}) == "x" + assert merger._get_dedup_key_or_attr(_ItemWithAttr(id="y")) == "y" + + with pytest.raises(AssertionError): + merger._get_dedup_key_or_attr({"nope": 1}) + + +@pytest.mark.parametrize("redis_client", ["sync", "async"], indirect=True) +@pytest.mark.asyncio +async def test_view_session_shuffle_applies_to_result(redis_client, monkeypatch) -> None: + import smartfeed.mergers.view_session as vs_mod + + cfg = dict(MERGER_VIEW_SESSION_CONFIG) + cfg.update({"shuffle": True}) + merger = parse_model(MergerViewSession, cfg) + cache_key = f"{merger.merger_id}_x" + await _redis_call(redis_client, "delete", cache_key) + + # Make shuffle deterministic: reverse in-place + monkeypatch.setattr(vs_mod, "shuffle", lambda data: data.reverse()) + + res = await merger.get_data( + methods_dict=METHODS_DICT, + limit=5, + next_page=FeedResultNextPage(data={}), + user_id="x", + redis_client=redis_client, + ) + + assert res.data == ["x_5", "x_4", "x_3", "x_2", "x_1"] + await _redis_call(redis_client, "delete", cache_key) From 65601b947cbe9f1ab5debb0e12b27f433f3a0919 Mon Sep 17 00:00:00 2001 From: Pavel Kochetov Date: Sun, 8 Feb 2026 01:26:41 +0000 Subject: [PATCH 21/33] Even more test coverage. --- tests/test_cursor_and_refill_edges.py | 92 ++++++++ tests/test_executor_slots_plan_invariants.py | 208 +++++++++++++++++++ 2 files changed, 300 insertions(+) create mode 100644 tests/test_cursor_and_refill_edges.py create mode 100644 tests/test_executor_slots_plan_invariants.py diff --git a/tests/test_cursor_and_refill_edges.py b/tests/test_cursor_and_refill_edges.py new file mode 100644 index 0000000..27b5b72 --- /dev/null +++ b/tests/test_cursor_and_refill_edges.py @@ -0,0 +1,92 @@ +import base64 +import zlib + +import pytest + +from smartfeed.execution.context import ExecutionContext, RefillExecutionSettings +from smartfeed.execution.executor import Executor +from smartfeed.feed_models import BaseFeedConfigModel, FeedResult, FeedResultNextPage +from smartfeed.policies.dedup import DeduplicationPolicy +from smartfeed.policies.dedup_utils import decode_seen_from_cursor +from smartfeed.policies.seen_store import CursorSeenStore + + +class _DuplicateOnlyNode(BaseFeedConfigModel): + """A node that always returns the same duplicate item and never ends.""" + + type: str = "test_node" # satisfies pydantic + + def __init__(self, **data): + super().__init__(**data) + object.__setattr__(self, "calls", 0) + + async def get_data( # type: ignore[override] + self, + methods_dict, + user_id, + limit, + next_page, + redis_client=None, + ctx=None, + **params, + ) -> FeedResult: + object.__setattr__(self, "calls", int(getattr(self, "calls", 0)) + 1) + return FeedResult( + data=[{"id": "dup"} for _ in range(int(limit) or 1)], + next_page=FeedResultNextPage(data={}), + has_next_page=True, + ) + + +def _policy_with_seen_dup(*, max_refill_loops: int) -> tuple[DeduplicationPolicy, RefillExecutionSettings]: + store = CursorSeenStore.from_after( + after={"v": 2, "seen": [["dup", 10]]}, + cursor_compress=False, + cursor_max_keys=None, + ) + policy = DeduplicationPolicy( + dedup_key="id", + missing_key_policy="keep", + store=store, + seen_request_set=set(), + ) + settings = RefillExecutionSettings(overfetch_factor=1, max_refill_loops=max_refill_loops) + return policy, settings + + +@pytest.mark.asyncio +async def test_dedup_refill_stops_at_max_loops_when_only_duplicates() -> None: + node = _DuplicateOnlyNode() + executor = Executor() + + dedup_policy, settings = _policy_with_seen_dup(max_refill_loops=2) + ctx = ExecutionContext(methods_dict={}, user_id="u", executor=executor) + ctx.dedup = dedup_policy + ctx.refill_settings = settings + + res = await executor.run(node, ctx, limit=3, next_page=FeedResultNextPage(data={})) + + assert res.data == [] + # initial call + 2 refill loops + assert getattr(node, "calls") == 3 + + +def test_decode_seen_from_cursor_raises_on_corrupt_compressed_payload() -> None: + # invalid base64 + with pytest.raises(Exception): + decode_seen_from_cursor({"v": 2, "c": "zlib+base64", "z": "not-base64"}) + + # base64 ok, zlib invalid + bad_zlib = base64.urlsafe_b64encode(b"not-a-zlib-stream").decode("ascii") + with pytest.raises(Exception): + decode_seen_from_cursor({"v": 2, "c": "zlib+base64", "z": bad_zlib}) + + # zlib ok, json invalid + bad_json = base64.urlsafe_b64encode(zlib.compress(b"not json")).decode("ascii") + with pytest.raises(Exception): + decode_seen_from_cursor({"v": 2, "c": "zlib+base64", "z": bad_json}) + + +def test_decode_seen_from_cursor_rejects_wrong_version_or_codec() -> None: + assert decode_seen_from_cursor({"v": 1, "c": "zlib+base64", "z": ""}) == {} + assert decode_seen_from_cursor({"v": 2, "c": "other", "z": ""}) == {} diff --git a/tests/test_executor_slots_plan_invariants.py b/tests/test_executor_slots_plan_invariants.py new file mode 100644 index 0000000..8eb5175 --- /dev/null +++ b/tests/test_executor_slots_plan_invariants.py @@ -0,0 +1,208 @@ +import pytest + +from smartfeed.execution.context import ExecutionContext +from smartfeed.execution.executor import Executor +from smartfeed.execution.plans import SlotSpec, SlotsPlan +from smartfeed.feed_models import BaseFeedConfigModel, FeedResult, FeedResultNextPage, FeedResultNextPageInside +from smartfeed.policies.dedup import DeduplicationPolicy +from smartfeed.policies.seen_store import CursorSeenStore + + +class _Owner(BaseFeedConfigModel): + type: str = "test_owner" + + def __init__(self, *, name: str, **data): + super().__init__(**data) + object.__setattr__(self, "name", name) + object.__setattr__(self, "last_limit", None) + object.__setattr__(self, "calls", 0) + + async def get_data( # type: ignore[override] + self, + methods_dict, + user_id, + limit, + next_page, + redis_client=None, + ctx=None, + **params, + ) -> FeedResult: + object.__setattr__(self, "calls", int(getattr(self, "calls", 0)) + 1) + object.__setattr__(self, "last_limit", int(limit)) + return FeedResult(data=[self.name] * int(limit), next_page=FeedResultNextPage(data={}), has_next_page=False) + + +class _PagedOwner(BaseFeedConfigModel): + type: str = "test_paged_owner" + subfeed_id: str + total: int = 10 + + def __init__(self, *, subfeed_id: str, total: int = 10, **data): + super().__init__(subfeed_id=subfeed_id, total=total, **data) + object.__setattr__(self, "calls", 0) + object.__setattr__(self, "limits", []) + + async def get_data( # type: ignore[override] + self, + methods_dict, + user_id, + limit, + next_page, + redis_client=None, + ctx=None, + **params, + ) -> FeedResult: + object.__setattr__(self, "calls", int(getattr(self, "calls", 0)) + 1) + limits = list(getattr(self, "limits", [])) + limits.append(int(limit)) + object.__setattr__(self, "limits", limits) + + entry = next_page.data.get(self.subfeed_id) + offset = int(entry.after) if (entry is not None and isinstance(entry.after, int)) else 0 + + take = max(0, min(int(limit), int(self.total) - offset)) + data = [{"id": f"{self.subfeed_id}_{i}"} for i in range(offset + 1, offset + take + 1)] + new_after = offset + take + + next_page.data[self.subfeed_id] = FeedResultNextPageInside( + page=(entry.page + 1 if entry is not None else 2), + after=new_after, + ) + return FeedResult( + data=data, + next_page=next_page, + has_next_page=bool(new_after < int(self.total)), + ) + + +def _dedup_policy() -> DeduplicationPolicy: + store = CursorSeenStore.from_after(after=None, cursor_compress=False, cursor_max_keys=None) + return DeduplicationPolicy( + dedup_key="id", + missing_key_policy="keep", # type: ignore[arg-type] + store=store, + seen_request_set=set(), + ) + + +@pytest.mark.asyncio +async def test_slots_plan_limit_le_zero_calls_assemble_only() -> None: + executor = Executor() + ctx = ExecutionContext(methods_dict={}, user_id="u", executor=executor) + + owner = _Owner(name="x") + + called = {"assemble": 0} + + def assemble(output, next_page, owner_results): + called["assemble"] += 1 + assert output == [] + assert owner_results == {} + return FeedResult(data=output, next_page=next_page, has_next_page=False) + + plan = SlotsPlan( + ctx=ctx, + limit=0, + next_page=FeedResultNextPage(data={}), + params={}, + slots=[SlotSpec(owner=owner, max_count=10)], + assemble=assemble, + ) + + res = await executor.execute_plan(plan) + assert res.data == [] + assert called["assemble"] == 1 + assert getattr(owner, "calls") == 0 + + +@pytest.mark.asyncio +async def test_slots_plan_owner_fetch_limits_overrides_demand() -> None: + executor = Executor() + ctx = ExecutionContext(methods_dict={}, user_id="u", executor=executor) + + owner = _Owner(name="x") + + def assemble(output, next_page, owner_results): + return FeedResult(data=output, next_page=next_page, has_next_page=False) + + plan = SlotsPlan( + ctx=ctx, + limit=5, + next_page=FeedResultNextPage(data={}), + params={}, + slots=[SlotSpec(owner=owner, max_count=5)], + assemble=assemble, + owner_fetch_limits={id(owner): 1}, + ) + + res = await executor.execute_plan(plan) + assert getattr(owner, "calls") == 1 + assert getattr(owner, "last_limit") == 1 + assert res.data == ["x"] + + +@pytest.mark.asyncio +async def test_slots_plan_no_ops_path_assemble_still_runs() -> None: + executor = Executor() + ctx = ExecutionContext(methods_dict={}, user_id="u", executor=executor) + + owner = _Owner(name="x") + + called = {"assemble": 0} + + def assemble(output, next_page, owner_results): + called["assemble"] += 1 + assert output == [] + assert owner_results == {} + return FeedResult(data=[], next_page=next_page, has_next_page=False) + + plan = SlotsPlan( + ctx=ctx, + limit=5, + next_page=FeedResultNextPage(data={}), + params={}, + slots=[SlotSpec(owner=owner, max_count=0)], + assemble=assemble, + ) + + res = await executor.execute_plan(plan) + assert res.data == [] + assert called["assemble"] == 1 + assert getattr(owner, "calls") == 0 + + +@pytest.mark.asyncio +async def test_slots_plan_quota_deficit_triggers_refill_wave() -> None: + executor = Executor() + ctx = ExecutionContext(methods_dict={}, user_id="u", executor=executor) + ctx.dedup = _dedup_policy() + + a = _PagedOwner(subfeed_id="a", total=10) + b = _PagedOwner(subfeed_id="b", total=10) + + def assemble(output, next_page, owner_results): + return FeedResult(data=output, next_page=next_page, has_next_page=False) + + plan = SlotsPlan( + ctx=ctx, + limit=6, + next_page=FeedResultNextPage(data={ + "a": FeedResultNextPageInside(page=1, after=0), + "b": FeedResultNextPageInside(page=1, after=0), + }), + params={}, + slots=[ + SlotSpec(owner=a, max_count=3), + SlotSpec(owner=b, max_count=3), + ], + assemble=assemble, + # Force an initial under-fetch for owner a (quota deficit). + owner_fetch_limits={id(a): 1}, + ) + + res = await executor.execute_plan(plan) + + # a should be refilled from 1 -> 3 items + assert getattr(a, "calls") >= 2 + assert res.data[:3] == [{"id": "a_1"}, {"id": "a_2"}, {"id": "a_3"}] + assert res.data[3:] == [{"id": "b_1"}, {"id": "b_2"}, {"id": "b_3"}] From 86be2a457ac5ff55ca1d5de909fa1f70891b9232 Mon Sep 17 00:00:00 2001 From: Pavel Kochetov Date: Sun, 8 Feb 2026 15:07:11 +0000 Subject: [PATCH 22/33] Tests for when one subfeed is empty. --- tests/test_executor_slots_plan_invariants.py | 48 ++++++++++++++++++++ tests/test_merger_append.py | 22 +++++++++ tests/test_merger_percentage.py | 26 +++++++++++ 3 files changed, 96 insertions(+) diff --git a/tests/test_executor_slots_plan_invariants.py b/tests/test_executor_slots_plan_invariants.py index 8eb5175..74f8540 100644 --- a/tests/test_executor_slots_plan_invariants.py +++ b/tests/test_executor_slots_plan_invariants.py @@ -206,3 +206,51 @@ def assemble(output, next_page, owner_results): assert getattr(a, "calls") >= 2 assert res.data[:3] == [{"id": "a_1"}, {"id": "a_2"}, {"id": "a_3"}] assert res.data[3:] == [{"id": "b_1"}, {"id": "b_2"}, {"id": "b_3"}] + + +@pytest.mark.asyncio +async def test_slots_plan_quota_deficit_stops_refill_when_owner_exhausts() -> None: + executor = Executor() + ctx = ExecutionContext(methods_dict={}, user_id="u", executor=executor) + ctx.dedup = _dedup_policy() + + # Owner a can never satisfy its full slot quota. + a = _PagedOwner(subfeed_id="a", total=2) + b = _PagedOwner(subfeed_id="b", total=10) + + def assemble(output, next_page, owner_results): + return FeedResult(data=output, next_page=next_page, has_next_page=False) + + plan = SlotsPlan( + ctx=ctx, + limit=6, + next_page=FeedResultNextPage( + data={ + "a": FeedResultNextPageInside(page=1, after=0), + "b": FeedResultNextPageInside(page=1, after=0), + } + ), + params={}, + slots=[ + SlotSpec(owner=a, max_count=3), + SlotSpec(owner=b, max_count=3), + ], + assemble=assemble, + # Force an initial under-fetch to create a quota deficit for a. + owner_fetch_limits={id(a): 1}, + ) + + res = await executor.execute_plan(plan) + + # a is exhausted after returning 2 total items; refill should stop. + assert getattr(a, "calls") == 2 + assert getattr(a, "limits") == [1, 2] + assert getattr(b, "calls") == 1 + + assert res.data == [ + {"id": "a_1"}, + {"id": "a_2"}, + {"id": "b_1"}, + {"id": "b_2"}, + {"id": "b_3"}, + ] diff --git a/tests/test_merger_append.py b/tests/test_merger_append.py index 309ea82..290dd04 100644 --- a/tests/test_merger_append.py +++ b/tests/test_merger_append.py @@ -1,3 +1,5 @@ +import copy + import pytest from smartfeed.schemas import FeedResultNextPage, FeedResultNextPageInside, MergerAppend @@ -42,3 +44,23 @@ async def test_merger_append_with_item_1_page_2() -> None: assert merger_append_res.data == ["x_6", "x_7", "x_8", "x_9", "x_10", "x_1", "x_2", "x_3", "x_4", "x_5", "x_6"] assert merger_append_res.next_page.data["subfeed_merger_append_example"].page == 3 assert merger_append_res.next_page.data["subfeed_merger_append_example"].after == "x_10" + + +@pytest.mark.asyncio +async def test_merger_append_when_one_leaf_is_empty() -> None: + config = copy.deepcopy(MERGER_APPEND_CONFIG) + # Make the second leaf return no data + has_next_page=False. + config["items"][1]["method_name"] = "empty" + + merger_append = parse_model(MergerAppend, config) + res = await merger_append.get_data( + methods_dict=METHODS_DICT, + limit=11, + next_page=FeedResultNextPage(data={}), + user_id="x", + ) + + # Only the first subfeed contributes (it is capped to 5 by config). + assert res.data == ["x_1", "x_2", "x_3", "x_4", "x_5"] + # First subfeed's example method still reports more pages. + assert res.has_next_page is True diff --git a/tests/test_merger_percentage.py b/tests/test_merger_percentage.py index 89e225e..ae57f00 100644 --- a/tests/test_merger_percentage.py +++ b/tests/test_merger_percentage.py @@ -1,3 +1,5 @@ +import copy + import pytest from smartfeed.schemas import FeedResultNextPage, FeedResultNextPageInside, MergerPercentage @@ -26,3 +28,27 @@ async def test_merger_percentage() -> None: ) assert merger_percentage_res.data == ["x_4", "x_21", "x_22", "x_5", "x_23", "x_24", "x_6", "x_25", "x_26", "x_7"] + + +@pytest.mark.asyncio +async def test_merger_percentage_when_one_leaf_is_empty() -> None: + config = copy.deepcopy(MERGER_PERCENTAGE_CONFIG) + # Make the second leaf return no data + has_next_page=False. + config["items"][1]["data"]["method_name"] = "empty" + + merger_percentage = parse_model(MergerPercentage, config) + res = await merger_percentage.get_data( + methods_dict=METHODS_DICT, + limit=10, + next_page=FeedResultNextPage( + data={ + "subfeed_merger_percentage_example": FeedResultNextPageInside(page=2, after="x_3"), + "subfeed_2_merger_percentage_example": FeedResultNextPageInside(page=3, after="x_20"), + } + ), + user_id="x", + ) + + assert res.data == ["x_4", "x_5", "x_6", "x_7"] + # The non-empty leaf still reports more pages. + assert res.has_next_page is True From 218e927a5c555eb4f9095bcd2d9c1b34642c8a4c Mon Sep 17 00:00:00 2001 From: Pavel Kochetov Date: Sun, 8 Feb 2026 21:50:23 +0000 Subject: [PATCH 23/33] Minor cleanup --- Makefile | 14 + smartfeed/execution/dedup_runtime.py | 89 ++--- smartfeed/execution/executor.py | 25 +- tests/test_async_loop_blocks_trace.py | 453 ++++++++++++++++++++++++++ 4 files changed, 498 insertions(+), 83 deletions(-) create mode 100644 tests/test_async_loop_blocks_trace.py diff --git a/Makefile b/Makefile index 7e9caf0..682148e 100644 --- a/Makefile +++ b/Makefile @@ -11,3 +11,17 @@ test: test_cache: pytest -s -vv -k "test_merger_view_session" + +.PHONY: test_async_chart charting + +# Runs only the async loop block + Chrome trace test. +# Writes trace.json next to this Makefile (project root). +test_async_chart: + rm -f ./trace.json + SMARTFEED_CHROME_TRACE=./trace.json pytest -q tests/test_async_loop_blocks_trace.py + @echo "\nWrote trace: $(CURDIR)/trace.json" + @echo "Open Chrome -> chrome://tracing -> Load -> select trace.json" + +# Convenience target: generate the trace + try to open chrome://tracing. +charting: test_async_chart + -@open -a "Google Chrome" "chrome://tracing" 2>/dev/null || true diff --git a/smartfeed/execution/dedup_runtime.py b/smartfeed/execution/dedup_runtime.py index 15773a1..3181150 100644 --- a/smartfeed/execution/dedup_runtime.py +++ b/smartfeed/execution/dedup_runtime.py @@ -1,6 +1,6 @@ from __future__ import annotations -from typing import TYPE_CHECKING, Any, Dict, List, Optional, Tuple +from typing import TYPE_CHECKING, Any, Dict, List, Tuple from ..feed_models import BaseFeedConfigModel, FeedResult, FeedResultNextPage from .context import ExecutionContext @@ -47,11 +47,9 @@ async def run_node_with_dedup_refill( remaining = int(limit) loops = 0 - current_result = initial_result - current_next_page = current_result.next_page - current_request_limit = max(1, remaining) - has_next_page = bool(current_result.has_next_page) base_next_page = next_page + current_result = initial_result + request_limit = max(1, remaining) # NOTE: Refill loops are inherently sequential for a single node because # each subsequent request depends on the previous cursor. @@ -64,11 +62,11 @@ async def run_node_with_dedup_refill( limit=remaining, ) - if can_overfetch and current_request_limit > remaining: + if can_overfetch and request_limit > remaining: CursorMap.rewind_overfetch( node=node, base_next_page=base_next_page, - result_next_page=current_next_page, + result_next_page=current_result.next_page, inspected_count=inspected_count, batch_size=len(current_result.data), ) @@ -77,31 +75,27 @@ async def run_node_with_dedup_refill( collected.extend(accepted) remaining = limit - len(collected) - if remaining <= 0 or not has_next_page or loops >= max_refill_loops: + if remaining <= 0 or not current_result.has_next_page or loops >= max_refill_loops: break loops += 1 - base_next_page = current_next_page - next_request_limit = max(1, remaining) - can_overfetch = CursorMap.can_overfetch(node=node, base_next_page=base_next_page) - if can_overfetch and overfetch_factor > 1: - next_request_limit = max(1, remaining * overfetch_factor) + base_next_page = current_result.next_page + request_limit = max(1, remaining) + if CursorMap.can_overfetch(node=node, base_next_page=base_next_page) and overfetch_factor > 1: + request_limit = max(1, remaining * overfetch_factor) current_result, _plan = await self._executor._run_node_raw( node, ctx, - next_request_limit, + request_limit, base_next_page, params, ) - current_next_page = current_result.next_page - current_request_limit = next_request_limit - has_next_page = bool(current_result.has_next_page) return FeedResult( data=collected, - next_page=current_next_page, - has_next_page=has_next_page, + next_page=current_result.next_page, + has_next_page=bool(current_result.has_next_page), ) async def apply_slots_plan_dedup( @@ -150,43 +144,12 @@ async def apply_slots_plan_dedup( return owner_buffers, owner_results def _compute_slot_deficits(self, *, plan: SlotsPlan, owner_buffers: Dict[int, List[Any]]) -> Dict[int, int]: - total_max = sum(int(s.max_count) for s in plan.slots) - quota_schedule = total_max <= int(plan.limit) - + quota_schedule = sum(int(s.max_count) for s in plan.slots) <= int(plan.limit) + deficits: Dict[int, int] = {} consumed: Dict[int, int] = {} remaining = int(plan.limit) deficit_slots: List[int] = [] - for slot in plan.slots: - if remaining <= 0: - break - - owner_id = id(slot.owner) - want = min(int(slot.max_count), remaining) - if want <= 0: - continue - - have_total = len(owner_buffers.get(owner_id, [])) - already = int(consumed.get(owner_id, 0)) - available = max(0, have_total - already) - take = min(want, available) - if take < want: - deficit_slots.append(owner_id) - consumed[owner_id] = already + take - remaining -= take - - page_underfilled = remaining > 0 - - if quota_schedule: - return self._compute_quota_deficits(plan=plan, owner_buffers=owner_buffers) - if not page_underfilled: - return {} - return self._compute_fill_deficits(plan=plan, remaining=remaining, deficit_slots=deficit_slots) - - def _compute_quota_deficits(self, *, plan: SlotsPlan, owner_buffers: Dict[int, List[Any]]) -> Dict[int, int]: - deficits: Dict[int, int] = {} - remaining = int(plan.limit) - consumed: Dict[int, int] = {} for slot in plan.slots: if remaining <= 0: break @@ -202,19 +165,18 @@ def _compute_quota_deficits(self, *, plan: SlotsPlan, owner_buffers: Dict[int, L take = min(want, available) missing = max(0, want - take) if missing: - deficits[owner_id] = deficits.get(owner_id, 0) + missing + deficit_slots.append(owner_id) + if quota_schedule: + deficits[owner_id] = deficits.get(owner_id, 0) + missing consumed[owner_id] = already + take remaining -= take - return deficits - - def _compute_fill_deficits(self, *, plan: SlotsPlan, remaining: int, deficit_slots: List[int]) -> Dict[int, int]: - to_fill = int(remaining) - if to_fill <= 0: + if quota_schedule: + return deficits + if remaining <= 0: return {} - owner_id = deficit_slots[-1] if deficit_slots else (id(plan.slots[-1].owner) if plan.slots else None) - return {owner_id: to_fill} if owner_id is not None else {} + return {owner_id: remaining} if owner_id is not None else {} async def _refill_deficits( self, @@ -250,17 +212,12 @@ async def _refill_deficits( base_np = owner_results[refill_owner_id].next_page if refill_owner_id in owner_results else plan.next_page state[refill_owner_id] = { - "owner": refill_owner, "missing_total": missing_total, - "remaining": int(missing_total), + "remaining": missing_total, "accepted": [], "loops": 0, "current_next_page": base_np, "has_next_page": True, - "last_result": None, - "last_request_limit": 0, - "last_can_overfetch": False, - "last_base_next_page": base_np, } if not state: diff --git a/smartfeed/execution/executor.py b/smartfeed/execution/executor.py index e4fb3ca..22b6a8c 100644 --- a/smartfeed/execution/executor.py +++ b/smartfeed/execution/executor.py @@ -107,12 +107,11 @@ async def _execute_slots_plan(self, plan: SlotsPlan) -> FeedResult: working_next_page = _pydantic_deep_copy(plan.next_page) cursor = CursorMap(working_next_page) - owners, owner_index = self._collect_plan_owners(plan) + owners, owner_index, owner_max_demand = self._collect_plan_owners(plan) dedup_policy = getattr(plan.ctx, "dedup", None) refill_settings = getattr(plan.ctx, "refill_settings", None) or getattr(plan.ctx, "dedup_settings", None) dedup_active = dedup_policy is not None - owner_max_demand = self._owner_slot_demand(plan) owner_buffers, owner_results = await self._run_plan_owners( plan=plan, owners=owners, @@ -137,25 +136,17 @@ async def _execute_slots_plan(self, plan: SlotsPlan) -> FeedResult: assembled = await self._maybe_await(plan.assemble(output, cursor.next_page, owner_results)) return assembled - def _owner_slot_demand(self, plan: SlotsPlan) -> Dict[int, int]: - """Compute a per-owner maximum demand based on the slot schedule.""" - - demand: Dict[int, int] = {} - for slot in plan.slots: - owner_id = id(slot.owner) - demand[owner_id] = demand.get(owner_id, 0) + int(slot.max_count) - return demand - - def _collect_plan_owners(self, plan: SlotsPlan) -> tuple[List[Any], Dict[int, int]]: + def _collect_plan_owners(self, plan: SlotsPlan) -> tuple[List[Any], Dict[int, int], Dict[int, int]]: owners: List[Any] = [] owner_index: Dict[int, int] = {} + owner_demand: Dict[int, int] = {} for slot in plan.slots: owner_id = id(slot.owner) - if owner_id in owner_index: - continue - owner_index[owner_id] = len(owners) - owners.append(slot.owner) - return owners, owner_index + if owner_id not in owner_index: + owner_index[owner_id] = len(owners) + owners.append(slot.owner) + owner_demand[owner_id] = owner_demand.get(owner_id, 0) + int(slot.max_count) + return owners, owner_index, owner_demand async def _run_owner( self, diff --git a/tests/test_async_loop_blocks_trace.py b/tests/test_async_loop_blocks_trace.py new file mode 100644 index 0000000..2dc8def --- /dev/null +++ b/tests/test_async_loop_blocks_trace.py @@ -0,0 +1,453 @@ +import asyncio +import json +import os +import time +from dataclasses import dataclass, field +from typing import Any, Awaitable, Callable, Dict, List, Optional + +import pytest + +from smartfeed.schemas import FeedResultNextPage, MergerDeduplication +from tests.fixtures import dedup_helpers as dh +from tests.fixtures.redis import redis_client # noqa: F401 +from tests.utils import parse_model + + +def _now_us() -> int: + return time.perf_counter_ns() // 1000 + + +@dataclass +class ChromeTraceRecorder: + """Writes Chrome Trace Events JSON for chrome://tracing. + + This is intentionally tiny and test-only: no production dependencies. + """ + + pid: int = 1 + events: List[Dict[str, Any]] = field(default_factory=list) + + def _emit(self, event: Dict[str, Any]) -> None: + self.events.append(event) + + def begin(self, name: str, *, tid: int, ts_us: Optional[int] = None, args: Optional[Dict[str, Any]] = None) -> None: + self._emit( + { + "name": name, + "ph": "B", + "ts": int(_now_us() if ts_us is None else ts_us), + "pid": int(self.pid), + "tid": int(tid), + "args": args or {}, + } + ) + + def end(self, name: str, *, tid: int, ts_us: Optional[int] = None, args: Optional[Dict[str, Any]] = None) -> None: + self._emit( + { + "name": name, + "ph": "E", + "ts": int(_now_us() if ts_us is None else ts_us), + "pid": int(self.pid), + "tid": int(tid), + "args": args or {}, + } + ) + + def instant(self, name: str, *, tid: int, ts_us: Optional[int] = None, args: Optional[Dict[str, Any]] = None) -> None: + self._emit( + { + "name": name, + "ph": "i", + "s": "t", + "ts": int(_now_us() if ts_us is None else ts_us), + "pid": int(self.pid), + "tid": int(tid), + "args": args or {}, + } + ) + + def write(self, path: str) -> None: + payload = {"traceEvents": self.events} + with open(path, "w", encoding="utf-8") as f: + json.dump(payload, f) + + +class LoopBlockMonitor: + """Detects event-loop blocking by measuring scheduling lag. + + If the event loop is blocked by long sync work, a periodic sleeper will wake + up late; we track the maximum observed lag. + """ + + def __init__(self, *, sample_interval_s: float = 0.01, block_threshold_s: float = 0.25) -> None: + self.sample_interval_s = float(sample_interval_s) + self.block_threshold_s = float(block_threshold_s) + self.max_lag_s: float = 0.0 + self.block_events: List[float] = [] + self._task: Optional[asyncio.Task[None]] = None + self._stop = asyncio.Event() + + async def __aenter__(self) -> "LoopBlockMonitor": + self._stop.clear() + self._task = asyncio.create_task(self._run()) + return self + + async def __aexit__(self, exc_type, exc, tb) -> None: # type: ignore[override] + self._stop.set() + if self._task is not None: + await self._task + + async def _run(self) -> None: + loop = asyncio.get_running_loop() + expected = loop.time() + self.sample_interval_s + while not self._stop.is_set(): + await asyncio.sleep(self.sample_interval_s) + now = loop.time() + lag = max(0.0, now - expected) + expected = now + self.sample_interval_s + self.max_lag_s = max(self.max_lag_s, lag) + if lag >= self.block_threshold_s: + self.block_events.append(lag) + + +@dataclass +class LeafConcurrencyTracker: + """Tracks how many leaf calls are in-flight concurrently.""" + + current: int = 0 + peak: int = 0 + _lock: asyncio.Lock = field(default_factory=asyncio.Lock) + + async def enter(self) -> int: + async with self._lock: + self.current += 1 + if self.current > self.peak: + self.peak = self.current + return self.current + + async def exit(self) -> int: + async with self._lock: + self.current = max(0, self.current - 1) + return self.current + + +def _trace_wrap_awaitable(rec: ChromeTraceRecorder, name: str, awaitable: Awaitable[Any], *, args: Dict[str, Any]) -> Awaitable[Any]: + async def _wrapped() -> Any: + task = asyncio.current_task() + tid = id(task) if task is not None else 0 + rec.begin(name, tid=tid, args=args) + try: + return await awaitable + finally: + rec.end(name, tid=tid) + + return _wrapped() + + +def _wrap_method_latency(method: Callable[..., Awaitable[Any]], *, latency_s: float) -> Callable[..., Awaitable[Any]]: + async def _wrapped(*args: Any, **kwargs: Any) -> Any: + await asyncio.sleep(latency_s) + return await method(*args, **kwargs) + + return _wrapped + + +def _wrap_leaf_method_traced( + *, + rec: ChromeTraceRecorder, + key: str, + method: Callable[..., Awaitable[Any]], + latency_s: float, + concurrency: LeafConcurrencyTracker, +) -> Callable[..., Awaitable[Any]]: + async def _wrapped(user_id: Any, limit: int, next_page: Any, **kwargs: Any) -> Any: + task = asyncio.current_task() + tid = id(task) if task is not None else 0 + + page = getattr(next_page, "page", None) + after = getattr(next_page, "after", None) + after_type = type(after).__name__ + + if after is None: + after_preview = None + else: + after_preview = str(after) + if len(after_preview) > 120: + after_preview = after_preview[:117] + "..." + + span = f"leaf.{key}" + + in_flight = await concurrency.enter() + rec.begin( + span, + tid=tid, + args={ + "key": key, + "limit": int(limit), + "page": page, + "after_type": after_type, + "after_preview": after_preview, + "in_flight": int(in_flight), + }, + ) + try: + if latency_s > 0: + await asyncio.sleep(float(latency_s)) + return await method(user_id, limit, next_page, **kwargs) + finally: + rec.end(span, tid=tid) + await concurrency.exit() + + return _wrapped + + +@pytest.mark.parametrize("redis_client", ["sync", "async"], indirect=True) +@pytest.mark.asyncio +async def test_async_loop_blocks_and_trace_for_deep_tree_all_mergers(redis_client, monkeypatch, tmp_path) -> None: + """A smoke-test for detecting async loop blocks + visualizing concurrency. + + - Builds one deep tree that includes ALL merger types. + - Simulates 2 sequential requests (fresh + next page). + - Forces refills by creating lots of cross-branch duplicates. + - Records loop scheduling lag (blocks/hangs) and optionally exports a Chrome trace. + + Set `SMARTFEED_CHROME_TRACE=/path/to/trace.json` to write a trace. + Open it in Chrome via chrome://tracing. + """ + + # Keep IDs disjoint across sources so "no dupes" is stable. + # Refill waves are forced via max_per_call limits (under-fetch), not via dedup collisions. + items_a = dh.make_items("A", 1, 400, user_id_mod=5, id_offset=1_000) + items_b = dh.make_items("B", 1, 400, user_id_mod=5, id_offset=10_000) + + # Distribute branch: needs distribution_key present (user_id). + items_posted_1 = dh.make_items("posted_1", 1, 80, user_id_mod=3, id_offset=20_000) + items_posted_2 = dh.make_items("posted_2", 1, 120, user_id_mod=3, id_offset=21_000) + + # Gradient branch: overlapping ids again. + items_g1 = dh.make_items("G1", 1, 250, user_id_mod=7, id_offset=30_000) + items_g2 = dh.make_items("G2", 1, 250, user_id_mod=7, id_offset=40_000) + + # View-session leaf. + items_vs = dh.make_items("VS", 1, 160, user_id_mod=11, id_offset=50_000) + + # Positional leaf that intentionally under-fetches to force refill waves. + items_pos_leaf = dh.make_items("POS", 1, 500, user_id_mod=13, id_offset=60_000) + + # --- tracing (test-only monkeypatch) --- + rec = ChromeTraceRecorder() + leaf_concurrency = LeafConcurrencyTracker() + + # Leaf method tracing: wrap the *actual* subfeed method calls. + # These spans are what you want to inspect for "are leaf calls parallel?". + leaf_latency_s = 0.02 + methods_dict = { + "a": _wrap_leaf_method_traced( + rec=rec, + key="a", + method=dh.make_offset_paged_method(items_a), + latency_s=leaf_latency_s, + concurrency=leaf_concurrency, + ), + "b": _wrap_leaf_method_traced( + rec=rec, + key="b", + method=dh.make_offset_paged_method(items_b), + latency_s=leaf_latency_s, + concurrency=leaf_concurrency, + ), + "posted_1": _wrap_leaf_method_traced( + rec=rec, + key="posted_1", + method=dh.make_offset_paged_method(items_posted_1), + latency_s=leaf_latency_s, + concurrency=leaf_concurrency, + ), + "posted_2": _wrap_leaf_method_traced( + rec=rec, + key="posted_2", + method=dh.make_offset_paged_method(items_posted_2), + latency_s=leaf_latency_s, + concurrency=leaf_concurrency, + ), + "g1": _wrap_leaf_method_traced( + rec=rec, + key="g1", + method=dh.make_offset_paged_method(items_g1), + latency_s=leaf_latency_s, + concurrency=leaf_concurrency, + ), + "g2": _wrap_leaf_method_traced( + rec=rec, + key="g2", + method=dh.make_offset_paged_method(items_g2), + latency_s=leaf_latency_s, + concurrency=leaf_concurrency, + ), + "vs": _wrap_leaf_method_traced( + rec=rec, + key="vs", + method=dh.make_offset_paged_method(items_vs), + latency_s=leaf_latency_s, + concurrency=leaf_concurrency, + ), + # Fetch only 1 item per call even if demand is higher -> triggers refill loops. + "pos_leaf": _wrap_leaf_method_traced( + rec=rec, + key="pos_leaf", + method=dh.make_offset_paged_method(items_pos_leaf, max_per_call=1), + latency_s=leaf_latency_s, + concurrency=leaf_concurrency, + ), + } + + view_session_cfg = { + "merger_id": "vs_all", + "type": "merger_view_session", + "session_size": 100, + "session_live_time": 60, + "deduplicate": True, + "dedup_key": "id", + "data": dh._subfeed("sf_vs", "vs"), + } + + pct_cfg = dh._percentage_config( + "pct_all", + items=dh._percentage_items(dh._subfeed("sf_a", "a"), dh._subfeed("sf_b", "b"), first_pct=50, second_pct=50), + ) + + pos_cfg = dh._positional_config( + "pos_all", + # Ensure positional inserts appear across pages for limit~12. + # Use even positions so the schedule starts with the default branch; + # this keeps ordering deterministic. + positions=[2, 4, 6, 8, 10, 12, 14, 16, 18], + positional=dh._subfeed("sf_pos_leaf", "pos_leaf"), + default=pct_cfg, + ) + + dist_cfg = dh._distribute_config( + "dist_all", + items=[dh._subfeed("sf_posted_1", "posted_1"), dh._subfeed("sf_posted_2", "posted_2")], + distribution_key="user_id", + ) + + grad_cfg = dh._gradient_config( + "grad_all", + item_from={"percentage": 70, "data": dh._subfeed("sf_g1", "g1")}, + item_to={"percentage": 30, "data": dh._subfeed("sf_g2", "g2")}, + step=10, + size_to_step=5, + shuffle=False, + ) + + # Include all merger types as siblings so they are executed (and visible in trace), + # while keeping the main output driven by the first branch. + deep_tree = dh._append_config("append_all", [pos_cfg, view_session_cfg, dist_cfg, grad_cfg]) + config = dh._dedup_config( + "dedup_all", + deep_tree, + dedup_key="id", + state_backend="cursor", + overfetch_factor=3, + max_refill_loops=50, + ) + merger = parse_model(MergerDeduplication, config) + + # Patch Executor.gather to wrap each awaitable for Chrome trace. + from smartfeed.execution.executor import Executor # local import for monkeypatch + + original_gather = Executor.gather + + async def _gather_traced(self: Any, *coros: Any) -> List[Any]: + wrapped = [ + _trace_wrap_awaitable(rec, "executor.gather.op", c, args={"idx": i, "total": len(coros)}) + for i, c in enumerate(coros) + ] + task = asyncio.current_task() + tid = id(task) if task is not None else 0 + rec.begin("executor.gather", tid=tid, args={"n": len(coros)}) + try: + return await original_gather(self, *wrapped) + finally: + rec.end("executor.gather", tid=tid) + + monkeypatch.setattr(Executor, "gather", _gather_traced) + + # Patch Executor.run to show sequential refill loops vs plan execution. + original_run = Executor.run + + async def _run_traced( + self: Any, + node: Any, + ctx: Any, + limit: int, + next_page: Any, + **params: Any, + ) -> Any: + task = asyncio.current_task() + tid = id(task) if task is not None else 0 + node_type = getattr(node, "type", node.__class__.__name__) + node_id = getattr(node, "merger_id", getattr(node, "subfeed_id", None)) + rec.begin( + "executor.run_node", + tid=tid, + args={"node_type": node_type, "node_id": node_id, "limit": int(limit)}, + ) + try: + return await original_run(self, node, ctx, limit, next_page, **params) + finally: + rec.end("executor.run_node", tid=tid) + + monkeypatch.setattr(Executor, "run", _run_traced) + + # --- run: fresh request + next_page --- + limit = 12 + np0 = FeedResultNextPage(data={}) + + async with LoopBlockMonitor(sample_interval_s=0.01, block_threshold_s=0.05) as monitor: + res1 = await asyncio.wait_for( + merger.get_data( + methods_dict=methods_dict, + user_id="u", + limit=limit, + next_page=np0, + redis_client=redis_client, + ), + timeout=15, + ) + res2 = await asyncio.wait_for( + merger.get_data( + methods_dict=methods_dict, + user_id="u", + limit=limit, + next_page=res1.next_page, + redis_client=redis_client, + ), + timeout=15, + ) + + # Sanity: we should fill the page and maintain dedup invariants. + assert len(res1.data) == limit + assert len({x["id"] for x in res1.data}) == limit + assert len(res2.data) == limit + assert len({x["id"] for x in res2.data}) == limit + + # Hard assertion: leaf calls must overlap (async concurrency), not serialize. + assert leaf_concurrency.peak > 1 + + # Primary signal: event-loop should remain responsive under load. + assert monitor.max_lag_s < 0.1 + + out = os.environ.get("SMARTFEED_CHROME_TRACE") + if out: + # Allow writing to an explicit file path, or to a directory. + out_path = out + if os.path.isdir(out_path): + out_path = os.path.join(out_path, "smartfeed_trace.json") + rec.instant("loop.max_lag", tid=0, args={"max_lag_s": monitor.max_lag_s, "blocks": len(monitor.block_events)}) + rec.write(out_path) + + # Keep references so this test remains useful in local debugging. + _ = tmp_path From 4e6bfefa61daf20e0046b34ec875f552415466c6 Mon Sep 17 00:00:00 2001 From: Pavel Kochetov Date: Sun, 8 Feb 2026 22:32:30 +0000 Subject: [PATCH 24/33] Remove some boilerplate from mergers. --- smartfeed/execution/context.py | 1 - smartfeed/execution/dedup_runtime.py | 10 +++-- smartfeed/execution/executor.py | 3 +- smartfeed/feed_models.py | 21 ++++++++-- smartfeed/mergers/append.py | 23 +---------- smartfeed/mergers/append_distribute.py | 23 +---------- smartfeed/mergers/deduplication.py | 23 +---------- smartfeed/mergers/percentage.py | 22 +---------- smartfeed/mergers/percentage_gradient.py | 22 +---------- smartfeed/mergers/positional.py | 22 +---------- smartfeed/mergers/view_session.py | 50 +++++++++--------------- smartfeed/policies/seen_store.py | 16 +++++--- tests/test_merger_deduplication.py | 44 +++++++++++++++++++++ tests/test_seen_store_unit.py | 14 +++++++ 14 files changed, 117 insertions(+), 177 deletions(-) diff --git a/smartfeed/execution/context.py b/smartfeed/execution/context.py index 134c261..18fc769 100644 --- a/smartfeed/execution/context.py +++ b/smartfeed/execution/context.py @@ -26,7 +26,6 @@ class ExecutionContext: # Execution settings (optional) refill_settings: Optional["RefillExecutionSettings"] = None - dedup_settings: Optional["RefillExecutionSettings"] = None def ensure_redis_client(self, redis_client: Optional[Union[redis.Redis, AsyncRedis]]) -> None: if self.redis_client is None and redis_client is not None: diff --git a/smartfeed/execution/dedup_runtime.py b/smartfeed/execution/dedup_runtime.py index 3181150..90dadc7 100644 --- a/smartfeed/execution/dedup_runtime.py +++ b/smartfeed/execution/dedup_runtime.py @@ -1,6 +1,6 @@ from __future__ import annotations -from typing import TYPE_CHECKING, Any, Dict, List, Tuple +from typing import TYPE_CHECKING, Any, Dict, List, Optional, Tuple from ..feed_models import BaseFeedConfigModel, FeedResult, FeedResultNextPage from .context import ExecutionContext @@ -22,7 +22,7 @@ def __init__(self, executor: "Executor") -> None: self._executor = executor def _get_refill_settings(self, ctx: ExecutionContext) -> Any: - return getattr(ctx, "refill_settings", None) or getattr(ctx, "dedup_settings", None) + return getattr(ctx, "refill_settings", None) async def run_node_with_dedup_refill( self, @@ -175,8 +175,10 @@ def _compute_slot_deficits(self, *, plan: SlotsPlan, owner_buffers: Dict[int, Li return deficits if remaining <= 0: return {} - owner_id = deficit_slots[-1] if deficit_slots else (id(plan.slots[-1].owner) if plan.slots else None) - return {owner_id: remaining} if owner_id is not None else {} + fallback_owner_id: Optional[int] = ( + deficit_slots[-1] if deficit_slots else (id(plan.slots[-1].owner) if plan.slots else None) + ) + return {fallback_owner_id: remaining} if fallback_owner_id is not None else {} async def _refill_deficits( self, diff --git a/smartfeed/execution/executor.py b/smartfeed/execution/executor.py index 22b6a8c..11eb551 100644 --- a/smartfeed/execution/executor.py +++ b/smartfeed/execution/executor.py @@ -109,7 +109,7 @@ async def _execute_slots_plan(self, plan: SlotsPlan) -> FeedResult: cursor = CursorMap(working_next_page) owners, owner_index, owner_max_demand = self._collect_plan_owners(plan) dedup_policy = getattr(plan.ctx, "dedup", None) - refill_settings = getattr(plan.ctx, "refill_settings", None) or getattr(plan.ctx, "dedup_settings", None) + refill_settings = getattr(plan.ctx, "refill_settings", None) dedup_active = dedup_policy is not None owner_buffers, owner_results = await self._run_plan_owners( @@ -167,7 +167,6 @@ async def _run_owner( executor=plan.ctx.executor, dedup=None, refill_settings=None, - dedup_settings=None, ) return await self.run(owner, owner_ctx, demand, isolated_next_page, **plan.params) diff --git a/smartfeed/feed_models.py b/smartfeed/feed_models.py index 6d29d67..38d26d6 100644 --- a/smartfeed/feed_models.py +++ b/smartfeed/feed_models.py @@ -1,6 +1,5 @@ import asyncio import inspect -from abc import ABC, abstractmethod from dataclasses import dataclass from random import shuffle from typing import TYPE_CHECKING, Any, Awaitable, Callable, Dict, List, Literal, Optional, Union, cast @@ -68,14 +67,13 @@ class FeedResultClient(BaseModel): has_next_page: bool -class BaseFeedConfigModel(ABC, BaseModel): +class BaseFeedConfigModel(BaseModel): """Base class for merger/subfeed config models.""" # Higher value means the item should "win" deduplication when duplicates exist. # This is primarily used by MergerDeduplication and by mergers when a dedup wrapper is active. dedup_priority: int = 0 - @abstractmethod async def get_data( self, methods_dict: Dict[str, Callable], @@ -86,7 +84,22 @@ async def get_data( ctx: Optional["ExecutionContext"] = None, **params: Any, ) -> FeedResult: - """Fetch data according to this node config.""" + """Default merger execution path via the shared executor.""" + + if not callable(getattr(self, "build_plan", None)): + raise NotImplementedError( + f"{self.__class__.__name__} must implement build_plan(...) or override get_data(...)." + ) + + if ctx is None: + from .execution.context import ExecutionContext as _ExecutionContext + + ctx = _ExecutionContext(methods_dict=methods_dict, user_id=user_id, redis_client=redis_client) + else: + ctx.ensure_redis_client(redis_client) + + executor = ctx.ensure_executor() + return await executor.run(self, ctx, limit, next_page, **params) @dataclass diff --git a/smartfeed/mergers/append.py b/smartfeed/mergers/append.py index a415159..9c5c5c6 100644 --- a/smartfeed/mergers/append.py +++ b/smartfeed/mergers/append.py @@ -1,10 +1,7 @@ from __future__ import annotations from random import shuffle -from typing import TYPE_CHECKING, Any, Callable, Dict, List, Literal, Optional, Union, cast - -import redis -from redis.asyncio import Redis as AsyncRedis +from typing import TYPE_CHECKING, Any, Dict, List, Literal, cast from ..execution.context import ExecutionContext from ..execution.executor import SlotSpec, SlotsPlan @@ -49,21 +46,3 @@ def _assemble( slots=slots, assemble=_assemble, ) - - async def get_data( - self, - methods_dict: Dict[str, Callable], - user_id: Any, - limit: int, - next_page: FeedResultNextPage, - redis_client: Optional[Union[redis.Redis, AsyncRedis]] = None, - ctx: Optional[ExecutionContext] = None, - **params: Any, - ) -> FeedResult: - if ctx is None: - ctx = ExecutionContext(methods_dict=methods_dict, user_id=user_id, redis_client=redis_client) - else: - ctx.ensure_redis_client(redis_client) - - executor = ctx.ensure_executor() - return await executor.run(self, ctx, limit, next_page, **params) diff --git a/smartfeed/mergers/append_distribute.py b/smartfeed/mergers/append_distribute.py index 3e0a8e1..3ef891d 100644 --- a/smartfeed/mergers/append_distribute.py +++ b/smartfeed/mergers/append_distribute.py @@ -1,10 +1,7 @@ from __future__ import annotations from collections import defaultdict, deque -from typing import TYPE_CHECKING, Any, Callable, Dict, List, Literal, Optional, Union - -import redis -from redis.asyncio import Redis as AsyncRedis +from typing import TYPE_CHECKING, Any, Dict, List, Literal, Optional from typing_extensions import no_type_check from ..execution.context import ExecutionContext @@ -73,21 +70,3 @@ def _assemble( slots=slots, assemble=_assemble, ) - - async def get_data( - self, - methods_dict: Dict[str, Callable], - user_id: Any, - limit: int, - next_page: FeedResultNextPage, - redis_client: Optional[Union[redis.Redis, AsyncRedis]] = None, - ctx: Optional[ExecutionContext] = None, - **params: Any, - ) -> FeedResult: - if ctx is None: - ctx = ExecutionContext(methods_dict=methods_dict, user_id=user_id, redis_client=redis_client) - else: - ctx.ensure_redis_client(redis_client) - - executor = ctx.ensure_executor() - return await executor.run(self, ctx, limit, next_page, **params) diff --git a/smartfeed/mergers/deduplication.py b/smartfeed/mergers/deduplication.py index 2686ac9..1b35a7b 100644 --- a/smartfeed/mergers/deduplication.py +++ b/smartfeed/mergers/deduplication.py @@ -1,10 +1,8 @@ from __future__ import annotations -from typing import TYPE_CHECKING, Any, Callable, Dict, Literal, Optional, Union +from typing import TYPE_CHECKING, Any, Dict, Literal, Optional -import redis from pydantic import PrivateAttr, model_validator -from redis.asyncio import Redis as AsyncRedis from ..execution.context import ExecutionContext, RefillExecutionSettings from ..execution.cursors import CursorMap @@ -102,24 +100,6 @@ def _build_redis_state_key(self, user_id: Any, params: Dict[str, Any]) -> str: return f"dedup:{self.merger_id}:{user_id}:{suffix}" return f"dedup:{self.merger_id}:{user_id}" - async def get_data( - self, - methods_dict: Dict[str, Callable], - user_id: Any, - limit: int, - next_page: FeedResultNextPage, - redis_client: Optional[Union[redis.Redis, AsyncRedis]] = None, - ctx: Optional[ExecutionContext] = None, - **params: Any, - ) -> FeedResult: - if ctx is None: - ctx = ExecutionContext(methods_dict=methods_dict, user_id=user_id, redis_client=redis_client) - else: - ctx.ensure_redis_client(redis_client) - - executor = ctx.ensure_executor() - return await executor.run(self, ctx, limit, next_page, **params) - def build_plan( self, *, @@ -185,7 +165,6 @@ async def _run(executor: Any) -> FeedResult: executor=ctx.executor, dedup=policy, refill_settings=refill_settings, - dedup_settings=refill_settings, ) child = _pydantic_deep_copy(self.data) diff --git a/smartfeed/mergers/percentage.py b/smartfeed/mergers/percentage.py index 1b034b4..a21fbd3 100644 --- a/smartfeed/mergers/percentage.py +++ b/smartfeed/mergers/percentage.py @@ -1,11 +1,9 @@ from __future__ import annotations from random import shuffle -from typing import TYPE_CHECKING, Any, Callable, Dict, List, Literal, Optional, Union, cast +from typing import TYPE_CHECKING, Any, Dict, List, Literal, cast -import redis from pydantic import BaseModel -from redis.asyncio import Redis as AsyncRedis from ..execution.context import ExecutionContext from ..execution.executor import SlotSpec, SlotsPlan @@ -56,24 +54,6 @@ def _merge_items_data(items_data: List[List]) -> List: return result - async def get_data( - self, - methods_dict: Dict[str, Callable], - user_id: Any, - limit: int, - next_page: FeedResultNextPage, - redis_client: Optional[Union[redis.Redis, AsyncRedis]] = None, - ctx: Optional[ExecutionContext] = None, - **params: Any, - ) -> FeedResult: - if ctx is None: - ctx = ExecutionContext(methods_dict=methods_dict, user_id=user_id, redis_client=redis_client) - else: - ctx.ensure_redis_client(redis_client) - - executor = ctx.ensure_executor() - return await executor.run(self, ctx, limit, next_page, **params) - def build_plan( self, *, diff --git a/smartfeed/mergers/percentage_gradient.py b/smartfeed/mergers/percentage_gradient.py index b1ff586..1aea051 100644 --- a/smartfeed/mergers/percentage_gradient.py +++ b/smartfeed/mergers/percentage_gradient.py @@ -1,9 +1,7 @@ from random import shuffle -from typing import Any, Callable, Dict, List, Literal, Optional, Union, cast +from typing import Any, Dict, List, Literal, cast -import redis from pydantic import model_validator -from redis.asyncio import Redis as AsyncRedis from ..execution.context import ExecutionContext from ..execution.executor import SlotSpec, SlotsPlan @@ -69,24 +67,6 @@ def _calculate_limits_and_percents(self, page: int, limit: int) -> Dict: return result - async def get_data( - self, - methods_dict: Dict[str, Callable], - user_id: Any, - limit: int, - next_page: FeedResultNextPage, - redis_client: Optional[Union[redis.Redis, AsyncRedis]] = None, - ctx: Optional[ExecutionContext] = None, - **params: Any, - ) -> FeedResult: - if ctx is None: - ctx = ExecutionContext(methods_dict=methods_dict, user_id=user_id, redis_client=redis_client) - else: - ctx.ensure_redis_client(redis_client) - - executor = ctx.ensure_executor() - return await executor.run(self, ctx, limit, next_page, **params) - def build_plan( self, *, diff --git a/smartfeed/mergers/positional.py b/smartfeed/mergers/positional.py index 3ac9b32..6023aae 100644 --- a/smartfeed/mergers/positional.py +++ b/smartfeed/mergers/positional.py @@ -1,10 +1,8 @@ from __future__ import annotations -from typing import TYPE_CHECKING, Any, Callable, Dict, List, Literal, Optional, Union +from typing import TYPE_CHECKING, Any, Dict, List, Literal, Optional -import redis from pydantic import model_validator -from redis.asyncio import Redis as AsyncRedis from ..execution.context import ExecutionContext from ..execution.executor import SlotSpec, SlotsPlan @@ -38,24 +36,6 @@ def validate_merger_positional(self) -> "MergerPositional": raise ValueError('"end" must be bigger than "start"') return self - async def get_data( - self, - methods_dict: Dict[str, Callable], - user_id: Any, - limit: int, - next_page: FeedResultNextPage, - redis_client: Optional[Union[redis.Redis, AsyncRedis]] = None, - ctx: Optional[ExecutionContext] = None, - **params: Any, - ) -> FeedResult: - if ctx is None: - ctx = ExecutionContext(methods_dict=methods_dict, user_id=user_id, redis_client=redis_client) - else: - ctx.ensure_redis_client(redis_client) - - executor = ctx.ensure_executor() - return await executor.run(self, ctx, limit, next_page, **params) - def build_plan( self, *, diff --git a/smartfeed/mergers/view_session.py b/smartfeed/mergers/view_session.py index 8adf814..dad3314 100644 --- a/smartfeed/mergers/view_session.py +++ b/smartfeed/mergers/view_session.py @@ -2,7 +2,7 @@ import logging from random import shuffle -from typing import TYPE_CHECKING, Any, Callable, Dict, List, Literal, Optional, Union +from typing import TYPE_CHECKING, Any, Dict, List, Literal, Optional, Union import redis from redis.asyncio import Redis as AsyncRedis @@ -11,6 +11,7 @@ from ..execution.context import ExecutionContext from ..execution.executor import CallablePlan from ..feed_models import BaseFeedConfigModel, FeedResult, FeedResultNextPage, FeedResultNextPageInside, _redis_call +from ..policies.dedup import entity_key if TYPE_CHECKING: from ..schemas import FeedTypes @@ -25,24 +26,27 @@ class MergerViewSession(BaseFeedConfigModel): session_live_time: int data: "FeedTypes" deduplicate: bool = False - dedup_key: str = None # type: ignore + dedup_key: Optional[str] = None + missing_key_policy: Literal["error", "keep", "drop"] = "error" shuffle: bool = False def _get_dedup_key_or_attr(self, item: Any) -> str: - if not self.dedup_key: - return item - - try: - dedup_value = item.get(self.dedup_key) - except AttributeError: - dedup_value = getattr(item, self.dedup_key, None) - - assert dedup_value is not None, f"Deduplication failed: entity {item} has no key or attr {self.dedup_key}" - return dedup_value + key = entity_key(item, self.dedup_key, self.missing_key_policy) + assert key is not None, "Deduplication key is missing and item was dropped by missing_key_policy='drop'" + return key def _dedup_data(self, data: List[Any]) -> List[Any]: - deduplicated_data = {self._get_dedup_key_or_attr(item): item for item in data} - return list(deduplicated_data.values()) + deduplicated: List[Any] = [] + seen: set[str] = set() + for item in data: + key = entity_key(item, self.dedup_key, self.missing_key_policy) + if key is None: + continue + if key in seen: + continue + seen.add(key) + deduplicated.append(item) + return deduplicated async def _set_cache( self, @@ -110,24 +114,6 @@ async def _get_cache( has_next_page=bool(len(session_data) > limit * page), ) - async def get_data( - self, - methods_dict: Dict[str, Callable], - user_id: Any, - limit: int, - next_page: FeedResultNextPage, - redis_client: Optional[Union[redis.Redis, AsyncRedis]] = None, - ctx: Optional[ExecutionContext] = None, - **params: Any, - ) -> FeedResult: - if ctx is None: - ctx = ExecutionContext(methods_dict=methods_dict, user_id=user_id, redis_client=redis_client) - else: - ctx.ensure_redis_client(redis_client) - - executor = ctx.ensure_executor() - return await executor.run(self, ctx, limit, next_page, **params) - def build_plan( self, *, diff --git a/smartfeed/policies/seen_store.py b/smartfeed/policies/seen_store.py index d0a9258..c4712c3 100644 --- a/smartfeed/policies/seen_store.py +++ b/smartfeed/policies/seen_store.py @@ -35,7 +35,7 @@ class CursorSeenStore: cursor_max_keys: Optional[int] seen_priority_map: Dict[str, int] - seen_updates_in_order: List[Tuple[str, int]] + seen_order: List[str] @classmethod def from_after( @@ -50,7 +50,7 @@ def from_after( cursor_compress=cursor_compress, cursor_max_keys=cursor_max_keys, seen_priority_map=seen_priority_map, - seen_updates_in_order=[], + seen_order=list(seen_priority_map.keys()), ) async def prefetch(self, keys: List[str]) -> None: @@ -64,15 +64,21 @@ def set_max(self, key: str, priority: int) -> None: if existing is not None and priority <= existing: return self.seen_priority_map[key] = priority - self.seen_updates_in_order.append((key, priority)) + if key in self.seen_order: + self.seen_order.remove(key) + self.seen_order.append(key) async def reset(self) -> None: self.seen_priority_map.clear() - self.seen_updates_in_order.clear() + self.seen_order.clear() async def commit(self) -> Any: + # Persist the full snapshot so dedup state survives beyond 2 pages. + seen_snapshot_in_order: List[Tuple[str, int]] = [ + (key, self.seen_priority_map[key]) for key in self.seen_order if key in self.seen_priority_map + ] return encode_seen_for_cursor( - self.seen_updates_in_order, + seen_snapshot_in_order, cursor_compress=self.cursor_compress, cursor_max_keys=self.cursor_max_keys, ) diff --git a/tests/test_merger_deduplication.py b/tests/test_merger_deduplication.py index ab9838f..35fb3d1 100644 --- a/tests/test_merger_deduplication.py +++ b/tests/test_merger_deduplication.py @@ -318,6 +318,50 @@ async def test_dedup_page_zero_resets_seen_and_descendant_cursors() -> None: assert res_2.next_page.data["sf_stream"].page == 2 +@pytest.mark.asyncio +async def test_dedup_cursor_backend_persists_seen_state_beyond_two_pages() -> None: + # First 2 pages are unique, then page 3 starts with duplicates from page 1. + items = dh.make_items("S", 1, 11) + dh.make_items("S", 1, 4) + dh.make_items("S", 11, 31) + methods_dict = {"s": dh.make_offset_paged_method(items)} + + config = dh._dedup_config( + "dedup_cursor_3p", + dh._subfeed("sf_stream", "s"), + state_backend="cursor", + ) + merger = parse_model(MergerDeduplication, config) + + res_1 = await merger.get_data( + methods_dict=methods_dict, + user_id="u", + limit=5, + next_page=FeedResultNextPage(data={}), + ) + res_2 = await merger.get_data( + methods_dict=methods_dict, + user_id="u", + limit=5, + next_page=res_1.next_page, + ) + res_3 = await merger.get_data( + methods_dict=methods_dict, + user_id="u", + limit=5, + next_page=res_2.next_page, + ) + + ids_1 = dh._ids(res_1.data) + ids_2 = dh._ids(res_2.data) + ids_3 = dh._ids(res_3.data) + + assert ids_1 == [1, 2, 3, 4, 5] + assert ids_2 == [6, 7, 8, 9, 10] + assert ids_3 == [11, 12, 13, 14, 15] + assert not (set(ids_1) & set(ids_2)) + assert not (set(ids_1) & set(ids_3)) + assert not (set(ids_2) & set(ids_3)) + + @pytest.mark.asyncio async def test_dedup_append_cursor_backend_across_pages_and_refill_advances_leaf_cursor_exactly() -> None: """Append: across pages there is no overlap; refill advances cursors correctly. diff --git a/tests/test_seen_store_unit.py b/tests/test_seen_store_unit.py index 50dab93..f70661f 100644 --- a/tests/test_seen_store_unit.py +++ b/tests/test_seen_store_unit.py @@ -22,6 +22,20 @@ async def test_cursor_seen_store_set_max_and_commit_roundtrip() -> None: assert decoded == {"a": 1, "b": 2} +@pytest.mark.asyncio +async def test_cursor_seen_store_commit_keeps_previous_cursor_state() -> None: + store = CursorSeenStore.from_after( + after={"v": 2, "seen": [["a", 1], ["b", 2]]}, + cursor_compress=False, + cursor_max_keys=None, + ) + store.set_max("c", 3) + + after = await store.commit() + decoded = decode_seen_from_cursor(after) + assert decoded == {"a": 1, "b": 2, "c": 3} + + @pytest.mark.parametrize("redis_client", ["sync", "async"], indirect=True) @pytest.mark.asyncio async def test_redis_seen_store_prefetch_set_max_commit_and_reset(redis_client) -> None: From e3310f06e3291286ad1fdca8897c2b2b58c227f4 Mon Sep 17 00:00:00 2001 From: Pavel Kochetov Date: Sun, 8 Feb 2026 23:38:07 +0000 Subject: [PATCH 25/33] Patch for positional leak when underfetched. --- smartfeed/execution/dedup_runtime.py | 135 +++++++++++++++++++++++++++ smartfeed/execution/executor.py | 14 ++- tests/test_merger_deduplication.py | 62 ++++++++++++ 3 files changed, 210 insertions(+), 1 deletion(-) diff --git a/smartfeed/execution/dedup_runtime.py b/smartfeed/execution/dedup_runtime.py index 90dadc7..1b9b26c 100644 --- a/smartfeed/execution/dedup_runtime.py +++ b/smartfeed/execution/dedup_runtime.py @@ -143,6 +143,32 @@ async def apply_slots_plan_dedup( return owner_buffers, owner_results + async def apply_slots_plan_refill( + self, + *, + plan: SlotsPlan, + owners: List[Any], + owner_index: Dict[int, int], + owner_buffers: Dict[int, List[Any]], + owner_results: Dict[int, FeedResult], + refill_settings: Any, + cursor: CursorMap, + ) -> Tuple[Dict[int, List[Any]], Dict[int, FeedResult]]: + deficits = self._compute_slot_deficits(plan=plan, owner_buffers=owner_buffers) + if deficits: + await self._refill_deficits_without_dedup( + plan=plan, + deficits=deficits, + owners=owners, + owner_index=owner_index, + owner_buffers=owner_buffers, + owner_results=owner_results, + refill_settings=refill_settings, + cursor=cursor, + ) + + return owner_buffers, owner_results + def _compute_slot_deficits(self, *, plan: SlotsPlan, owner_buffers: Dict[int, List[Any]]) -> Dict[int, int]: quota_schedule = sum(int(s.max_count) for s in plan.slots) <= int(plan.limit) deficits: Dict[int, int] = {} @@ -311,3 +337,112 @@ async def _refill_deficits( next_page=owner_state["current_next_page"], has_next_page=owner_state["has_next_page"], ) + + async def _refill_deficits_without_dedup( + self, + *, + plan: SlotsPlan, + deficits: Dict[int, int], + owners: List[Any], + owner_index: Dict[int, int], + owner_buffers: Dict[int, List[Any]], + owner_results: Dict[int, FeedResult], + refill_settings: Any, + cursor: CursorMap, + ) -> None: + max_refill_loops = max(1, int(getattr(refill_settings, "max_refill_loops", 20))) + + deficit_owners: List[Any] = [o for o in owners if id(o) in deficits] + deficit_owners = sorted( + deficit_owners, + key=lambda o: ( + int(getattr(o, "dedup_priority", 0)), + owner_index.get(id(o), 0), + ), + ) + + state: Dict[int, Dict[str, Any]] = {} + for refill_owner in deficit_owners: + refill_owner_id = id(refill_owner) + missing_total = int(deficits.get(refill_owner_id, 0)) + if missing_total <= 0: + continue + + base_np = owner_results[refill_owner_id].next_page if refill_owner_id in owner_results else plan.next_page + state[refill_owner_id] = { + "missing_total": missing_total, + "remaining": missing_total, + "accepted": [], + "loops": 0, + "current_next_page": base_np, + "has_next_page": True, + } + + if not state: + return + + while True: + wave_ops: List[Tuple[Any, int, FeedResultNextPage, int]] = [] + for refill_owner in deficit_owners: + refill_owner_id = id(refill_owner) + owner_state = state.get(refill_owner_id) + if owner_state is None: + continue + if owner_state["remaining"] <= 0: + continue + if not owner_state["has_next_page"]: + continue + if owner_state["loops"] >= max_refill_loops: + continue + + base_np = owner_state["current_next_page"] + request_limit = max(1, int(owner_state["remaining"])) + wave_ops.append((refill_owner, refill_owner_id, base_np, request_limit)) + + if not wave_ops: + break + + results = await self._executor.gather( + *[ + self._executor._run_owner( + plan=plan, + owner=owner, + demand=request_limit, + base_next_page=base_np, + dedup_active=False, + ) + for owner, _owner_id, base_np, request_limit in wave_ops + ] + ) + + for (_owner, owner_id, _base_np, _request_limit), result in zip(wave_ops, results): + owner_state = state[owner_id] + remaining_before = int(owner_state["remaining"]) + + owner_state["current_next_page"] = result.next_page + owner_state["has_next_page"] = bool(result.has_next_page) + cursor.merge_delta(base_next_page=plan.next_page, owner_next_page=result.next_page) + + if remaining_before > 0: + owner_state["accepted"].extend(list(result.data)[:remaining_before]) + owner_state["remaining"] = int(owner_state["missing_total"]) - len(owner_state["accepted"]) + + if owner_state["remaining"] > 0 and owner_state["has_next_page"]: + owner_state["loops"] += 1 + + for refill_owner in deficit_owners: + refill_owner_id = id(refill_owner) + owner_state = state.get(refill_owner_id) + if owner_state is None: + continue + + accepted = owner_state["accepted"] + if accepted: + owner_buffers.setdefault(refill_owner_id, []) + owner_buffers[refill_owner_id].extend(accepted) + + owner_results[refill_owner_id] = FeedResult( + data=list(owner_buffers.get(refill_owner_id, [])), + next_page=owner_state["current_next_page"], + has_next_page=owner_state["has_next_page"], + ) diff --git a/smartfeed/execution/executor.py b/smartfeed/execution/executor.py index 11eb551..d6a92e8 100644 --- a/smartfeed/execution/executor.py +++ b/smartfeed/execution/executor.py @@ -131,6 +131,16 @@ async def _execute_slots_plan(self, plan: SlotsPlan) -> FeedResult: refill_settings=refill_settings, cursor=cursor, ) + elif refill_settings is not None: + owner_buffers, owner_results = await self._dedup_runtime().apply_slots_plan_refill( + plan=plan, + owners=owners, + owner_index=owner_index, + owner_buffers=owner_buffers, + owner_results=owner_results, + refill_settings=refill_settings, + cursor=cursor, + ) output = self._consume_slots(plan=plan, owner_buffers=owner_buffers) assembled = await self._maybe_await(plan.assemble(output, cursor.next_page, owner_results)) @@ -166,7 +176,9 @@ async def _run_owner( redis_client=plan.ctx.redis_client, executor=plan.ctx.executor, dedup=None, - refill_settings=None, + # Keep refill settings so nested slots plans can still compensate + # owner deficits while top-level dedup arbitration remains centralized. + refill_settings=plan.ctx.refill_settings, ) return await self.run(owner, owner_ctx, demand, isolated_next_page, **plan.params) diff --git a/tests/test_merger_deduplication.py b/tests/test_merger_deduplication.py index 35fb3d1..bbeb011 100644 --- a/tests/test_merger_deduplication.py +++ b/tests/test_merger_deduplication.py @@ -149,6 +149,68 @@ async def test_dedup_deep_tree_cursor_backend() -> None: dh._assert_sources_at_positions(res_2.data, [1, 4], "P") +@pytest.mark.asyncio +async def test_dedup_nested_positional_refill_not_masked_by_parent_append() -> None: + """Nested positional refills must run even when parent append can fill the page. + + Regression: + - parent dedup wrapper executes append owners with dedup disabled in owner ctx + - positional child under-fetches (`max_per_call=1`) and needs internal slot refills + - if those refills are skipped, append sibling backfills the page and positional slots are lost + """ + + items_default = dh.make_items("D", 1, 400, id_offset=1_000) + items_pos = dh.make_items("P", 1, 400, id_offset=10_000) + items_fill = dh.make_items("F", 1, 400, id_offset=20_000) + + pos_calls = {"count": 0} + pos_base = dh.make_offset_paged_method(items_pos, max_per_call=1) + + async def _pos_method(user_id, limit, next_page, **kwargs): # pylint: disable=unused-argument + pos_calls["count"] += 1 + return await pos_base(user_id, limit, next_page, **kwargs) + + methods_dict = { + "default": dh.make_offset_paged_method(items_default), + "pos": _pos_method, + "fill": dh.make_offset_paged_method(items_fill), + } + + config = dh._dedup_config( + "dedup_nested_refill", + dh._append_config( + "append_nested_refill", + [ + dh._positional_config( + "pos_nested_refill", + positions=[2, 4, 6, 8, 10, 12], + positional=dh._subfeed("sf_pos_nested", "pos"), + default=dh._subfeed("sf_default_nested", "default"), + ), + dh._subfeed("sf_fill_nested", "fill"), + ], + ), + dedup_key="id", + state_backend="cursor", + overfetch_factor=3, + max_refill_loops=50, + ) + + merger = parse_model(MergerDeduplication, config) + res = await merger.get_data( + methods_dict=methods_dict, + user_id="u", + limit=12, + next_page=FeedResultNextPage(data={}), + ) + + assert len(res.data) == 12 + dh._assert_no_dupes_in_page(res.data) + dh._assert_sources_at_positions(res.data, [2, 4, 6, 8, 10, 12], "P") + assert "F" not in set(dh._sources(res.data)) + assert pos_calls["count"] > 1 + + @pytest.mark.parametrize( "merger_type", [ From 051373a3b4213b59c4aba8a789b3ad67919672fc Mon Sep 17 00:00:00 2001 From: Pavel Kochetov Date: Mon, 9 Feb 2026 00:56:27 +0000 Subject: [PATCH 26/33] Minor bugfixes and formatting. --- smartfeed/mergers/append_distribute.py | 1 + smartfeed/mergers/percentage.py | 27 +++++++-- smartfeed/mergers/percentage_gradient.py | 22 ++++++-- tests/test_async_loop_blocks_trace.py | 22 ++++++-- tests/test_dedup_utils.py | 7 +-- tests/test_executor_slots_plan_invariants.py | 51 +++++++++++++++-- tests/test_merger_deduplication.py | 59 ++++++++++++++++++++ tests/test_merger_percentage.py | 18 ++++++ tests/test_merger_percentage_gradient.py | 18 ++++++ tests/test_seen_store_unit.py | 1 - 10 files changed, 201 insertions(+), 25 deletions(-) diff --git a/smartfeed/mergers/append_distribute.py b/smartfeed/mergers/append_distribute.py index 3ef891d..220e3e3 100644 --- a/smartfeed/mergers/append_distribute.py +++ b/smartfeed/mergers/append_distribute.py @@ -2,6 +2,7 @@ from collections import defaultdict, deque from typing import TYPE_CHECKING, Any, Dict, List, Literal, Optional + from typing_extensions import no_type_check from ..execution.context import ExecutionContext diff --git a/smartfeed/mergers/percentage.py b/smartfeed/mergers/percentage.py index a21fbd3..9ea2096 100644 --- a/smartfeed/mergers/percentage.py +++ b/smartfeed/mergers/percentage.py @@ -64,10 +64,29 @@ def build_plan( ) -> SlotsPlan: owners: List[BaseFeedConfigModel] = [cast(BaseFeedConfigModel, item.data) for item in self.items] - slots: List[SlotSpec] = [] - for item, owner in zip(self.items, owners): - child_limit = limit * int(item.percentage) // 100 - slots.append(SlotSpec(owner=owner, max_count=max(0, child_limit))) + slot_limits: List[int] = [] + remainders: List[tuple[int, int]] = [] + total_percentage = sum(int(item.percentage) for item in self.items) + + for idx, item in enumerate(self.items): + raw = int(limit) * int(item.percentage) + child_limit = raw // 100 + slot_limits.append(max(0, child_limit)) + remainders.append((raw % 100, idx)) + + # avoid underfilling for the common "percentages sum to 100" case + if total_percentage == 100: + missing = max(0, int(limit) - sum(slot_limits)) + if missing > 0: + for _rem, idx in sorted(remainders, key=lambda x: (-x[0], x[1])): + if missing <= 0: + break + slot_limits[idx] += 1 + missing -= 1 + + slots: List[SlotSpec] = [ + SlotSpec(owner=owner, max_count=max(0, int(slot_limits[idx]))) for idx, owner in enumerate(owners) + ] def _assemble( output: List[Any], diff --git a/smartfeed/mergers/percentage_gradient.py b/smartfeed/mergers/percentage_gradient.py index 1aea051..fb70891 100644 --- a/smartfeed/mergers/percentage_gradient.py +++ b/smartfeed/mergers/percentage_gradient.py @@ -56,10 +56,19 @@ def _calculate_limits_and_percents(self, page: int, limit: int) -> Dict: if result["percentages"] and result["percentages"][-1]["to"] >= 100: result["limit_to"] += iter_limit result["percentages"][-1]["limit"] += iter_limit + result["percentages"][-1]["to_take"] += iter_limit else: - result["limit_from"] += iter_limit * percentage_from // 100 - result["limit_to"] += iter_limit * percentage_to // 100 - iter_result = {"limit": iter_limit, "from": percentage_from, "to": percentage_to} + from_take = iter_limit * percentage_from // 100 + to_take = iter_limit - from_take + result["limit_from"] += from_take + result["limit_to"] += to_take + iter_result = { + "limit": iter_limit, + "from": percentage_from, + "to": percentage_to, + "from_take": from_take, + "to_take": to_take, + } result["percentages"].append(iter_result) if first_iter: @@ -110,8 +119,11 @@ def _assemble( from_start_index = 0 to_start_index = 0 for lp_data in limits_and_percents["percentages"]: - from_end_index = (lp_data["limit"] * lp_data["from"] // 100) + from_start_index - to_end_index = (lp_data["limit"] * lp_data["to"] // 100) + to_start_index + from_take = int(lp_data.get("from_take", lp_data["limit"] * lp_data["from"] // 100)) + to_take = int(lp_data.get("to_take", lp_data["limit"] - from_take)) + + from_end_index = from_start_index + from_take + to_end_index = to_start_index + to_take data.extend(from_data[from_start_index:from_end_index]) data.extend(to_data[to_start_index:to_end_index]) diff --git a/tests/test_async_loop_blocks_trace.py b/tests/test_async_loop_blocks_trace.py index 2dc8def..a1bafd9 100644 --- a/tests/test_async_loop_blocks_trace.py +++ b/tests/test_async_loop_blocks_trace.py @@ -54,7 +54,9 @@ def end(self, name: str, *, tid: int, ts_us: Optional[int] = None, args: Optiona } ) - def instant(self, name: str, *, tid: int, ts_us: Optional[int] = None, args: Optional[Dict[str, Any]] = None) -> None: + def instant( + self, name: str, *, tid: int, ts_us: Optional[int] = None, args: Optional[Dict[str, Any]] = None + ) -> None: self._emit( { "name": name, @@ -132,7 +134,9 @@ async def exit(self) -> int: return self.current -def _trace_wrap_awaitable(rec: ChromeTraceRecorder, name: str, awaitable: Awaitable[Any], *, args: Dict[str, Any]) -> Awaitable[Any]: +def _trace_wrap_awaitable( + rec: ChromeTraceRecorder, name: str, awaitable: Awaitable[Any], *, args: Dict[str, Any] +) -> Awaitable[Any]: async def _wrapped() -> Any: task = asyncio.current_task() tid = id(task) if task is not None else 0 @@ -209,7 +213,7 @@ async def test_async_loop_blocks_and_trace_for_deep_tree_all_mergers(redis_clien - Builds one deep tree that includes ALL merger types. - Simulates 2 sequential requests (fresh + next page). - - Forces refills by creating lots of cross-branch duplicates. + - Forces refills via positional under-fetch (`max_per_call=1`). - Records loop scheduling lag (blocks/hangs) and optionally exports a Chrome trace. Set `SMARTFEED_CHROME_TRACE=/path/to/trace.json` to write a trace. @@ -238,6 +242,13 @@ async def test_async_loop_blocks_and_trace_for_deep_tree_all_mergers(redis_clien # --- tracing (test-only monkeypatch) --- rec = ChromeTraceRecorder() leaf_concurrency = LeafConcurrencyTracker() + pos_leaf_calls = {"count": 0} + + pos_leaf_base = dh.make_offset_paged_method(items_pos_leaf, max_per_call=1) + + async def _pos_leaf_counted(user_id: Any, limit: int, next_page: Any, **kwargs: Any) -> Any: + pos_leaf_calls["count"] += 1 + return await pos_leaf_base(user_id, limit, next_page, **kwargs) # Leaf method tracing: wrap the *actual* subfeed method calls. # These spans are what you want to inspect for "are leaf calls parallel?". @@ -296,7 +307,7 @@ async def test_async_loop_blocks_and_trace_for_deep_tree_all_mergers(redis_clien "pos_leaf": _wrap_leaf_method_traced( rec=rec, key="pos_leaf", - method=dh.make_offset_paged_method(items_pos_leaf, max_per_call=1), + method=_pos_leaf_counted, latency_s=leaf_latency_s, concurrency=leaf_concurrency, ), @@ -436,6 +447,9 @@ async def _run_traced( # Hard assertion: leaf calls must overlap (async concurrency), not serialize. assert leaf_concurrency.peak > 1 + # Refill signal: with max_per_call=1, two page requests should trigger + # multiple extra positional calls to satisfy positional slots. + assert pos_leaf_calls["count"] > 2 # Primary signal: event-loop should remain responsive under load. assert monitor.max_lag_s < 0.1 diff --git a/tests/test_dedup_utils.py b/tests/test_dedup_utils.py index 9556af3..06150ad 100644 --- a/tests/test_dedup_utils.py +++ b/tests/test_dedup_utils.py @@ -3,12 +3,7 @@ import pytest from smartfeed.feed_models import _redis_call -from smartfeed.policies.dedup_utils import ( - decode_seen_from_cursor, - encode_seen_for_cursor, - redis_zmscore, -) - +from smartfeed.policies.dedup_utils import decode_seen_from_cursor, encode_seen_for_cursor, redis_zmscore from tests.fixtures.redis import redis_client diff --git a/tests/test_executor_slots_plan_invariants.py b/tests/test_executor_slots_plan_invariants.py index 74f8540..4b774b4 100644 --- a/tests/test_executor_slots_plan_invariants.py +++ b/tests/test_executor_slots_plan_invariants.py @@ -1,6 +1,6 @@ import pytest -from smartfeed.execution.context import ExecutionContext +from smartfeed.execution.context import ExecutionContext, RefillExecutionSettings from smartfeed.execution.executor import Executor from smartfeed.execution.plans import SlotSpec, SlotsPlan from smartfeed.feed_models import BaseFeedConfigModel, FeedResult, FeedResultNextPage, FeedResultNextPageInside @@ -186,10 +186,12 @@ def assemble(output, next_page, owner_results): plan = SlotsPlan( ctx=ctx, limit=6, - next_page=FeedResultNextPage(data={ - "a": FeedResultNextPageInside(page=1, after=0), - "b": FeedResultNextPageInside(page=1, after=0), - }), + next_page=FeedResultNextPage( + data={ + "a": FeedResultNextPageInside(page=1, after=0), + "b": FeedResultNextPageInside(page=1, after=0), + } + ), params={}, slots=[ SlotSpec(owner=a, max_count=3), @@ -254,3 +256,42 @@ def assemble(output, next_page, owner_results): {"id": "b_2"}, {"id": "b_3"}, ] + + +@pytest.mark.asyncio +async def test_slots_plan_quota_deficit_refills_without_dedup_when_refill_settings_present() -> None: + executor = Executor() + ctx = ExecutionContext(methods_dict={}, user_id="u", executor=executor) + ctx.refill_settings = RefillExecutionSettings(overfetch_factor=3, max_refill_loops=10) + + a = _PagedOwner(subfeed_id="a", total=10) + b = _PagedOwner(subfeed_id="b", total=10) + + def assemble(output, next_page, owner_results): + return FeedResult(data=output, next_page=next_page, has_next_page=False) + + plan = SlotsPlan( + ctx=ctx, + limit=6, + next_page=FeedResultNextPage( + data={ + "a": FeedResultNextPageInside(page=1, after=0), + "b": FeedResultNextPageInside(page=1, after=0), + } + ), + params={}, + slots=[ + SlotSpec(owner=a, max_count=3), + SlotSpec(owner=b, max_count=3), + ], + # force an initial under-fetch for owner a. + owner_fetch_limits={id(a): 1}, + assemble=assemble, + ) + + res = await executor.execute_plan(plan) + + # refill must still happen even when dedup policy is absent. + assert getattr(a, "calls") >= 2 + assert res.data[:3] == [{"id": "a_1"}, {"id": "a_2"}, {"id": "a_3"}] + assert res.data[3:] == [{"id": "b_1"}, {"id": "b_2"}, {"id": "b_3"}] diff --git a/tests/test_merger_deduplication.py b/tests/test_merger_deduplication.py index bbeb011..101eaac 100644 --- a/tests/test_merger_deduplication.py +++ b/tests/test_merger_deduplication.py @@ -211,6 +211,65 @@ async def _pos_method(user_id, limit, next_page, **kwargs): # pylint: disable=u assert pos_calls["count"] > 1 +@pytest.mark.asyncio +async def test_dedup_nested_percentage_refill_not_masked_by_parent_append() -> None: + """Nested percentage refills must run even when parent append can fill.""" + + items_a = dh.make_items("A", 1, 400, id_offset=1_000) + items_b = dh.make_items("B", 1, 400, id_offset=10_000) + items_fill = dh.make_items("F", 1, 400, id_offset=20_000) + + b_calls = {"count": 0} + b_base = dh.make_offset_paged_method(items_b, max_per_call=1) + + async def _b_method(user_id, limit, next_page, **kwargs): # pylint: disable=unused-argument + b_calls["count"] += 1 + return await b_base(user_id, limit, next_page, **kwargs) + + methods_dict = { + "a": dh.make_offset_paged_method(items_a), + "b": _b_method, + "fill": dh.make_offset_paged_method(items_fill), + } + + percentage_cfg = dh._percentage_config( + "pct_nested_refill", + items=dh._percentage_items( + dh._subfeed("sf_a_nested", "a"), + dh._subfeed("sf_b_nested", "b"), + first_pct=50, + second_pct=50, + ), + ) + + config = dh._dedup_config( + "dedup_nested_pct_refill", + dh._append_config( + "append_nested_pct_refill", + [percentage_cfg, dh._subfeed("sf_fill_nested_pct", "fill")], + ), + dedup_key="id", + state_backend="cursor", + overfetch_factor=3, + max_refill_loops=50, + ) + + merger = parse_model(MergerDeduplication, config) + res = await merger.get_data( + methods_dict=methods_dict, + user_id="u", + limit=12, + next_page=FeedResultNextPage(data={}), + ) + + assert len(res.data) == 12 + dh._assert_no_dupes_in_page(res.data) + assert "F" not in set(dh._sources(res.data)) + assert dh._sources(res.data).count("A") == 6 + assert dh._sources(res.data).count("B") == 6 + assert b_calls["count"] > 1 + + @pytest.mark.parametrize( "merger_type", [ diff --git a/tests/test_merger_percentage.py b/tests/test_merger_percentage.py index ae57f00..328dc39 100644 --- a/tests/test_merger_percentage.py +++ b/tests/test_merger_percentage.py @@ -52,3 +52,21 @@ async def test_merger_percentage_when_one_leaf_is_empty() -> None: assert res.data == ["x_4", "x_5", "x_6", "x_7"] # The non-empty leaf still reports more pages. assert res.has_next_page is True + + +@pytest.mark.asyncio +async def test_merger_percentage_odd_limit_fills_page_when_sources_have_data() -> None: + merger_percentage = parse_model(MergerPercentage, MERGER_PERCENTAGE_CONFIG) + res = await merger_percentage.get_data( + methods_dict=METHODS_DICT, + limit=11, + next_page=FeedResultNextPage( + data={ + "subfeed_merger_percentage_example": FeedResultNextPageInside(page=2, after="x_3"), + "subfeed_2_merger_percentage_example": FeedResultNextPageInside(page=3, after="x_20"), + } + ), + user_id="x", + ) + + assert len(res.data) == 11 diff --git a/tests/test_merger_percentage_gradient.py b/tests/test_merger_percentage_gradient.py index 73bc769..e2c6607 100644 --- a/tests/test_merger_percentage_gradient.py +++ b/tests/test_merger_percentage_gradient.py @@ -71,3 +71,21 @@ async def test_merger_percentage_gradient_next_page() -> None: "x_22", "x_23", ] + + +@pytest.mark.asyncio +async def test_merger_percentage_gradient_odd_limit_fills_page_when_sources_have_data() -> None: + merger_percentage_gradient = parse_model(MergerPercentageGradient, MERGER_PERCENTAGE_GRADIENT_CONFIG) + res = await merger_percentage_gradient.get_data( + methods_dict=METHODS_DICT, + limit=11, + next_page=FeedResultNextPage( + data={ + "subfeed_from_merger_percentage_gradient_example": FeedResultNextPageInside(page=2, after="x_3"), + "subfeed_to_merger_percentage_gradient_example": FeedResultNextPageInside(page=3, after="x_20"), + } + ), + user_id="x", + ) + + assert len(res.data) == 11 diff --git a/tests/test_seen_store_unit.py b/tests/test_seen_store_unit.py index f70661f..fc7f6e7 100644 --- a/tests/test_seen_store_unit.py +++ b/tests/test_seen_store_unit.py @@ -5,7 +5,6 @@ from smartfeed.feed_models import _redis_call from smartfeed.policies.dedup_utils import decode_seen_from_cursor from smartfeed.policies.seen_store import CursorSeenStore, RedisSeenStore - from tests.fixtures.redis import redis_client From ec63967f46713bf8c3602e9122d586e1228f9737 Mon Sep 17 00:00:00 2001 From: Pavel Kochetov Date: Mon, 9 Feb 2026 11:11:26 +0000 Subject: [PATCH 27/33] Readme added. --- ARCHITECTURE.md | 238 ++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 238 insertions(+) create mode 100644 ARCHITECTURE.md diff --git a/ARCHITECTURE.md b/ARCHITECTURE.md new file mode 100644 index 0000000..a9ba69c --- /dev/null +++ b/ARCHITECTURE.md @@ -0,0 +1,238 @@ +# SmartFeed Architecture (medium-brief) + +## 1) What SmartFeed does + +SmartFeed builds one paginated feed from multiple client-provided sources (“subfeeds”) using a declarative tree config: + +- **Leaf**: `SubFeed` (calls one client method) +- **Mergers**: compose children (`append`, `distribute`, `positional`, `percentage`, `percentage_gradient`, `view_session`) +- **Wrapper**: `MergerDeduplication` (changes execution semantics around one child) + +Core runtime: + +- parse config -> create request `ExecutionContext` -> run tree via shared `Executor` -> return `FeedResult` + `next_page`. + + +## 2) Public surfaces and core data types + +### Public entrypoint + +- `FeedManager(config, methods_dict, redis_client=None)` + - `get_data(user_id, limit, next_page, **params) -> FeedResult` + +`methods_dict` maps config `method_name` strings to host-app callables. + +### Config schema surface + +`smartfeed.schemas` keeps stable imports for: + +- `FeedConfig`: top-level model (`version`, `feed`) +- `FeedTypes`: discriminated union by `type` + +### Cursor / pagination models + +- `FeedResultNextPageInside`: one node cursor (`page`, `after`) +- `FeedResultNextPage`: full-tree cursor map (`data: {node_id -> FeedResultNextPageInside}`) + +### Result models + +- `FeedResultClient`: required return type of client subfeed methods +- `FeedResult`: normalized return type of any SmartFeed node + + +## 3) Node interface contract + +All nodes inherit `BaseFeedConfigModel` and are executed through: + +- `get_data(methods_dict, user_id, limit, next_page, redis_client=None, ctx=None, **params) -> FeedResult` + +Important notes: + +- If a node implements `build_plan(...)`, executor uses the plan path. +- Base `get_data(...)` delegates back to executor and expects `build_plan(...)` to exist. +- Every node has `dedup_priority: int` (used by dedup arbitration/refill ordering). + + +## 4) ExecutionContext + +`ExecutionContext` is per-request state propagated through the tree: + +- `methods_dict`, `user_id`, `redis_client` +- `executor` (lazy via `ensure_executor()`) +- optional policy/settings: + - `dedup`: `DeduplicationPolicy` when dedup wrapper is active + - `refill_settings`: `RefillExecutionSettings(overfetch_factor, max_refill_loops)` + +Responsibilities: + +- centralize shared plumbing (executor + redis client) +- keep execution policies out of user params + + +## 5) Executor (runtime engine) + +Primary entry: + +- `Executor.run(node, ctx, limit, next_page, **params) -> FeedResult` + +Execution strategy: + +1. **Plan-first** + - `build_plan(...)` -> execute returned `Plan` + - otherwise call node `get_data(...)` +2. **Centralized concurrency** + - child runs use executor-managed `asyncio.gather(...)` +3. **Dedup/refill hooks** + - for non-slot nodes with `ctx.dedup`, run `DedupRuntime.run_node_with_dedup_refill(...)` + - for `SlotsPlan`, dedup/refill is handled inside slot execution + +`SlotsPlan` execution highlights: + +1. collect unique owners + demand per owner +2. fetch owners concurrently (with optional `owner_fetch_limits` overrides) +3. merge only changed cursor keys (`CursorMap.merge_delta`) +4. apply: + - dedup arbitration + refill (`apply_slots_plan_dedup`) when `ctx.dedup` exists + - refill-only deficits (`apply_slots_plan_refill`) when only `ctx.refill_settings` exists +5. consume slot schedule and call `assemble(...)` + +When dedup is active for a slots plan, owners are executed with `dedup=None` in owner context so global arbitration stays centralized. + + +## 6) Plans: declarative execution + +Plans separate “what to run” from “how to run it”. + +- `CallablePlan(fn)` + - node-provided async function with custom flow, still executed by executor + +- `SlotsPlan(ctx, limit, next_page, params, slots, assemble, owner_fetch_limits=None)` + - `slots`: ordered `SlotSpec(owner, max_count)` schedule + - `assemble(output, merged_next_page, owner_results)`: builds final `FeedResult` + + +## 7) Mergers and leaf responsibilities + +### SubFeed (leaf) + +- derives its local cursor from `next_page.data[subfeed_id]` (defaults page=1/after=None) +- calls `methods_dict[method_name]` +- passes only params present in method signature + `subfeed_params` +- async methods are awaited; sync methods run via `asyncio.to_thread(...)` +- `raise_error=False` converts method failure into empty `FeedResultClient` +- optional `shuffle` then normalizes to `FeedResult` + +### Slot-based mergers + +These build `SlotsPlan`: + +- `MergerAppend`: concatenation (optional shuffle) +- `MergerAppendDistribute` (`type="merger_distribute"`): append then redistribute by `distribution_key` +- `MergerPositional`: page-local slot ownership for `positional` vs `default`, keeps its own merger cursor +- `MergerPercentage`: integer allocation by percentages; when total is exactly 100, remainder is distributed to avoid underfill +- `MergerPercentageGradient`: two-owner percentage curve across the page, then advances merger page cursor + +### MergerViewSession (Redis-backed session cache) + +Goal: cache a session-sized list and serve slices. + +Flow: + +1. build cache key: `{merger_id}_{user_id}` + optional suffix from `custom_view_session_key` +2. check Redis `exists`; if no cache or no merger cursor in request -> regenerate session +3. on hit, `get`; if Redis returns `None` unexpectedly, regenerate +4. on generation: execute child once for `session_size`, optional dedup, store JSON with TTL +5. return page slice and increment merger cursor page +6. optional `shuffle` is applied to returned page slice (cache payload is not reshuffled) + +### MergerDeduplication (single-child wrapper) + +Goal: deduplicate while keeping child mix/slot semantics. + +Key behavior: + +- fresh session when merger cursor is absent or `page <= 0` + - reset descendant cursors + - for Redis backend, reset Redis seen-state key +- seen-state backend: + - `cursor`: encoded into merger cursor `after` + - `redis`: ZSET `dedup:{merger_id}:{user_id}` (+ optional custom suffix) +- builds `DeduplicationPolicy` + child `ExecutionContext(dedup=..., refill_settings=...)` +- executes child via shared executor, commits store, writes merger cursor (`page+1`, `after` for cursor backend) + +Refill/overfetch behavior: + +- duplicates trigger bounded refill loops (`max_refill_loops`) +- overfetch (`overfetch_factor`) is applied only for rewindable integer-offset cursors +- when overfetch is used, leaf cursor is rewound to inspected-count to avoid skipping unseen items + + +## 8) Dedup policy + seen stores + +### DeduplicationPolicy + +Owns key extraction + acceptance rules: + +- entity key from `dedup_key` + `missing_key_policy` +- reject duplicates already seen in current response (`seen_request_set`) +- compare candidate priority vs persisted seen priority + +Capabilities: + +- batched prefetch from store +- per-owner arbitration with deterministic tie-break: `(-dedup_priority, owner_rank, item_rank)` +- ordered single-stream acceptance (`accept_batch`) returning accepted items + inspected count + +### Seen stores + +- `CursorSeenStore` + - in-cursor map of `{key -> max_priority}` + - optional compression + max-key trimming at commit + +- `RedisSeenStore` + - cached reads via `redis_zmscore(...)` + - buffered writes via `redis_zadd_and_expire(...)` + + +## 9) Redis/JSON helpers + +- `_redis_call(client, method_name, *args, **kwargs)` + - async redis client: direct await + - sync redis client: `asyncio.to_thread(...)` + +Other helpers: + +- `jsonlib`: thin `orjson` wrapper compatible with package usage (`dumps`/`loads`) +- `dedup_utils`: cursor encode/decode + Redis ZSET helper fallbacks (`zmscore` / pipeline) + + +## 10) End-to-end call flows + +### A) Standard request (no view session, no dedup) + +1. `FeedManager.get_data(...)` builds `ExecutionContext` +2. `Executor.run(root, ctx, limit, next_page)` +3. recursive execution via plans or direct `get_data(...)` +4. returns `FeedResult(data, next_page, has_next_page)` + +### B) Slot-based merger request + +1. merger returns `SlotsPlan` +2. executor fetches owners concurrently +3. optional arbitration/refill runs +4. slots are consumed in schedule order +5. `assemble(...)` builds final result + +### C) Dedup wrapper request + +1. wrapper creates store + policy and child context +2. child executes under dedup/refill control +3. executor performs acceptance/arbitration + bounded refills +4. store commits; wrapper writes merger cursor state + +### D) View-session request + +1. wrapper resolves cache key +2. cache miss/new session -> regenerate and cache +3. cache hit -> load session list from Redis +4. return requested slice + advanced merger page From 9ceabad96c35d086ee53ceb5589cccf4614a899a Mon Sep 17 00:00:00 2001 From: Shakirov Renat Date: Thu, 12 Mar 2026 20:24:55 +0300 Subject: [PATCH 28/33] bugfix --- smartfeed/mergers/view_session.py | 12 +++++++++++- 1 file changed, 11 insertions(+), 1 deletion(-) diff --git a/smartfeed/mergers/view_session.py b/smartfeed/mergers/view_session.py index dad3314..3bab829 100644 --- a/smartfeed/mergers/view_session.py +++ b/smartfeed/mergers/view_session.py @@ -58,7 +58,17 @@ async def _set_cache( if ctx.executor is None: raise ValueError("Executor must be initialized for MergerViewSession") - result = await ctx.executor.run(self.data, ctx, self.session_size, FeedResultNextPage(data={}), **params) + # Strip dedup from context to avoid marking items as "seen" during cache build. + # Items will be deduped later when returned page-by-page through the parent merger. + cache_ctx = ExecutionContext( + methods_dict=ctx.methods_dict, + user_id=ctx.user_id, + redis_client=ctx.redis_client, + executor=ctx.executor, + dedup=None, + refill_settings=None, + ) + result = await ctx.executor.run(self.data, cache_ctx, self.session_size, FeedResultNextPage(data={}), **params) data = result.data if self.deduplicate: From 92a628ad837f06bd479a7ececc239ddf9ff3bb2f Mon Sep 17 00:00:00 2001 From: Pavel Kochetov Date: Wed, 18 Mar 2026 12:49:08 +0000 Subject: [PATCH 29/33] State fix dedup+view_session stacked. --- smartfeed/mergers/view_session.py | 11 +++++------ smartfeed/policies/dedup.py | 21 +++++++++++++++++++++ tests/fixtures/mergers.py | 17 +++++++++++++++++ tests/test_merger_deduplication.py | 1 + tests/test_merger_view_session.py | 25 +++++++++++++++++++++++-- 5 files changed, 67 insertions(+), 8 deletions(-) diff --git a/smartfeed/mergers/view_session.py b/smartfeed/mergers/view_session.py index 3bab829..857e69a 100644 --- a/smartfeed/mergers/view_session.py +++ b/smartfeed/mergers/view_session.py @@ -58,17 +58,16 @@ async def _set_cache( if ctx.executor is None: raise ValueError("Executor must be initialized for MergerViewSession") - # Strip dedup from context to avoid marking items as "seen" during cache build. - # Items will be deduped later when returned page-by-page through the parent merger. - cache_ctx = ExecutionContext( + inner_dedup = ctx.dedup.create_isolated() if ctx.dedup is not None else None + inner_ctx = ExecutionContext( methods_dict=ctx.methods_dict, user_id=ctx.user_id, redis_client=ctx.redis_client, executor=ctx.executor, - dedup=None, - refill_settings=None, + dedup=inner_dedup, + refill_settings=ctx.refill_settings if inner_dedup is not None else None, ) - result = await ctx.executor.run(self.data, cache_ctx, self.session_size, FeedResultNextPage(data={}), **params) + result = await ctx.executor.run(self.data, inner_ctx, self.session_size, FeedResultNextPage(data={}), **params) data = result.data if self.deduplicate: diff --git a/smartfeed/policies/dedup.py b/smartfeed/policies/dedup.py index b040c47..e744931 100644 --- a/smartfeed/policies/dedup.py +++ b/smartfeed/policies/dedup.py @@ -92,6 +92,27 @@ def record(self, key: str, priority: int) -> None: self.seen_request_set.add(key) self.store.set_max(key, priority) + def create_isolated(self) -> "DeduplicationPolicy": + """Create an isolated copy for inner execution contexts. + + Uses a fresh in-memory store and empty seen_request_set so that + inner priority arbitration works without contaminating the caller's + dedup state. + """ + from .seen_store import CursorSeenStore + + return DeduplicationPolicy( + dedup_key=self.dedup_key, + missing_key_policy=self.missing_key_policy, + store=CursorSeenStore( + cursor_compress=True, + cursor_max_keys=None, + seen_priority_map={}, + seen_order=[], + ), + seen_request_set=set(), + ) + async def arbitrate_owner_buffers( self, *, diff --git a/tests/fixtures/mergers.py b/tests/fixtures/mergers.py index c6bbf50..41c2150 100644 --- a/tests/fixtures/mergers.py +++ b/tests/fixtures/mergers.py @@ -136,3 +136,20 @@ "method_name": "doubles", }, } + +MERGER_DEDUP_VIEW_SESSION_CONFIG = { + "merger_id": "dedup_over_session", + "type": "merger_deduplication", + "dedup_key": None, + "data": { + "merger_id": "inner_session", + "type": "merger_view_session", + "session_size": 60, + "session_live_time": 300, + "data": { + "subfeed_id": "subfeed_dedup_vs", + "type": "subfeed", + "method_name": "followings", + }, + }, +} diff --git a/tests/test_merger_deduplication.py b/tests/test_merger_deduplication.py index 101eaac..f0e57b7 100644 --- a/tests/test_merger_deduplication.py +++ b/tests/test_merger_deduplication.py @@ -868,6 +868,7 @@ async def test_dedup_wrapper_with_view_session_merger(redis_client) -> None: custom_view_session_key="vs1", ) + assert len(res_1.data) == 10 dh._assert_two_pages_no_dupes(res_1, res_2) # Deletion priority: for the overlapping early ids, the winning entity must be from high. diff --git a/tests/test_merger_view_session.py b/tests/test_merger_view_session.py index f62a096..4e41536 100644 --- a/tests/test_merger_view_session.py +++ b/tests/test_merger_view_session.py @@ -3,9 +3,13 @@ import pytest from smartfeed.feed_models import _redis_call -from smartfeed.schemas import FeedResultNextPage, FeedResultNextPageInside, MergerViewSession +from smartfeed.schemas import FeedResultNextPage, FeedResultNextPageInside, MergerDeduplication, MergerViewSession from tests.fixtures.configs import METHODS_DICT -from tests.fixtures.mergers import MERGER_VIEW_SESSION_CONFIG, MERGER_VIEW_SESSION_DUPS_CONFIG +from tests.fixtures.mergers import ( + MERGER_DEDUP_VIEW_SESSION_CONFIG, + MERGER_VIEW_SESSION_CONFIG, + MERGER_VIEW_SESSION_DUPS_CONFIG, +) from tests.fixtures.redis import redis_client from tests.utils import parse_model @@ -116,3 +120,20 @@ async def test_merger_view_session_deduplication(redis_client) -> None: assert merger_vs_res.data == [i for i in range(1, 11)] assert len(merger_vs_cache) == merger_vs.session_size assert merger_vs_cache[:10] == merger_vs_res.data + + +@pytest.mark.parametrize("redis_client", ["sync", "async"], indirect=True) +@pytest.mark.asyncio +async def test_dedup_directly_over_view_session(redis_client) -> None: + """Regression: dedup context must not leak into view session cache generation.""" + merger = parse_model(MergerDeduplication, MERGER_DEDUP_VIEW_SESSION_CONFIG) + result = await merger.get_data( + methods_dict=METHODS_DICT, + limit=20, + next_page=FeedResultNextPage(data={}), + user_id="x", + redis_client=redis_client, + ) + # Currently returns 0 items — all rejected as "seen" during cache build + assert len(result.data) == 20 + assert result.data == [f"x_{i}" for i in range(1, 21)] From 64de66963e08001e8e0a01e414dd1c54a3aedd4e Mon Sep 17 00:00:00 2001 From: Pavel Kochetov Date: Wed, 18 Mar 2026 13:29:05 +0000 Subject: [PATCH 30/33] Bugfix. --- smartfeed/mergers/view_session.py | 47 +++++++++++++++++++++++++++---- 1 file changed, 41 insertions(+), 6 deletions(-) diff --git a/smartfeed/mergers/view_session.py b/smartfeed/mergers/view_session.py index 857e69a..9ae0e01 100644 --- a/smartfeed/mergers/view_session.py +++ b/smartfeed/mergers/view_session.py @@ -53,8 +53,9 @@ async def _set_cache( redis_client: Union[redis.Redis, AsyncRedis], cache_key: str, ctx: ExecutionContext, + child_next_page: Optional[FeedResultNextPage] = None, **params: Any, - ) -> List[Any]: + ) -> tuple[List[Any], bool, Optional[FeedResultNextPage]]: if ctx.executor is None: raise ValueError("Executor must be initialized for MergerViewSession") @@ -67,13 +68,22 @@ async def _set_cache( dedup=inner_dedup, refill_settings=ctx.refill_settings if inner_dedup is not None else None, ) - result = await ctx.executor.run(self.data, inner_ctx, self.session_size, FeedResultNextPage(data={}), **params) + start_cursor = child_next_page if child_next_page is not None else FeedResultNextPage(data={}) + result = await ctx.executor.run(self.data, inner_ctx, self.session_size, start_cursor, **params) data = result.data if self.deduplicate: data = self._dedup_data(data) await _redis_call(redis_client, "set", cache_key, json.dumps(data), ex=self.session_live_time) - return data + + meta = { + "child_has_next": result.has_next_page, + "child_cursor": result.next_page.model_dump(), + } + await _redis_call(redis_client, "set", f"{cache_key}:meta", json.dumps(meta), ex=self.session_live_time) + + child_cursor = result.next_page if result.has_next_page else None + return data, result.has_next_page, child_cursor async def _get_cache( self, @@ -90,10 +100,14 @@ async def _get_cache( ) logging.info("MergerViewSession cache request for %s", cache_key) + + child_has_next = False + child_cursor: Optional[FeedResultNextPage] = None + cache_exists = bool(await _redis_call(redis_client, "exists", cache_key)) if not cache_exists or self.merger_id not in next_page.data: logging.info("Cache miss or new session - generating fresh data for %s", cache_key) - session_data = await self._set_cache( + session_data, child_has_next, child_cursor = await self._set_cache( redis_client=redis_client, cache_key=cache_key, ctx=ctx, @@ -106,7 +120,7 @@ async def _get_cache( logging.info( "Redis returned None for %s - falling back to fresh data (cluster replication issue)", cache_key ) - session_data = await self._set_cache( + session_data, child_has_next, child_cursor = await self._set_cache( redis_client=redis_client, cache_key=cache_key, ctx=ctx, @@ -116,11 +130,32 @@ async def _get_cache( logging.info("Successfully read cached data for %s", cache_key) session_data = json.loads(cached_data) + meta_raw = await _redis_call(redis_client, "get", f"{cache_key}:meta") + if meta_raw: + meta = json.loads(meta_raw) + child_has_next = meta.get("child_has_next", False) + child_cursor_data = meta.get("child_cursor") + if child_cursor_data: + child_cursor = FeedResultNextPage.model_validate(child_cursor_data) + page = next_page.data[self.merger_id].page if self.merger_id in next_page.data else 1 + + # Session exhausted but child has more data -- rebuild from continuation cursor + if (page - 1) * limit >= len(session_data) and child_has_next and child_cursor is not None: + logging.info("Session exhausted at page %d for %s - rebuilding with child cursor", page, cache_key) + session_data, child_has_next, child_cursor = await self._set_cache( + redis_client=redis_client, + cache_key=cache_key, + ctx=ctx, + child_next_page=child_cursor, + **params, + ) + page = 1 + return FeedResult( data=session_data[(page - 1) * limit :][:limit], next_page=FeedResultNextPage(data={self.merger_id: FeedResultNextPageInside(page=page + 1, after=None)}), - has_next_page=bool(len(session_data) > limit * page), + has_next_page=bool(len(session_data) > limit * page or child_has_next), ) def build_plan( From 951aff2fb294d9e26fc9e85c0c1c0b816b2efd10 Mon Sep 17 00:00:00 2001 From: Pavel Kochetov Date: Wed, 18 Mar 2026 13:53:21 +0000 Subject: [PATCH 31/33] Lint fixes. --- pyproject.toml | 3 +++ smartfeed/execution/context.py | 9 +++++++-- smartfeed/execution/plans.py | 5 +---- smartfeed/feed_models.py | 2 +- smartfeed/mergers/view_session.py | 2 +- smartfeed/schemas.py | 2 +- tests/test_dedup_utils.py | 6 +++--- tests/test_executor_slots_plan_invariants.py | 10 +++++----- tests/test_merger_append.py | 2 +- tests/test_merger_percentage.py | 2 +- tests/test_merger_positional.py | 2 +- tests/test_merger_view_session.py | 2 +- tests/test_redis_live.py | 2 -- tests/test_seen_store_unit.py | 3 +-- tests/test_view_session_unit.py | 5 ++--- 15 files changed, 29 insertions(+), 28 deletions(-) diff --git a/pyproject.toml b/pyproject.toml index 2c4d669..f474b19 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -44,6 +44,9 @@ strict_optional = true disallow_any_expr = false python_version = "3.10" +[tool.ruff.lint.per-file-ignores] +"tests/**" = ["F811"] + [tool.pytest.ini_options] testpaths = ["tests"] python_files = ["tests.py", "test_*.py", "*_test.py"] diff --git a/smartfeed/execution/context.py b/smartfeed/execution/context.py index 18fc769..20d213f 100644 --- a/smartfeed/execution/context.py +++ b/smartfeed/execution/context.py @@ -1,11 +1,16 @@ from __future__ import annotations +from __future__ import annotations + from dataclasses import dataclass -from typing import Any, Callable, Dict, Optional, Union +from typing import TYPE_CHECKING, Any, Callable, Dict, Optional, Union import redis from redis.asyncio import Redis as AsyncRedis +if TYPE_CHECKING: + from ..policies.dedup import DeduplicationPolicy + @dataclass class ExecutionContext: @@ -22,7 +27,7 @@ class ExecutionContext: executor: Any = None # Policies (optional) - dedup: Optional[object] = None + dedup: Optional[DeduplicationPolicy] = None # Execution settings (optional) refill_settings: Optional["RefillExecutionSettings"] = None diff --git a/smartfeed/execution/plans.py b/smartfeed/execution/plans.py index 8a23a29..9824c86 100644 --- a/smartfeed/execution/plans.py +++ b/smartfeed/execution/plans.py @@ -1,7 +1,7 @@ from __future__ import annotations from dataclasses import dataclass -from typing import Any, Awaitable, Callable, Dict, List, Optional, Protocol +from typing import TYPE_CHECKING, Any, Awaitable, Callable, Dict, List, Optional, Protocol from ..feed_models import BaseFeedConfigModel, FeedResult, FeedResultNextPage from .context import ExecutionContext @@ -53,8 +53,5 @@ class SlotsPlan: owner_fetch_limits: Optional[Dict[int, int]] = None -# NOTE: `Executor` is imported only for typing to avoid an import cycle. -from typing import TYPE_CHECKING - if TYPE_CHECKING: from .executor import Executor diff --git a/smartfeed/feed_models.py b/smartfeed/feed_models.py index 38d26d6..4227253 100644 --- a/smartfeed/feed_models.py +++ b/smartfeed/feed_models.py @@ -2,7 +2,7 @@ import inspect from dataclasses import dataclass from random import shuffle -from typing import TYPE_CHECKING, Any, Awaitable, Callable, Dict, List, Literal, Optional, Union, cast +from typing import TYPE_CHECKING, Any, Callable, Dict, List, Literal, Optional, Union, cast import redis from pydantic import BaseModel diff --git a/smartfeed/mergers/view_session.py b/smartfeed/mergers/view_session.py index 9ae0e01..e29ed22 100644 --- a/smartfeed/mergers/view_session.py +++ b/smartfeed/mergers/view_session.py @@ -2,7 +2,7 @@ import logging from random import shuffle -from typing import TYPE_CHECKING, Any, Dict, List, Literal, Optional, Union +from typing import TYPE_CHECKING, Any, List, Literal, Optional, Union import redis from redis.asyncio import Redis as AsyncRedis diff --git a/smartfeed/schemas.py b/smartfeed/schemas.py index 9863bb4..298cde7 100644 --- a/smartfeed/schemas.py +++ b/smartfeed/schemas.py @@ -6,7 +6,7 @@ from __future__ import annotations -from typing import Annotated, Any, Dict, Union +from typing import Annotated, Any, Union from pydantic import BaseModel, Field diff --git a/tests/test_dedup_utils.py b/tests/test_dedup_utils.py index 06150ad..1805516 100644 --- a/tests/test_dedup_utils.py +++ b/tests/test_dedup_utils.py @@ -1,10 +1,10 @@ -from typing import Any, List +from typing import Any import pytest from smartfeed.feed_models import _redis_call from smartfeed.policies.dedup_utils import decode_seen_from_cursor, encode_seen_for_cursor, redis_zmscore -from tests.fixtures.redis import redis_client +from tests.fixtures.redis import redis_client # noqa: F401 class _RedisNoZmscore: @@ -67,7 +67,7 @@ async def test_redis_zmscore_pipeline_fallback_for_sync_client_without_zmscore(r await _redis_call(redis_client, "zadd", key, mapping={"a": 1.0, "b": 2.0}) wrapped = _RedisNoZmscore(redis_client) - res = await redis_zmscore(wrapped, key, ["a", "missing", "b"]) + res = await redis_zmscore(wrapped, key, ["a", "missing", "b"]) # type: ignore[arg-type] assert res == [1.0, None, 2.0] await _redis_call(redis_client, "delete", key) diff --git a/tests/test_executor_slots_plan_invariants.py b/tests/test_executor_slots_plan_invariants.py index 4b774b4..596f10d 100644 --- a/tests/test_executor_slots_plan_invariants.py +++ b/tests/test_executor_slots_plan_invariants.py @@ -10,10 +10,10 @@ class _Owner(BaseFeedConfigModel): type: str = "test_owner" + name: str = "" - def __init__(self, *, name: str, **data): - super().__init__(**data) - object.__setattr__(self, "name", name) + def __init__(self, *, name: str, **data): # type: ignore[override] + super().__init__(name=name, **data) # type: ignore[call-arg] object.__setattr__(self, "last_limit", None) object.__setattr__(self, "calls", 0) @@ -37,8 +37,8 @@ class _PagedOwner(BaseFeedConfigModel): subfeed_id: str total: int = 10 - def __init__(self, *, subfeed_id: str, total: int = 10, **data): - super().__init__(subfeed_id=subfeed_id, total=total, **data) + def __init__(self, *, subfeed_id: str, total: int = 10, **data): # type: ignore[override] + super().__init__(subfeed_id=subfeed_id, total=total, **data) # type: ignore[call-arg] object.__setattr__(self, "calls", 0) object.__setattr__(self, "limits", []) diff --git a/tests/test_merger_append.py b/tests/test_merger_append.py index 290dd04..c6f67fb 100644 --- a/tests/test_merger_append.py +++ b/tests/test_merger_append.py @@ -48,7 +48,7 @@ async def test_merger_append_with_item_1_page_2() -> None: @pytest.mark.asyncio async def test_merger_append_when_one_leaf_is_empty() -> None: - config = copy.deepcopy(MERGER_APPEND_CONFIG) + config: dict = copy.deepcopy(MERGER_APPEND_CONFIG) # Make the second leaf return no data + has_next_page=False. config["items"][1]["method_name"] = "empty" diff --git a/tests/test_merger_percentage.py b/tests/test_merger_percentage.py index 328dc39..10da578 100644 --- a/tests/test_merger_percentage.py +++ b/tests/test_merger_percentage.py @@ -32,7 +32,7 @@ async def test_merger_percentage() -> None: @pytest.mark.asyncio async def test_merger_percentage_when_one_leaf_is_empty() -> None: - config = copy.deepcopy(MERGER_PERCENTAGE_CONFIG) + config: dict = copy.deepcopy(MERGER_PERCENTAGE_CONFIG) # Make the second leaf return no data + has_next_page=False. config["items"][1]["data"]["method_name"] = "empty" diff --git a/tests/test_merger_positional.py b/tests/test_merger_positional.py index 370f770..8653d80 100644 --- a/tests/test_merger_positional.py +++ b/tests/test_merger_positional.py @@ -58,7 +58,7 @@ async def test_merger_positional_with_empty_default() -> None: """ merger_positional = parse_model(MergerPositional, MERGER_POSITIONAL_CONFIG) - merger_positional.default.method_name = "empty" + merger_positional.default.method_name = "empty" # type: ignore[union-attr] merger_positional_res = await merger_positional.get_data( methods_dict=METHODS_DICT, limit=10, diff --git a/tests/test_merger_view_session.py b/tests/test_merger_view_session.py index 4e41536..aef0a25 100644 --- a/tests/test_merger_view_session.py +++ b/tests/test_merger_view_session.py @@ -10,7 +10,7 @@ MERGER_VIEW_SESSION_CONFIG, MERGER_VIEW_SESSION_DUPS_CONFIG, ) -from tests.fixtures.redis import redis_client +from tests.fixtures.redis import redis_client # noqa: F401 from tests.utils import parse_model diff --git a/tests/test_redis_live.py b/tests/test_redis_live.py index e1dc13d..32f59a7 100644 --- a/tests/test_redis_live.py +++ b/tests/test_redis_live.py @@ -1,10 +1,8 @@ import asyncio -import json import time import pytest import redis -from redis.asyncio import Redis as AsyncRedis from smartfeed.schemas import FeedResultNextPage, MergerViewSession from tests.fixtures.configs import METHODS_DICT diff --git a/tests/test_seen_store_unit.py b/tests/test_seen_store_unit.py index fc7f6e7..2ae77c3 100644 --- a/tests/test_seen_store_unit.py +++ b/tests/test_seen_store_unit.py @@ -1,11 +1,10 @@ -from typing import Any import pytest from smartfeed.feed_models import _redis_call from smartfeed.policies.dedup_utils import decode_seen_from_cursor from smartfeed.policies.seen_store import CursorSeenStore, RedisSeenStore -from tests.fixtures.redis import redis_client +from tests.fixtures.redis import redis_client # noqa: F401 @pytest.mark.asyncio diff --git a/tests/test_view_session_unit.py b/tests/test_view_session_unit.py index a847eea..d237185 100644 --- a/tests/test_view_session_unit.py +++ b/tests/test_view_session_unit.py @@ -1,13 +1,12 @@ from dataclasses import dataclass -from typing import Any import pytest from smartfeed.feed_models import _redis_call -from smartfeed.schemas import FeedResultNextPage, FeedResultNextPageInside, MergerViewSession +from smartfeed.schemas import FeedResultNextPage, MergerViewSession from tests.fixtures.configs import METHODS_DICT from tests.fixtures.mergers import MERGER_VIEW_SESSION_CONFIG -from tests.fixtures.redis import redis_client +from tests.fixtures.redis import redis_client # noqa: F401 from tests.utils import parse_model From acc66dc3485080169d864b2819087a4336c8f932 Mon Sep 17 00:00:00 2001 From: Shakirov Renat Date: Thu, 19 Mar 2026 12:46:04 +0300 Subject: [PATCH 32/33] more tests for large feeds --- smartfeed/examples/example_client.py | 22 +++ smartfeed/mergers/view_session.py | 7 +- tests/fixtures/configs.py | 1 + tests/test_merger_view_session.py | 225 +++++++++++++++++++++++++++ 4 files changed, 254 insertions(+), 1 deletion(-) diff --git a/smartfeed/examples/example_client.py b/smartfeed/examples/example_client.py index a24e130..485c4b5 100644 --- a/smartfeed/examples/example_client.py +++ b/smartfeed/examples/example_client.py @@ -53,6 +53,28 @@ async def example_method( next_page.page += 1 return FeedResultClient(data=result_data, next_page=next_page, has_next_page=True) + @staticmethod + async def large_method( + user_id: str, + limit: int, + next_page: FeedResultNextPageInside, + limit_to_return: Optional[int] = None, + ) -> FeedResultClient: + data = [f"{user_id}_{i}" for i in range(1, 5001)] + + from_index = (data.index(next_page.after) + 1) if next_page.after else 0 + to_index = from_index + limit + + result_data = data[from_index:to_index] + + if isinstance(limit_to_return, int) and limit_to_return > 0: + result_data = result_data[:limit_to_return] + + has_next = to_index < len(data) + next_page.after = result_data[-1] if result_data else None + next_page.page += 1 + return FeedResultClient(data=result_data, next_page=next_page, has_next_page=has_next) + @staticmethod async def empty_method( user_id: str, # pylint: disable=W0613 diff --git a/smartfeed/mergers/view_session.py b/smartfeed/mergers/view_session.py index e29ed22..d7126e2 100644 --- a/smartfeed/mergers/view_session.py +++ b/smartfeed/mergers/view_session.py @@ -152,10 +152,15 @@ async def _get_cache( ) page = 1 + has_more_in_session = len(session_data) > limit * page + can_rebuild = child_has_next and child_cursor is not None + return FeedResult( data=session_data[(page - 1) * limit :][:limit], next_page=FeedResultNextPage(data={self.merger_id: FeedResultNextPageInside(page=page + 1, after=None)}), - has_next_page=bool(len(session_data) > limit * page or child_has_next), + # True while the current session still has pages OR the child + # can provide a new session (triggers rebuild on the next call). + has_next_page=bool(has_more_in_session or can_rebuild), ) def build_plan( diff --git a/tests/fixtures/configs.py b/tests/fixtures/configs.py index 6aff5cb..642d169 100644 --- a/tests/fixtures/configs.py +++ b/tests/fixtures/configs.py @@ -5,6 +5,7 @@ METHODS_DICT: Dict[str, Callable] = { "ads": ClientMixerClass().example_method, "followings": ClientMixerClass().example_method, + "large": ClientMixerClass().large_method, "empty": ClientMixerClass().empty_method, "error": ClientMixerClass().error_method, "doubles": ClientMixerClass().doubles_method, diff --git a/tests/test_merger_view_session.py b/tests/test_merger_view_session.py index aef0a25..3269e6f 100644 --- a/tests/test_merger_view_session.py +++ b/tests/test_merger_view_session.py @@ -137,3 +137,228 @@ async def test_dedup_directly_over_view_session(redis_client) -> None: # Currently returns 0 items — all rejected as "seen" during cache build assert len(result.data) == 20 assert result.data == [f"x_{i}" for i in range(1, 21)] + + +MERGER_VIEW_SESSION_SMALL_CONFIG = { + "merger_id": "small_session", + "type": "merger_view_session", + "session_size": 20, + "session_live_time": 300, + "data": { + "subfeed_id": "subfeed_small_session", + "type": "subfeed", + "method_name": "followings", + }, +} + +MERGER_DEDUP_SMALL_SESSION_CONFIG = { + "merger_id": "dedup_small", + "type": "merger_deduplication", + "dedup_key": None, + "max_refill_loops": 3, + "data": { + "merger_id": "small_session_dedup", + "type": "merger_view_session", + "session_size": 30, + "session_live_time": 300, + "data": { + "subfeed_id": "subfeed_dedup_small", + "type": "subfeed", + "method_name": "followings", + }, + }, +} + +MERGER_DEDUP_LARGE_POOL_CONFIG = { + "merger_id": "dedup_large", + "type": "merger_deduplication", + "dedup_key": None, + "max_refill_loops": 3, + "data": { + "merger_id": "large_pool_session", + "type": "merger_view_session", + "session_size": 300, + "session_live_time": 300, + "data": { + "subfeed_id": "subfeed_large_pool", + "type": "subfeed", + "method_name": "large", + }, + }, +} + + +@pytest.mark.parametrize("redis_client", ["sync", "async"], indirect=True) +@pytest.mark.asyncio +async def test_session_rebuild_continues_pagination(redis_client) -> None: + """Pagination continues beyond session_size via session rebuild. + + session_size=20, limit=10. After 2 pages (20 items), session exhausts. + has_next_page should remain True because child has more data. + Page 3 triggers rebuild and returns fresh items. + """ + merger_vs = parse_model(MergerViewSession, MERGER_VIEW_SESSION_SMALL_CONFIG) + user_id = "rebuild_test" + + all_items = [] + next_page = FeedResultNextPage(data={}) + num_pages = 5 # 50 items, crosses 2+ sessions of 20 + + for page_num in range(1, num_pages + 1): + result = await merger_vs.get_data( + methods_dict=METHODS_DICT, + limit=10, + next_page=next_page, + user_id=user_id, + redis_client=redis_client, + ) + assert len(result.data) == 10, f"Page {page_num}: expected 10 items, got {len(result.data)}" + all_items.extend(result.data) + next_page = result.next_page + + if page_num < num_pages: + assert result.has_next_page, ( + f"Page {page_num}: has_next_page=False but expected True " + f"(session_size=20, should rebuild)" + ) + + # 50 unique items across 2+ sessions + assert len(all_items) == 50 + assert all_items == [f"{user_id}_{i}" for i in range(1, 51)] + # No duplicates + assert len(set(all_items)) == 50 + + +@pytest.mark.parametrize("redis_client", ["sync", "async"], indirect=True) +@pytest.mark.asyncio +async def test_session_has_next_page_true_at_boundary(redis_client) -> None: + """has_next_page=True when session exactly exhausted but child has more. + + session_size=20, limit=10. Page 2 returns items 11-20 (session exhausted). + has_next_page must be True because child can rebuild. + """ + merger_vs = parse_model(MergerViewSession, MERGER_VIEW_SESSION_SMALL_CONFIG) + user_id = "boundary_test" + + # Page 1 + result1 = await merger_vs.get_data( + methods_dict=METHODS_DICT, + limit=10, + next_page=FeedResultNextPage(data={}), + user_id=user_id, + redis_client=redis_client, + ) + assert result1.has_next_page is True + assert result1.data == [f"{user_id}_{i}" for i in range(1, 11)] + + # Page 2 -- session exactly exhausted (items 11-20, session_size=20) + result2 = await merger_vs.get_data( + methods_dict=METHODS_DICT, + limit=10, + next_page=result1.next_page, + user_id=user_id, + redis_client=redis_client, + ) + assert result2.data == [f"{user_id}_{i}" for i in range(11, 21)] + # KEY ASSERTION: has_next_page=True despite session exhausted + assert result2.has_next_page is True, ( + "has_next_page should be True when session exhausted but child has more data" + ) + + # Page 3 -- rebuild triggers, fresh items 21-30 + result3 = await merger_vs.get_data( + methods_dict=METHODS_DICT, + limit=10, + next_page=result2.next_page, + user_id=user_id, + redis_client=redis_client, + ) + assert len(result3.data) == 10 + assert result3.data == [f"{user_id}_{i}" for i in range(21, 31)] + + +@pytest.mark.parametrize("redis_client", ["sync", "async"], indirect=True) +@pytest.mark.asyncio +async def test_dedup_over_session_no_duplicates_across_rebuilds(redis_client) -> None: + """dedup + session_size=30: пагинация до 900 туров, 0 дупликатов. + + session_size=30 при limit=10 дает 3 страницы на сессию. + 90 страниц = 30 пересборок сессии. + Все 900 туров должны быть уникальными (dedup трекает виденные). + (example_method генерирует 999 элементов, берем 900 чтобы не упереться в лимит) + """ + merger = parse_model(MergerDeduplication, MERGER_DEDUP_SMALL_SESSION_CONFIG) + user_id = "dedup_900" + limit = 10 + num_pages = 90 + + all_items = [] + next_page = FeedResultNextPage(data={}) + + for page_num in range(1, num_pages + 1): + result = await merger.get_data( + methods_dict=METHODS_DICT, + limit=limit, + next_page=next_page, + user_id=user_id, + redis_client=redis_client, + ) + assert len(result.data) == limit, ( + f"Page {page_num}: expected {limit} items, got {len(result.data)}" + ) + all_items.extend(result.data) + next_page = result.next_page + + if page_num < num_pages: + assert result.has_next_page, ( + f"Page {page_num}: has_next_page=False, got only {len(all_items)} items" + ) + + assert len(all_items) == 900 + unique = set(all_items) + assert len(unique) == 900, ( + f"Found {900 - len(unique)} duplicates among 900 items" + ) + + +@pytest.mark.parametrize("redis_client", ["sync", "async"], indirect=True) +@pytest.mark.asyncio +async def test_dedup_over_session_1500_unique_tours(redis_client) -> None: + """dedup + session_size=300, pool=5000: 1500 unique tours, 0 duplicates. + + session_size=300 with limit=150 gives 2 pages per session. + 10 pages = 5 session rebuilds = 1500 tours. + All must be unique (dedup tracks seen items across sessions). + """ + merger = parse_model(MergerDeduplication, MERGER_DEDUP_LARGE_POOL_CONFIG) + user_id = "dedup_1500" + limit = 150 + num_pages = 10 + + all_items = [] + next_page = FeedResultNextPage(data={}) + + for page_num in range(1, num_pages + 1): + result = await merger.get_data( + methods_dict=METHODS_DICT, + limit=limit, + next_page=next_page, + user_id=user_id, + redis_client=redis_client, + ) + assert len(result.data) == limit, ( + f"Page {page_num}: expected {limit} items, got {len(result.data)}" + ) + all_items.extend(result.data) + next_page = result.next_page + + if page_num < num_pages: + assert result.has_next_page, ( + f"Page {page_num}: has_next_page=False after only {len(all_items)} items" + ) + + assert len(all_items) == 1500 + unique = set(all_items) + assert len(unique) == 1500, ( + f"Found {1500 - len(unique)} duplicates among 1500 items" + ) From ec499b01e2e9c749c958fa3ced226b9703190d20 Mon Sep 17 00:00:00 2001 From: Shakirov Renat Date: Wed, 25 Mar 2026 18:01:12 +0300 Subject: [PATCH 33/33] dedup fix --- smartfeed/execution/dedup_runtime.py | 12 +- ...test_positional_no_refill_high_priority.py | 128 ++++++++++++++++++ 2 files changed, 136 insertions(+), 4 deletions(-) create mode 100644 tests/test_positional_no_refill_high_priority.py diff --git a/smartfeed/execution/dedup_runtime.py b/smartfeed/execution/dedup_runtime.py index 1b9b26c..c706f9d 100644 --- a/smartfeed/execution/dedup_runtime.py +++ b/smartfeed/execution/dedup_runtime.py @@ -238,14 +238,16 @@ async def _refill_deficits( if missing_total <= 0: continue - base_np = owner_results[refill_owner_id].next_page if refill_owner_id in owner_results else plan.next_page + owner_res = owner_results.get(refill_owner_id) + base_np = owner_res.next_page if owner_res is not None else plan.next_page + initial_has_next = bool(owner_res.has_next_page) if owner_res is not None else True state[refill_owner_id] = { "missing_total": missing_total, "remaining": missing_total, "accepted": [], "loops": 0, "current_next_page": base_np, - "has_next_page": True, + "has_next_page": initial_has_next, } if not state: @@ -368,14 +370,16 @@ async def _refill_deficits_without_dedup( if missing_total <= 0: continue - base_np = owner_results[refill_owner_id].next_page if refill_owner_id in owner_results else plan.next_page + owner_res = owner_results.get(refill_owner_id) + base_np = owner_res.next_page if owner_res is not None else plan.next_page + initial_has_next = bool(owner_res.has_next_page) if owner_res is not None else True state[refill_owner_id] = { "missing_total": missing_total, "remaining": missing_total, "accepted": [], "loops": 0, "current_next_page": base_np, - "has_next_page": True, + "has_next_page": initial_has_next, } if not state: diff --git a/tests/test_positional_no_refill_high_priority.py b/tests/test_positional_no_refill_high_priority.py new file mode 100644 index 0000000..8b8fd81 --- /dev/null +++ b/tests/test_positional_no_refill_high_priority.py @@ -0,0 +1,128 @@ +"""Positional subfeed with highest dedup_priority must NOT be refilled +when it returns fewer items than requested slots. + +Reproduces production bug: TopSort (promo) returns fewer ads than +positional slots → _refill_deficits hardcodes has_next_page=True +in initial state → pointless refill calls even though the subfeed +already signalled has_next_page=False. +""" + +import pytest + +from smartfeed.schemas import FeedResultNextPage, MergerDeduplication +from tests.fixtures import dedup_helpers as dh +from tests.utils import parse_model + + +def _make_counting_method(items): + """Offset-paged method that also counts how many times it was called.""" + call_count = {"value": 0} + + async def _method(user_id, limit, next_page, **kwargs): + call_count["value"] += 1 + from smartfeed.schemas import FeedResultClient + + offset = int(next_page.after or 0) + result_data = items[offset : offset + limit] + next_page.after = offset + len(result_data) + next_page.page += 1 + has_next_page = (offset + len(result_data)) < len(items) + return FeedResultClient(data=result_data, next_page=next_page, has_next_page=has_next_page) + + return _method, call_count + + +@pytest.mark.asyncio +async def test_positional_high_priority_no_refill_when_exhausted(): + """When the positional subfeed (dedup_priority=1, highest) returns fewer + items than requested AND has_next_page=False, it must NOT be refilled. + + Setup: + - positional (promo): only 2 items, dedup_priority=1 + - default: 100 items, dedup_priority=0 + - positions=[1,3,5,7] → needs 4 promo items + - promo returns 2 out of 4 → has_next_page=False + - Expected: promo called exactly ONCE (no refill) + """ + promo_items = dh.make_items("promo", 1001, 1003) # only 2 items + default_items = dh.make_items("default", 1, 101) # 100 items, no overlap + + promo_method, promo_calls = _make_counting_method(promo_items) + default_method, default_calls = _make_counting_method(default_items) + + methods_dict = { + "promo": promo_method, + "default": default_method, + } + + config = dh._dedup_config( + "dedup_wrapper", + dh._positional_config( + "pos_mix", + positions=[1, 3, 5, 7], + positional=dh._subfeed("sf_promo", "promo", dedup_priority=1), + default=dh._subfeed("sf_default", "default", dedup_priority=0), + ), + max_refill_loops=5, + overfetch_factor=1, + ) + + merger = parse_model(MergerDeduplication, config) + res = await merger.get_data( + methods_dict=methods_dict, + user_id="u", + limit=20, + next_page=FeedResultNextPage(data={}), + ) + + assert len(res.data) > 0 + + # The critical assertion: promo must be called only once. + # Before the fix, it was called 1 + max_refill_loops times + # because _refill_deficits hardcoded has_next_page=True. + assert promo_calls["value"] == 1, ( + f"Promo was called {promo_calls['value']} times, expected 1. " + f"Refill should not retry a subfeed that returned has_next_page=False." + ) + + +@pytest.mark.asyncio +async def test_positional_high_priority_no_refill_even_with_overfetch(): + """Same as above but with overfetch_factor > 1 to ensure the fix + works regardless of overfetch settings.""" + promo_items = dh.make_items("promo", 1001, 1003) # only 2 items + default_items = dh.make_items("default", 1, 101) + + promo_method, promo_calls = _make_counting_method(promo_items) + default_method, _ = _make_counting_method(default_items) + + methods_dict = { + "promo": promo_method, + "default": default_method, + } + + config = dh._dedup_config( + "dedup_wrapper", + dh._positional_config( + "pos_mix", + positions=[1, 3, 5, 7], + positional=dh._subfeed("sf_promo", "promo", dedup_priority=1), + default=dh._subfeed("sf_default", "default", dedup_priority=0), + ), + max_refill_loops=5, + overfetch_factor=5, + ) + + merger = parse_model(MergerDeduplication, config) + res = await merger.get_data( + methods_dict=methods_dict, + user_id="u", + limit=20, + next_page=FeedResultNextPage(data={}), + ) + + assert len(res.data) > 0 + assert promo_calls["value"] == 1, ( + f"Promo was called {promo_calls['value']} times with overfetch_factor=5. " + f"Expected 1 — exhausted subfeed must not be refilled." + )