From b234200b6667803ab1a7581e10845b739069264a Mon Sep 17 00:00:00 2001 From: bartosz roguski Date: Sat, 21 Feb 2026 01:20:28 +0100 Subject: [PATCH 01/10] feat(extraction): add atomization for self-contained knowledge Add prompt-level and code-level enforcement to ensure extracted knowledge statements are independently interpretable without conversation context. Prompt changes: - ATOMIZATION_RULES constant prohibiting pronouns, relative time, ambiguous references, and first-person in extracted statements - Stronger reference_timestamp wording (REJECTED vs resolve) - Good/bad atomization examples in both extraction prompts Code-level validation (_pipeline.py): - Third-person check (statement must start with "User") - First-person pronoun rejection (I, my, me, we, our) - Unresolved relative time rejection (yesterday, last week, etc.) - Assistant-sourced pattern rejection (was advised/suggested) - Temporal backfill step between extract and validate New temporal module (src/memv/processing/temporal.py): - contains_relative_time: regex detection of unresolved patterns - parse_temporal_expression: relative-to-absolute via dateutil - backfill_temporal_fields: populate valid_at/invalid_at from temporal_info strings when LLM leaves them None Dependency: python-dateutil>=2.9.0 Tests: 179 total (48 new), all passing --- pyproject.toml | 1 + src/memv/memory/_pipeline.py | 45 ++++++- src/memv/processing/prompts.py | 49 ++++++- src/memv/processing/temporal.py | 146 +++++++++++++++++++++ tests/test_atomization.py | 221 ++++++++++++++++++++++++++++++++ tests/test_extractor.py | 37 ++++++ tests/test_memory_e2e.py | 42 +++++- tests/test_temporal.py | 205 +++++++++++++++++++++++++++++ uv.lock | 2 + 9 files changed, 737 insertions(+), 11 deletions(-) create mode 100644 src/memv/processing/temporal.py create mode 100644 tests/test_atomization.py create mode 100644 tests/test_temporal.py diff --git a/pyproject.toml b/pyproject.toml index 1b05fa2..a6eca69 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -22,6 +22,7 @@ dependencies = [ "openai>=2.15.0", "pydantic>=2.12.5", "pydantic-ai>=1.43.0", + "python-dateutil>=2.9.0", "sqlite-vec>=0.1.6", "textual>=3.5.0", ] diff --git a/src/memv/memory/_pipeline.py b/src/memv/memory/_pipeline.py index 8bfdaab..8b3482a 100644 --- a/src/memv/memory/_pipeline.py +++ b/src/memv/memory/_pipeline.py @@ -3,6 +3,7 @@ from __future__ import annotations import logging +import re from typing import TYPE_CHECKING from memv.models import ( @@ -11,12 +12,17 @@ Message, SemanticKnowledge, ) +from memv.processing.temporal import backfill_temporal_fields, contains_relative_time if TYPE_CHECKING: from memv.memory._lifecycle import LifecycleManager logger = logging.getLogger(__name__) +# Compiled regexes for validation checks +_FIRST_PERSON_RE = re.compile(r"\b(I|my|me|we|our)\b") +_ASSISTANT_SOURCE_RE = re.compile(r"\b(?:was|were)\s+(?:advised|suggested|recommended)\b", re.IGNORECASE) + class Pipeline: """Handles message processing, episode generation, and knowledge extraction.""" @@ -155,10 +161,20 @@ async def _process_episode(self, messages: list[Message], user_id: str) -> int: existing_knowledge=existing.retrieved_knowledge, ) - # 6. Filter out low-quality extractions + # 6. Backfill temporal fields from temporal_info + for item in extracted: + if item.temporal_info and (item.valid_at is None or item.invalid_at is None): + item.valid_at, item.invalid_at = backfill_temporal_fields( + item.temporal_info, + item.valid_at, + item.invalid_at, + episode.end_time, + ) + + # 7. Filter out low-quality extractions extracted = [item for item in extracted if self._validate_extraction(item)] - # 7. Convert to SemanticKnowledge and store with embeddings + # 8. Convert to SemanticKnowledge and store with embeddings if not extracted: return 0 @@ -166,7 +182,7 @@ async def _process_episode(self, messages: list[Message], user_id: str) -> int: statements = [item.statement for item in extracted] embeddings = await self._lc.embedder.embed_batch(statements) - # 8. Process each extracted item + # 9. Process each extracted item stored_count = 0 for item, embedding in zip(extracted, embeddings, strict=True): # Handle contradictions @@ -197,9 +213,30 @@ async def _process_episode(self, messages: list[Message], user_id: str) -> int: return stored_count def _validate_extraction(self, item: ExtractedKnowledge) -> bool: - """Filter low-confidence extractions.""" + """Filter extractions that are low-confidence or not self-contained.""" + stmt = item.statement + preview = stmt[:60] + if item.confidence < 0.7: + logger.debug("Rejected (low confidence %.2f): %s", item.confidence, preview) + return False + + if not stmt.startswith("User"): + logger.debug("Rejected (not third-person): %s", preview) + return False + + if _FIRST_PERSON_RE.search(stmt): + logger.debug("Rejected (first-person pronoun): %s", preview) return False + + if contains_relative_time(stmt): + logger.debug("Rejected (unresolved relative time): %s", preview) + return False + + if _ASSISTANT_SOURCE_RE.search(stmt): + logger.debug("Rejected (assistant-sourced): %s", preview) + return False + return True async def _is_duplicate_knowledge(self, embedding: list[float], user_id: str) -> tuple[bool, float]: diff --git a/src/memv/processing/prompts.py b/src/memv/processing/prompts.py index bb72fed..7351315 100644 --- a/src/memv/processing/prompts.py +++ b/src/memv/processing/prompts.py @@ -66,6 +66,7 @@ - Obvious or common knowledge - Speculative or uncertain claims - Conversation events ("User asked about X", "User requested Y") - extract the FACT, not the action +- Assistant-sourced knowledge ("was advised to", "was suggested to", "was recommended to") **CRITICAL SOURCE RULE - READ CAREFULLY:** - ONLY extract facts the USER explicitly stated in their own messages @@ -77,6 +78,34 @@ - Extract opinions WITH reasons when stated: "User finds X too basic" or "User likes Y because it's intuitive" """ +# ============================================================================= +# ATOMIZATION RULES +# Self-contained statement constraints for extraction quality. +# ============================================================================= + +ATOMIZATION_RULES = """ +## Self-Contained Statement Rules (MANDATORY) + +Every extracted statement MUST be independently interpretable without conversation context. + +**PROHIBIT in output statements:** +- Pronouns as subjects: "he", "she", "they", "it" — use the actual name or "User" +- Relative time: "yesterday", "today", "tomorrow", "last week", "next month", "recently", "soon" +- Ambiguous references: "the project", "the tool", "that place", "the same thing" +- First person: "I", "my", "me", "we", "our" + +**REQUIRE in output statements:** +- Absolute dates when temporal info exists: "on 2024-06-15", not "yesterday" +- Specific names: "User's React project at Vstorm", not "the project" +- Third person with "User" as subject: "User prefers Python", not "I prefer Python" + +**Coreference resolution — resolve BEFORE writing the statement:** +- "my kids" → "User's children" +- "he said" → "[Name from conversation] said" +- "this place" → "[specific location mentioned]" +- "the same library" → "[library name]" +""" + # ============================================================================= # BOUNDARY DETECTION SIGNALS # Patterns that indicate episode boundaries. @@ -201,7 +230,8 @@ def cold_start_extraction_prompt(episode_title: str, original_messages: list[dic {reference_timestamp} -Use this to resolve relative dates ("yesterday", "next week", "last month") into absolute ISO 8601 dates. +You MUST resolve ALL relative dates using this timestamp. +Statements with unresolved relative time ("yesterday", "last week") will be REJECTED. """ return f"""Extract HIGH-VALUE, PERSISTENT knowledge from this conversation. @@ -227,6 +257,8 @@ def cold_start_extraction_prompt(episode_title: str, original_messages: list[dic {EXCLUSIONS} +{ATOMIZATION_RULES} + ## Examples ### GOOD Extractions: @@ -240,9 +272,13 @@ def cold_start_extraction_prompt(episode_title: str, original_messages: list[dic - "User finds Faiss too basic for their needs" - "User had problems with Milvus due to hosting overhead" - "User uses JavaScript" (correct third-person form) +- "User started using FastAPI on 2024-06-14" (absolute date, not "yesterday") +- "User moved to Berlin in 2023" (resolved, not "last year") ### BAD Extractions: - "I use JavaScript" (raw copy - should be "User uses JavaScript") +- "He started using it yesterday" (unresolved pronoun + relative time → "User started using FastAPI on 2024-06-14") +- "They moved there last year" (unresolved pronoun + relative time → "User moved to Berlin in 2023") - "User requested code for X" (conversation event, not fact about user) - "User discussed X" (conversation event) - "User asked about X" (conversation event - extract the preference instead) @@ -250,6 +286,7 @@ def cold_start_extraction_prompt(episode_title: str, original_messages: list[dic - Expanding ANY acronym the user used (RAG, API, LLM, PDF, etc.) - "User is interested in machine learning" (too vague) - "User received advice about X" (conversation event) +- "User was advised to use X" (assistant-sourced, not user fact) - "User prefers Python" when assistant suggested Python but user didn't confirm (VIOLATES SOURCE RULE) - "User uses library X" when assistant recommended it but user didn't adopt it (VIOLATES SOURCE RULE) - "User is using Python" when assistant provided Python code but user said "I use JavaScript" (WRONG - user stated JavaScript) @@ -275,7 +312,8 @@ def extraction_prompt_with_prediction(prediction: str, conversation: str, refere {reference_timestamp} -Use this to resolve relative dates ("yesterday", "next week", "last month") into absolute ISO 8601 dates. +You MUST resolve ALL relative dates using this timestamp. +Statements with unresolved relative time ("yesterday", "last week") will be REJECTED. """ return f"""Extract valuable knowledge by comparing actual conversation with predicted content. @@ -304,6 +342,8 @@ def extraction_prompt_with_prediction(prediction: str, conversation: str, refere {EXCLUSIONS} +{ATOMIZATION_RULES} + ## Examples ### GOOD Extractions (concrete facts): @@ -314,12 +354,17 @@ def extraction_prompt_with_prediction(prediction: str, conversation: str, refere - "User likes Chroma because it's intuitive" - "User finds Faiss too basic" - "User had issues with Milvus due to hosting overhead" +- "User started using FastAPI on 2024-06-14" (absolute date, not "yesterday") +- "User moved to Berlin in 2023" (resolved, not "last year") ### BAD Extractions: +- "He started using it yesterday" (unresolved pronoun + relative time) +- "They moved there last year" (unresolved pronoun + relative time) - "User is interested in X" (too vague) - "User asked about X" (conversation event, not fact) - "User asked for suggestions on X" (conversation event - extract the preference instead) - "User discussed X" (conversation event) +- "User was advised to use X" (assistant-sourced, not user fact) - "User prefers X" when assistant suggested X but user didn't confirm (VIOLATES SOURCE RULE) - "User uses Python" when assistant provided Python code but user said they use JavaScript (WRONG) diff --git a/src/memv/processing/temporal.py b/src/memv/processing/temporal.py new file mode 100644 index 0000000..d2099bb --- /dev/null +++ b/src/memv/processing/temporal.py @@ -0,0 +1,146 @@ +"""Temporal parsing utilities for knowledge atomization. + +Safety net for when LLMs ignore reference_timestamp instructions. +Validates and backfills temporal fields on extracted knowledge. +""" + +from __future__ import annotations + +import re +from datetime import datetime, timezone + +from dateutil.parser import parse as dateutil_parse +from dateutil.relativedelta import relativedelta + +# Relative time patterns that indicate unresolved temporal references +_RELATIVE_PATTERNS = re.compile( + r"\b(" + r"yesterday|today|tomorrow" + r"|(?:last|next|this)\s+(?:week|month|year|monday|tuesday|wednesday|thursday|friday|saturday|sunday)" + r"|recently|soon|later|earlier" + r"|\d+\s+(?:days?|weeks?|months?|years?)\s+(?:ago|from\s+now)" + r")\b", + re.IGNORECASE, +) + +# Named relative expressions → relativedelta offsets +_RELATIVE_OFFSETS: dict[str, relativedelta] = { + "yesterday": relativedelta(days=-1), + "today": relativedelta(), + "tomorrow": relativedelta(days=1), + "last week": relativedelta(weeks=-1), + "next week": relativedelta(weeks=1), + "this week": relativedelta(), + "last month": relativedelta(months=-1), + "next month": relativedelta(months=1), + "this month": relativedelta(), + "last year": relativedelta(years=-1), + "next year": relativedelta(years=1), + "this year": relativedelta(), +} + +# N units ago / from now +_N_UNITS_PATTERN = re.compile( + r"(\d+)\s+(days?|weeks?|months?|years?)\s+(ago|from\s+now)", + re.IGNORECASE, +) + +# Backfill patterns for temporal_info strings +_SINCE_PATTERN = re.compile(r"(?:since|from|starting|began?)\s+(.+?)(?:\s*$|\s*(?:to|until|through)\s+)", re.IGNORECASE) +_UNTIL_PATTERN = re.compile(r"(?:until|to|through|ending|ended?)\s+(.+?)$", re.IGNORECASE) +_FROM_TO_PATTERN = re.compile(r"(?:from|since)\s+(.+?)\s+(?:to|until|through)\s+(.+?)$", re.IGNORECASE) + + +def contains_relative_time(text: str) -> bool: + """Check if text contains unresolved relative time expressions.""" + return bool(_RELATIVE_PATTERNS.search(text)) + + +def parse_temporal_expression(text: str, reference: datetime) -> datetime | None: + """Parse a temporal expression relative to a reference time. + + Handles common relative patterns via relativedelta, falls through + to dateutil.parser.parse for natural language / ISO dates. + Returns None on failure. + """ + normalized = text.strip().lower() + + # Named relative offsets + if normalized in _RELATIVE_OFFSETS: + return reference + _RELATIVE_OFFSETS[normalized] + + # "last/next " patterns + for prefix, direction in [("last", -1), ("next", 1)]: + for day_name in ["monday", "tuesday", "wednesday", "thursday", "friday", "saturday", "sunday"]: + if normalized == f"{prefix} {day_name}": + target_weekday = ["monday", "tuesday", "wednesday", "thursday", "friday", "saturday", "sunday"].index(day_name) + days_diff = (reference.weekday() - target_weekday) % 7 + if direction == -1: + days_diff = days_diff or 7 + return reference - relativedelta(days=days_diff) + else: + days_diff = (target_weekday - reference.weekday()) % 7 + days_diff = days_diff or 7 + return reference + relativedelta(days=days_diff) + + # "N days/weeks/months/years ago/from now" + match = _N_UNITS_PATTERN.match(normalized) + if match: + n = int(match.group(1)) + unit = match.group(2).rstrip("s") # normalize "days" -> "day" + direction = -1 if match.group(3).lower() == "ago" else 1 + unit_map = {"day": "days", "week": "weeks", "month": "months", "year": "years"} + delta = relativedelta(**{unit_map[unit]: n * direction}) + return reference + delta + + # Fall through to dateutil for ISO 8601 and natural dates + try: + return dateutil_parse(text, default=reference.replace(hour=0, minute=0, second=0, microsecond=0)) + except (ValueError, OverflowError): + return None + + +def backfill_temporal_fields( + temporal_info: str | None, + valid_at: datetime | None, + invalid_at: datetime | None, + reference: datetime, +) -> tuple[datetime | None, datetime | None]: + """Attempt to populate valid_at/invalid_at from temporal_info string. + + Only fills None fields — preserves existing values. + Returns (valid_at, invalid_at). + """ + if not temporal_info: + return valid_at, invalid_at + + # "from X to Y" — try to fill both + from_to = _FROM_TO_PATTERN.search(temporal_info) + if from_to: + if valid_at is None: + parsed = parse_temporal_expression(from_to.group(1), reference) + if parsed: + valid_at = parsed.replace(tzinfo=timezone.utc) if parsed.tzinfo is None else parsed + if invalid_at is None: + parsed = parse_temporal_expression(from_to.group(2), reference) + if parsed: + invalid_at = parsed.replace(tzinfo=timezone.utc) if parsed.tzinfo is None else parsed + return valid_at, invalid_at + + # "since X" → valid_at + if valid_at is None: + since = _SINCE_PATTERN.search(temporal_info) + if since: + parsed = parse_temporal_expression(since.group(1), reference) + if parsed: + valid_at = parsed.replace(tzinfo=timezone.utc) if parsed.tzinfo is None else parsed + + # "until X" → invalid_at + if invalid_at is None: + until = _UNTIL_PATTERN.search(temporal_info) + if until: + parsed = parse_temporal_expression(until.group(1), reference) + if parsed: + invalid_at = parsed.replace(tzinfo=timezone.utc) if parsed.tzinfo is None else parsed + + return valid_at, invalid_at diff --git a/tests/test_atomization.py b/tests/test_atomization.py new file mode 100644 index 0000000..77d3164 --- /dev/null +++ b/tests/test_atomization.py @@ -0,0 +1,221 @@ +"""Tests for atomization: validation filter + prompt integration.""" + +import json +from datetime import datetime, timedelta, timezone + +from memv.memory.memory import Memory +from memv.models import ExtractedKnowledge +from memv.processing.extraction import ExtractionResponse +from memv.processing.prompts import ( + ATOMIZATION_RULES, + cold_start_extraction_prompt, + extraction_prompt_with_prediction, +) + +from .conftest import MockEmbedder, MockLLM + + +def _ts(minutes: int = 0) -> datetime: + return datetime(2024, 6, 15, 12, 0, 0, tzinfo=timezone.utc) + timedelta(minutes=minutes) + + +def _episode_json(title="Test Episode", content="A test narrative."): + return json.dumps({"title": title, "content": content}) + + +def _extraction(items: list[ExtractedKnowledge]) -> ExtractionResponse: + return ExtractionResponse(extracted=items) + + +def _make_memory(tmp_path, llm, embedder): + db_path = str(tmp_path / "atom.db") + return Memory( + db_path=db_path, + embedding_client=embedder, + llm_client=llm, + embedding_dimensions=1536, + enable_episode_merging=False, + enable_embedding_cache=False, + ) + + +# --------------------------------------------------------------------------- +# Validation filter: unit-level via e2e pipeline +# --------------------------------------------------------------------------- + + +async def test_rejects_first_person(tmp_path): + """Statement with 'I' / 'my' gets filtered out.""" + llm = MockLLM() + embedder = MockEmbedder() + + llm.set_responses("generate", [_episode_json()]) + llm.set_responses( + "generate_structured", + [_extraction([ExtractedKnowledge(statement="I like Python", knowledge_type="new", confidence=0.9)])], + ) + + memory = _make_memory(tmp_path, llm, embedder) + async with memory: + await memory.add_exchange("user1", "I like Python", "Cool!", timestamp=_ts()) + count = await memory.process("user1") + assert count == 0 + + +async def test_rejects_non_third_person(tmp_path): + """Statement not starting with 'User' gets filtered.""" + llm = MockLLM() + embedder = MockEmbedder() + + llm.set_responses("generate", [_episode_json()]) + llm.set_responses( + "generate_structured", + [_extraction([ExtractedKnowledge(statement="He likes Python", knowledge_type="new", confidence=0.9)])], + ) + + memory = _make_memory(tmp_path, llm, embedder) + async with memory: + await memory.add_exchange("user1", "I like Python", "Cool!", timestamp=_ts()) + count = await memory.process("user1") + assert count == 0 + + +async def test_rejects_relative_time(tmp_path): + """Statement with unresolved 'yesterday' gets filtered.""" + llm = MockLLM() + embedder = MockEmbedder() + + llm.set_responses("generate", [_episode_json()]) + llm.set_responses( + "generate_structured", + [_extraction([ExtractedKnowledge(statement="User started using FastAPI yesterday", knowledge_type="new", confidence=0.9)])], + ) + + memory = _make_memory(tmp_path, llm, embedder) + async with memory: + await memory.add_exchange("user1", "I started using FastAPI yesterday", "Nice!", timestamp=_ts()) + count = await memory.process("user1") + assert count == 0 + + +async def test_rejects_assistant_sourced(tmp_path): + """Statement with 'was advised/suggested/recommended' gets filtered.""" + llm = MockLLM() + embedder = MockEmbedder() + + llm.set_responses("generate", [_episode_json()]) + llm.set_responses( + "generate_structured", + [_extraction([ExtractedKnowledge(statement="User was advised to use Docker", knowledge_type="new", confidence=0.9)])], + ) + + memory = _make_memory(tmp_path, llm, embedder) + async with memory: + await memory.add_exchange("user1", "hi", "Use Docker!", timestamp=_ts()) + count = await memory.process("user1") + assert count == 0 + + +async def test_accepts_clean_statement(tmp_path): + """Properly atomized statement passes all checks.""" + llm = MockLLM() + embedder = MockEmbedder() + + llm.set_responses("generate", [_episode_json()]) + llm.set_responses( + "generate_structured", + [ + _extraction( + [ExtractedKnowledge(statement="User prefers Python for backend development", knowledge_type="new", confidence=0.9)] + ) + ], + ) + + memory = _make_memory(tmp_path, llm, embedder) + async with memory: + await memory.add_exchange("user1", "I prefer Python for backend", "Good choice!", timestamp=_ts()) + count = await memory.process("user1") + assert count == 1 + + +async def test_confidence_filter_unchanged(tmp_path): + """Low-confidence items still rejected (regression check).""" + llm = MockLLM() + embedder = MockEmbedder() + + llm.set_responses("generate", [_episode_json()]) + llm.set_responses( + "generate_structured", + [_extraction([ExtractedKnowledge(statement="User might like Rust", knowledge_type="new", confidence=0.3)])], + ) + + memory = _make_memory(tmp_path, llm, embedder) + async with memory: + await memory.add_exchange("user1", "Maybe Rust?", "Interesting!", timestamp=_ts()) + count = await memory.process("user1") + assert count == 0 + + +async def test_temporal_backfill_in_pipeline(tmp_path): + """temporal_info 'since January 2024' backfills valid_at when LLM leaves it None.""" + llm = MockLLM() + embedder = MockEmbedder() + + llm.set_responses("generate", [_episode_json()]) + llm.set_responses( + "generate_structured", + [ + _extraction( + [ + ExtractedKnowledge( + statement="User works at Vstorm", + knowledge_type="new", + confidence=0.9, + temporal_info="since January 2024", + valid_at=None, + invalid_at=None, + ) + ] + ) + ], + ) + + memory = _make_memory(tmp_path, llm, embedder) + async with memory: + await memory.add_exchange("user1", "I work at Vstorm since Jan 2024", "Nice!", timestamp=_ts()) + count = await memory.process("user1") + assert count == 1 + + result = await memory.retrieve("Vstorm", user_id="user1") + knowledge = result.retrieved_knowledge[0] + assert knowledge.valid_at is not None + assert knowledge.valid_at.year == 2024 + assert knowledge.valid_at.month == 1 + + +# --------------------------------------------------------------------------- +# Prompt content assertions +# --------------------------------------------------------------------------- + + +class TestPromptContent: + def test_cold_start_contains_atomization_rules(self): + prompt = cold_start_extraction_prompt("Test", [{"role": "user", "content": "hi"}], "2024-06-15T12:00:00Z") + assert "Self-Contained Statement Rules" in prompt + assert "PROHIBIT" in prompt + + def test_warm_extraction_contains_atomization_rules(self): + prompt = extraction_prompt_with_prediction("prediction", ">>> USER: hi", "2024-06-15T12:00:00Z") + assert "Self-Contained Statement Rules" in prompt + assert "PROHIBIT" in prompt + + def test_cold_start_contains_strong_timestamp_wording(self): + prompt = cold_start_extraction_prompt("Test", [{"role": "user", "content": "hi"}], "2024-06-15T12:00:00Z") + assert "will be REJECTED" in prompt + + def test_warm_extraction_contains_strong_timestamp_wording(self): + prompt = extraction_prompt_with_prediction("prediction", ">>> USER: hi", "2024-06-15T12:00:00Z") + assert "will be REJECTED" in prompt + + def test_atomization_rules_constant_exists(self): + assert "Self-Contained Statement Rules" in ATOMIZATION_RULES diff --git a/tests/test_extractor.py b/tests/test_extractor.py index 2b92ea9..b9ceb8c 100644 --- a/tests/test_extractor.py +++ b/tests/test_extractor.py @@ -155,3 +155,40 @@ async def test_temporal_fields_preserved(): assert result[0].valid_at == valid assert result[0].invalid_at == invalid assert result[0].temporal_info == "from Jan 2024 to Jan 2025" + + +async def test_cold_start_prompt_contains_atomization_rules(): + """Cold start extraction prompt includes ATOMIZATION_RULES.""" + llm = MockLLM() + llm.set_responses("generate_structured", [ExtractionResponse(extracted=[])]) + extractor = PredictCalibrateExtractor(llm) + + await extractor.extract(_episode(), existing_knowledge=[]) + + prompt = llm.calls["generate_structured"][0][0] + assert "Self-Contained Statement Rules" in prompt + + +async def test_warm_prompt_contains_atomization_rules(): + """Warm extraction prompt includes ATOMIZATION_RULES.""" + llm = MockLLM() + llm.set_responses("generate", ["prediction"]) + llm.set_responses("generate_structured", [ExtractionResponse(extracted=[])]) + extractor = PredictCalibrateExtractor(llm) + + await extractor.extract(_episode(), existing_knowledge=[_knowledge()]) + + prompt = llm.calls["generate_structured"][0][0] + assert "Self-Contained Statement Rules" in prompt + + +async def test_prompt_contains_reference_timestamp(): + """Both extraction paths include reference_timestamp in the prompt.""" + llm = MockLLM() + llm.set_responses("generate_structured", [ExtractionResponse(extracted=[])]) + extractor = PredictCalibrateExtractor(llm) + + await extractor.extract(_episode(), existing_knowledge=[]) + + prompt = llm.calls["generate_structured"][0][0] + assert "reference_timestamp" in prompt diff --git a/tests/test_memory_e2e.py b/tests/test_memory_e2e.py index d12ecc8..3d0d9d4 100644 --- a/tests/test_memory_e2e.py +++ b/tests/test_memory_e2e.py @@ -63,7 +63,7 @@ async def test_process_returns_count(tmp_path): embedder = MockEmbedder() llm.set_responses("generate", [_episode_json()]) - llm.set_responses("generate_structured", [_extraction(["Fact A", "Fact B"])]) + llm.set_responses("generate_structured", [_extraction(["User knows Fact A", "User knows Fact B"])]) memory = _make_memory(tmp_path, llm, embedder, enable_knowledge_dedup=False) async with memory: @@ -151,7 +151,7 @@ async def test_clear_user(tmp_path): embedder = MockEmbedder() llm.set_responses("generate", [_episode_json()]) - llm.set_responses("generate_structured", [_extraction(["Fact to delete"])]) + llm.set_responses("generate_structured", [_extraction(["User has a fact to delete"])]) memory = _make_memory(tmp_path, llm, embedder) async with memory: @@ -162,7 +162,7 @@ async def test_clear_user(tmp_path): assert counts["messages"] >= 2 assert counts["episodes"] >= 1 - result = await memory.retrieve("Fact to delete", user_id="user1") + result = await memory.retrieve("User has a fact to delete", user_id="user1") assert len(result.retrieved_knowledge) == 0 @@ -197,7 +197,7 @@ async def test_auto_process_at_threshold(tmp_path): _episode_json("Auto", "Auto processed"), # episode gen ], ) - llm.set_responses("generate_structured", [_extraction(["Auto fact"])]) + llm.set_responses("generate_structured", [_extraction(["User has auto fact"])]) memory = _make_memory(tmp_path, llm, embedder, auto_process=True, batch_threshold=4) async with memory: @@ -219,7 +219,7 @@ async def test_flush_forces_processing(tmp_path): embedder = MockEmbedder() llm.set_responses("generate", [_episode_json("Flushed", "Flushed episode")]) - llm.set_responses("generate_structured", [_extraction(["Flushed fact"])]) + llm.set_responses("generate_structured", [_extraction(["User has flushed fact"])]) memory = _make_memory(tmp_path, llm, embedder, auto_process=True, batch_threshold=100) async with memory: @@ -229,6 +229,38 @@ async def test_flush_forces_processing(tmp_path): assert count == 1 +async def test_atomization_filters_bad_statements(tmp_path): + """LLM returns mix of good and bad statements — only atomized ones survive.""" + llm = MockLLM() + embedder = MockEmbedder() + + llm.set_responses("generate", [_episode_json("Mixed", "Mixed quality")]) + llm.set_responses( + "generate_structured", + [ + ExtractionResponse( + extracted=[ + ExtractedKnowledge(statement="User prefers Vim", knowledge_type="new", confidence=0.9), + ExtractedKnowledge(statement="I like Vim", knowledge_type="new", confidence=0.9), + ExtractedKnowledge(statement="User started yesterday", knowledge_type="new", confidence=0.9), + ExtractedKnowledge(statement="He uses Neovim", knowledge_type="new", confidence=0.9), + ExtractedKnowledge(statement="User was advised to try Emacs", knowledge_type="new", confidence=0.9), + ] + ) + ], + ) + + memory = _make_memory(tmp_path, llm, embedder) + async with memory: + await memory.add_exchange("user1", "I prefer Vim", "Nice!", timestamp=_ts()) + count = await memory.process("user1") + # Only "User prefers Vim" should survive all filters + assert count == 1 + + result = await memory.retrieve("Vim", user_id="user1") + assert result.retrieved_knowledge[0].statement == "User prefers Vim" + + async def test_dedup_skips_duplicate(tmp_path): """enable_knowledge_dedup=True → identical statement not stored twice.""" llm = MockLLM() diff --git a/tests/test_temporal.py b/tests/test_temporal.py new file mode 100644 index 0000000..5783aaa --- /dev/null +++ b/tests/test_temporal.py @@ -0,0 +1,205 @@ +"""Tests for temporal parsing utilities.""" + +from datetime import datetime, timezone + +from memv.processing.temporal import ( + backfill_temporal_fields, + contains_relative_time, + parse_temporal_expression, +) + + +def _ref(year=2024, month=6, day=15, hour=12): + return datetime(year, month, day, hour, 0, 0, tzinfo=timezone.utc) + + +# --------------------------------------------------------------------------- +# contains_relative_time +# --------------------------------------------------------------------------- + + +class TestContainsRelativeTime: + def test_yesterday(self): + assert contains_relative_time("User started yesterday") + + def test_today(self): + assert contains_relative_time("User is working today") + + def test_tomorrow(self): + assert contains_relative_time("User will finish tomorrow") + + def test_last_week(self): + assert contains_relative_time("User mentioned last week") + + def test_next_month(self): + assert contains_relative_time("User plans to move next month") + + def test_n_days_ago(self): + assert contains_relative_time("User joined 3 days ago") + + def test_n_years_from_now(self): + assert contains_relative_time("User plans to retire 5 years from now") + + def test_recently(self): + assert contains_relative_time("User recently switched to Rust") + + def test_soon(self): + assert contains_relative_time("User will deploy soon") + + def test_absolute_date_not_flagged(self): + assert not contains_relative_time("User started on 2024-06-14") + + def test_plain_text_not_flagged(self): + assert not contains_relative_time("User prefers Python over JavaScript") + + def test_iso_date_not_flagged(self): + assert not contains_relative_time("User moved to Berlin in 2023") + + def test_case_insensitive(self): + assert contains_relative_time("User started YESTERDAY") + + +# --------------------------------------------------------------------------- +# parse_temporal_expression +# --------------------------------------------------------------------------- + + +class TestParseTemporalExpression: + def test_yesterday(self): + result = parse_temporal_expression("yesterday", _ref()) + assert result == datetime(2024, 6, 14, 12, 0, 0, tzinfo=timezone.utc) + + def test_tomorrow(self): + result = parse_temporal_expression("tomorrow", _ref()) + assert result == datetime(2024, 6, 16, 12, 0, 0, tzinfo=timezone.utc) + + def test_today(self): + result = parse_temporal_expression("today", _ref()) + assert result == _ref() + + def test_last_week(self): + result = parse_temporal_expression("last week", _ref()) + assert result == datetime(2024, 6, 8, 12, 0, 0, tzinfo=timezone.utc) + + def test_3_days_ago(self): + result = parse_temporal_expression("3 days ago", _ref()) + assert result == datetime(2024, 6, 12, 12, 0, 0, tzinfo=timezone.utc) + + def test_2_months_from_now(self): + result = parse_temporal_expression("2 months from now", _ref()) + assert result == datetime(2024, 8, 15, 12, 0, 0, tzinfo=timezone.utc) + + def test_iso_8601(self): + result = parse_temporal_expression("2024-01-15", _ref()) + assert result is not None + assert result.year == 2024 + assert result.month == 1 + assert result.day == 15 + + def test_natural_date(self): + result = parse_temporal_expression("January 2024", _ref()) + assert result is not None + assert result.year == 2024 + assert result.month == 1 + + def test_garbage_returns_none(self): + result = parse_temporal_expression("not a date at all xyz", _ref()) + assert result is None + + def test_last_monday(self): + # 2024-06-15 is a Saturday (weekday=5), last Monday = 2024-06-10 + result = parse_temporal_expression("last monday", _ref()) + assert result is not None + assert result.weekday() == 0 # Monday + assert result < _ref() + + def test_next_friday(self): + # 2024-06-15 is a Saturday (weekday=5), next Friday = 2024-06-21 + result = parse_temporal_expression("next friday", _ref()) + assert result is not None + assert result.weekday() == 4 # Friday + assert result > _ref() + + +# --------------------------------------------------------------------------- +# backfill_temporal_fields +# --------------------------------------------------------------------------- + + +class TestBackfillTemporalFields: + def test_since_x(self): + valid_at, invalid_at = backfill_temporal_fields( + "since January 2024", + None, + None, + _ref(), + ) + assert valid_at is not None + assert valid_at.year == 2024 + assert valid_at.month == 1 + assert invalid_at is None + + def test_until_x(self): + valid_at, invalid_at = backfill_temporal_fields( + "until December 2024", + None, + None, + _ref(), + ) + assert valid_at is None + assert invalid_at is not None + assert invalid_at.year == 2024 + assert invalid_at.month == 12 + + def test_from_x_to_y(self): + valid_at, invalid_at = backfill_temporal_fields( + "from January 2024 to June 2024", + None, + None, + _ref(), + ) + assert valid_at is not None + assert valid_at.month == 1 + assert invalid_at is not None + assert invalid_at.month == 6 + + def test_preserves_existing_valid_at(self): + existing = datetime(2023, 1, 1, tzinfo=timezone.utc) + valid_at, invalid_at = backfill_temporal_fields( + "since March 2024", + existing, + None, + _ref(), + ) + assert valid_at == existing # preserved, not overwritten + + def test_preserves_existing_invalid_at(self): + existing = datetime(2025, 12, 31, tzinfo=timezone.utc) + valid_at, invalid_at = backfill_temporal_fields( + "until March 2024", + None, + existing, + _ref(), + ) + assert invalid_at == existing # preserved + + def test_none_temporal_info_noop(self): + valid_at, invalid_at = backfill_temporal_fields(None, None, None, _ref()) + assert valid_at is None + assert invalid_at is None + + def test_empty_string_noop(self): + valid_at, invalid_at = backfill_temporal_fields("", None, None, _ref()) + assert valid_at is None + assert invalid_at is None + + def test_unparseable_temporal_info(self): + valid_at, invalid_at = backfill_temporal_fields( + "some random text", + None, + None, + _ref(), + ) + # Should not crash, fields stay None + assert valid_at is None + assert invalid_at is None diff --git a/uv.lock b/uv.lock index 18322ed..ecbc5ad 100644 --- a/uv.lock +++ b/uv.lock @@ -2321,6 +2321,7 @@ dependencies = [ { name = "openai" }, { name = "pydantic" }, { name = "pydantic-ai" }, + { name = "python-dateutil" }, { name = "sqlite-vec" }, { name = "textual" }, ] @@ -2357,6 +2358,7 @@ requires-dist = [ { name = "openai", specifier = ">=2.15.0" }, { name = "pydantic", specifier = ">=2.12.5" }, { name = "pydantic-ai", specifier = ">=1.43.0" }, + { name = "python-dateutil", specifier = ">=2.9.0" }, { name = "sqlite-vec", specifier = ">=0.1.6" }, { name = "textual", specifier = ">=3.5.0" }, ] From 09de2b3ab746b05c8fc597613e19edd78989b149 Mon Sep 17 00:00:00 2001 From: bartosz roguski Date: Sat, 21 Feb 2026 01:30:14 +0100 Subject: [PATCH 02/10] fix(extraction): address PR #13 review feedback MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit - Use word-boundary regex (^User\b) instead of startswith("User") - Expand _ASSISTANT_SOURCE_RE to catch told/instructed/encouraged/shown/given - Use removesuffix("s") instead of rstrip("s") for unit normalization - Add comment on intentional .match() anchoring in temporal parser - Make _FIRST_PERSON_RE case-insensitive for my/me/we/our (not I) - Compute preview lazily (after confidence gate) - Use placeholder syntax in ATOMIZATION_RULES date examples - Add tests: same-weekday edge case, backfill with relative date, assistant-sourced "told" variant - Fix test fixtures for word-boundary validation (User1 → User) 182 tests passing, lint and typecheck clean. --- src/memv/memory/_pipeline.py | 25 ++++++++++++++----------- src/memv/processing/prompts.py | 2 +- src/memv/processing/temporal.py | 3 ++- tests/test_atomization.py | 20 +++++++++++++++++++- tests/test_memory_e2e.py | 16 ++++++++-------- tests/test_temporal.py | 20 ++++++++++++++++++++ 6 files changed, 64 insertions(+), 22 deletions(-) diff --git a/src/memv/memory/_pipeline.py b/src/memv/memory/_pipeline.py index 8b3482a..e2bcbbf 100644 --- a/src/memv/memory/_pipeline.py +++ b/src/memv/memory/_pipeline.py @@ -20,8 +20,12 @@ logger = logging.getLogger(__name__) # Compiled regexes for validation checks -_FIRST_PERSON_RE = re.compile(r"\b(I|my|me|we|our)\b") -_ASSISTANT_SOURCE_RE = re.compile(r"\b(?:was|were)\s+(?:advised|suggested|recommended)\b", re.IGNORECASE) +_THIRD_PERSON_RE = re.compile(r"^User\b") +_FIRST_PERSON_RE = re.compile(r"\b(I|[Mm][Yy]|[Mm][Ee]|[Ww][Ee]|[Oo][Uu][Rr])\b") +_ASSISTANT_SOURCE_RE = re.compile( + r"\b(?:was|were)\s+(?:advised|suggested|recommended|told|instructed|encouraged|shown|given)\b", + re.IGNORECASE, +) class Pipeline: @@ -214,27 +218,26 @@ async def _process_episode(self, messages: list[Message], user_id: str) -> int: def _validate_extraction(self, item: ExtractedKnowledge) -> bool: """Filter extractions that are low-confidence or not self-contained.""" - stmt = item.statement - preview = stmt[:60] - if item.confidence < 0.7: - logger.debug("Rejected (low confidence %.2f): %s", item.confidence, preview) + logger.debug("Rejected (low confidence %.2f): %s", item.confidence, item.statement[:60]) return False - if not stmt.startswith("User"): - logger.debug("Rejected (not third-person): %s", preview) + stmt = item.statement + + if not _THIRD_PERSON_RE.match(stmt): + logger.debug("Rejected (not third-person): %s", stmt[:60]) return False if _FIRST_PERSON_RE.search(stmt): - logger.debug("Rejected (first-person pronoun): %s", preview) + logger.debug("Rejected (first-person pronoun): %s", stmt[:60]) return False if contains_relative_time(stmt): - logger.debug("Rejected (unresolved relative time): %s", preview) + logger.debug("Rejected (unresolved relative time): %s", stmt[:60]) return False if _ASSISTANT_SOURCE_RE.search(stmt): - logger.debug("Rejected (assistant-sourced): %s", preview) + logger.debug("Rejected (assistant-sourced): %s", stmt[:60]) return False return True diff --git a/src/memv/processing/prompts.py b/src/memv/processing/prompts.py index 7351315..a02ecc2 100644 --- a/src/memv/processing/prompts.py +++ b/src/memv/processing/prompts.py @@ -95,7 +95,7 @@ - First person: "I", "my", "me", "we", "our" **REQUIRE in output statements:** -- Absolute dates when temporal info exists: "on 2024-06-15", not "yesterday" +- Absolute dates when temporal info exists: "on [resolved date]", not "yesterday" - Specific names: "User's React project at Vstorm", not "the project" - Third person with "User" as subject: "User prefers Python", not "I prefer Python" diff --git a/src/memv/processing/temporal.py b/src/memv/processing/temporal.py index d2099bb..cab036b 100644 --- a/src/memv/processing/temporal.py +++ b/src/memv/processing/temporal.py @@ -84,10 +84,11 @@ def parse_temporal_expression(text: str, reference: datetime) -> datetime | None return reference + relativedelta(days=days_diff) # "N days/weeks/months/years ago/from now" + # .match() is intentionally start-anchored — input is already stripped/normalized match = _N_UNITS_PATTERN.match(normalized) if match: n = int(match.group(1)) - unit = match.group(2).rstrip("s") # normalize "days" -> "day" + unit = match.group(2).removesuffix("s") # normalize "days" -> "day" direction = -1 if match.group(3).lower() == "ago" else 1 unit_map = {"day": "days", "week": "weeks", "month": "months", "year": "years"} delta = relativedelta(**{unit_map[unit]: n * direction}) diff --git a/tests/test_atomization.py b/tests/test_atomization.py index 77d3164..b7e4547 100644 --- a/tests/test_atomization.py +++ b/tests/test_atomization.py @@ -99,7 +99,7 @@ async def test_rejects_relative_time(tmp_path): async def test_rejects_assistant_sourced(tmp_path): - """Statement with 'was advised/suggested/recommended' gets filtered.""" + """Statement with 'was advised/suggested/recommended/told/instructed' gets filtered.""" llm = MockLLM() embedder = MockEmbedder() @@ -116,6 +116,24 @@ async def test_rejects_assistant_sourced(tmp_path): assert count == 0 +async def test_rejects_assistant_sourced_told(tmp_path): + """'was told to' variant also gets filtered.""" + llm = MockLLM() + embedder = MockEmbedder() + + llm.set_responses("generate", [_episode_json()]) + llm.set_responses( + "generate_structured", + [_extraction([ExtractedKnowledge(statement="User was told to migrate to Postgres", knowledge_type="new", confidence=0.9)])], + ) + + memory = _make_memory(tmp_path, llm, embedder) + async with memory: + await memory.add_exchange("user1", "hi", "Migrate to Postgres!", timestamp=_ts()) + count = await memory.process("user1") + assert count == 0 + + async def test_accepts_clean_statement(tmp_path): """Properly atomized statement passes all checks.""" llm = MockLLM() diff --git a/tests/test_memory_e2e.py b/tests/test_memory_e2e.py index 3d0d9d4..a8ad073 100644 --- a/tests/test_memory_e2e.py +++ b/tests/test_memory_e2e.py @@ -123,8 +123,8 @@ async def test_user_isolation_e2e(tmp_path): llm.set_responses( "generate_structured", [ - _extraction(["User1 likes cats"]), - _extraction(["User2 likes dogs"]), + _extraction(["User likes cats"]), + _extraction(["User likes dogs"]), ], ) @@ -135,15 +135,15 @@ async def test_user_isolation_e2e(tmp_path): await memory.add_exchange("user2", "I like dogs", "Cool!", timestamp=_ts()) await memory.process("user2") - r1 = await memory.retrieve("likes", user_id="user1") - r2 = await memory.retrieve("likes", user_id="user2") + r1 = await memory.retrieve("cats", user_id="user1") + r2 = await memory.retrieve("dogs", user_id="user2") s1 = [k.statement for k in r1.retrieved_knowledge] s2 = [k.statement for k in r2.retrieved_knowledge] - assert "User1 likes cats" in s1 - assert "User2 likes dogs" not in s1 - assert "User2 likes dogs" in s2 - assert "User1 likes cats" not in s2 + assert "User likes cats" in s1 + assert "User likes dogs" not in s1 + assert "User likes dogs" in s2 + assert "User likes cats" not in s2 async def test_clear_user(tmp_path): diff --git a/tests/test_temporal.py b/tests/test_temporal.py index 5783aaa..29115a1 100644 --- a/tests/test_temporal.py +++ b/tests/test_temporal.py @@ -120,6 +120,14 @@ def test_next_friday(self): assert result.weekday() == 4 # Friday assert result > _ref() + def test_next_monday_when_ref_is_monday(self): + # 2024-06-10 is a Monday — "next monday" should return 2024-06-17, not same day + ref_monday = _ref(year=2024, month=6, day=10) + result = parse_temporal_expression("next monday", ref_monday) + assert result is not None + assert result.weekday() == 0 # Monday + assert result > ref_monday # must be future, not same day + # --------------------------------------------------------------------------- # backfill_temporal_fields @@ -203,3 +211,15 @@ def test_unparseable_temporal_info(self): # Should not crash, fields stay None assert valid_at is None assert invalid_at is None + + def test_since_relative_date(self): + # "since yesterday" should resolve relative to reference + valid_at, invalid_at = backfill_temporal_fields( + "since yesterday", + None, + None, + _ref(), + ) + assert valid_at is not None + assert valid_at == datetime(2024, 6, 14, 12, 0, 0, tzinfo=timezone.utc) + assert invalid_at is None From ee0408eac20c165fa63125ad69159549e5c3bb20 Mon Sep 17 00:00:00 2001 From: bartosz roguski Date: Sat, 21 Feb 2026 01:43:02 +0100 Subject: [PATCH 03/10] fix(extraction): handle I/O false positive and document scope - Add negative lookahead/lookbehind for "/" in _FIRST_PERSON_RE to avoid matching "I" in "I/O" - Document ^User\b scope constraint in _validate_extraction docstring - Add control flow comment in backfill_temporal_fields --- src/memv/memory/_pipeline.py | 9 +++++++-- src/memv/processing/temporal.py | 2 ++ 2 files changed, 9 insertions(+), 2 deletions(-) diff --git a/src/memv/memory/_pipeline.py b/src/memv/memory/_pipeline.py index e2bcbbf..d15a0b1 100644 --- a/src/memv/memory/_pipeline.py +++ b/src/memv/memory/_pipeline.py @@ -21,7 +21,7 @@ # Compiled regexes for validation checks _THIRD_PERSON_RE = re.compile(r"^User\b") -_FIRST_PERSON_RE = re.compile(r"\b(I|[Mm][Yy]|[Mm][Ee]|[Ww][Ee]|[Oo][Uu][Rr])\b") +_FIRST_PERSON_RE = re.compile(r"(? int: return stored_count def _validate_extraction(self, item: ExtractedKnowledge) -> bool: - """Filter extractions that are low-confidence or not self-contained.""" + """Filter extractions that are low-confidence or not self-contained. + + Scope: only User-subject statements pass. Third-party facts ("Bob prefers + Postgres") are intentionally dropped — the knowledge base stores facts + about the user, not about people mentioned in conversation. + """ if item.confidence < 0.7: logger.debug("Rejected (low confidence %.2f): %s", item.confidence, item.statement[:60]) return False diff --git a/src/memv/processing/temporal.py b/src/memv/processing/temporal.py index cab036b..806e27b 100644 --- a/src/memv/processing/temporal.py +++ b/src/memv/processing/temporal.py @@ -128,6 +128,8 @@ def backfill_temporal_fields( invalid_at = parsed.replace(tzinfo=timezone.utc) if parsed.tzinfo is None else parsed return valid_at, invalid_at + # "from X to Y" handled above — remaining checks are single-endpoint patterns + # "since X" → valid_at if valid_at is None: since = _SINCE_PATTERN.search(temporal_info) From ea20e2c53140ab8a005aa00a43e1a671d3f24604 Mon Sep 17 00:00:00 2001 From: bartosz roguski Date: Sat, 21 Feb 2026 18:00:27 +0100 Subject: [PATCH 04/10] refactor(extraction): drop assistant-source regex filter MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Regex can't distinguish semantic intent — "User was given a promotion" (valid) vs "User was told to use Docker" (assistant-sourced). Prompt- level EXCLUSIONS already handle this correctly at the right layer. --- src/memv/memory/_pipeline.py | 8 -------- tests/test_atomization.py | 36 ------------------------------------ 2 files changed, 44 deletions(-) diff --git a/src/memv/memory/_pipeline.py b/src/memv/memory/_pipeline.py index d15a0b1..c631cf9 100644 --- a/src/memv/memory/_pipeline.py +++ b/src/memv/memory/_pipeline.py @@ -22,10 +22,6 @@ # Compiled regexes for validation checks _THIRD_PERSON_RE = re.compile(r"^User\b") _FIRST_PERSON_RE = re.compile(r"(? bool: logger.debug("Rejected (unresolved relative time): %s", stmt[:60]) return False - if _ASSISTANT_SOURCE_RE.search(stmt): - logger.debug("Rejected (assistant-sourced): %s", stmt[:60]) - return False - return True async def _is_duplicate_knowledge(self, embedding: list[float], user_id: str) -> tuple[bool, float]: diff --git a/tests/test_atomization.py b/tests/test_atomization.py index b7e4547..8805875 100644 --- a/tests/test_atomization.py +++ b/tests/test_atomization.py @@ -98,42 +98,6 @@ async def test_rejects_relative_time(tmp_path): assert count == 0 -async def test_rejects_assistant_sourced(tmp_path): - """Statement with 'was advised/suggested/recommended/told/instructed' gets filtered.""" - llm = MockLLM() - embedder = MockEmbedder() - - llm.set_responses("generate", [_episode_json()]) - llm.set_responses( - "generate_structured", - [_extraction([ExtractedKnowledge(statement="User was advised to use Docker", knowledge_type="new", confidence=0.9)])], - ) - - memory = _make_memory(tmp_path, llm, embedder) - async with memory: - await memory.add_exchange("user1", "hi", "Use Docker!", timestamp=_ts()) - count = await memory.process("user1") - assert count == 0 - - -async def test_rejects_assistant_sourced_told(tmp_path): - """'was told to' variant also gets filtered.""" - llm = MockLLM() - embedder = MockEmbedder() - - llm.set_responses("generate", [_episode_json()]) - llm.set_responses( - "generate_structured", - [_extraction([ExtractedKnowledge(statement="User was told to migrate to Postgres", knowledge_type="new", confidence=0.9)])], - ) - - memory = _make_memory(tmp_path, llm, embedder) - async with memory: - await memory.add_exchange("user1", "hi", "Migrate to Postgres!", timestamp=_ts()) - count = await memory.process("user1") - assert count == 0 - - async def test_accepts_clean_statement(tmp_path): """Properly atomized statement passes all checks.""" llm = MockLLM() From 5f46be14b74d89615021bbce9238adea148b7810 Mon Sep 17 00:00:00 2001 From: bartosz roguski Date: Sat, 21 Feb 2026 18:34:30 +0100 Subject: [PATCH 05/10] fix(extraction): address PR #13 review round 5 Add missing _ASSISTANT_SOURCE_RE code-level check with infinitive requirement ("was advised to") to avoid false positives on legitimate passive constructions. Fix _THIRD_PERSON_RE to accept lowercase "user". Fix began? regex typo, remove later/earlier false positives, remove bare "to" from _UNTIL_PATTERN, extract weekday constant. --- src/memv/memory/_pipeline.py | 11 ++++++- src/memv/processing/temporal.py | 11 ++++--- tests/test_atomization.py | 54 +++++++++++++++++++++++++++++++++ tests/test_temporal.py | 28 +++++++++++++++++ 4 files changed, 98 insertions(+), 6 deletions(-) diff --git a/src/memv/memory/_pipeline.py b/src/memv/memory/_pipeline.py index c631cf9..dfc189b 100644 --- a/src/memv/memory/_pipeline.py +++ b/src/memv/memory/_pipeline.py @@ -20,8 +20,13 @@ logger = logging.getLogger(__name__) # Compiled regexes for validation checks -_THIRD_PERSON_RE = re.compile(r"^User\b") +_THIRD_PERSON_RE = re.compile(r"^[Uu]ser\b") _FIRST_PERSON_RE = re.compile(r"(? bool: logger.debug("Rejected (unresolved relative time): %s", stmt[:60]) return False + if _ASSISTANT_SOURCE_RE.search(stmt): + logger.debug("Rejected (assistant-sourced): %s", stmt[:60]) + return False + return True async def _is_duplicate_knowledge(self, embedding: list[float], user_id: str) -> tuple[bool, float]: diff --git a/src/memv/processing/temporal.py b/src/memv/processing/temporal.py index 806e27b..9e3f93b 100644 --- a/src/memv/processing/temporal.py +++ b/src/memv/processing/temporal.py @@ -17,7 +17,7 @@ r"\b(" r"yesterday|today|tomorrow" r"|(?:last|next|this)\s+(?:week|month|year|monday|tuesday|wednesday|thursday|friday|saturday|sunday)" - r"|recently|soon|later|earlier" + r"|recently|soon" r"|\d+\s+(?:days?|weeks?|months?|years?)\s+(?:ago|from\s+now)" r")\b", re.IGNORECASE, @@ -40,14 +40,16 @@ } # N units ago / from now +_WEEKDAY_NAMES = ["monday", "tuesday", "wednesday", "thursday", "friday", "saturday", "sunday"] + _N_UNITS_PATTERN = re.compile( r"(\d+)\s+(days?|weeks?|months?|years?)\s+(ago|from\s+now)", re.IGNORECASE, ) # Backfill patterns for temporal_info strings -_SINCE_PATTERN = re.compile(r"(?:since|from|starting|began?)\s+(.+?)(?:\s*$|\s*(?:to|until|through)\s+)", re.IGNORECASE) -_UNTIL_PATTERN = re.compile(r"(?:until|to|through|ending|ended?)\s+(.+?)$", re.IGNORECASE) +_SINCE_PATTERN = re.compile(r"(?:since|from|starting|began)\s+(.+?)(?:\s*$|\s*(?:to|until|through)\s+)", re.IGNORECASE) +_UNTIL_PATTERN = re.compile(r"(?:until|through|ending|ended?)\s+(.+?)$", re.IGNORECASE) _FROM_TO_PATTERN = re.compile(r"(?:from|since)\s+(.+?)\s+(?:to|until|through)\s+(.+?)$", re.IGNORECASE) @@ -71,9 +73,8 @@ def parse_temporal_expression(text: str, reference: datetime) -> datetime | None # "last/next " patterns for prefix, direction in [("last", -1), ("next", 1)]: - for day_name in ["monday", "tuesday", "wednesday", "thursday", "friday", "saturday", "sunday"]: + for target_weekday, day_name in enumerate(_WEEKDAY_NAMES): if normalized == f"{prefix} {day_name}": - target_weekday = ["monday", "tuesday", "wednesday", "thursday", "friday", "saturday", "sunday"].index(day_name) days_diff = (reference.weekday() - target_weekday) % 7 if direction == -1: days_diff = days_diff or 7 diff --git a/tests/test_atomization.py b/tests/test_atomization.py index 8805875..d462db9 100644 --- a/tests/test_atomization.py +++ b/tests/test_atomization.py @@ -120,6 +120,60 @@ async def test_accepts_clean_statement(tmp_path): assert count == 1 +async def test_rejects_assistant_sourced(tmp_path): + """Statement with 'was advised to' pattern gets filtered.""" + llm = MockLLM() + embedder = MockEmbedder() + + llm.set_responses("generate", [_episode_json()]) + llm.set_responses( + "generate_structured", + [_extraction([ExtractedKnowledge(statement="User was advised to try Emacs", knowledge_type="new", confidence=0.9)])], + ) + + memory = _make_memory(tmp_path, llm, embedder) + async with memory: + await memory.add_exchange("user1", "The assistant told me to try Emacs", "Sure!", timestamp=_ts()) + count = await memory.process("user1") + assert count == 0 + + +async def test_accepts_passive_without_infinitive(tmp_path): + """'was given a promotion' is NOT assistant-sourced — should pass.""" + llm = MockLLM() + embedder = MockEmbedder() + + llm.set_responses("generate", [_episode_json()]) + llm.set_responses( + "generate_structured", + [_extraction([ExtractedKnowledge(statement="User was given a promotion", knowledge_type="new", confidence=0.9)])], + ) + + memory = _make_memory(tmp_path, llm, embedder) + async with memory: + await memory.add_exchange("user1", "I was given a promotion", "Congrats!", timestamp=_ts()) + count = await memory.process("user1") + assert count == 1 + + +async def test_accepts_lowercase_user(tmp_path): + """LLM may output lowercase 'user' — should still pass third-person check.""" + llm = MockLLM() + embedder = MockEmbedder() + + llm.set_responses("generate", [_episode_json()]) + llm.set_responses( + "generate_structured", + [_extraction([ExtractedKnowledge(statement="user prefers Python", knowledge_type="new", confidence=0.9)])], + ) + + memory = _make_memory(tmp_path, llm, embedder) + async with memory: + await memory.add_exchange("user1", "I prefer Python", "Nice!", timestamp=_ts()) + count = await memory.process("user1") + assert count == 1 + + async def test_confidence_filter_unchanged(tmp_path): """Low-confidence items still rejected (regression check).""" llm = MockLLM() diff --git a/tests/test_temporal.py b/tests/test_temporal.py index 29115a1..7380f92 100644 --- a/tests/test_temporal.py +++ b/tests/test_temporal.py @@ -58,6 +58,12 @@ def test_iso_date_not_flagged(self): def test_case_insensitive(self): assert contains_relative_time("User started YESTERDAY") + def test_earlier_as_adjective_not_flagged(self): + assert not contains_relative_time("User prefers the earlier version of Python") + + def test_later_as_adjective_not_flagged(self): + assert not contains_relative_time("User upgraded to a later release of Node") + # --------------------------------------------------------------------------- # parse_temporal_expression @@ -223,3 +229,25 @@ def test_since_relative_date(self): assert valid_at is not None assert valid_at == datetime(2024, 6, 14, 12, 0, 0, tzinfo=timezone.utc) assert invalid_at is None + + def test_bare_to_not_matched_as_until(self): + # "related to data pipelines" should not trigger _UNTIL_PATTERN + valid_at, invalid_at = backfill_temporal_fields( + "related to data pipelines", + None, + None, + _ref(), + ) + assert valid_at is None + assert invalid_at is None + + def test_began_prefix(self): + valid_at, invalid_at = backfill_temporal_fields( + "began January 2024", + None, + None, + _ref(), + ) + assert valid_at is not None + assert valid_at.year == 2024 + assert valid_at.month == 1 From a572e2b5949439d642df6c2cd312b3bb7cb40c2d Mon Sep 17 00:00:00 2001 From: bartosz roguski Date: Sat, 21 Feb 2026 19:18:06 +0100 Subject: [PATCH 06/10] fix(test): test _FIRST_PERSON_RE instead of _THIRD_PERSON_RE Change test_rejects_first_person to use a statement that passes the third-person check so _FIRST_PERSON_RE is actually exercised. --- tests/test_atomization.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/test_atomization.py b/tests/test_atomization.py index d462db9..4627a64 100644 --- a/tests/test_atomization.py +++ b/tests/test_atomization.py @@ -52,7 +52,7 @@ async def test_rejects_first_person(tmp_path): llm.set_responses("generate", [_episode_json()]) llm.set_responses( "generate_structured", - [_extraction([ExtractedKnowledge(statement="I like Python", knowledge_type="new", confidence=0.9)])], + [_extraction([ExtractedKnowledge(statement="User mentioned that my team uses React", knowledge_type="new", confidence=0.9)])], ) memory = _make_memory(tmp_path, llm, embedder) From 37547d5aa322e06773448ec37dc5ad72bb4ced65 Mon Sep 17 00:00:00 2001 From: bartosz roguski Date: Sat, 21 Feb 2026 21:31:50 +0100 Subject: [PATCH 07/10] refactor(extraction): remove regex filters, use prompt-level enforcement Regex filters (_THIRD_PERSON_RE, _FIRST_PERSON_RE, _ASSISTANT_SOURCE_RE, contains_relative_time) only worked for English. Extraction quality is now enforced entirely at the prompt level via ATOMIZATION_RULES and EXCLUSIONS. _validate_extraction retains confidence >= 0.7 as the sole code-level gate. Prompts updated with nuanced source rules: extract assistant-communicated facts (dates, appointments) while rejecting suggestions/hypotheticals. --- CLAUDE.md | 6 ++ src/memv/memory/_pipeline.py | 37 +---------- src/memv/processing/prompts.py | 27 +++++--- tests/test_atomization.py | 110 ++++++++------------------------- tests/test_memory_e2e.py | 11 ++-- 5 files changed, 55 insertions(+), 136 deletions(-) diff --git a/CLAUDE.md b/CLAUDE.md index 8b8a0c5..4298d19 100644 --- a/CLAUDE.md +++ b/CLAUDE.md @@ -104,6 +104,12 @@ async with memory: **User isolation is mandatory**: All retrieval and storage operations require and filter by `user_id`. No cross-user queries possible by design. +**Extraction validation (`_pipeline.py`)**: `_validate_extraction` enforces a single check: confidence >= 0.7. All regex filters (third-person, first-person pronouns, relative time, assistant-sourced patterns) were removed because they only work for English. Quality enforcement is handled entirely at the prompt level via ATOMIZATION_RULES and EXCLUSIONS in `prompts.py`. + +**`temporal.py` regex gotchas**: Avoid broad word matches in `_RELATIVE_PATTERNS` (e.g. "later"/"earlier" match adjective usage). `_UNTIL_PATTERN` must not include bare "to" (matches non-temporal uses). `_SINCE_PATTERN` uses `began` not `began?` (the `?` makes "a" optional). + +**PR reviews**: The `claude` bot posts automated code reviews on PRs. Check with `gh pr view --comments` and address feedback before merging. + ## Code Style - Line length: 135 characters diff --git a/src/memv/memory/_pipeline.py b/src/memv/memory/_pipeline.py index dfc189b..210af1f 100644 --- a/src/memv/memory/_pipeline.py +++ b/src/memv/memory/_pipeline.py @@ -3,7 +3,6 @@ from __future__ import annotations import logging -import re from typing import TYPE_CHECKING from memv.models import ( @@ -12,22 +11,13 @@ Message, SemanticKnowledge, ) -from memv.processing.temporal import backfill_temporal_fields, contains_relative_time +from memv.processing.temporal import backfill_temporal_fields if TYPE_CHECKING: from memv.memory._lifecycle import LifecycleManager logger = logging.getLogger(__name__) -# Compiled regexes for validation checks -_THIRD_PERSON_RE = re.compile(r"^[Uu]ser\b") -_FIRST_PERSON_RE = re.compile(r"(? int: return stored_count def _validate_extraction(self, item: ExtractedKnowledge) -> bool: - """Filter extractions that are low-confidence or not self-contained. - - Scope: only User-subject statements pass. Third-party facts ("Bob prefers - Postgres") are intentionally dropped — the knowledge base stores facts - about the user, not about people mentioned in conversation. - """ + """Filter low-confidence extractions.""" if item.confidence < 0.7: logger.debug("Rejected (low confidence %.2f): %s", item.confidence, item.statement[:60]) return False - stmt = item.statement - - if not _THIRD_PERSON_RE.match(stmt): - logger.debug("Rejected (not third-person): %s", stmt[:60]) - return False - - if _FIRST_PERSON_RE.search(stmt): - logger.debug("Rejected (first-person pronoun): %s", stmt[:60]) - return False - - if contains_relative_time(stmt): - logger.debug("Rejected (unresolved relative time): %s", stmt[:60]) - return False - - if _ASSISTANT_SOURCE_RE.search(stmt): - logger.debug("Rejected (assistant-sourced): %s", stmt[:60]) - return False - return True async def _is_duplicate_knowledge(self, embedding: list[float], user_id: str) -> tuple[bool, float]: diff --git a/src/memv/processing/prompts.py b/src/memv/processing/prompts.py index a02ecc2..3110e31 100644 --- a/src/memv/processing/prompts.py +++ b/src/memv/processing/prompts.py @@ -68,9 +68,14 @@ - Conversation events ("User asked about X", "User requested Y") - extract the FACT, not the action - Assistant-sourced knowledge ("was advised to", "was suggested to", "was recommended to") -**CRITICAL SOURCE RULE - READ CAREFULLY:** -- ONLY extract facts the USER explicitly stated in their own messages +**CRITICAL SOURCE RULES - READ CAREFULLY:** +- Extract facts the user explicitly stated in their own messages +- DO extract factual information communicated by the assistant (dates, appointments, confirmations, scheduled events) - NEVER extract assistant suggestions, recommendations, or code examples as user facts +- Treat assistant suggestions about user intent as speculative — never encode them as facts +- NEVER attribute intent/action/statement to user if it originated in a hypothetical/suggestion/conditional from the assistant +- Ignore assistant-led topics unless user acts on them +- Attribute preferences only to explicit user claims — never to questions or reactions - If assistant says "use Python" and user doesn't respond with "yes" or confirm - DO NOT extract "User uses Python" - If assistant provides code in language X but user says "I use Y" - extract Y, not X - The user ASKING about something is NOT the same as the user USING it @@ -97,7 +102,7 @@ **REQUIRE in output statements:** - Absolute dates when temporal info exists: "on [resolved date]", not "yesterday" - Specific names: "User's React project at Vstorm", not "the project" -- Third person with "User" as subject: "User prefers Python", not "I prefer Python" +- Third person: "User prefers Python", not "I prefer Python" **Coreference resolution — resolve BEFORE writing the statement:** - "my kids" → "User's children" @@ -237,10 +242,12 @@ def cold_start_extraction_prompt(episode_title: str, original_messages: list[dic return f"""Extract HIGH-VALUE, PERSISTENT knowledge from this conversation. {timestamp_section} -**CRITICAL RULE: Extract ONLY from lines starting with ">>> USER:"** -- These are the user's actual words - the ONLY source of truth -- IGNORE all ASSISTANT lines completely -- Do NOT infer, expand, or modify what the user said +**SOURCE RULES:** +- Lines starting with ">>> USER:" are the user's actual words — the primary source of truth +- DO extract factual information communicated by the assistant (dates, appointments, confirmations) +- Do NOT extract assistant suggestions, recommendations, or hypotheticals as user facts +- Treat assistant suggestions about user intent as speculative — never encode them as facts +- Attribute preferences only to explicit user claims — never to assistant questions or reactions - Preserve the user's exact phrasing and technical terms @@ -318,8 +325,10 @@ def extraction_prompt_with_prediction(prediction: str, conversation: str, refere return f"""Extract valuable knowledge by comparing actual conversation with predicted content. {timestamp_section} -**CRITICAL: Extract ONLY from lines starting with ">>> USER:" - these are the user's actual words.** -**IGNORE all ASSISTANT lines - those are suggestions, not user facts.** +**SOURCE RULES:** +- Lines starting with ">>> USER:" are the user's actual words — the primary source of truth. +- DO extract factual info communicated by the assistant (dates, appointments, confirmations). +- Do NOT extract assistant suggestions, recommendations, or hypotheticals as user facts. {prediction} diff --git a/tests/test_atomization.py b/tests/test_atomization.py index 4627a64..dc15a90 100644 --- a/tests/test_atomization.py +++ b/tests/test_atomization.py @@ -40,66 +40,30 @@ def _make_memory(tmp_path, llm, embedder): # --------------------------------------------------------------------------- -# Validation filter: unit-level via e2e pipeline +# Confidence threshold # --------------------------------------------------------------------------- -async def test_rejects_first_person(tmp_path): - """Statement with 'I' / 'my' gets filtered out.""" +async def test_rejects_low_confidence(tmp_path): + """Low-confidence items rejected.""" llm = MockLLM() embedder = MockEmbedder() llm.set_responses("generate", [_episode_json()]) llm.set_responses( "generate_structured", - [_extraction([ExtractedKnowledge(statement="User mentioned that my team uses React", knowledge_type="new", confidence=0.9)])], - ) - - memory = _make_memory(tmp_path, llm, embedder) - async with memory: - await memory.add_exchange("user1", "I like Python", "Cool!", timestamp=_ts()) - count = await memory.process("user1") - assert count == 0 - - -async def test_rejects_non_third_person(tmp_path): - """Statement not starting with 'User' gets filtered.""" - llm = MockLLM() - embedder = MockEmbedder() - - llm.set_responses("generate", [_episode_json()]) - llm.set_responses( - "generate_structured", - [_extraction([ExtractedKnowledge(statement="He likes Python", knowledge_type="new", confidence=0.9)])], - ) - - memory = _make_memory(tmp_path, llm, embedder) - async with memory: - await memory.add_exchange("user1", "I like Python", "Cool!", timestamp=_ts()) - count = await memory.process("user1") - assert count == 0 - - -async def test_rejects_relative_time(tmp_path): - """Statement with unresolved 'yesterday' gets filtered.""" - llm = MockLLM() - embedder = MockEmbedder() - - llm.set_responses("generate", [_episode_json()]) - llm.set_responses( - "generate_structured", - [_extraction([ExtractedKnowledge(statement="User started using FastAPI yesterday", knowledge_type="new", confidence=0.9)])], + [_extraction([ExtractedKnowledge(statement="User might like Rust", knowledge_type="new", confidence=0.3)])], ) memory = _make_memory(tmp_path, llm, embedder) async with memory: - await memory.add_exchange("user1", "I started using FastAPI yesterday", "Nice!", timestamp=_ts()) + await memory.add_exchange("user1", "Maybe Rust?", "Interesting!", timestamp=_ts()) count = await memory.process("user1") assert count == 0 -async def test_accepts_clean_statement(tmp_path): - """Properly atomized statement passes all checks.""" +async def test_accepts_high_confidence(tmp_path): + """Statement with confidence >= 0.7 passes.""" llm = MockLLM() embedder = MockEmbedder() @@ -120,76 +84,52 @@ async def test_accepts_clean_statement(tmp_path): assert count == 1 -async def test_rejects_assistant_sourced(tmp_path): - """Statement with 'was advised to' pattern gets filtered.""" - llm = MockLLM() - embedder = MockEmbedder() - - llm.set_responses("generate", [_episode_json()]) - llm.set_responses( - "generate_structured", - [_extraction([ExtractedKnowledge(statement="User was advised to try Emacs", knowledge_type="new", confidence=0.9)])], - ) - - memory = _make_memory(tmp_path, llm, embedder) - async with memory: - await memory.add_exchange("user1", "The assistant told me to try Emacs", "Sure!", timestamp=_ts()) - count = await memory.process("user1") - assert count == 0 - - -async def test_accepts_passive_without_infinitive(tmp_path): - """'was given a promotion' is NOT assistant-sourced — should pass.""" +async def test_accepts_non_user_subject(tmp_path): + """Statements not starting with 'User' are accepted.""" llm = MockLLM() embedder = MockEmbedder() llm.set_responses("generate", [_episode_json()]) llm.set_responses( "generate_structured", - [_extraction([ExtractedKnowledge(statement="User was given a promotion", knowledge_type="new", confidence=0.9)])], + [_extraction([ExtractedKnowledge(statement="The appointment was moved to March 15", knowledge_type="new", confidence=0.9)])], ) memory = _make_memory(tmp_path, llm, embedder) async with memory: - await memory.add_exchange("user1", "I was given a promotion", "Congrats!", timestamp=_ts()) + await memory.add_exchange("user1", "When is my appointment?", "It was moved to March 15", timestamp=_ts()) count = await memory.process("user1") assert count == 1 -async def test_accepts_lowercase_user(tmp_path): - """LLM may output lowercase 'user' — should still pass third-person check.""" +async def test_confidence_boundary(tmp_path): + """Confidence exactly at 0.7 passes, below does not.""" llm = MockLLM() embedder = MockEmbedder() llm.set_responses("generate", [_episode_json()]) llm.set_responses( "generate_structured", - [_extraction([ExtractedKnowledge(statement="user prefers Python", knowledge_type="new", confidence=0.9)])], + [ + _extraction( + [ + ExtractedKnowledge(statement="User works at Vstorm", knowledge_type="new", confidence=0.7), + ExtractedKnowledge(statement="User likes tea", knowledge_type="new", confidence=0.69), + ] + ) + ], ) memory = _make_memory(tmp_path, llm, embedder) async with memory: - await memory.add_exchange("user1", "I prefer Python", "Nice!", timestamp=_ts()) + await memory.add_exchange("user1", "I work at Vstorm and like tea", "Nice!", timestamp=_ts()) count = await memory.process("user1") assert count == 1 -async def test_confidence_filter_unchanged(tmp_path): - """Low-confidence items still rejected (regression check).""" - llm = MockLLM() - embedder = MockEmbedder() - - llm.set_responses("generate", [_episode_json()]) - llm.set_responses( - "generate_structured", - [_extraction([ExtractedKnowledge(statement="User might like Rust", knowledge_type="new", confidence=0.3)])], - ) - - memory = _make_memory(tmp_path, llm, embedder) - async with memory: - await memory.add_exchange("user1", "Maybe Rust?", "Interesting!", timestamp=_ts()) - count = await memory.process("user1") - assert count == 0 +# --------------------------------------------------------------------------- +# Temporal backfill +# --------------------------------------------------------------------------- async def test_temporal_backfill_in_pipeline(tmp_path): diff --git a/tests/test_memory_e2e.py b/tests/test_memory_e2e.py index a8ad073..ed699cb 100644 --- a/tests/test_memory_e2e.py +++ b/tests/test_memory_e2e.py @@ -229,8 +229,8 @@ async def test_flush_forces_processing(tmp_path): assert count == 1 -async def test_atomization_filters_bad_statements(tmp_path): - """LLM returns mix of good and bad statements — only atomized ones survive.""" +async def test_confidence_filters_low_quality_statements(tmp_path): + """Only statements with confidence >= 0.7 survive the pipeline filter.""" llm = MockLLM() embedder = MockEmbedder() @@ -241,10 +241,8 @@ async def test_atomization_filters_bad_statements(tmp_path): ExtractionResponse( extracted=[ ExtractedKnowledge(statement="User prefers Vim", knowledge_type="new", confidence=0.9), - ExtractedKnowledge(statement="I like Vim", knowledge_type="new", confidence=0.9), - ExtractedKnowledge(statement="User started yesterday", knowledge_type="new", confidence=0.9), - ExtractedKnowledge(statement="He uses Neovim", knowledge_type="new", confidence=0.9), - ExtractedKnowledge(statement="User was advised to try Emacs", knowledge_type="new", confidence=0.9), + ExtractedKnowledge(statement="User likes Emacs", knowledge_type="new", confidence=0.5), + ExtractedKnowledge(statement="User uses Neovim", knowledge_type="new", confidence=0.3), ] ) ], @@ -254,7 +252,6 @@ async def test_atomization_filters_bad_statements(tmp_path): async with memory: await memory.add_exchange("user1", "I prefer Vim", "Nice!", timestamp=_ts()) count = await memory.process("user1") - # Only "User prefers Vim" should survive all filters assert count == 1 result = await memory.retrieve("Vim", user_id="user1") From afc38b9a96571f1513096f906ea1445364ba6929 Mon Sep 17 00:00:00 2001 From: bartosz roguski Date: Sat, 21 Feb 2026 21:47:30 +0100 Subject: [PATCH 08/10] fix(extraction): address PR #14 review feedback MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit - Change "will be REJECTED" to "are INVALID — resolve them or omit" (no code-level enforcement, prompt-only instruction) - Fix output format: "A concrete, self-contained fact from the conversation" instead of "USER explicitly stated" - Merge assistant-sourced exclusion into SOURCE RULES to remove tension between EXCLUSIONS list and source rules - Fix test statement: use atomized "User's dentist appointment was moved to March 15, 2024" instead of ambiguous "The appointment" --- src/memv/processing/prompts.py | 10 +++++----- tests/test_atomization.py | 14 +++++++++++--- 2 files changed, 16 insertions(+), 8 deletions(-) diff --git a/src/memv/processing/prompts.py b/src/memv/processing/prompts.py index 3110e31..68194fe 100644 --- a/src/memv/processing/prompts.py +++ b/src/memv/processing/prompts.py @@ -66,12 +66,12 @@ - Obvious or common knowledge - Speculative or uncertain claims - Conversation events ("User asked about X", "User requested Y") - extract the FACT, not the action -- Assistant-sourced knowledge ("was advised to", "was suggested to", "was recommended to") **CRITICAL SOURCE RULES - READ CAREFULLY:** - Extract facts the user explicitly stated in their own messages - DO extract factual information communicated by the assistant (dates, appointments, confirmations, scheduled events) -- NEVER extract assistant suggestions, recommendations, or code examples as user facts +- NEVER extract assistant suggestions, recommendations, or hypotheticals as user facts + ("was advised to", "was suggested to", "was recommended to") - Treat assistant suggestions about user intent as speculative — never encode them as facts - NEVER attribute intent/action/statement to user if it originated in a hypothetical/suggestion/conditional from the assistant - Ignore assistant-led topics unless user acts on them @@ -236,7 +236,7 @@ def cold_start_extraction_prompt(episode_title: str, original_messages: list[dic {reference_timestamp} You MUST resolve ALL relative dates using this timestamp. -Statements with unresolved relative time ("yesterday", "last week") will be REJECTED. +Statements with unresolved relative time ("yesterday", "last week") are INVALID — resolve them or omit. """ return f"""Extract HIGH-VALUE, PERSISTENT knowledge from this conversation. @@ -320,7 +320,7 @@ def extraction_prompt_with_prediction(prediction: str, conversation: str, refere {reference_timestamp} You MUST resolve ALL relative dates using this timestamp. -Statements with unresolved relative time ("yesterday", "last week") will be REJECTED. +Statements with unresolved relative time ("yesterday", "last week") are INVALID — resolve them or omit. """ return f"""Extract valuable knowledge by comparing actual conversation with predicted content. @@ -380,7 +380,7 @@ def extraction_prompt_with_prediction(prediction: str, conversation: str, refere ## Output Format For each extracted item, specify: -- statement: A fact the USER explicitly stated (not assistant suggestions) +- statement: A concrete, self-contained fact from the conversation (see SOURCE RULES above) - knowledge_type: "new" if entirely new, "update" if refines existing, "contradiction" if conflicts - temporal_info: Human-readable description if mentioned ("since January 2024", "until next month") - valid_at: ISO 8601 datetime when fact became true, or null if unknown/always true (e.g., "2024-01-01T00:00:00Z") diff --git a/tests/test_atomization.py b/tests/test_atomization.py index dc15a90..125f100 100644 --- a/tests/test_atomization.py +++ b/tests/test_atomization.py @@ -92,7 +92,15 @@ async def test_accepts_non_user_subject(tmp_path): llm.set_responses("generate", [_episode_json()]) llm.set_responses( "generate_structured", - [_extraction([ExtractedKnowledge(statement="The appointment was moved to March 15", knowledge_type="new", confidence=0.9)])], + [ + _extraction( + [ + ExtractedKnowledge( + statement="User's dentist appointment was moved to March 15, 2024", knowledge_type="new", confidence=0.9 + ) + ] + ) + ], ) memory = _make_memory(tmp_path, llm, embedder) @@ -187,11 +195,11 @@ def test_warm_extraction_contains_atomization_rules(self): def test_cold_start_contains_strong_timestamp_wording(self): prompt = cold_start_extraction_prompt("Test", [{"role": "user", "content": "hi"}], "2024-06-15T12:00:00Z") - assert "will be REJECTED" in prompt + assert "are INVALID" in prompt def test_warm_extraction_contains_strong_timestamp_wording(self): prompt = extraction_prompt_with_prediction("prediction", ">>> USER: hi", "2024-06-15T12:00:00Z") - assert "will be REJECTED" in prompt + assert "are INVALID" in prompt def test_atomization_rules_constant_exists(self): assert "Self-Contained Statement Rules" in ATOMIZATION_RULES From 953044a8ec8cb4f1ab0bece1dcde26d44ba8dd75 Mon Sep 17 00:00:00 2001 From: bartosz roguski Date: Sat, 21 Feb 2026 21:53:19 +0100 Subject: [PATCH 09/10] fix(test): address PR #14 review round 2 - Rename test_accepts_non_user_subject to test_accepts_assistant_communicated_fact (name/docstring now matches what the test actually validates) - Assert which item survived confidence boundary (Vstorm, not tea) --- tests/test_atomization.py | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/tests/test_atomization.py b/tests/test_atomization.py index 125f100..c20f8f1 100644 --- a/tests/test_atomization.py +++ b/tests/test_atomization.py @@ -84,8 +84,8 @@ async def test_accepts_high_confidence(tmp_path): assert count == 1 -async def test_accepts_non_user_subject(tmp_path): - """Statements not starting with 'User' are accepted.""" +async def test_accepts_assistant_communicated_fact(tmp_path): + """Facts communicated by the assistant (dates, appointments) are accepted.""" llm = MockLLM() embedder = MockEmbedder() @@ -134,6 +134,9 @@ async def test_confidence_boundary(tmp_path): count = await memory.process("user1") assert count == 1 + result = await memory.retrieve("Vstorm", user_id="user1") + assert result.retrieved_knowledge[0].statement == "User works at Vstorm" + # --------------------------------------------------------------------------- # Temporal backfill From 5ddda7dd775031288d36fb4b2952f4a516d1896e Mon Sep 17 00:00:00 2001 From: bartosz roguski Date: Sat, 21 Feb 2026 22:05:07 +0100 Subject: [PATCH 10/10] fix(extraction): address PR #14 review round 3 - Align cold_start output format with warm extraction (both now say "A concrete, self-contained fact from the conversation") - Simplify backfill condition: temporal_info alone triggers backfill (function is already idempotent on non-None fields) - Add retrieve assertion to test_accepts_assistant_communicated_fact --- src/memv/memory/_pipeline.py | 2 +- src/memv/processing/prompts.py | 2 +- tests/test_atomization.py | 3 +++ 3 files changed, 5 insertions(+), 2 deletions(-) diff --git a/src/memv/memory/_pipeline.py b/src/memv/memory/_pipeline.py index 210af1f..256364d 100644 --- a/src/memv/memory/_pipeline.py +++ b/src/memv/memory/_pipeline.py @@ -158,7 +158,7 @@ async def _process_episode(self, messages: list[Message], user_id: str) -> int: # 6. Backfill temporal fields from temporal_info for item in extracted: - if item.temporal_info and (item.valid_at is None or item.invalid_at is None): + if item.temporal_info: item.valid_at, item.invalid_at = backfill_temporal_fields( item.temporal_info, item.valid_at, diff --git a/src/memv/processing/prompts.py b/src/memv/processing/prompts.py index 68194fe..4a609c4 100644 --- a/src/memv/processing/prompts.py +++ b/src/memv/processing/prompts.py @@ -301,7 +301,7 @@ def cold_start_extraction_prompt(episode_title: str, original_messages: list[dic ## Output Format For each extracted item, specify: -- statement: A clean, declarative fact about the user (third-person: "User...", not "I...") +- statement: A concrete, self-contained fact from the conversation (see SOURCE RULES above) - knowledge_type: "new" - temporal_info: Human-readable description if mentioned ("since January 2024", "until next month") - valid_at: ISO 8601 datetime when fact became true, or null if unknown/always true (e.g., "2024-01-01T00:00:00Z") diff --git a/tests/test_atomization.py b/tests/test_atomization.py index c20f8f1..40d933d 100644 --- a/tests/test_atomization.py +++ b/tests/test_atomization.py @@ -109,6 +109,9 @@ async def test_accepts_assistant_communicated_fact(tmp_path): count = await memory.process("user1") assert count == 1 + result = await memory.retrieve("dentist appointment", user_id="user1") + assert "March 15, 2024" in result.retrieved_knowledge[0].statement + async def test_confidence_boundary(tmp_path): """Confidence exactly at 0.7 passes, below does not."""