diff --git a/CLAUDE.md b/CLAUDE.md index 8b8a0c5..4298d19 100644 --- a/CLAUDE.md +++ b/CLAUDE.md @@ -104,6 +104,12 @@ async with memory: **User isolation is mandatory**: All retrieval and storage operations require and filter by `user_id`. No cross-user queries possible by design. +**Extraction validation (`_pipeline.py`)**: `_validate_extraction` enforces a single check: confidence >= 0.7. All regex filters (third-person, first-person pronouns, relative time, assistant-sourced patterns) were removed because they only work for English. Quality enforcement is handled entirely at the prompt level via ATOMIZATION_RULES and EXCLUSIONS in `prompts.py`. + +**`temporal.py` regex gotchas**: Avoid broad word matches in `_RELATIVE_PATTERNS` (e.g. "later"/"earlier" match adjective usage). `_UNTIL_PATTERN` must not include bare "to" (matches non-temporal uses). `_SINCE_PATTERN` uses `began` not `began?` (the `?` makes "a" optional). + +**PR reviews**: The `claude` bot posts automated code reviews on PRs. Check with `gh pr view --comments` and address feedback before merging. + ## Code Style - Line length: 135 characters diff --git a/src/memv/memory/_pipeline.py b/src/memv/memory/_pipeline.py index dfc189b..256364d 100644 --- a/src/memv/memory/_pipeline.py +++ b/src/memv/memory/_pipeline.py @@ -3,7 +3,6 @@ from __future__ import annotations import logging -import re from typing import TYPE_CHECKING from memv.models import ( @@ -12,22 +11,13 @@ Message, SemanticKnowledge, ) -from memv.processing.temporal import backfill_temporal_fields, contains_relative_time +from memv.processing.temporal import backfill_temporal_fields if TYPE_CHECKING: from memv.memory._lifecycle import LifecycleManager logger = logging.getLogger(__name__) -# Compiled regexes for validation checks -_THIRD_PERSON_RE = re.compile(r"^[Uu]ser\b") -_FIRST_PERSON_RE = re.compile(r"(? int: # 6. Backfill temporal fields from temporal_info for item in extracted: - if item.temporal_info and (item.valid_at is None or item.invalid_at is None): + if item.temporal_info: item.valid_at, item.invalid_at = backfill_temporal_fields( item.temporal_info, item.valid_at, @@ -218,34 +208,11 @@ async def _process_episode(self, messages: list[Message], user_id: str) -> int: return stored_count def _validate_extraction(self, item: ExtractedKnowledge) -> bool: - """Filter extractions that are low-confidence or not self-contained. - - Scope: only User-subject statements pass. Third-party facts ("Bob prefers - Postgres") are intentionally dropped — the knowledge base stores facts - about the user, not about people mentioned in conversation. - """ + """Filter low-confidence extractions.""" if item.confidence < 0.7: logger.debug("Rejected (low confidence %.2f): %s", item.confidence, item.statement[:60]) return False - stmt = item.statement - - if not _THIRD_PERSON_RE.match(stmt): - logger.debug("Rejected (not third-person): %s", stmt[:60]) - return False - - if _FIRST_PERSON_RE.search(stmt): - logger.debug("Rejected (first-person pronoun): %s", stmt[:60]) - return False - - if contains_relative_time(stmt): - logger.debug("Rejected (unresolved relative time): %s", stmt[:60]) - return False - - if _ASSISTANT_SOURCE_RE.search(stmt): - logger.debug("Rejected (assistant-sourced): %s", stmt[:60]) - return False - return True async def _is_duplicate_knowledge(self, embedding: list[float], user_id: str) -> tuple[bool, float]: diff --git a/src/memv/processing/prompts.py b/src/memv/processing/prompts.py index a02ecc2..4a609c4 100644 --- a/src/memv/processing/prompts.py +++ b/src/memv/processing/prompts.py @@ -66,11 +66,16 @@ - Obvious or common knowledge - Speculative or uncertain claims - Conversation events ("User asked about X", "User requested Y") - extract the FACT, not the action -- Assistant-sourced knowledge ("was advised to", "was suggested to", "was recommended to") -**CRITICAL SOURCE RULE - READ CAREFULLY:** -- ONLY extract facts the USER explicitly stated in their own messages -- NEVER extract assistant suggestions, recommendations, or code examples as user facts +**CRITICAL SOURCE RULES - READ CAREFULLY:** +- Extract facts the user explicitly stated in their own messages +- DO extract factual information communicated by the assistant (dates, appointments, confirmations, scheduled events) +- NEVER extract assistant suggestions, recommendations, or hypotheticals as user facts + ("was advised to", "was suggested to", "was recommended to") +- Treat assistant suggestions about user intent as speculative — never encode them as facts +- NEVER attribute intent/action/statement to user if it originated in a hypothetical/suggestion/conditional from the assistant +- Ignore assistant-led topics unless user acts on them +- Attribute preferences only to explicit user claims — never to questions or reactions - If assistant says "use Python" and user doesn't respond with "yes" or confirm - DO NOT extract "User uses Python" - If assistant provides code in language X but user says "I use Y" - extract Y, not X - The user ASKING about something is NOT the same as the user USING it @@ -97,7 +102,7 @@ **REQUIRE in output statements:** - Absolute dates when temporal info exists: "on [resolved date]", not "yesterday" - Specific names: "User's React project at Vstorm", not "the project" -- Third person with "User" as subject: "User prefers Python", not "I prefer Python" +- Third person: "User prefers Python", not "I prefer Python" **Coreference resolution — resolve BEFORE writing the statement:** - "my kids" → "User's children" @@ -231,16 +236,18 @@ def cold_start_extraction_prompt(episode_title: str, original_messages: list[dic {reference_timestamp} You MUST resolve ALL relative dates using this timestamp. -Statements with unresolved relative time ("yesterday", "last week") will be REJECTED. +Statements with unresolved relative time ("yesterday", "last week") are INVALID — resolve them or omit. """ return f"""Extract HIGH-VALUE, PERSISTENT knowledge from this conversation. {timestamp_section} -**CRITICAL RULE: Extract ONLY from lines starting with ">>> USER:"** -- These are the user's actual words - the ONLY source of truth -- IGNORE all ASSISTANT lines completely -- Do NOT infer, expand, or modify what the user said +**SOURCE RULES:** +- Lines starting with ">>> USER:" are the user's actual words — the primary source of truth +- DO extract factual information communicated by the assistant (dates, appointments, confirmations) +- Do NOT extract assistant suggestions, recommendations, or hypotheticals as user facts +- Treat assistant suggestions about user intent as speculative — never encode them as facts +- Attribute preferences only to explicit user claims — never to assistant questions or reactions - Preserve the user's exact phrasing and technical terms @@ -294,7 +301,7 @@ def cold_start_extraction_prompt(episode_title: str, original_messages: list[dic ## Output Format For each extracted item, specify: -- statement: A clean, declarative fact about the user (third-person: "User...", not "I...") +- statement: A concrete, self-contained fact from the conversation (see SOURCE RULES above) - knowledge_type: "new" - temporal_info: Human-readable description if mentioned ("since January 2024", "until next month") - valid_at: ISO 8601 datetime when fact became true, or null if unknown/always true (e.g., "2024-01-01T00:00:00Z") @@ -313,13 +320,15 @@ def extraction_prompt_with_prediction(prediction: str, conversation: str, refere {reference_timestamp} You MUST resolve ALL relative dates using this timestamp. -Statements with unresolved relative time ("yesterday", "last week") will be REJECTED. +Statements with unresolved relative time ("yesterday", "last week") are INVALID — resolve them or omit. """ return f"""Extract valuable knowledge by comparing actual conversation with predicted content. {timestamp_section} -**CRITICAL: Extract ONLY from lines starting with ">>> USER:" - these are the user's actual words.** -**IGNORE all ASSISTANT lines - those are suggestions, not user facts.** +**SOURCE RULES:** +- Lines starting with ">>> USER:" are the user's actual words — the primary source of truth. +- DO extract factual info communicated by the assistant (dates, appointments, confirmations). +- Do NOT extract assistant suggestions, recommendations, or hypotheticals as user facts. {prediction} @@ -371,7 +380,7 @@ def extraction_prompt_with_prediction(prediction: str, conversation: str, refere ## Output Format For each extracted item, specify: -- statement: A fact the USER explicitly stated (not assistant suggestions) +- statement: A concrete, self-contained fact from the conversation (see SOURCE RULES above) - knowledge_type: "new" if entirely new, "update" if refines existing, "contradiction" if conflicts - temporal_info: Human-readable description if mentioned ("since January 2024", "until next month") - valid_at: ISO 8601 datetime when fact became true, or null if unknown/always true (e.g., "2024-01-01T00:00:00Z") diff --git a/tests/test_atomization.py b/tests/test_atomization.py index 4627a64..40d933d 100644 --- a/tests/test_atomization.py +++ b/tests/test_atomization.py @@ -40,66 +40,30 @@ def _make_memory(tmp_path, llm, embedder): # --------------------------------------------------------------------------- -# Validation filter: unit-level via e2e pipeline +# Confidence threshold # --------------------------------------------------------------------------- -async def test_rejects_first_person(tmp_path): - """Statement with 'I' / 'my' gets filtered out.""" +async def test_rejects_low_confidence(tmp_path): + """Low-confidence items rejected.""" llm = MockLLM() embedder = MockEmbedder() llm.set_responses("generate", [_episode_json()]) llm.set_responses( "generate_structured", - [_extraction([ExtractedKnowledge(statement="User mentioned that my team uses React", knowledge_type="new", confidence=0.9)])], - ) - - memory = _make_memory(tmp_path, llm, embedder) - async with memory: - await memory.add_exchange("user1", "I like Python", "Cool!", timestamp=_ts()) - count = await memory.process("user1") - assert count == 0 - - -async def test_rejects_non_third_person(tmp_path): - """Statement not starting with 'User' gets filtered.""" - llm = MockLLM() - embedder = MockEmbedder() - - llm.set_responses("generate", [_episode_json()]) - llm.set_responses( - "generate_structured", - [_extraction([ExtractedKnowledge(statement="He likes Python", knowledge_type="new", confidence=0.9)])], - ) - - memory = _make_memory(tmp_path, llm, embedder) - async with memory: - await memory.add_exchange("user1", "I like Python", "Cool!", timestamp=_ts()) - count = await memory.process("user1") - assert count == 0 - - -async def test_rejects_relative_time(tmp_path): - """Statement with unresolved 'yesterday' gets filtered.""" - llm = MockLLM() - embedder = MockEmbedder() - - llm.set_responses("generate", [_episode_json()]) - llm.set_responses( - "generate_structured", - [_extraction([ExtractedKnowledge(statement="User started using FastAPI yesterday", knowledge_type="new", confidence=0.9)])], + [_extraction([ExtractedKnowledge(statement="User might like Rust", knowledge_type="new", confidence=0.3)])], ) memory = _make_memory(tmp_path, llm, embedder) async with memory: - await memory.add_exchange("user1", "I started using FastAPI yesterday", "Nice!", timestamp=_ts()) + await memory.add_exchange("user1", "Maybe Rust?", "Interesting!", timestamp=_ts()) count = await memory.process("user1") assert count == 0 -async def test_accepts_clean_statement(tmp_path): - """Properly atomized statement passes all checks.""" +async def test_accepts_high_confidence(tmp_path): + """Statement with confidence >= 0.7 passes.""" llm = MockLLM() embedder = MockEmbedder() @@ -120,76 +84,66 @@ async def test_accepts_clean_statement(tmp_path): assert count == 1 -async def test_rejects_assistant_sourced(tmp_path): - """Statement with 'was advised to' pattern gets filtered.""" +async def test_accepts_assistant_communicated_fact(tmp_path): + """Facts communicated by the assistant (dates, appointments) are accepted.""" llm = MockLLM() embedder = MockEmbedder() llm.set_responses("generate", [_episode_json()]) llm.set_responses( "generate_structured", - [_extraction([ExtractedKnowledge(statement="User was advised to try Emacs", knowledge_type="new", confidence=0.9)])], - ) - - memory = _make_memory(tmp_path, llm, embedder) - async with memory: - await memory.add_exchange("user1", "The assistant told me to try Emacs", "Sure!", timestamp=_ts()) - count = await memory.process("user1") - assert count == 0 - - -async def test_accepts_passive_without_infinitive(tmp_path): - """'was given a promotion' is NOT assistant-sourced — should pass.""" - llm = MockLLM() - embedder = MockEmbedder() - - llm.set_responses("generate", [_episode_json()]) - llm.set_responses( - "generate_structured", - [_extraction([ExtractedKnowledge(statement="User was given a promotion", knowledge_type="new", confidence=0.9)])], + [ + _extraction( + [ + ExtractedKnowledge( + statement="User's dentist appointment was moved to March 15, 2024", knowledge_type="new", confidence=0.9 + ) + ] + ) + ], ) memory = _make_memory(tmp_path, llm, embedder) async with memory: - await memory.add_exchange("user1", "I was given a promotion", "Congrats!", timestamp=_ts()) + await memory.add_exchange("user1", "When is my appointment?", "It was moved to March 15", timestamp=_ts()) count = await memory.process("user1") assert count == 1 + result = await memory.retrieve("dentist appointment", user_id="user1") + assert "March 15, 2024" in result.retrieved_knowledge[0].statement + -async def test_accepts_lowercase_user(tmp_path): - """LLM may output lowercase 'user' — should still pass third-person check.""" +async def test_confidence_boundary(tmp_path): + """Confidence exactly at 0.7 passes, below does not.""" llm = MockLLM() embedder = MockEmbedder() llm.set_responses("generate", [_episode_json()]) llm.set_responses( "generate_structured", - [_extraction([ExtractedKnowledge(statement="user prefers Python", knowledge_type="new", confidence=0.9)])], + [ + _extraction( + [ + ExtractedKnowledge(statement="User works at Vstorm", knowledge_type="new", confidence=0.7), + ExtractedKnowledge(statement="User likes tea", knowledge_type="new", confidence=0.69), + ] + ) + ], ) memory = _make_memory(tmp_path, llm, embedder) async with memory: - await memory.add_exchange("user1", "I prefer Python", "Nice!", timestamp=_ts()) + await memory.add_exchange("user1", "I work at Vstorm and like tea", "Nice!", timestamp=_ts()) count = await memory.process("user1") assert count == 1 + result = await memory.retrieve("Vstorm", user_id="user1") + assert result.retrieved_knowledge[0].statement == "User works at Vstorm" -async def test_confidence_filter_unchanged(tmp_path): - """Low-confidence items still rejected (regression check).""" - llm = MockLLM() - embedder = MockEmbedder() - - llm.set_responses("generate", [_episode_json()]) - llm.set_responses( - "generate_structured", - [_extraction([ExtractedKnowledge(statement="User might like Rust", knowledge_type="new", confidence=0.3)])], - ) - memory = _make_memory(tmp_path, llm, embedder) - async with memory: - await memory.add_exchange("user1", "Maybe Rust?", "Interesting!", timestamp=_ts()) - count = await memory.process("user1") - assert count == 0 +# --------------------------------------------------------------------------- +# Temporal backfill +# --------------------------------------------------------------------------- async def test_temporal_backfill_in_pipeline(tmp_path): @@ -247,11 +201,11 @@ def test_warm_extraction_contains_atomization_rules(self): def test_cold_start_contains_strong_timestamp_wording(self): prompt = cold_start_extraction_prompt("Test", [{"role": "user", "content": "hi"}], "2024-06-15T12:00:00Z") - assert "will be REJECTED" in prompt + assert "are INVALID" in prompt def test_warm_extraction_contains_strong_timestamp_wording(self): prompt = extraction_prompt_with_prediction("prediction", ">>> USER: hi", "2024-06-15T12:00:00Z") - assert "will be REJECTED" in prompt + assert "are INVALID" in prompt def test_atomization_rules_constant_exists(self): assert "Self-Contained Statement Rules" in ATOMIZATION_RULES diff --git a/tests/test_memory_e2e.py b/tests/test_memory_e2e.py index a8ad073..ed699cb 100644 --- a/tests/test_memory_e2e.py +++ b/tests/test_memory_e2e.py @@ -229,8 +229,8 @@ async def test_flush_forces_processing(tmp_path): assert count == 1 -async def test_atomization_filters_bad_statements(tmp_path): - """LLM returns mix of good and bad statements — only atomized ones survive.""" +async def test_confidence_filters_low_quality_statements(tmp_path): + """Only statements with confidence >= 0.7 survive the pipeline filter.""" llm = MockLLM() embedder = MockEmbedder() @@ -241,10 +241,8 @@ async def test_atomization_filters_bad_statements(tmp_path): ExtractionResponse( extracted=[ ExtractedKnowledge(statement="User prefers Vim", knowledge_type="new", confidence=0.9), - ExtractedKnowledge(statement="I like Vim", knowledge_type="new", confidence=0.9), - ExtractedKnowledge(statement="User started yesterday", knowledge_type="new", confidence=0.9), - ExtractedKnowledge(statement="He uses Neovim", knowledge_type="new", confidence=0.9), - ExtractedKnowledge(statement="User was advised to try Emacs", knowledge_type="new", confidence=0.9), + ExtractedKnowledge(statement="User likes Emacs", knowledge_type="new", confidence=0.5), + ExtractedKnowledge(statement="User uses Neovim", knowledge_type="new", confidence=0.3), ] ) ], @@ -254,7 +252,6 @@ async def test_atomization_filters_bad_statements(tmp_path): async with memory: await memory.add_exchange("user1", "I prefer Vim", "Nice!", timestamp=_ts()) count = await memory.process("user1") - # Only "User prefers Vim" should survive all filters assert count == 1 result = await memory.retrieve("Vim", user_id="user1")