Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
6 changes: 6 additions & 0 deletions CLAUDE.md
Original file line number Diff line number Diff line change
Expand Up @@ -104,6 +104,12 @@ async with memory:

**User isolation is mandatory**: All retrieval and storage operations require and filter by `user_id`. No cross-user queries possible by design.

**Extraction validation (`_pipeline.py`)**: `_validate_extraction` enforces a single check: confidence >= 0.7. All regex filters (third-person, first-person pronouns, relative time, assistant-sourced patterns) were removed because they only work for English. Quality enforcement is handled entirely at the prompt level via ATOMIZATION_RULES and EXCLUSIONS in `prompts.py`.

**`temporal.py` regex gotchas**: Avoid broad word matches in `_RELATIVE_PATTERNS` (e.g. "later"/"earlier" match adjective usage). `_UNTIL_PATTERN` must not include bare "to" (matches non-temporal uses). `_SINCE_PATTERN` uses `began` not `began?` (the `?` makes "a" optional).

**PR reviews**: The `claude` bot posts automated code reviews on PRs. Check with `gh pr view <N> --comments` and address feedback before merging.

## Code Style

- Line length: 135 characters
Expand Down
39 changes: 3 additions & 36 deletions src/memv/memory/_pipeline.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,6 @@
from __future__ import annotations

import logging
import re
from typing import TYPE_CHECKING

from memv.models import (
Expand All @@ -12,22 +11,13 @@
Message,
SemanticKnowledge,
)
from memv.processing.temporal import backfill_temporal_fields, contains_relative_time
from memv.processing.temporal import backfill_temporal_fields

if TYPE_CHECKING:
from memv.memory._lifecycle import LifecycleManager

logger = logging.getLogger(__name__)

# Compiled regexes for validation checks
_THIRD_PERSON_RE = re.compile(r"^[Uu]ser\b")
_FIRST_PERSON_RE = re.compile(r"(?<!/)\b(I(?!/)|[Mm][Yy]|[Mm][Ee]|[Ww][Ee]|[Oo][Uu][Rr])\b")
# Requires infinitive ("was advised to") to avoid rejecting legitimate facts like "User was given a promotion"
_ASSISTANT_SOURCE_RE = re.compile(
r"\b(?:was|were)\s+(?:advised|suggested|recommended|told|instructed|encouraged)\s+to\b",
re.IGNORECASE,
)


class Pipeline:
"""Handles message processing, episode generation, and knowledge extraction."""
Expand Down Expand Up @@ -168,7 +158,7 @@ async def _process_episode(self, messages: list[Message], user_id: str) -> int:

# 6. Backfill temporal fields from temporal_info
for item in extracted:
if item.temporal_info and (item.valid_at is None or item.invalid_at is None):
if item.temporal_info:
item.valid_at, item.invalid_at = backfill_temporal_fields(
item.temporal_info,
item.valid_at,
Expand Down Expand Up @@ -218,34 +208,11 @@ async def _process_episode(self, messages: list[Message], user_id: str) -> int:
return stored_count

def _validate_extraction(self, item: ExtractedKnowledge) -> bool:
"""Filter extractions that are low-confidence or not self-contained.

Scope: only User-subject statements pass. Third-party facts ("Bob prefers
Postgres") are intentionally dropped — the knowledge base stores facts
about the user, not about people mentioned in conversation.
"""
"""Filter low-confidence extractions."""
if item.confidence < 0.7:
logger.debug("Rejected (low confidence %.2f): %s", item.confidence, item.statement[:60])
return False

stmt = item.statement

if not _THIRD_PERSON_RE.match(stmt):
logger.debug("Rejected (not third-person): %s", stmt[:60])
return False

if _FIRST_PERSON_RE.search(stmt):
logger.debug("Rejected (first-person pronoun): %s", stmt[:60])
return False

if contains_relative_time(stmt):
logger.debug("Rejected (unresolved relative time): %s", stmt[:60])
return False

if _ASSISTANT_SOURCE_RE.search(stmt):
logger.debug("Rejected (assistant-sourced): %s", stmt[:60])
return False

return True

async def _is_duplicate_knowledge(self, embedding: list[float], user_id: str) -> tuple[bool, float]:
Expand Down
39 changes: 24 additions & 15 deletions src/memv/processing/prompts.py
Original file line number Diff line number Diff line change
Expand Up @@ -66,11 +66,16 @@
- Obvious or common knowledge
- Speculative or uncertain claims
- Conversation events ("User asked about X", "User requested Y") - extract the FACT, not the action
- Assistant-sourced knowledge ("was advised to", "was suggested to", "was recommended to")

**CRITICAL SOURCE RULE - READ CAREFULLY:**
- ONLY extract facts the USER explicitly stated in their own messages
- NEVER extract assistant suggestions, recommendations, or code examples as user facts
**CRITICAL SOURCE RULES - READ CAREFULLY:**
- Extract facts the user explicitly stated in their own messages
- DO extract factual information communicated by the assistant (dates, appointments, confirmations, scheduled events)
- NEVER extract assistant suggestions, recommendations, or hypotheticals as user facts
("was advised to", "was suggested to", "was recommended to")
- Treat assistant suggestions about user intent as speculative — never encode them as facts
- NEVER attribute intent/action/statement to user if it originated in a hypothetical/suggestion/conditional from the assistant
- Ignore assistant-led topics unless user acts on them
- Attribute preferences only to explicit user claims — never to questions or reactions
- If assistant says "use Python" and user doesn't respond with "yes" or confirm - DO NOT extract "User uses Python"
- If assistant provides code in language X but user says "I use Y" - extract Y, not X
- The user ASKING about something is NOT the same as the user USING it
Expand All @@ -97,7 +102,7 @@
**REQUIRE in output statements:**
- Absolute dates when temporal info exists: "on [resolved date]", not "yesterday"
- Specific names: "User's React project at Vstorm", not "the project"
- Third person with "User" as subject: "User prefers Python", not "I prefer Python"
- Third person: "User prefers Python", not "I prefer Python"

**Coreference resolution — resolve BEFORE writing the statement:**
- "my kids" → "User's children"
Expand Down Expand Up @@ -231,16 +236,18 @@ def cold_start_extraction_prompt(episode_title: str, original_messages: list[dic
{reference_timestamp}
</reference_timestamp>
You MUST resolve ALL relative dates using this timestamp.
Statements with unresolved relative time ("yesterday", "last week") will be REJECTED.
Statements with unresolved relative time ("yesterday", "last week") are INVALID — resolve them or omit.
"""

return f"""Extract HIGH-VALUE, PERSISTENT knowledge from this conversation.
{timestamp_section}

**CRITICAL RULE: Extract ONLY from lines starting with ">>> USER:"**
- These are the user's actual words - the ONLY source of truth
- IGNORE all ASSISTANT lines completely
- Do NOT infer, expand, or modify what the user said
**SOURCE RULES:**
- Lines starting with ">>> USER:" are the user's actual words — the primary source of truth
- DO extract factual information communicated by the assistant (dates, appointments, confirmations)
- Do NOT extract assistant suggestions, recommendations, or hypotheticals as user facts
- Treat assistant suggestions about user intent as speculative — never encode them as facts
- Attribute preferences only to explicit user claims — never to assistant questions or reactions
- Preserve the user's exact phrasing and technical terms

<episode_context>
Expand Down Expand Up @@ -294,7 +301,7 @@ def cold_start_extraction_prompt(episode_title: str, original_messages: list[dic
## Output Format

For each extracted item, specify:
- statement: A clean, declarative fact about the user (third-person: "User...", not "I...")
- statement: A concrete, self-contained fact from the conversation (see SOURCE RULES above)
- knowledge_type: "new"
- temporal_info: Human-readable description if mentioned ("since January 2024", "until next month")
- valid_at: ISO 8601 datetime when fact became true, or null if unknown/always true (e.g., "2024-01-01T00:00:00Z")
Expand All @@ -313,13 +320,15 @@ def extraction_prompt_with_prediction(prediction: str, conversation: str, refere
{reference_timestamp}
</reference_timestamp>
You MUST resolve ALL relative dates using this timestamp.
Statements with unresolved relative time ("yesterday", "last week") will be REJECTED.
Statements with unresolved relative time ("yesterday", "last week") are INVALID — resolve them or omit.
"""

return f"""Extract valuable knowledge by comparing actual conversation with predicted content.
{timestamp_section}
**CRITICAL: Extract ONLY from lines starting with ">>> USER:" - these are the user's actual words.**
**IGNORE all ASSISTANT lines - those are suggestions, not user facts.**
**SOURCE RULES:**
- Lines starting with ">>> USER:" are the user's actual words — the primary source of truth.
- DO extract factual info communicated by the assistant (dates, appointments, confirmations).
- Do NOT extract assistant suggestions, recommendations, or hypotheticals as user facts.

<prediction>
{prediction}
Expand Down Expand Up @@ -371,7 +380,7 @@ def extraction_prompt_with_prediction(prediction: str, conversation: str, refere
## Output Format

For each extracted item, specify:
- statement: A fact the USER explicitly stated (not assistant suggestions)
- statement: A concrete, self-contained fact from the conversation (see SOURCE RULES above)
- knowledge_type: "new" if entirely new, "update" if refines existing, "contradiction" if conflicts
- temporal_info: Human-readable description if mentioned ("since January 2024", "until next month")
- valid_at: ISO 8601 datetime when fact became true, or null if unknown/always true (e.g., "2024-01-01T00:00:00Z")
Expand Down
126 changes: 40 additions & 86 deletions tests/test_atomization.py
Original file line number Diff line number Diff line change
Expand Up @@ -40,66 +40,30 @@ def _make_memory(tmp_path, llm, embedder):


# ---------------------------------------------------------------------------
# Validation filter: unit-level via e2e pipeline
# Confidence threshold
# ---------------------------------------------------------------------------


async def test_rejects_first_person(tmp_path):
"""Statement with 'I' / 'my' gets filtered out."""
async def test_rejects_low_confidence(tmp_path):
"""Low-confidence items rejected."""
llm = MockLLM()
embedder = MockEmbedder()

llm.set_responses("generate", [_episode_json()])
llm.set_responses(
"generate_structured",
[_extraction([ExtractedKnowledge(statement="User mentioned that my team uses React", knowledge_type="new", confidence=0.9)])],
)

memory = _make_memory(tmp_path, llm, embedder)
async with memory:
await memory.add_exchange("user1", "I like Python", "Cool!", timestamp=_ts())
count = await memory.process("user1")
assert count == 0


async def test_rejects_non_third_person(tmp_path):
"""Statement not starting with 'User' gets filtered."""
llm = MockLLM()
embedder = MockEmbedder()

llm.set_responses("generate", [_episode_json()])
llm.set_responses(
"generate_structured",
[_extraction([ExtractedKnowledge(statement="He likes Python", knowledge_type="new", confidence=0.9)])],
)

memory = _make_memory(tmp_path, llm, embedder)
async with memory:
await memory.add_exchange("user1", "I like Python", "Cool!", timestamp=_ts())
count = await memory.process("user1")
assert count == 0


async def test_rejects_relative_time(tmp_path):
"""Statement with unresolved 'yesterday' gets filtered."""
llm = MockLLM()
embedder = MockEmbedder()

llm.set_responses("generate", [_episode_json()])
llm.set_responses(
"generate_structured",
[_extraction([ExtractedKnowledge(statement="User started using FastAPI yesterday", knowledge_type="new", confidence=0.9)])],
[_extraction([ExtractedKnowledge(statement="User might like Rust", knowledge_type="new", confidence=0.3)])],
)

memory = _make_memory(tmp_path, llm, embedder)
async with memory:
await memory.add_exchange("user1", "I started using FastAPI yesterday", "Nice!", timestamp=_ts())
await memory.add_exchange("user1", "Maybe Rust?", "Interesting!", timestamp=_ts())
count = await memory.process("user1")
assert count == 0


async def test_accepts_clean_statement(tmp_path):
"""Properly atomized statement passes all checks."""
async def test_accepts_high_confidence(tmp_path):
"""Statement with confidence >= 0.7 passes."""
llm = MockLLM()
embedder = MockEmbedder()

Expand All @@ -120,76 +84,66 @@ async def test_accepts_clean_statement(tmp_path):
assert count == 1


async def test_rejects_assistant_sourced(tmp_path):
"""Statement with 'was advised to' pattern gets filtered."""
async def test_accepts_assistant_communicated_fact(tmp_path):
"""Facts communicated by the assistant (dates, appointments) are accepted."""
llm = MockLLM()
embedder = MockEmbedder()

llm.set_responses("generate", [_episode_json()])
llm.set_responses(
"generate_structured",
[_extraction([ExtractedKnowledge(statement="User was advised to try Emacs", knowledge_type="new", confidence=0.9)])],
)

memory = _make_memory(tmp_path, llm, embedder)
async with memory:
await memory.add_exchange("user1", "The assistant told me to try Emacs", "Sure!", timestamp=_ts())
count = await memory.process("user1")
assert count == 0


async def test_accepts_passive_without_infinitive(tmp_path):
"""'was given a promotion' is NOT assistant-sourced — should pass."""
llm = MockLLM()
embedder = MockEmbedder()

llm.set_responses("generate", [_episode_json()])
llm.set_responses(
"generate_structured",
[_extraction([ExtractedKnowledge(statement="User was given a promotion", knowledge_type="new", confidence=0.9)])],
[
_extraction(
[
ExtractedKnowledge(
statement="User's dentist appointment was moved to March 15, 2024", knowledge_type="new", confidence=0.9
)
]
)
],
)

memory = _make_memory(tmp_path, llm, embedder)
async with memory:
await memory.add_exchange("user1", "I was given a promotion", "Congrats!", timestamp=_ts())
await memory.add_exchange("user1", "When is my appointment?", "It was moved to March 15", timestamp=_ts())
count = await memory.process("user1")
assert count == 1

result = await memory.retrieve("dentist appointment", user_id="user1")
assert "March 15, 2024" in result.retrieved_knowledge[0].statement


async def test_accepts_lowercase_user(tmp_path):
"""LLM may output lowercase 'user' — should still pass third-person check."""
async def test_confidence_boundary(tmp_path):
"""Confidence exactly at 0.7 passes, below does not."""
llm = MockLLM()
embedder = MockEmbedder()

llm.set_responses("generate", [_episode_json()])
llm.set_responses(
"generate_structured",
[_extraction([ExtractedKnowledge(statement="user prefers Python", knowledge_type="new", confidence=0.9)])],
[
_extraction(
[
ExtractedKnowledge(statement="User works at Vstorm", knowledge_type="new", confidence=0.7),
ExtractedKnowledge(statement="User likes tea", knowledge_type="new", confidence=0.69),
]
)
],
)

memory = _make_memory(tmp_path, llm, embedder)
async with memory:
await memory.add_exchange("user1", "I prefer Python", "Nice!", timestamp=_ts())
await memory.add_exchange("user1", "I work at Vstorm and like tea", "Nice!", timestamp=_ts())
count = await memory.process("user1")
assert count == 1

result = await memory.retrieve("Vstorm", user_id="user1")
assert result.retrieved_knowledge[0].statement == "User works at Vstorm"

async def test_confidence_filter_unchanged(tmp_path):
"""Low-confidence items still rejected (regression check)."""
llm = MockLLM()
embedder = MockEmbedder()

llm.set_responses("generate", [_episode_json()])
llm.set_responses(
"generate_structured",
[_extraction([ExtractedKnowledge(statement="User might like Rust", knowledge_type="new", confidence=0.3)])],
)

memory = _make_memory(tmp_path, llm, embedder)
async with memory:
await memory.add_exchange("user1", "Maybe Rust?", "Interesting!", timestamp=_ts())
count = await memory.process("user1")
assert count == 0
# ---------------------------------------------------------------------------
# Temporal backfill
# ---------------------------------------------------------------------------


async def test_temporal_backfill_in_pipeline(tmp_path):
Expand Down Expand Up @@ -247,11 +201,11 @@ def test_warm_extraction_contains_atomization_rules(self):

def test_cold_start_contains_strong_timestamp_wording(self):
prompt = cold_start_extraction_prompt("Test", [{"role": "user", "content": "hi"}], "2024-06-15T12:00:00Z")
assert "will be REJECTED" in prompt
assert "are INVALID" in prompt

def test_warm_extraction_contains_strong_timestamp_wording(self):
prompt = extraction_prompt_with_prediction("prediction", ">>> USER: hi", "2024-06-15T12:00:00Z")
assert "will be REJECTED" in prompt
assert "are INVALID" in prompt

def test_atomization_rules_constant_exists(self):
assert "Self-Contained Statement Rules" in ATOMIZATION_RULES
Loading