Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 0 additions & 1 deletion demo/utils/simple_memory_manager.py
Original file line number Diff line number Diff line change
Expand Up @@ -244,7 +244,6 @@ async def search(
query: Query text
top_k: Number of results to return (default: 3)
mode: Retrieval mode (default: "rrf")
- "rrf": RRF fusion (recommended)
- "keyword": Keyword retrieval (BM25)
- "vector": Vector retrieval
- "hybrid": Keyword + Vector + Rerank
Expand Down
1 change: 0 additions & 1 deletion docker-compose.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,6 @@ services:
- "27017:27017"
volumes:
- mongodb_data:/data/db
- ./docker/mongodb/init:/docker-entrypoint-initdb.d
networks:
- memsys-network
healthcheck:
Expand Down
3 changes: 3 additions & 0 deletions docs/STARTER_KIT.md
Original file line number Diff line number Diff line change
Expand Up @@ -31,6 +31,9 @@ Welcome to the EverMemOS Competition! This starter kit will help you get up and
git clone https://github.com/EverMind-AI/EverMemOS.git
cd EverMemOS

# Copy environment template and configure (replace with your API keys)
cp env.template .env

# Start all services with Docker
docker compose up -d

Expand Down
2 changes: 1 addition & 1 deletion docs/dev_docs/getting_started.md
Original file line number Diff line number Diff line change
Expand Up @@ -273,7 +273,7 @@ uv run python src/bootstrap.py evaluation/locomo_evaluation/stage1_memcells_extr

# Run other evaluation stages
uv run python src/bootstrap.py evaluation/locomo_evaluation/stage2_index_building.py
uv run python src/bootstrap.py evaluation/locomo_evaluation/stage3_memory_retrivel.py
uv run python src/bootstrap.py evaluation/locomo_evaluation/stage3_memory_retrieval.py
```

#### 2. Run Demo Scripts
Expand Down
4 changes: 2 additions & 2 deletions evaluation/src/adapters/evermemos/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -21,7 +21,7 @@ locomo_evaluation/
│ └── answer_prompts.py # Answer generation
├── stage1_memcells_extraction.py # Stage 1: Extract MemCells
├── stage2_index_building.py # Stage 2: Build indexes
├── stage3_memory_retrivel.py # Stage 3: Retrieve memories
├── stage3_memory_retrieval.py # Stage 3: Retrieve memories
├── stage4_response.py # Stage 4: Generate responses
├── stage5_eval.py # Stage 5: Evaluate results
└── tools/ # Utility tools
Expand Down Expand Up @@ -73,7 +73,7 @@ python evaluation/locomo_evaluation/stage1_memcells_extraction.py
python evaluation/locomo_evaluation/stage2_index_building.py

# Stage 3: Retrieve memories
python evaluation/locomo_evaluation/stage3_memory_retrivel.py
python evaluation/locomo_evaluation/stage3_memory_retrieval.py

# Stage 4: Generate responses
python evaluation/locomo_evaluation/stage4_response.py
Expand Down
4 changes: 4 additions & 0 deletions evaluation/src/adapters/evermemos/config.py
Original file line number Diff line number Diff line change
Expand Up @@ -96,3 +96,7 @@ class ExperimentConfig:

max_retries: int = 5
max_concurrent_requests: int = 10

# Conversation IDs for index building (needed for --from-conv/--to-conv slicing)
# This maps sequential indices (0, 1, 2...) to actual conversation IDs
conversation_ids: list = []
30 changes: 26 additions & 4 deletions evaluation/src/adapters/evermemos/stage2_index_building.py
Original file line number Diff line number Diff line change
Expand Up @@ -126,8 +126,19 @@ def build_bm25_index(

print(f"Reading data from: {data_dir}")

# Get conversation IDs for proper file naming
# If conversation_ids is provided, use them; otherwise fall back to sequential indices
conversation_ids = getattr(config, 'conversation_ids', [])

for i in range(config.num_conv):
file_path = data_dir / f"memcell_list_conv_{i}.json"
# Use conversation_id if available, otherwise use sequential index
if conversation_ids and i < len(conversation_ids):
# Extract numeric ID from conversation_id (e.g., "locomo_234" -> "234")
conv_id = conversation_ids[i].split("_")[-1] if "_" in conversation_ids[i] else conversation_ids[i]
else:
conv_id = str(i)

file_path = data_dir / f"memcell_list_conv_{conv_id}.json"
if not file_path.exists():
print(f"Warning: File not found, skipping: {file_path}")
continue
Expand Down Expand Up @@ -161,7 +172,7 @@ def build_bm25_index(
# --- Saving the Index ---
index_data = {"bm25": bm25, "docs": original_docs}

output_path = bm25_save_dir / f"bm25_index_conv_{i}.pkl"
output_path = bm25_save_dir / f"bm25_index_conv_{conv_id}.pkl"
print(f"Saving index to: {output_path}")
with open(output_path, "wb") as f:
pickle.dump(index_data, f)
Expand Down Expand Up @@ -190,8 +201,19 @@ async def build_emb_index(config: ExperimentConfig, data_dir: Path, emb_save_dir

import time # For performance statistics

# Get conversation IDs for proper file naming
# If conversation_ids is provided, use them; otherwise fall back to sequential indices
conversation_ids = getattr(config, 'conversation_ids', [])

for i in range(config.num_conv):
file_path = data_dir / f"memcell_list_conv_{i}.json"
# Use conversation_id if available, otherwise use sequential index
if conversation_ids and i < len(conversation_ids):
# Extract numeric ID from conversation_id (e.g., "locomo_234" -> "234")
conv_id = conversation_ids[i].split("_")[-1] if "_" in conversation_ids[i] else conversation_ids[i]
else:
conv_id = str(i)

file_path = data_dir / f"memcell_list_conv_{conv_id}.json"
if not file_path.exists():
print(f"Warning: File not found, skipping: {file_path}")
continue
Expand Down Expand Up @@ -365,7 +387,7 @@ async def process_batch_with_retry(
# },
# ...
# ]
output_path = emb_save_dir / f"embedding_index_conv_{i}.pkl"
output_path = emb_save_dir / f"embedding_index_conv_{conv_id}.pkl"
emb_save_dir.mkdir(parents=True, exist_ok=True)
print(f"Saving embeddings to: {output_path}")
with open(output_path, "wb") as f:
Expand Down
76 changes: 64 additions & 12 deletions evaluation/src/adapters/evermemos_adapter.py
Original file line number Diff line number Diff line change
Expand Up @@ -30,7 +30,7 @@
from evaluation.src.adapters.evermemos import (
stage1_memcells_extraction,
stage2_index_building,
stage3_memory_retrivel,
stage3_memory_retrieval,
stage4_response,
)

Expand Down Expand Up @@ -102,7 +102,8 @@ def _extract_conv_index(conversation_id: str) -> str:
return conversation_id

def _check_missing_indexes(
self, index_dir: Path, num_conv: int, index_type: str = "bm25"
self, index_dir: Path, num_conv: int, index_type: str = "bm25",
conversation_ids: List[str] = None
) -> List[int]:
"""
Check for missing index files.
Expand All @@ -111,17 +112,24 @@ def _check_missing_indexes(
index_dir: Index directory
num_conv: Total number of conversations
index_type: Index type ("bm25" or "embedding")
conversation_ids: List of conversation IDs for proper file naming

Returns:
List of conversation indices with missing indexes
"""
missing_indexes = []

for i in range(num_conv):
# Use extracted numeric ID for file naming if conversation_ids provided
if conversation_ids and i < len(conversation_ids):
conv_id = self._extract_conv_index(conversation_ids[i])
else:
conv_id = str(i)

if index_type == "bm25":
index_file = index_dir / f"bm25_index_conv_{i}.pkl"
index_file = index_dir / f"bm25_index_conv_{conv_id}.pkl"
else: # embedding
index_file = index_dir / f"embedding_index_conv_{i}.pkl"
index_file = index_dir / f"embedding_index_conv_{conv_id}.pkl"

if not index_file.exists():
missing_indexes.append(i)
Expand Down Expand Up @@ -363,10 +371,14 @@ async def run_with_completion(conv_id, task):
# Call stage2 implementation to build indexes
exp_config = self._convert_config_to_experiment_config()
exp_config.num_conv = len(conversations) # Set conversation count
# Pass conversation IDs for proper index file naming (supports --from-conv/--to-conv slicing)
conversation_ids_list = [conv.conversation_id for conv in conversations]
exp_config.conversation_ids = conversation_ids_list

# Smart skip logic: check existing index files
bm25_need_build = self._check_missing_indexes(
index_dir=bm25_index_dir, num_conv=len(conversations), index_type="bm25"
index_dir=bm25_index_dir, num_conv=len(conversations), index_type="bm25",
conversation_ids=conversation_ids_list
)

emb_need_build = []
Expand All @@ -376,6 +388,7 @@ async def run_with_completion(conv_id, task):
index_dir=emb_index_dir,
num_conv=len(conversations),
index_type="embedding",
conversation_ids=conversation_ids_list
)

# Statistics
Expand Down Expand Up @@ -424,12 +437,29 @@ async def run_with_completion(conv_id, task):

# ========== Plan A: Return index metadata (lazy loading) ==========
# Don't load indexes into memory, only return paths and metadata

# Build mapping from conversation_id to extracted numeric ID
# This is needed because when using --from-conv/--to-conv slicing:
# - Index files are saved with extracted numeric IDs (e.g., "234", "235"...)
# - But conversation_ids still contain original IDs (e.g., "locomo_exp_user_234")
# - We need to map conversation_id -> extracted numeric ID (not sequential index!)
conv_id_to_index = {
conv.conversation_id: self._extract_conv_index(conv.conversation_id)
for idx, conv in enumerate(conversations)
}

# Save mapping to a JSON file for persistence across stages
mapping_file = output_dir / "conversation_index_mapping.json"
with open(mapping_file, "w") as f:
json.dump(conv_id_to_index, f, indent=2)

index_metadata = {
"type": "lazy_load", # Mark as lazy loading
"memcells_dir": str(memcells_dir),
"bm25_index_dir": str(bm25_index_dir),
"emb_index_dir": str(emb_index_dir),
"conversation_ids": [conv.conversation_id for conv in conversations],
"conv_id_to_index": conv_id_to_index, # Add mapping for search stage
"use_hybrid_search": use_hybrid,
"total_conversations": len(conversations),
}
Expand All @@ -454,16 +484,29 @@ async def search(
Search stage: Retrieve relevant MemCells.

Lazy loading: Load indexes from files on demand (memory-friendly).

Fix for --from-conv/--to-conv slicing:
- When building indexes, files are saved with sequential indices (0, 1, 2...)
- But conversation_id still contains original ID (e.g., "locomo_234")
- Use the mapping (conv_id_to_index) to find the correct sequential index
"""
# Lazy loading - read indexes from files
bm25_index_dir = Path(index["bm25_index_dir"])
emb_index_dir = Path(index["emb_index_dir"])

# Extract numeric index from conversation_id to find index files
# Example: conversation_id = "locomo_0" -> conv_index = "0"
conv_index = self._extract_conv_index(conversation_id)
# Get the sequential index from the mapping
# This mapping was created in add() stage and maps conversation_id -> sequential index
conv_id_to_index = index.get("conv_id_to_index", {})

if conversation_id in conv_id_to_index:
# Use the mapping to get sequential index
conv_index = conv_id_to_index[conversation_id]
else:
# Fallback: extract index from conversation_id (legacy behavior)
# This handles cases where the mapping is not available (e.g., old index files)
conv_index = self._extract_conv_index(conversation_id)

# Load BM25 index on demand (using numeric index)
# Load BM25 index on demand (using sequential index)
bm25_file = bm25_index_dir / f"bm25_index_conv_{conv_index}.pkl"
if not bm25_file.exists():
return SearchResult(
Expand Down Expand Up @@ -497,7 +540,7 @@ async def search(

if retrieval_mode == "agentic":
# Agentic retrieval
top_results, metadata = await stage3_memory_retrivel.agentic_retrieval(
top_results, metadata = await stage3_memory_retrieval.agentic_retrieval(
query=query,
config=exp_config,
llm_provider=self.llm_provider,
Expand All @@ -508,7 +551,7 @@ async def search(
)
elif retrieval_mode == "lightweight":
# Lightweight retrieval
top_results, metadata = await stage3_memory_retrivel.lightweight_retrieval(
top_results, metadata = await stage3_memory_retrieval.lightweight_retrieval(
query=query,
emb_index=emb_index,
bm25=bm25,
Expand All @@ -517,7 +560,7 @@ async def search(
)
else:
# Default to hybrid retrieval
top_results = await stage3_memory_retrivel.hybrid_search_with_rrf(
top_results = await stage3_memory_retrieval.hybrid_search_with_rrf(
query=query,
emb_index=emb_index,
bm25=bm25,
Expand Down Expand Up @@ -682,12 +725,21 @@ def build_lazy_index(
Returns:
Index metadata dict
"""
# Build mapping from conversation_id to extracted numeric ID
# This is needed for --from-conv/--to-conv slicing support
# Index files are named with extracted numeric IDs (e.g., "234", not sequential 0)
conv_id_to_index = {
conv.conversation_id: self._extract_conv_index(conv.conversation_id)
for idx, conv in enumerate(conversations)
}

return {
"type": "lazy_load",
"memcells_dir": str(output_dir / "memcells"),
"bm25_index_dir": str(output_dir / "bm25_index"),
"emb_index_dir": str(output_dir / "vectors"),
"conversation_ids": [conv.conversation_id for conv in conversations],
"conv_id_to_index": conv_id_to_index, # Add mapping for search stage
"use_hybrid_search": True,
"total_conversations": len(conversations),
}
Loading