Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
6 changes: 3 additions & 3 deletions docs/TOOLS.md
Original file line number Diff line number Diff line change
Expand Up @@ -432,9 +432,9 @@ Start high, go lower:

## Limitations

1. **Oversized notes:** Notes >32k tokens (~120k chars) currently skipped
- Planned: Automatic chunking for these notes
- Workaround: Manually split large notes
1. **Oversized notes:** Notes >32k tokens (~120k chars) are automatically chunked
- Split into 2000-char chunks with context preserved via voyage-context-3
- Dynamic batch sizing adapts to chunk density

2. **Empty notes:** Skipped during indexing
- Warning logged with file path
Expand Down
78 changes: 62 additions & 16 deletions src/embedder.py
Original file line number Diff line number Diff line change
Expand Up @@ -42,6 +42,14 @@ def _redact_sensitive(message: str) -> str:
return message


def _is_token_limit_error(e: Exception) -> bool:
"""Check if an exception is a deterministic token limit error (not retryable)."""
error_lower = str(e).lower()
return ("token" in error_lower and "context window" in error_lower) or (
"too many tokens" in error_lower
)


class VoyageEmbedder:
"""
Voyage Context-3 embedding client with caching and rate limiting.
Expand Down Expand Up @@ -165,37 +173,67 @@ def embed_with_chunks(

# If under limit, embed whole
if estimated_tokens < 30000:
embedding = self.embed(text, input_type=input_type)
return ([embedding], 1)
try:
embedding = self.embed(text, input_type=input_type)
return ([embedding], 1)
except EmbeddingError as e:
if _is_token_limit_error(e):
logger.warning(
f"Whole-note embed failed (est. {estimated_tokens:.0f} tokens), "
f"falling back to chunked embedding"
)
# Fall through to chunking below
else:
raise

# Split into chunks
chunks = self.chunk_text(text, chunk_size=chunk_size, overlap=0)
logger.info(f"Large note: splitting into {len(chunks)} chunks")

# Embed chunks in batches (Voyage limit: ~60 chunks = 30k tokens per contextualized call)
# Calculate safe batch size based on actual chunk sizes
total_chars = sum(len(c) for c in chunks)
avg_chars_per_chunk = total_chars / len(chunks)
# Conservative: assume 3 chars/token for dense content safety margin
estimated_tokens_per_chunk = avg_chars_per_chunk / 3
batch_size = max(1, int(28000 / estimated_tokens_per_chunk))
logger.info(f"Dynamic batch size: {batch_size} (avg {avg_chars_per_chunk:.0f} chars/chunk)")

# Embed chunks in batches
all_embeddings = []
batch_size = 60 # ~30k tokens per batch

try:
for i in range(0, len(chunks), batch_size):
i = 0
while i < len(chunks):
chunk_batch = chunks[i : i + batch_size]

# Rate limit
self._rate_limit_sync()

# Embed this batch of chunks with context (with retry)
result = self._call_api_with_retry(
self.client.contextualized_embed,
inputs=[chunk_batch], # One document's chunks
model=self.model,
input_type=input_type,
)
try:
# Embed this batch of chunks with context
result = self._call_api_with_retry(
self.client.contextualized_embed,
inputs=[chunk_batch], # One document's chunks
model=self.model,
input_type=input_type,
)

# Extract embeddings
batch_embeddings = result.results[0].embeddings
all_embeddings.extend(batch_embeddings)

# Extract embeddings
batch_embeddings = result.results[0].embeddings
all_embeddings.extend(batch_embeddings)
logger.debug(f"Embedded chunks {i + 1}-{i + len(chunk_batch)} of {len(chunks)}")
i += batch_size

logger.debug(f"Embedded chunks {i + 1}-{i + len(chunk_batch)} of {len(chunks)}")
except EmbeddingError as e:
if _is_token_limit_error(e) and batch_size > 1:
Copy link
Copy Markdown

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

🟡 Batch retry checks batch_size > 1 instead of len(chunk_batch) > 1, causing wasteful identical API calls

When the remaining chunks at position i are fewer than batch_size, reducing batch_size does not change the actual chunk_batch sent to the API. The loop at src/embedder.py:229 checks batch_size > 1 to decide whether to retry with a smaller batch, but the actual batch sent is chunks[i : i + batch_size] (src/embedder.py:207), which is bounded by the remaining chunks. This means the code can make multiple identical failing API calls (each with a rate-limit sleep at src/embedder.py:210) before batch_size shrinks below the actual remaining chunk count.

Example scenario: 5 remaining chunks with batch_size=42
  • batch_size=42chunk_batch = 5 chunks → API fails (token limit)
  • batch_size=21chunk_batch = 5 chunks → same request, fails again
  • batch_size=10chunk_batch = 5 chunks → same request, fails again
  • batch_size=5chunk_batch = 5 chunks → same request, fails again
  • batch_size=2chunk_batch = 2 chunks → different request, may succeed

That's 4 wasted API calls with rate-limit delays.

Suggested change
if _is_token_limit_error(e) and batch_size > 1:
if _is_token_limit_error(e) and len(chunk_batch) > 1:
Open in Devin Review

Was this helpful? React with 👍 or 👎 to provide feedback.

# Halve batch size and retry this batch
batch_size = max(1, batch_size // 2)
logger.warning(
f"Batch too large for token limit, reducing to {batch_size} chunks"
)
continue # Retry same position with smaller batch
raise

logger.success(f"Embedded {len(all_embeddings)} chunks with context preserved")
return (all_embeddings, len(chunks))
Expand Down Expand Up @@ -264,6 +302,14 @@ def _call_api_with_retry(self, api_func, *args, **kwargs):
last_error = e
error_msg = _redact_sensitive(str(e))

# Token limit errors are deterministic, don't retry
if _is_token_limit_error(e):
logger.error(f"Token limit error (not retryable): {error_msg}")
raise EmbeddingError(
f"Token limit exceeded: {error_msg}",
cause=e,
) from e

# Check if it's a rate limit error (429)
if "429" in str(e) or "rate" in str(e).lower():
# Exponential backoff: 2^attempt seconds (1, 2, 4, ...)
Expand Down
2 changes: 1 addition & 1 deletion src/vector_store.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@
PostgreSQL+pgvector Vector Store for Obsidian Graph MCP Server.

Adapted from oachatbot's PostgreSQL store, simplified for Obsidian notes:
- Stores whole notes (not chunked documents)
- Stores whole notes and chunked documents (automatic chunking for large notes)
- Uses 'path' as identifier (not document_id)
- No site_id or publish_date (Obsidian-specific)
- Adds connection_count materialization for graph queries
Expand Down
Loading