diff --git a/docs/TOOLS.md b/docs/TOOLS.md index 3613f63..049cad6 100644 --- a/docs/TOOLS.md +++ b/docs/TOOLS.md @@ -432,9 +432,9 @@ Start high, go lower: ## Limitations -1. **Oversized notes:** Notes >32k tokens (~120k chars) currently skipped - - Planned: Automatic chunking for these notes - - Workaround: Manually split large notes +1. **Oversized notes:** Notes >32k tokens (~120k chars) are automatically chunked + - Split into 2000-char chunks with context preserved via voyage-context-3 + - Dynamic batch sizing adapts to chunk density 2. **Empty notes:** Skipped during indexing - Warning logged with file path diff --git a/src/embedder.py b/src/embedder.py index 543f42a..873493f 100644 --- a/src/embedder.py +++ b/src/embedder.py @@ -42,6 +42,14 @@ def _redact_sensitive(message: str) -> str: return message +def _is_token_limit_error(e: Exception) -> bool: + """Check if an exception is a deterministic token limit error (not retryable).""" + error_lower = str(e).lower() + return ("token" in error_lower and "context window" in error_lower) or ( + "too many tokens" in error_lower + ) + + class VoyageEmbedder: """ Voyage Context-3 embedding client with caching and rate limiting. @@ -165,37 +173,67 @@ def embed_with_chunks( # If under limit, embed whole if estimated_tokens < 30000: - embedding = self.embed(text, input_type=input_type) - return ([embedding], 1) + try: + embedding = self.embed(text, input_type=input_type) + return ([embedding], 1) + except EmbeddingError as e: + if _is_token_limit_error(e): + logger.warning( + f"Whole-note embed failed (est. {estimated_tokens:.0f} tokens), " + f"falling back to chunked embedding" + ) + # Fall through to chunking below + else: + raise # Split into chunks chunks = self.chunk_text(text, chunk_size=chunk_size, overlap=0) logger.info(f"Large note: splitting into {len(chunks)} chunks") - # Embed chunks in batches (Voyage limit: ~60 chunks = 30k tokens per contextualized call) + # Calculate safe batch size based on actual chunk sizes + total_chars = sum(len(c) for c in chunks) + avg_chars_per_chunk = total_chars / len(chunks) + # Conservative: assume 3 chars/token for dense content safety margin + estimated_tokens_per_chunk = avg_chars_per_chunk / 3 + batch_size = max(1, int(28000 / estimated_tokens_per_chunk)) + logger.info(f"Dynamic batch size: {batch_size} (avg {avg_chars_per_chunk:.0f} chars/chunk)") + + # Embed chunks in batches all_embeddings = [] - batch_size = 60 # ~30k tokens per batch try: - for i in range(0, len(chunks), batch_size): + i = 0 + while i < len(chunks): chunk_batch = chunks[i : i + batch_size] # Rate limit self._rate_limit_sync() - # Embed this batch of chunks with context (with retry) - result = self._call_api_with_retry( - self.client.contextualized_embed, - inputs=[chunk_batch], # One document's chunks - model=self.model, - input_type=input_type, - ) + try: + # Embed this batch of chunks with context + result = self._call_api_with_retry( + self.client.contextualized_embed, + inputs=[chunk_batch], # One document's chunks + model=self.model, + input_type=input_type, + ) + + # Extract embeddings + batch_embeddings = result.results[0].embeddings + all_embeddings.extend(batch_embeddings) - # Extract embeddings - batch_embeddings = result.results[0].embeddings - all_embeddings.extend(batch_embeddings) + logger.debug(f"Embedded chunks {i + 1}-{i + len(chunk_batch)} of {len(chunks)}") + i += batch_size - logger.debug(f"Embedded chunks {i + 1}-{i + len(chunk_batch)} of {len(chunks)}") + except EmbeddingError as e: + if _is_token_limit_error(e) and batch_size > 1: + # Halve batch size and retry this batch + batch_size = max(1, batch_size // 2) + logger.warning( + f"Batch too large for token limit, reducing to {batch_size} chunks" + ) + continue # Retry same position with smaller batch + raise logger.success(f"Embedded {len(all_embeddings)} chunks with context preserved") return (all_embeddings, len(chunks)) @@ -264,6 +302,14 @@ def _call_api_with_retry(self, api_func, *args, **kwargs): last_error = e error_msg = _redact_sensitive(str(e)) + # Token limit errors are deterministic, don't retry + if _is_token_limit_error(e): + logger.error(f"Token limit error (not retryable): {error_msg}") + raise EmbeddingError( + f"Token limit exceeded: {error_msg}", + cause=e, + ) from e + # Check if it's a rate limit error (429) if "429" in str(e) or "rate" in str(e).lower(): # Exponential backoff: 2^attempt seconds (1, 2, 4, ...) diff --git a/src/vector_store.py b/src/vector_store.py index 0154c0f..2dcc976 100644 --- a/src/vector_store.py +++ b/src/vector_store.py @@ -2,7 +2,7 @@ PostgreSQL+pgvector Vector Store for Obsidian Graph MCP Server. Adapted from oachatbot's PostgreSQL store, simplified for Obsidian notes: -- Stores whole notes (not chunked documents) +- Stores whole notes and chunked documents (automatic chunking for large notes) - Uses 'path' as identifier (not document_id) - No site_id or publish_date (Obsidian-specific) - Adds connection_count materialization for graph queries