From 402585200b68b03ec67191f6529b6b3b0f2ebd58 Mon Sep 17 00:00:00 2001 From: mildred522 <1037950115@qq.com> Date: Sun, 22 Mar 2026 12:22:48 +0800 Subject: [PATCH 1/3] feat(retrieval): chunk long text file embeddings --- docs/en/guides/01-configuration.md | 20 +++ docs/zh/guides/01-configuration.md | 19 +++ openviking/retrieve/hierarchical_retriever.py | 64 +++++++- openviking/utils/embedding_utils.py | 113 +++++++++++-- .../utils/config/open_viking_config.py | 23 ++- tests/misc/test_file_chunk_vectorization.py | 89 +++++++++++ .../test_openviking_config_file_chunking.py | 52 ++++++ ...t_hierarchical_retriever_chunk_collapse.py | 148 ++++++++++++++++++ 8 files changed, 511 insertions(+), 17 deletions(-) create mode 100644 tests/misc/test_file_chunk_vectorization.py create mode 100644 tests/misc/test_openviking_config_file_chunking.py create mode 100644 tests/retrieve/test_hierarchical_retriever_chunk_collapse.py diff --git a/docs/en/guides/01-configuration.md b/docs/en/guides/01-configuration.md index 48003e28..1b956a17 100644 --- a/docs/en/guides/01-configuration.md +++ b/docs/en/guides/01-configuration.md @@ -500,6 +500,24 @@ Reranking model for search result refinement. If rerank is not configured, search uses vector similarity only. +### retrieval indexing + +Long text files are chunked during file-level vectorization so a single oversized file does not +collapse into one coarse L2 embedding only. Chunk hits are collapsed back to the base file URI at +retrieval time. + +```json +{ + "file_chunk_chars": 4000, + "file_chunk_overlap": 400 +} +``` + +| Parameter | Type | Description | Default | +|-----------|------|-------------|---------| +| `file_chunk_chars` | int | Maximum characters per chunk when vectorizing long text files | `4000` | +| `file_chunk_overlap` | int | Overlapping characters between adjacent file chunks | `400` | + ### storage Storage configuration for context data, including file storage (AGFS) and vector database storage (VectorDB). @@ -837,6 +855,8 @@ For details on the lock mechanism, see [Path Locks and Crash Recovery](../concep "code": { "code_summary_mode": "ast" }, + "file_chunk_chars": 4000, + "file_chunk_overlap": 400, "server": { "host": "0.0.0.0", "port": 1933, diff --git a/docs/zh/guides/01-configuration.md b/docs/zh/guides/01-configuration.md index a5c3507d..8b0fa773 100644 --- a/docs/zh/guides/01-configuration.md +++ b/docs/zh/guides/01-configuration.md @@ -475,6 +475,23 @@ AST 提取支持:Python、JavaScript/TypeScript、Rust、Go、Java、C/C++。 如果未配置 Rerank,搜索仅使用向量相似度。 +### 检索索引 + +长文本文件在文件级向量化阶段会被切分为多个 chunk,避免一个超长文件只生成单个粗粒度的 +L2 向量。检索阶段会再把这些 chunk 命中折叠回基础文件 URI,对外仍返回文件级结果。 + +```json +{ + "file_chunk_chars": 4000, + "file_chunk_overlap": 400 +} +``` + +| 参数 | 类型 | 说明 | 默认值 | +|------|------|------|--------| +| `file_chunk_chars` | int | 长文本文件向量化时每个 chunk 的最大字符数 | `4000` | +| `file_chunk_overlap` | int | 相邻文件 chunk 之间的重叠字符数 | `400` | + ### storage 用于存储上下文数据 ,包括文件存储(AGFS)和向量库存储(VectorDB)。 @@ -814,6 +831,8 @@ HTTP 客户端(`SyncHTTPClient` / `AsyncHTTPClient`)和 CLI 工具连接远 "code": { "code_summary_mode": "ast" }, + "file_chunk_chars": 4000, + "file_chunk_overlap": 400, "server": { "host": "string", "port": 1933, diff --git a/openviking/retrieve/hierarchical_retriever.py b/openviking/retrieve/hierarchical_retriever.py index b2d2885a..3e52aaff 100644 --- a/openviking/retrieve/hierarchical_retriever.py +++ b/openviking/retrieve/hierarchical_retriever.py @@ -8,8 +8,9 @@ """ import heapq -import math import logging +import math +import re import time from datetime import datetime from typing import Any, Dict, List, Optional, Tuple @@ -51,6 +52,7 @@ class HierarchicalRetriever: GLOBAL_SEARCH_TOPK = 5 # Global retrieval count HOTNESS_ALPHA = 0.2 # Weight for hotness score in final ranking (0 = disabled) LEVEL_URI_SUFFIX = {0: ".abstract.md", 1: ".overview.md"} + CHUNK_URI_PATTERN = re.compile(r"#chunk_\d+$") def __init__( self, @@ -177,6 +179,7 @@ async def retrieve( # 从 global_results 中提取 level 2 的文件作为初始候选者 initial_candidates = [r for r in global_results if r.get("level", 2) == 2] + initial_candidates = [r for r in global_results if r.get("level", 2) == 2] initial_candidates = self._prepare_initial_candidates( query.query, initial_candidates, @@ -184,13 +187,14 @@ async def retrieve( ) # Step 4: Recursive search + recursive_limit = max(limit * 3, 10) candidates = await self._recursive_search( vector_proxy=vector_proxy, query=query.query, query_vector=query_vector, sparse_query_vector=sparse_query_vector, starting_points=starting_points, - limit=limit, + limit=recursive_limit, mode=mode, threshold=effective_threshold, score_gte=score_gte, @@ -501,6 +505,7 @@ async def _convert_to_matched_contexts( is controlled by ``HOTNESS_ALPHA`` (0 disables the boost). """ results = [] + candidates = self._collapse_chunk_candidates(candidates) for c in candidates: # Read related contexts and get summaries @@ -563,6 +568,61 @@ async def _convert_to_matched_contexts( results.sort(key=lambda x: x.score, reverse=True) return results + @classmethod + def _base_uri_for_chunk(cls, uri: str) -> str: + """Strip chunk suffix from vector-only file chunk URIs.""" + if not uri: + return uri + return cls.CHUNK_URI_PATTERN.sub("", uri) + + @classmethod + def _collapse_chunk_candidates(cls, candidates: List[Dict[str, Any]]) -> List[Dict[str, Any]]: + """Collapse chunk-level file hits back to a single file-level candidate.""" + collapsed: Dict[str, Dict[str, Any]] = {} + + for candidate in candidates: + candidate_copy = dict(candidate) + original_uri = candidate_copy.get("uri", "") + base_uri = cls._base_uri_for_chunk(original_uri) + is_chunk = base_uri != original_uri + candidate_copy["uri"] = base_uri + + previous = collapsed.get(base_uri) + candidate_score = candidate_copy.get("_final_score", candidate_copy.get("_score", 0.0)) + + if previous is None: + collapsed[base_uri] = candidate_copy + else: + previous_score = previous.get("_final_score", previous.get("_score", 0.0)) + if candidate_score > previous_score: + preserved_abstract = previous.get("abstract", "") + preserved_context_type = previous.get("context_type") + preserved_category = previous.get("category") + collapsed[base_uri] = candidate_copy + if preserved_abstract and is_chunk and not candidate_copy.get("abstract"): + collapsed[base_uri]["abstract"] = preserved_abstract + if preserved_context_type and not candidate_copy.get("context_type"): + collapsed[base_uri]["context_type"] = preserved_context_type + if preserved_category and not candidate_copy.get("category"): + collapsed[base_uri]["category"] = preserved_category + else: + if not previous.get("abstract") and candidate_copy.get("abstract"): + previous["abstract"] = candidate_copy["abstract"] + if not previous.get("context_type") and candidate_copy.get("context_type"): + previous["context_type"] = candidate_copy["context_type"] + if not previous.get("category") and candidate_copy.get("category"): + previous["category"] = candidate_copy["category"] + + collapsed_candidate = collapsed[base_uri] + if not is_chunk and candidate_copy.get("abstract"): + collapsed_candidate["abstract"] = candidate_copy["abstract"] + + return sorted( + collapsed.values(), + key=lambda item: item.get("_final_score", item.get("_score", 0.0)), + reverse=True, + ) + @classmethod def _append_level_suffix(cls, uri: str, level: int) -> str: """Return user-facing URI with L0/L1 suffix reconstructed by level.""" diff --git a/openviking/utils/embedding_utils.py b/openviking/utils/embedding_utils.py index d0b9bca3..e8319c42 100644 --- a/openviking/utils/embedding_utils.py +++ b/openviking/utils/embedding_utils.py @@ -8,7 +8,7 @@ import os from datetime import datetime -from typing import Dict, Optional +from typing import Any, Dict, List, Optional, cast from openviking.core.context import Context, ContextLevel, ResourceContentType, Vectorize from openviking.server.identity import RequestContext @@ -16,10 +16,46 @@ from openviking.storage.queuefs.embedding_msg_converter import EmbeddingMsgConverter from openviking.storage.viking_fs import get_viking_fs from openviking_cli.utils import VikingURI, get_logger +from openviking_cli.utils.config import get_openviking_config logger = get_logger(__name__) +def _chunk_text(text: str, chunk_chars: int, chunk_overlap: int) -> List[str]: + """Split text into overlapping chunks for long-file vectorization.""" + if not text: + return [] + if len(text) <= chunk_chars: + return [text] + + step = max(chunk_chars - chunk_overlap, 1) + chunks = [] + start = 0 + + while start < len(text): + end = min(start + chunk_chars, len(text)) + chunks.append(text[start:end]) + if end >= len(text): + break + start += step + + return chunks + + +async def _enqueue_context( + context: Context, + embedding_queue: Any, + semantic_msg_id: Optional[str] = None, +) -> bool: + """Convert a Context into an embedding message and enqueue it.""" + embedding_msg = EmbeddingMsgConverter.from_context(context) + if not embedding_msg: + return False + embedding_msg.semantic_msg_id = semantic_msg_id + await embedding_queue.enqueue(embedding_msg) + return True + + async def _decrement_embedding_tracker(semantic_msg_id: Optional[str], count: int) -> None: if not semantic_msg_id or count <= 0: return @@ -146,9 +182,10 @@ async def vectorize_directory_meta( return queue_manager = get_queue_manager() - embedding_queue = queue_manager.get_queue(queue_manager.EMBEDDING) + embedding_queue = cast(Any, queue_manager.get_queue(queue_manager.EMBEDDING)) - parent_uri = VikingURI(uri).parent.uri + parent = VikingURI(uri).parent + parent_uri = parent.uri if parent else "" owner_space = _owner_space_for_uri(uri, ctx) # Vectorize L0: .abstract.md (abstract) @@ -230,11 +267,14 @@ async def vectorize_file( return queue_manager = get_queue_manager() - embedding_queue = queue_manager.get_queue(queue_manager.EMBEDDING) - viking_fs = get_viking_fs() + embedding_queue = cast(Any, queue_manager.get_queue(queue_manager.EMBEDDING)) + viking_fs = cast(Any, get_viking_fs()) + config = get_openviking_config() file_name = summary_dict.get("name") or os.path.basename(file_path) summary = summary_dict.get("summary", "") + owner_space = _owner_space_for_uri(file_path, ctx) + created_at = datetime.now() context = Context( uri=file_path, @@ -242,10 +282,10 @@ async def vectorize_file( is_leaf=True, abstract=summary, context_type=context_type, - created_at=datetime.now(), + created_at=created_at, user=ctx.user, account_id=ctx.account_id, - owner_space=_owner_space_for_uri(file_path, ctx), + owner_space=owner_space, ) content_type = get_resource_content_type(file_name) @@ -271,7 +311,56 @@ async def vectorize_file( content = await viking_fs.read_file(file_path, ctx=ctx) if isinstance(content, bytes): content = content.decode("utf-8", errors="replace") - context.set_vectorize(Vectorize(text=content)) + chunks = _chunk_text(content, config.file_chunk_chars, config.file_chunk_overlap) + if len(chunks) <= 1: + context.set_vectorize(Vectorize(text=content)) + else: + if summary: + context.set_vectorize(Vectorize(text=summary)) + if not await _enqueue_context( + context, + embedding_queue, + semantic_msg_id=semantic_msg_id, + ): + return + enqueued = True + logger.debug( + "Enqueued canonical summary vector for chunked file: %s", file_path + ) + + for index, chunk_text in enumerate(chunks): + chunk_context = Context( + uri=f"{file_path}#chunk_{index:04d}", + parent_uri=parent_uri, + is_leaf=True, + abstract=summary, + context_type=context_type, + created_at=created_at, + user=ctx.user, + account_id=ctx.account_id, + owner_space=owner_space, + meta={ + "chunk_index": index, + "chunk_count": len(chunks), + "source_uri": file_path, + }, + level=ContextLevel.DETAIL, + ) + chunk_context.set_vectorize(Vectorize(text=chunk_text)) + if not await _enqueue_context( + chunk_context, + embedding_queue, + semantic_msg_id=semantic_msg_id, + ): + continue + enqueued = True + + logger.debug( + "Enqueued %d chunk vectors for long text file: %s", + len(chunks), + file_path, + ) + return except Exception as e: logger.warning( f"Failed to read file content for {file_path}, falling back to summary: {e}" @@ -290,12 +379,8 @@ async def vectorize_file( logger.debug(f"Skipping file {file_path} (no text content or summary)") return - embedding_msg = EmbeddingMsgConverter.from_context(context) - if not embedding_msg: + if not await _enqueue_context(context, embedding_queue, semantic_msg_id=semantic_msg_id): return - - embedding_msg.semantic_msg_id = semantic_msg_id - await embedding_queue.enqueue(embedding_msg) enqueued = True logger.debug(f"Enqueued file for vectorization: {file_path}") @@ -316,7 +401,7 @@ async def index_resource( 1. Reads .abstract.md and .overview.md and vectorizes them. 2. Scans files in the directory and vectorizes them. """ - viking_fs = get_viking_fs() + viking_fs = cast(Any, get_viking_fs()) # 1. Index Directory Metadata abstract_uri = f"{uri}/.abstract.md" diff --git a/openviking_cli/utils/config/open_viking_config.py b/openviking_cli/utils/config/open_viking_config.py index b93889f7..aef493b3 100644 --- a/openviking_cli/utils/config/open_viking_config.py +++ b/openviking_cli/utils/config/open_viking_config.py @@ -5,7 +5,7 @@ from threading import Lock from typing import Any, Dict, Optional -from pydantic import BaseModel, Field +from pydantic import BaseModel, Field, model_validator from openviking_cli.session.user_id import UserIdentifier @@ -127,6 +127,16 @@ class OpenVikingConfig(BaseModel): default=3600, description="Interval (seconds) to check for expired memories" ) + file_chunk_chars: int = Field( + default=4000, + description="Maximum characters per chunk when vectorizing long text files", + ) + + file_chunk_overlap: int = Field( + default=400, + description="Overlapping characters between adjacent file chunks", + ) + language_fallback: str = Field( default="en", description=( @@ -139,6 +149,17 @@ class OpenVikingConfig(BaseModel): model_config = {"arbitrary_types_allowed": True, "extra": "forbid"} + @model_validator(mode="after") + def validate_chunk_settings(self) -> "OpenVikingConfig": + """Validate file chunking settings used during file vectorization.""" + if self.file_chunk_chars <= 0: + raise ValueError("file_chunk_chars must be positive") + if self.file_chunk_overlap < 0: + raise ValueError("file_chunk_overlap must be non-negative") + if self.file_chunk_overlap >= self.file_chunk_chars: + raise ValueError("file_chunk_overlap must be smaller than file_chunk_chars") + return self + @classmethod def from_dict(cls, config: Dict[str, Any]) -> "OpenVikingConfig": """Create configuration from dictionary.""" diff --git a/tests/misc/test_file_chunk_vectorization.py b/tests/misc/test_file_chunk_vectorization.py new file mode 100644 index 00000000..7337f202 --- /dev/null +++ b/tests/misc/test_file_chunk_vectorization.py @@ -0,0 +1,89 @@ +# Copyright (c) 2026 Beijing Volcano Engine Technology Co., Ltd. +# SPDX-License-Identifier: Apache-2.0 + +"""Tests for long-file chunked vectorization.""" + +from types import SimpleNamespace +from typing import Any + +import pytest + +from openviking.server.identity import RequestContext, Role +from openviking.utils.embedding_utils import vectorize_file +from openviking_cli.session.user_id import UserIdentifier + + +class DummyEmbeddingQueue: + """Collect embedding messages sent by vectorize_file().""" + + def __init__(self) -> None: + self.messages: list[Any] = [] + + async def enqueue(self, msg) -> None: + self.messages.append(msg) + + +class DummyQueueManager: + """Minimal queue manager stub for embedding tests.""" + + EMBEDDING = "embedding" + + def __init__(self) -> None: + self.queue = DummyEmbeddingQueue() + + def get_queue(self, name): + assert name == self.EMBEDDING + return self.queue + + +class DummyVikingFS: + """Minimal filesystem stub returning a fixed file body.""" + + def __init__(self, content: str) -> None: + self.content = content + + async def read_file(self, file_path: str, ctx=None) -> str: + return self.content + + +@pytest.mark.asyncio +async def test_vectorize_file_chunks_long_text_and_preserves_base_summary(monkeypatch): + queue_manager = DummyQueueManager() + viking_fs = DummyVikingFS("ABCDEFGHIJ1234567890KLMNOPQRST") + config = SimpleNamespace(file_chunk_chars=10, file_chunk_overlap=2) + ctx = RequestContext(user=UserIdentifier("acc1", "user1", "agent1"), role=Role.USER) + + monkeypatch.setattr("openviking.utils.embedding_utils.get_queue_manager", lambda: queue_manager) + monkeypatch.setattr("openviking.utils.embedding_utils.get_viking_fs", lambda: viking_fs) + monkeypatch.setattr("openviking.utils.embedding_utils.get_openviking_config", lambda: config) + + await vectorize_file( + file_path="viking://resources/demo/long.md", + summary_dict={"name": "long.md", "summary": "long file summary"}, + parent_uri="viking://resources/demo", + ctx=ctx, + ) + + messages = queue_manager.queue.messages + assert len(messages) == 5 + + assert messages[0].context_data["uri"] == "viking://resources/demo/long.md" + assert messages[0].message == "long file summary" + + chunk_uris = [msg.context_data["uri"] for msg in messages[1:]] + assert chunk_uris == [ + "viking://resources/demo/long.md#chunk_0000", + "viking://resources/demo/long.md#chunk_0001", + "viking://resources/demo/long.md#chunk_0002", + "viking://resources/demo/long.md#chunk_0003", + ] + assert [msg.message for msg in messages[1:]] == [ + "ABCDEFGHIJ", + "IJ12345678", + "7890KLMNOP", + "OPQRST", + ] + assert all(msg.context_data["parent_uri"] == "viking://resources/demo" for msg in messages[1:]) + assert messages[1].context_data["meta"]["chunk_index"] == 0 + assert messages[1].context_data["meta"]["chunk_count"] == 4 + assert messages[1].context_data["meta"]["source_uri"] == "viking://resources/demo/long.md" diff --git a/tests/misc/test_openviking_config_file_chunking.py b/tests/misc/test_openviking_config_file_chunking.py new file mode 100644 index 00000000..0c7955f7 --- /dev/null +++ b/tests/misc/test_openviking_config_file_chunking.py @@ -0,0 +1,52 @@ +# Copyright (c) 2026 Beijing Volcano Engine Technology Co., Ltd. +# SPDX-License-Identifier: Apache-2.0 + +"""Tests for file chunking settings in OpenVikingConfig.""" + +import pytest + +from openviking_cli.utils.config.open_viking_config import OpenVikingConfig + + +def _minimal_config_dict() -> dict: + return { + "embedding": { + "dense": { + "provider": "openai", + "api_key": "test-key", + "model": "text-embedding-3-small", + "dimension": 1536, + } + } + } + + +def test_openviking_config_file_chunk_defaults(): + config = OpenVikingConfig.from_dict(_minimal_config_dict()) + + assert config.file_chunk_chars == 4000 + assert config.file_chunk_overlap == 400 + + +def test_openviking_config_accepts_custom_file_chunk_settings(): + config = OpenVikingConfig.from_dict( + { + **_minimal_config_dict(), + "file_chunk_chars": 2048, + "file_chunk_overlap": 256, + } + ) + + assert config.file_chunk_chars == 2048 + assert config.file_chunk_overlap == 256 + + +def test_openviking_config_rejects_invalid_file_chunk_overlap(): + with pytest.raises(ValueError, match="file_chunk_overlap must be smaller than file_chunk_chars"): + OpenVikingConfig.from_dict( + { + **_minimal_config_dict(), + "file_chunk_chars": 512, + "file_chunk_overlap": 512, + } + ) diff --git a/tests/retrieve/test_hierarchical_retriever_chunk_collapse.py b/tests/retrieve/test_hierarchical_retriever_chunk_collapse.py new file mode 100644 index 00000000..5ff9f817 --- /dev/null +++ b/tests/retrieve/test_hierarchical_retriever_chunk_collapse.py @@ -0,0 +1,148 @@ +# Copyright (c) 2026 Beijing Volcano Engine Technology Co., Ltd. +# SPDX-License-Identifier: Apache-2.0 + +"""Tests for chunk result collapse in hierarchical retrieval.""" + +from typing import Any, cast + +import pytest + +from openviking.retrieve.hierarchical_retriever import HierarchicalRetriever +from openviking.server.identity import RequestContext, Role +from openviking_cli.retrieve.types import ContextType, TypedQuery +from openviking_cli.session.user_id import UserIdentifier + + +class DummyStorage: + """Minimal storage stub for retrieval collapse tests.""" + + def __init__(self) -> None: + self.collection_name = "context" + + async def collection_exists_bound(self) -> bool: + return True + + async def search_global_roots_in_tenant( + self, + ctx, + query_vector=None, + sparse_query_vector=None, + context_type=None, + target_directories=None, + extra_filter=None, + limit: int = 10, + ): + return [] + + async def search_children_in_tenant( + self, + ctx, + parent_uri: str, + query_vector=None, + sparse_query_vector=None, + context_type=None, + target_directories=None, + extra_filter=None, + limit: int = 10, + ): + if parent_uri != "viking://resources/demo": + return [] + return [ + { + "uri": "viking://resources/demo/guide.md#chunk_0000", + "context_type": "resource", + "level": 2, + "abstract": "", + "_score": 0.95, + }, + { + "uri": "viking://resources/demo/guide.md#chunk_0001", + "context_type": "resource", + "level": 2, + "abstract": "", + "_score": 0.94, + }, + { + "uri": "viking://resources/demo/guide.md", + "context_type": "resource", + "level": 2, + "abstract": "Guide summary", + "_score": 0.75, + }, + { + "uri": "viking://resources/demo/faq.md", + "context_type": "resource", + "level": 2, + "abstract": "FAQ summary", + "_score": 0.80, + }, + ] + + +@pytest.mark.asyncio +async def test_convert_to_matched_contexts_collapses_chunk_hits(monkeypatch): + retriever = HierarchicalRetriever( + storage=cast(Any, DummyStorage()), embedder=None, rerank_config=None + ) + ctx = RequestContext(user=UserIdentifier("acc1", "user1", "agent1"), role=Role.USER) + + monkeypatch.setattr("openviking.retrieve.hierarchical_retriever.get_viking_fs", lambda: None) + + results = await retriever._convert_to_matched_contexts( + [ + { + "uri": "viking://resources/demo/guide.md#chunk_0000", + "context_type": "resource", + "level": 2, + "abstract": "", + "_final_score": 0.95, + }, + { + "uri": "viking://resources/demo/guide.md", + "context_type": "resource", + "level": 2, + "abstract": "Guide summary", + "_final_score": 0.75, + }, + { + "uri": "viking://resources/demo/faq.md", + "context_type": "resource", + "level": 2, + "abstract": "FAQ summary", + "_final_score": 0.80, + }, + ], + ctx=ctx, + ) + + assert [result.uri for result in results] == [ + "viking://resources/demo/guide.md", + "viking://resources/demo/faq.md", + ] + assert results[0].abstract == "Guide summary" + + +@pytest.mark.asyncio +async def test_retrieve_returns_enough_unique_results_after_chunk_collapse(monkeypatch): + retriever = HierarchicalRetriever( + storage=cast(Any, DummyStorage()), embedder=None, rerank_config=None + ) + ctx = RequestContext(user=UserIdentifier("acc1", "user1", "agent1"), role=Role.USER) + + monkeypatch.setattr("openviking.retrieve.hierarchical_retriever.get_viking_fs", lambda: None) + + result = await retriever.retrieve( + TypedQuery( + query="guide", + context_type=ContextType.RESOURCE, + intent="", + target_directories=["viking://resources/demo"], + ), + ctx=ctx, + limit=2, + ) + + assert [matched.uri for matched in result.matched_contexts] == [ + "viking://resources/demo/guide.md", + "viking://resources/demo/faq.md", + ] From e527a9eb7e91d281360c06e9b16187666a291ca5 Mon Sep 17 00:00:00 2001 From: mildred522 <1037950115@qq.com> Date: Sun, 22 Mar 2026 12:54:14 +0800 Subject: [PATCH 2/3] style(retrieval): apply ruff formatting --- openviking/utils/embedding_utils.py | 4 +++- tests/misc/test_openviking_config_file_chunking.py | 4 +++- 2 files changed, 6 insertions(+), 2 deletions(-) diff --git a/openviking/utils/embedding_utils.py b/openviking/utils/embedding_utils.py index e8319c42..d6d362cc 100644 --- a/openviking/utils/embedding_utils.py +++ b/openviking/utils/embedding_utils.py @@ -311,7 +311,9 @@ async def vectorize_file( content = await viking_fs.read_file(file_path, ctx=ctx) if isinstance(content, bytes): content = content.decode("utf-8", errors="replace") - chunks = _chunk_text(content, config.file_chunk_chars, config.file_chunk_overlap) + chunks = _chunk_text( + content, config.file_chunk_chars, config.file_chunk_overlap + ) if len(chunks) <= 1: context.set_vectorize(Vectorize(text=content)) else: diff --git a/tests/misc/test_openviking_config_file_chunking.py b/tests/misc/test_openviking_config_file_chunking.py index 0c7955f7..88a73264 100644 --- a/tests/misc/test_openviking_config_file_chunking.py +++ b/tests/misc/test_openviking_config_file_chunking.py @@ -42,7 +42,9 @@ def test_openviking_config_accepts_custom_file_chunk_settings(): def test_openviking_config_rejects_invalid_file_chunk_overlap(): - with pytest.raises(ValueError, match="file_chunk_overlap must be smaller than file_chunk_chars"): + with pytest.raises( + ValueError, match="file_chunk_overlap must be smaller than file_chunk_chars" + ): OpenVikingConfig.from_dict( { **_minimal_config_dict(), From e8b72e2b627d10c24ec5dbb0718c867be0520b8e Mon Sep 17 00:00:00 2001 From: mildred522 <1037950115@qq.com> Date: Sun, 22 Mar 2026 13:15:51 +0800 Subject: [PATCH 3/3] chore: merge upstream main and fix formatter drift --- openviking_cli/utils/config/open_viking_config.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/openviking_cli/utils/config/open_viking_config.py b/openviking_cli/utils/config/open_viking_config.py index 14fb7601..b13585a5 100644 --- a/openviking_cli/utils/config/open_viking_config.py +++ b/openviking_cli/utils/config/open_viking_config.py @@ -103,7 +103,8 @@ class OpenVikingConfig(BaseModel): ) feishu: FeishuConfig = Field( - default_factory=lambda: FeishuConfig(), description="Feishu/Lark document parsing configuration" + default_factory=lambda: FeishuConfig(), + description="Feishu/Lark document parsing configuration", ) semantic: SemanticConfig = Field(