From a92b5f7e238665e20e46866fb872c30cf1c88b72 Mon Sep 17 00:00:00 2001 From: ningfeemic-dev Date: Sun, 22 Mar 2026 11:28:03 +0800 Subject: [PATCH 1/2] feat: make file vectorization strategy configurable --- openviking/utils/embedding_utils.py | 20 +++- .../utils/config/embedding_config.py | 13 +++ .../unit/test_embedding_vectorize_strategy.py | 41 +++++++ tests/unit/test_vectorize_file_strategy.py | 109 ++++++++++++++++++ 4 files changed, 180 insertions(+), 3 deletions(-) create mode 100644 tests/unit/test_embedding_vectorize_strategy.py create mode 100644 tests/unit/test_vectorize_file_strategy.py diff --git a/openviking/utils/embedding_utils.py b/openviking/utils/embedding_utils.py index d0b9bca3..868786e7 100644 --- a/openviking/utils/embedding_utils.py +++ b/openviking/utils/embedding_utils.py @@ -16,6 +16,7 @@ from openviking.storage.queuefs.embedding_msg_converter import EmbeddingMsgConverter from openviking.storage.viking_fs import get_viking_fs from openviking_cli.utils import VikingURI, get_logger +from openviking_cli.utils.config import get_openviking_config logger = get_logger(__name__) @@ -220,7 +221,8 @@ async def vectorize_file( Creates Context object for the file and enqueues it. If use_summary=True and summary is available, uses summary for TEXT files (e.g. code scenario). - Otherwise reads raw file content for TEXT files, falls back to summary on failure. + Otherwise reads raw file content for TEXT files, but now respects embedding strategy config + to avoid oversize embedding failures on long text files. """ enqueued = False @@ -249,6 +251,15 @@ async def vectorize_file( ) content_type = get_resource_content_type(file_name) + embedding_cfg = get_openviking_config().embedding + text_source = getattr(embedding_cfg, "text_source", "summary_first") + max_text_chars = int(getattr(embedding_cfg, "max_text_chars", 1000) or 1000) + + def _truncate_text(value: str) -> str: + if len(value) <= max_text_chars: + return value + return value[:max_text_chars] + "\n...(truncated for embedding)" + if content_type is None: # Unsupported file type: fall back to summary if available if summary: @@ -263,14 +274,17 @@ async def vectorize_file( return elif content_type == ResourceContentType.TEXT: if use_summary and summary: - # Code scenario: use pre-generated summary (e.g. AST skeleton) for embedding + # Explicit code scenario override: use pre-generated summary for embedding + context.set_vectorize(Vectorize(text=summary)) + elif summary and text_source in {"summary_first", "summary_only"}: context.set_vectorize(Vectorize(text=summary)) else: - # Default: read raw file content + # Default: read raw file content, but apply configured truncation guard try: content = await viking_fs.read_file(file_path, ctx=ctx) if isinstance(content, bytes): content = content.decode("utf-8", errors="replace") + content = _truncate_text(content) context.set_vectorize(Vectorize(text=content)) except Exception as e: logger.warning( diff --git a/openviking_cli/utils/config/embedding_config.py b/openviking_cli/utils/config/embedding_config.py index 14435438..dcf5f2ca 100644 --- a/openviking_cli/utils/config/embedding_config.py +++ b/openviking_cli/utils/config/embedding_config.py @@ -220,6 +220,15 @@ class EmbeddingConfig(BaseModel): max_concurrent: int = Field( default=10, description="Maximum number of concurrent embedding requests" ) + text_source: str = Field( + default="summary_first", + description="Text source for file vectorization: summary_first|summary_only|content_only", + ) + max_text_chars: int = Field( + default=1000, + ge=100, + description="Maximum characters sent to embeddings when raw text fallback is used", + ) model_config = {"extra": "forbid"} @@ -230,6 +239,10 @@ def validate_config(self): raise ValueError( "At least one embedding configuration (dense, sparse, or hybrid) is required" ) + if self.text_source not in {"summary_first", "summary_only", "content_only"}: + raise ValueError( + "embedding.text_source must be one of: summary_first, summary_only, content_only" + ) return self def _create_embedder( diff --git a/tests/unit/test_embedding_vectorize_strategy.py b/tests/unit/test_embedding_vectorize_strategy.py new file mode 100644 index 00000000..f9a481ce --- /dev/null +++ b/tests/unit/test_embedding_vectorize_strategy.py @@ -0,0 +1,41 @@ +#!/usr/bin/env python3 +# Copyright (c) 2026 Beijing Volcano Engine Technology Co., Ltd. +# SPDX-License-Identifier: Apache-2.0 + +import pytest + +from openviking_cli.utils.config.embedding_config import EmbeddingConfig, EmbeddingModelConfig + + +def _cfg(**kwargs): + return EmbeddingConfig( + dense=EmbeddingModelConfig( + provider="openai", + model="text-embedding-3-small", + api_base="http://localhost:8080/v1", + dimension=1536, + ), + **kwargs, + ) + + +def test_embedding_text_source_validation_accepts_supported_values(): + for value in ["summary_first", "summary_only", "content_only"]: + cfg = _cfg(text_source=value) + assert cfg.text_source == value + + +@pytest.mark.parametrize("bad_value", ["summary", "content", "auto", ""]) +def test_embedding_text_source_validation_rejects_invalid_values(bad_value): + with pytest.raises(ValueError, match="embedding.text_source"): + _cfg(text_source=bad_value) + + +def test_embedding_max_text_chars_validation_accepts_reasonable_value(): + cfg = _cfg(max_text_chars=1000) + assert cfg.max_text_chars == 1000 + + +def test_embedding_max_text_chars_validation_rejects_too_small_value(): + with pytest.raises(ValueError): + _cfg(max_text_chars=10) diff --git a/tests/unit/test_vectorize_file_strategy.py b/tests/unit/test_vectorize_file_strategy.py new file mode 100644 index 00000000..a2bbd51a --- /dev/null +++ b/tests/unit/test_vectorize_file_strategy.py @@ -0,0 +1,109 @@ +import types + +import pytest + +from openviking.core.context import Context +from openviking.utils import embedding_utils + + +class DummyQueue: + def __init__(self): + self.items = [] + + async def enqueue(self, msg): + self.items.append(msg) + + +class DummyQueueManager: + EMBEDDING = "embedding" + + def __init__(self, queue): + self._queue = queue + + def get_queue(self, _name): + return self._queue + + +class DummyFS: + def __init__(self, content): + self.content = content + + async def read_file(self, _path, ctx=None): + return self.content + + +class DummyUser: + account_id = "default" + + def user_space_name(self): + return "user/default" + + def agent_space_name(self): + return "agent/default" + + +class DummyReq: + def __init__(self): + self.user = DummyUser() + self.account_id = "default" + + +@pytest.mark.asyncio +async def test_vectorize_file_uses_summary_first(monkeypatch): + queue = DummyQueue() + monkeypatch.setattr(embedding_utils, "get_queue_manager", lambda: DummyQueueManager(queue)) + monkeypatch.setattr(embedding_utils, "get_viking_fs", lambda: DummyFS("X" * 5000)) + monkeypatch.setattr( + embedding_utils, + "get_openviking_config", + lambda: types.SimpleNamespace( + embedding=types.SimpleNamespace(text_source="summary_first", max_text_chars=1000) + ), + ) + monkeypatch.setattr( + embedding_utils.EmbeddingMsgConverter, + "from_context", + lambda context: context, + ) + + await embedding_utils.vectorize_file( + file_path="viking://user/default/resources/test.md", + summary_dict={"name": "test.md", "summary": "short summary"}, + parent_uri="viking://user/default/resources", + ctx=DummyReq(), + ) + + assert len(queue.items) == 1 + assert isinstance(queue.items[0], Context) + assert queue.items[0].get_vectorization_text() == "short summary" + + +@pytest.mark.asyncio +async def test_vectorize_file_truncates_content_when_content_only(monkeypatch): + queue = DummyQueue() + monkeypatch.setattr(embedding_utils, "get_queue_manager", lambda: DummyQueueManager(queue)) + monkeypatch.setattr(embedding_utils, "get_viking_fs", lambda: DummyFS("A" * 1500)) + monkeypatch.setattr( + embedding_utils, + "get_openviking_config", + lambda: types.SimpleNamespace( + embedding=types.SimpleNamespace(text_source="content_only", max_text_chars=1000) + ), + ) + monkeypatch.setattr( + embedding_utils.EmbeddingMsgConverter, + "from_context", + lambda context: context, + ) + + await embedding_utils.vectorize_file( + file_path="viking://user/default/resources/test.md", + summary_dict={"name": "test.md", "summary": "short summary"}, + parent_uri="viking://user/default/resources", + ctx=DummyReq(), + ) + + assert len(queue.items) == 1 + text = queue.items[0].get_vectorization_text() + assert text.startswith("A" * 1000) + assert text.endswith("...(truncated for embedding)") From d9769d04d263716db5fe2a7161ad821592c53325 Mon Sep 17 00:00:00 2001 From: ningfeemic-dev Date: Sun, 22 Mar 2026 13:09:05 +0800 Subject: [PATCH 2/2] refactor: align vectorization config naming and strategy --- openviking/utils/embedding_utils.py | 21 ++++++++----------- .../utils/config/embedding_config.py | 2 +- .../unit/test_embedding_vectorize_strategy.py | 10 ++++----- tests/unit/test_vectorize_file_strategy.py | 4 ++-- 4 files changed, 17 insertions(+), 20 deletions(-) diff --git a/openviking/utils/embedding_utils.py b/openviking/utils/embedding_utils.py index 868786e7..68e4b060 100644 --- a/openviking/utils/embedding_utils.py +++ b/openviking/utils/embedding_utils.py @@ -220,9 +220,8 @@ async def vectorize_file( Vectorize a single file. Creates Context object for the file and enqueues it. - If use_summary=True and summary is available, uses summary for TEXT files (e.g. code scenario). - Otherwise reads raw file content for TEXT files, but now respects embedding strategy config - to avoid oversize embedding failures on long text files. + The effective vectorization strategy is resolved once from either the explicit + `use_summary` flag (code path override) or the embedding config. """ enqueued = False @@ -252,13 +251,14 @@ async def vectorize_file( content_type = get_resource_content_type(file_name) embedding_cfg = get_openviking_config().embedding - text_source = getattr(embedding_cfg, "text_source", "summary_first") - max_text_chars = int(getattr(embedding_cfg, "max_text_chars", 1000) or 1000) + configured_text_source = getattr(embedding_cfg, "text_source", "summary_first") + effective_text_source = "summary_only" if use_summary else configured_text_source + max_input_chars = int(getattr(embedding_cfg, "max_input_chars", 1000) or 1000) def _truncate_text(value: str) -> str: - if len(value) <= max_text_chars: + if len(value) <= max_input_chars: return value - return value[:max_text_chars] + "\n...(truncated for embedding)" + return value[:max_input_chars] + "\n...(truncated for embedding)" if content_type is None: # Unsupported file type: fall back to summary if available @@ -273,13 +273,10 @@ def _truncate_text(value: str) -> str: ) return elif content_type == ResourceContentType.TEXT: - if use_summary and summary: - # Explicit code scenario override: use pre-generated summary for embedding - context.set_vectorize(Vectorize(text=summary)) - elif summary and text_source in {"summary_first", "summary_only"}: + if summary and effective_text_source in {"summary_first", "summary_only"}: context.set_vectorize(Vectorize(text=summary)) else: - # Default: read raw file content, but apply configured truncation guard + # Read raw file content and apply configured truncation guard. try: content = await viking_fs.read_file(file_path, ctx=ctx) if isinstance(content, bytes): diff --git a/openviking_cli/utils/config/embedding_config.py b/openviking_cli/utils/config/embedding_config.py index dcf5f2ca..db80386b 100644 --- a/openviking_cli/utils/config/embedding_config.py +++ b/openviking_cli/utils/config/embedding_config.py @@ -224,7 +224,7 @@ class EmbeddingConfig(BaseModel): default="summary_first", description="Text source for file vectorization: summary_first|summary_only|content_only", ) - max_text_chars: int = Field( + max_input_chars: int = Field( default=1000, ge=100, description="Maximum characters sent to embeddings when raw text fallback is used", diff --git a/tests/unit/test_embedding_vectorize_strategy.py b/tests/unit/test_embedding_vectorize_strategy.py index f9a481ce..53aca072 100644 --- a/tests/unit/test_embedding_vectorize_strategy.py +++ b/tests/unit/test_embedding_vectorize_strategy.py @@ -31,11 +31,11 @@ def test_embedding_text_source_validation_rejects_invalid_values(bad_value): _cfg(text_source=bad_value) -def test_embedding_max_text_chars_validation_accepts_reasonable_value(): - cfg = _cfg(max_text_chars=1000) - assert cfg.max_text_chars == 1000 +def test_embedding_max_input_chars_validation_accepts_reasonable_value(): + cfg = _cfg(max_input_chars=1000) + assert cfg.max_input_chars == 1000 -def test_embedding_max_text_chars_validation_rejects_too_small_value(): +def test_embedding_max_input_chars_validation_rejects_too_small_value(): with pytest.raises(ValueError): - _cfg(max_text_chars=10) + _cfg(max_input_chars=10) diff --git a/tests/unit/test_vectorize_file_strategy.py b/tests/unit/test_vectorize_file_strategy.py index a2bbd51a..5be9a387 100644 --- a/tests/unit/test_vectorize_file_strategy.py +++ b/tests/unit/test_vectorize_file_strategy.py @@ -57,7 +57,7 @@ async def test_vectorize_file_uses_summary_first(monkeypatch): embedding_utils, "get_openviking_config", lambda: types.SimpleNamespace( - embedding=types.SimpleNamespace(text_source="summary_first", max_text_chars=1000) + embedding=types.SimpleNamespace(text_source="summary_first", max_input_chars=1000) ), ) monkeypatch.setattr( @@ -87,7 +87,7 @@ async def test_vectorize_file_truncates_content_when_content_only(monkeypatch): embedding_utils, "get_openviking_config", lambda: types.SimpleNamespace( - embedding=types.SimpleNamespace(text_source="content_only", max_text_chars=1000) + embedding=types.SimpleNamespace(text_source="content_only", max_input_chars=1000) ), ) monkeypatch.setattr(