diff --git a/openviking/utils/embedding_utils.py b/openviking/utils/embedding_utils.py index d0b9bca3..68e4b060 100644 --- a/openviking/utils/embedding_utils.py +++ b/openviking/utils/embedding_utils.py @@ -16,6 +16,7 @@ from openviking.storage.queuefs.embedding_msg_converter import EmbeddingMsgConverter from openviking.storage.viking_fs import get_viking_fs from openviking_cli.utils import VikingURI, get_logger +from openviking_cli.utils.config import get_openviking_config logger = get_logger(__name__) @@ -219,8 +220,8 @@ async def vectorize_file( Vectorize a single file. Creates Context object for the file and enqueues it. - If use_summary=True and summary is available, uses summary for TEXT files (e.g. code scenario). - Otherwise reads raw file content for TEXT files, falls back to summary on failure. + The effective vectorization strategy is resolved once from either the explicit + `use_summary` flag (code path override) or the embedding config. """ enqueued = False @@ -249,6 +250,16 @@ async def vectorize_file( ) content_type = get_resource_content_type(file_name) + embedding_cfg = get_openviking_config().embedding + configured_text_source = getattr(embedding_cfg, "text_source", "summary_first") + effective_text_source = "summary_only" if use_summary else configured_text_source + max_input_chars = int(getattr(embedding_cfg, "max_input_chars", 1000) or 1000) + + def _truncate_text(value: str) -> str: + if len(value) <= max_input_chars: + return value + return value[:max_input_chars] + "\n...(truncated for embedding)" + if content_type is None: # Unsupported file type: fall back to summary if available if summary: @@ -262,15 +273,15 @@ async def vectorize_file( ) return elif content_type == ResourceContentType.TEXT: - if use_summary and summary: - # Code scenario: use pre-generated summary (e.g. AST skeleton) for embedding + if summary and effective_text_source in {"summary_first", "summary_only"}: context.set_vectorize(Vectorize(text=summary)) else: - # Default: read raw file content + # Read raw file content and apply configured truncation guard. try: content = await viking_fs.read_file(file_path, ctx=ctx) if isinstance(content, bytes): content = content.decode("utf-8", errors="replace") + content = _truncate_text(content) context.set_vectorize(Vectorize(text=content)) except Exception as e: logger.warning( diff --git a/openviking_cli/utils/config/embedding_config.py b/openviking_cli/utils/config/embedding_config.py index 14435438..db80386b 100644 --- a/openviking_cli/utils/config/embedding_config.py +++ b/openviking_cli/utils/config/embedding_config.py @@ -220,6 +220,15 @@ class EmbeddingConfig(BaseModel): max_concurrent: int = Field( default=10, description="Maximum number of concurrent embedding requests" ) + text_source: str = Field( + default="summary_first", + description="Text source for file vectorization: summary_first|summary_only|content_only", + ) + max_input_chars: int = Field( + default=1000, + ge=100, + description="Maximum characters sent to embeddings when raw text fallback is used", + ) model_config = {"extra": "forbid"} @@ -230,6 +239,10 @@ def validate_config(self): raise ValueError( "At least one embedding configuration (dense, sparse, or hybrid) is required" ) + if self.text_source not in {"summary_first", "summary_only", "content_only"}: + raise ValueError( + "embedding.text_source must be one of: summary_first, summary_only, content_only" + ) return self def _create_embedder( diff --git a/tests/unit/test_embedding_vectorize_strategy.py b/tests/unit/test_embedding_vectorize_strategy.py new file mode 100644 index 00000000..53aca072 --- /dev/null +++ b/tests/unit/test_embedding_vectorize_strategy.py @@ -0,0 +1,41 @@ +#!/usr/bin/env python3 +# Copyright (c) 2026 Beijing Volcano Engine Technology Co., Ltd. +# SPDX-License-Identifier: Apache-2.0 + +import pytest + +from openviking_cli.utils.config.embedding_config import EmbeddingConfig, EmbeddingModelConfig + + +def _cfg(**kwargs): + return EmbeddingConfig( + dense=EmbeddingModelConfig( + provider="openai", + model="text-embedding-3-small", + api_base="http://localhost:8080/v1", + dimension=1536, + ), + **kwargs, + ) + + +def test_embedding_text_source_validation_accepts_supported_values(): + for value in ["summary_first", "summary_only", "content_only"]: + cfg = _cfg(text_source=value) + assert cfg.text_source == value + + +@pytest.mark.parametrize("bad_value", ["summary", "content", "auto", ""]) +def test_embedding_text_source_validation_rejects_invalid_values(bad_value): + with pytest.raises(ValueError, match="embedding.text_source"): + _cfg(text_source=bad_value) + + +def test_embedding_max_input_chars_validation_accepts_reasonable_value(): + cfg = _cfg(max_input_chars=1000) + assert cfg.max_input_chars == 1000 + + +def test_embedding_max_input_chars_validation_rejects_too_small_value(): + with pytest.raises(ValueError): + _cfg(max_input_chars=10) diff --git a/tests/unit/test_vectorize_file_strategy.py b/tests/unit/test_vectorize_file_strategy.py new file mode 100644 index 00000000..5be9a387 --- /dev/null +++ b/tests/unit/test_vectorize_file_strategy.py @@ -0,0 +1,109 @@ +import types + +import pytest + +from openviking.core.context import Context +from openviking.utils import embedding_utils + + +class DummyQueue: + def __init__(self): + self.items = [] + + async def enqueue(self, msg): + self.items.append(msg) + + +class DummyQueueManager: + EMBEDDING = "embedding" + + def __init__(self, queue): + self._queue = queue + + def get_queue(self, _name): + return self._queue + + +class DummyFS: + def __init__(self, content): + self.content = content + + async def read_file(self, _path, ctx=None): + return self.content + + +class DummyUser: + account_id = "default" + + def user_space_name(self): + return "user/default" + + def agent_space_name(self): + return "agent/default" + + +class DummyReq: + def __init__(self): + self.user = DummyUser() + self.account_id = "default" + + +@pytest.mark.asyncio +async def test_vectorize_file_uses_summary_first(monkeypatch): + queue = DummyQueue() + monkeypatch.setattr(embedding_utils, "get_queue_manager", lambda: DummyQueueManager(queue)) + monkeypatch.setattr(embedding_utils, "get_viking_fs", lambda: DummyFS("X" * 5000)) + monkeypatch.setattr( + embedding_utils, + "get_openviking_config", + lambda: types.SimpleNamespace( + embedding=types.SimpleNamespace(text_source="summary_first", max_input_chars=1000) + ), + ) + monkeypatch.setattr( + embedding_utils.EmbeddingMsgConverter, + "from_context", + lambda context: context, + ) + + await embedding_utils.vectorize_file( + file_path="viking://user/default/resources/test.md", + summary_dict={"name": "test.md", "summary": "short summary"}, + parent_uri="viking://user/default/resources", + ctx=DummyReq(), + ) + + assert len(queue.items) == 1 + assert isinstance(queue.items[0], Context) + assert queue.items[0].get_vectorization_text() == "short summary" + + +@pytest.mark.asyncio +async def test_vectorize_file_truncates_content_when_content_only(monkeypatch): + queue = DummyQueue() + monkeypatch.setattr(embedding_utils, "get_queue_manager", lambda: DummyQueueManager(queue)) + monkeypatch.setattr(embedding_utils, "get_viking_fs", lambda: DummyFS("A" * 1500)) + monkeypatch.setattr( + embedding_utils, + "get_openviking_config", + lambda: types.SimpleNamespace( + embedding=types.SimpleNamespace(text_source="content_only", max_input_chars=1000) + ), + ) + monkeypatch.setattr( + embedding_utils.EmbeddingMsgConverter, + "from_context", + lambda context: context, + ) + + await embedding_utils.vectorize_file( + file_path="viking://user/default/resources/test.md", + summary_dict={"name": "test.md", "summary": "short summary"}, + parent_uri="viking://user/default/resources", + ctx=DummyReq(), + ) + + assert len(queue.items) == 1 + text = queue.items[0].get_vectorization_text() + assert text.startswith("A" * 1000) + assert text.endswith("...(truncated for embedding)")