volcengine · ningfeemic-dev · Mar 22, 2026 · Mar 22, 2026
diff --git a/openviking/utils/embedding_utils.py b/openviking/utils/embedding_utils.py
@@ -16,6 +16,7 @@
 from openviking.storage.queuefs.embedding_msg_converter import EmbeddingMsgConverter
 from openviking.storage.viking_fs import get_viking_fs
 from openviking_cli.utils import VikingURI, get_logger
+from openviking_cli.utils.config import get_openviking_config
 
 logger = get_logger(__name__)
 
@@ -219,8 +220,8 @@ async def vectorize_file(
     Vectorize a single file.
 
     Creates Context object for the file and enqueues it.
-    If use_summary=True and summary is available, uses summary for TEXT files (e.g. code scenario).
-    Otherwise reads raw file content for TEXT files, falls back to summary on failure.
+    The effective vectorization strategy is resolved once from either the explicit
+    `use_summary` flag (code path override) or the embedding config.
     """
     enqueued = False
 
@@ -249,6 +250,16 @@ async def vectorize_file(
         )
 
         content_type = get_resource_content_type(file_name)
+        embedding_cfg = get_openviking_config().embedding
+        configured_text_source = getattr(embedding_cfg, "text_source", "summary_first")
+        effective_text_source = "summary_only" if use_summary else configured_text_source
+        max_input_chars = int(getattr(embedding_cfg, "max_input_chars", 1000) or 1000)
+
+        def _truncate_text(value: str) -> str:
+            if len(value) <= max_input_chars:
+                return value
+            return value[:max_input_chars] + "\n...(truncated for embedding)"
+
         if content_type is None:
             # Unsupported file type: fall back to summary if available
             if summary:
@@ -262,15 +273,15 @@ async def vectorize_file(
                 )
                 return
         elif content_type == ResourceContentType.TEXT:
-            if use_summary and summary:
-                # Code scenario: use pre-generated summary (e.g. AST skeleton) for embedding
+            if summary and effective_text_source in {"summary_first", "summary_only"}:
                 context.set_vectorize(Vectorize(text=summary))
             else:
-                # Default: read raw file content
+                # Read raw file content and apply configured truncation guard.
                 try:
                     content = await viking_fs.read_file(file_path, ctx=ctx)
                     if isinstance(content, bytes):
                         content = content.decode("utf-8", errors="replace")
+                    content = _truncate_text(content)
                     context.set_vectorize(Vectorize(text=content))
                 except Exception as e:
                     logger.warning(

diff --git a/openviking_cli/utils/config/embedding_config.py b/openviking_cli/utils/config/embedding_config.py
@@ -220,6 +220,15 @@ class EmbeddingConfig(BaseModel):
     max_concurrent: int = Field(
         default=10, description="Maximum number of concurrent embedding requests"
     )
+    text_source: str = Field(
+        default="summary_first",
+        description="Text source for file vectorization: summary_first|summary_only|content_only",
+    )
+    max_input_chars: int = Field(
+        default=1000,
+        ge=100,
+        description="Maximum characters sent to embeddings when raw text fallback is used",
+    )
 
     model_config = {"extra": "forbid"}
 
@@ -230,6 +239,10 @@ def validate_config(self):
             raise ValueError(
                 "At least one embedding configuration (dense, sparse, or hybrid) is required"
             )
+        if self.text_source not in {"summary_first", "summary_only", "content_only"}:
+            raise ValueError(
+                "embedding.text_source must be one of: summary_first, summary_only, content_only"
+            )
         return self
 
     def _create_embedder(

diff --git a/tests/unit/test_embedding_vectorize_strategy.py b/tests/unit/test_embedding_vectorize_strategy.py
@@ -0,0 +1,41 @@
+#!/usr/bin/env python3
+# Copyright (c) 2026 Beijing Volcano Engine Technology Co., Ltd.
+# SPDX-License-Identifier: Apache-2.0
+
+import pytest
+
+from openviking_cli.utils.config.embedding_config import EmbeddingConfig, EmbeddingModelConfig
+
+
+def _cfg(**kwargs):
+    return EmbeddingConfig(
+        dense=EmbeddingModelConfig(
+            provider="openai",
+            model="text-embedding-3-small",
+            api_base="http://localhost:8080/v1",
+            dimension=1536,
+        ),
+        **kwargs,
+    )
+
+
+def test_embedding_text_source_validation_accepts_supported_values():
+    for value in ["summary_first", "summary_only", "content_only"]:
+        cfg = _cfg(text_source=value)
+        assert cfg.text_source == value
+
+
+@pytest.mark.parametrize("bad_value", ["summary", "content", "auto", ""])
+def test_embedding_text_source_validation_rejects_invalid_values(bad_value):
+    with pytest.raises(ValueError, match="embedding.text_source"):
+        _cfg(text_source=bad_value)
+
+
+def test_embedding_max_input_chars_validation_accepts_reasonable_value():
+    cfg = _cfg(max_input_chars=1000)
+    assert cfg.max_input_chars == 1000
+
+
+def test_embedding_max_input_chars_validation_rejects_too_small_value():
+    with pytest.raises(ValueError):
+        _cfg(max_input_chars=10)
diff --git a/tests/unit/test_vectorize_file_strategy.py b/tests/unit/test_vectorize_file_strategy.py
@@ -0,0 +1,109 @@
+import types
+
+import pytest
+
+from openviking.core.context import Context
+from openviking.utils import embedding_utils
+
+
+class DummyQueue:
+    def __init__(self):
+        self.items = []
+
+    async def enqueue(self, msg):
+        self.items.append(msg)
+
+
+class DummyQueueManager:
+    EMBEDDING = "embedding"
+
+    def __init__(self, queue):
+        self._queue = queue
+
+    def get_queue(self, _name):
+        return self._queue
+
+
+class DummyFS:
+    def __init__(self, content):
+        self.content = content
+
+    async def read_file(self, _path, ctx=None):
+        return self.content
+
+
+class DummyUser:
+    account_id = "default"
+
+    def user_space_name(self):
+        return "user/default"
+
+    def agent_space_name(self):
+        return "agent/default"
+
+
+class DummyReq:
+    def __init__(self):
+        self.user = DummyUser()
+        self.account_id = "default"
+
+
+@pytest.mark.asyncio
+async def test_vectorize_file_uses_summary_first(monkeypatch):
+    queue = DummyQueue()
+    monkeypatch.setattr(embedding_utils, "get_queue_manager", lambda: DummyQueueManager(queue))
+    monkeypatch.setattr(embedding_utils, "get_viking_fs", lambda: DummyFS("X" * 5000))
+    monkeypatch.setattr(
+        embedding_utils,
+        "get_openviking_config",
+        lambda: types.SimpleNamespace(
+            embedding=types.SimpleNamespace(text_source="summary_first", max_input_chars=1000)
+        ),
+    )
+    monkeypatch.setattr(
+        embedding_utils.EmbeddingMsgConverter,
+        "from_context",
+        lambda context: context,
+    )
+
+    await embedding_utils.vectorize_file(
+        file_path="viking://user/default/resources/test.md",
+        summary_dict={"name": "test.md", "summary": "short summary"},
+        parent_uri="viking://user/default/resources",
+        ctx=DummyReq(),
+    )
+
+    assert len(queue.items) == 1
+    assert isinstance(queue.items[0], Context)
+    assert queue.items[0].get_vectorization_text() == "short summary"
+
+
+@pytest.mark.asyncio
+async def test_vectorize_file_truncates_content_when_content_only(monkeypatch):
+    queue = DummyQueue()
+    monkeypatch.setattr(embedding_utils, "get_queue_manager", lambda: DummyQueueManager(queue))
+    monkeypatch.setattr(embedding_utils, "get_viking_fs", lambda: DummyFS("A" * 1500))
+    monkeypatch.setattr(
+        embedding_utils,
+        "get_openviking_config",
+        lambda: types.SimpleNamespace(
+            embedding=types.SimpleNamespace(text_source="content_only", max_input_chars=1000)
+        ),
+    )
+    monkeypatch.setattr(
+        embedding_utils.EmbeddingMsgConverter,
+        "from_context",
+        lambda context: context,
+    )
+
+    await embedding_utils.vectorize_file(
+        file_path="viking://user/default/resources/test.md",
+        summary_dict={"name": "test.md", "summary": "short summary"},
+        parent_uri="viking://user/default/resources",
+        ctx=DummyReq(),
+    )
+
+    assert len(queue.items) == 1
+    text = queue.items[0].get_vectorization_text()
+    assert text.startswith("A" * 1000)
+    assert text.endswith("...(truncated for embedding)")