Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
21 changes: 16 additions & 5 deletions openviking/utils/embedding_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,7 @@
from openviking.storage.queuefs.embedding_msg_converter import EmbeddingMsgConverter
from openviking.storage.viking_fs import get_viking_fs
from openviking_cli.utils import VikingURI, get_logger
from openviking_cli.utils.config import get_openviking_config

logger = get_logger(__name__)

Expand Down Expand Up @@ -219,8 +220,8 @@ async def vectorize_file(
Vectorize a single file.

Creates Context object for the file and enqueues it.
If use_summary=True and summary is available, uses summary for TEXT files (e.g. code scenario).
Otherwise reads raw file content for TEXT files, falls back to summary on failure.
The effective vectorization strategy is resolved once from either the explicit
`use_summary` flag (code path override) or the embedding config.
"""
enqueued = False

Expand Down Expand Up @@ -249,6 +250,16 @@ async def vectorize_file(
)

content_type = get_resource_content_type(file_name)
embedding_cfg = get_openviking_config().embedding
configured_text_source = getattr(embedding_cfg, "text_source", "summary_first")
effective_text_source = "summary_only" if use_summary else configured_text_source
max_input_chars = int(getattr(embedding_cfg, "max_input_chars", 1000) or 1000)

def _truncate_text(value: str) -> str:
if len(value) <= max_input_chars:
return value
return value[:max_input_chars] + "\n...(truncated for embedding)"

if content_type is None:
# Unsupported file type: fall back to summary if available
if summary:
Expand All @@ -262,15 +273,15 @@ async def vectorize_file(
)
return
elif content_type == ResourceContentType.TEXT:
if use_summary and summary:
# Code scenario: use pre-generated summary (e.g. AST skeleton) for embedding
if summary and effective_text_source in {"summary_first", "summary_only"}:
context.set_vectorize(Vectorize(text=summary))
else:
# Default: read raw file content
# Read raw file content and apply configured truncation guard.
try:
content = await viking_fs.read_file(file_path, ctx=ctx)
if isinstance(content, bytes):
content = content.decode("utf-8", errors="replace")
content = _truncate_text(content)
context.set_vectorize(Vectorize(text=content))
except Exception as e:
logger.warning(
Expand Down
13 changes: 13 additions & 0 deletions openviking_cli/utils/config/embedding_config.py
Original file line number Diff line number Diff line change
Expand Up @@ -220,6 +220,15 @@ class EmbeddingConfig(BaseModel):
max_concurrent: int = Field(
default=10, description="Maximum number of concurrent embedding requests"
)
text_source: str = Field(
default="summary_first",
description="Text source for file vectorization: summary_first|summary_only|content_only",
)
max_input_chars: int = Field(
default=1000,
ge=100,
description="Maximum characters sent to embeddings when raw text fallback is used",
)

model_config = {"extra": "forbid"}

Expand All @@ -230,6 +239,10 @@ def validate_config(self):
raise ValueError(
"At least one embedding configuration (dense, sparse, or hybrid) is required"
)
if self.text_source not in {"summary_first", "summary_only", "content_only"}:
raise ValueError(
"embedding.text_source must be one of: summary_first, summary_only, content_only"
)
return self

def _create_embedder(
Expand Down
41 changes: 41 additions & 0 deletions tests/unit/test_embedding_vectorize_strategy.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,41 @@
#!/usr/bin/env python3
# Copyright (c) 2026 Beijing Volcano Engine Technology Co., Ltd.
# SPDX-License-Identifier: Apache-2.0

import pytest

from openviking_cli.utils.config.embedding_config import EmbeddingConfig, EmbeddingModelConfig


def _cfg(**kwargs):
return EmbeddingConfig(
dense=EmbeddingModelConfig(
provider="openai",
model="text-embedding-3-small",
api_base="http://localhost:8080/v1",
dimension=1536,
),
**kwargs,
)


def test_embedding_text_source_validation_accepts_supported_values():
for value in ["summary_first", "summary_only", "content_only"]:
cfg = _cfg(text_source=value)
assert cfg.text_source == value


@pytest.mark.parametrize("bad_value", ["summary", "content", "auto", ""])
def test_embedding_text_source_validation_rejects_invalid_values(bad_value):
with pytest.raises(ValueError, match="embedding.text_source"):
_cfg(text_source=bad_value)


def test_embedding_max_input_chars_validation_accepts_reasonable_value():
cfg = _cfg(max_input_chars=1000)
assert cfg.max_input_chars == 1000


def test_embedding_max_input_chars_validation_rejects_too_small_value():
with pytest.raises(ValueError):
_cfg(max_input_chars=10)
109 changes: 109 additions & 0 deletions tests/unit/test_vectorize_file_strategy.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,109 @@
import types

import pytest

from openviking.core.context import Context
from openviking.utils import embedding_utils


class DummyQueue:
def __init__(self):
self.items = []

async def enqueue(self, msg):
self.items.append(msg)


class DummyQueueManager:
EMBEDDING = "embedding"

def __init__(self, queue):
self._queue = queue

def get_queue(self, _name):
return self._queue


class DummyFS:
def __init__(self, content):
self.content = content

async def read_file(self, _path, ctx=None):
return self.content


class DummyUser:
account_id = "default"

def user_space_name(self):
return "user/default"

def agent_space_name(self):
return "agent/default"


class DummyReq:
def __init__(self):
self.user = DummyUser()
self.account_id = "default"


@pytest.mark.asyncio
async def test_vectorize_file_uses_summary_first(monkeypatch):
queue = DummyQueue()
monkeypatch.setattr(embedding_utils, "get_queue_manager", lambda: DummyQueueManager(queue))
monkeypatch.setattr(embedding_utils, "get_viking_fs", lambda: DummyFS("X" * 5000))
monkeypatch.setattr(
embedding_utils,
"get_openviking_config",
lambda: types.SimpleNamespace(
embedding=types.SimpleNamespace(text_source="summary_first", max_input_chars=1000)
),
)
monkeypatch.setattr(
embedding_utils.EmbeddingMsgConverter,
"from_context",
lambda context: context,
)

await embedding_utils.vectorize_file(
file_path="viking://user/default/resources/test.md",
summary_dict={"name": "test.md", "summary": "short summary"},
parent_uri="viking://user/default/resources",
ctx=DummyReq(),
)

assert len(queue.items) == 1
assert isinstance(queue.items[0], Context)
assert queue.items[0].get_vectorization_text() == "short summary"


@pytest.mark.asyncio
async def test_vectorize_file_truncates_content_when_content_only(monkeypatch):
queue = DummyQueue()
monkeypatch.setattr(embedding_utils, "get_queue_manager", lambda: DummyQueueManager(queue))
monkeypatch.setattr(embedding_utils, "get_viking_fs", lambda: DummyFS("A" * 1500))
monkeypatch.setattr(
embedding_utils,
"get_openviking_config",
lambda: types.SimpleNamespace(
embedding=types.SimpleNamespace(text_source="content_only", max_input_chars=1000)
),
)
monkeypatch.setattr(
embedding_utils.EmbeddingMsgConverter,
"from_context",
lambda context: context,
)

await embedding_utils.vectorize_file(
file_path="viking://user/default/resources/test.md",
summary_dict={"name": "test.md", "summary": "short summary"},
parent_uri="viking://user/default/resources",
ctx=DummyReq(),
)

assert len(queue.items) == 1
text = queue.items[0].get_vectorization_text()
assert text.startswith("A" * 1000)
assert text.endswith("...(truncated for embedding)")
Loading