From f8e67780517c47f2aa81969625821775ec5af6b2 Mon Sep 17 00:00:00 2001 From: Davidyz Date: Thu, 27 Mar 2025 15:39:15 +0000 Subject: [PATCH 01/11] feat(cli): try to implement chunking with line-ranges metadata. --- src/vectorcode/chunking.py | 162 +++++++++++++++++++++++++++++-------- tests/test_chunking.py | 102 ++++++++++++++--------- 2 files changed, 192 insertions(+), 72 deletions(-) diff --git a/src/vectorcode/chunking.py b/src/vectorcode/chunking.py index ec0a1346..afc6ffc4 100644 --- a/src/vectorcode/chunking.py +++ b/src/vectorcode/chunking.py @@ -1,6 +1,7 @@ import os import re from abc import abstractmethod +from dataclasses import dataclass from functools import cache from io import TextIOWrapper from typing import Generator, Optional @@ -8,12 +9,26 @@ from pygments.lexer import Lexer from pygments.lexers import guess_lexer_for_filename from pygments.util import ClassNotFound -from tree_sitter import Node +from tree_sitter import Node, Point from tree_sitter_language_pack import get_parser from vectorcode.cli_utils import Config +@dataclass +class Chunk: + """ + rows are 1-indexed, cols are 0-indexed. + """ + + text: str + start: Point + end: Point + + def __str__(self): + return self.text + + class ChunkerBase: def __init__(self, config: Optional[Config] = None) -> None: if config is None: @@ -24,7 +39,7 @@ def __init__(self, config: Optional[Config] = None) -> None: self.config = config @abstractmethod - def chunk(self, data) -> Generator[str, None, None]: + def chunk(self, data) -> Generator[Chunk, None, None]: raise NotImplementedError @@ -34,16 +49,25 @@ def __init__(self, config: Optional[Config] = None) -> None: config = Config() super().__init__(config) - def chunk(self, data: str) -> Generator[str, None, None]: + def chunk(self, data: str): if self.config.chunk_size < 0: - yield data + yield Chunk( + text=data, + start=Point(row=1, column=0), + end=Point(row=1, column=len(data)), + ) else: step_size = max( 1, int(self.config.chunk_size * (1 - self.config.overlap_ratio)) ) i = 0 while i < len(data): - yield data[i : i + self.config.chunk_size] + chunk_text = data[i : i + self.config.chunk_size] + yield Chunk( + text=chunk_text, + start=Point(row=1, column=i), + end=Point(row=1, column=len(chunk_text) - 1), + ) if i + self.config.chunk_size >= len(data): break i += step_size @@ -55,24 +79,41 @@ def __init__(self, config: Optional[Config] = None) -> None: config = Config() super().__init__(config) - def chunk(self, data: TextIOWrapper) -> Generator[str, None, None]: - if self.config.chunk_size < 0: - yield "".join(data.readlines()) - else: - step_size = max( - 1, int(self.config.chunk_size * (1 - self.config.overlap_ratio)) - ) - # the output of this method should be identical to that of StringChunker.chunk - output = data.read(self.config.chunk_size) - yield output - if len(output) < self.config.chunk_size: - return - while True: - new_chars = data.read(step_size) - output = output[step_size:] + new_chars - yield output - if len(new_chars) < step_size: - return + def chunk(self, data: TextIOWrapper) -> Generator[Chunk, None, None]: + lines = data.readlines() + if len(lines) == 0: + return + if ( + self.config.chunk_size < 0 + or sum(len(i) for i in lines) < self.config.chunk_size + ): + text = "".join(lines) + yield Chunk(text, Point(1, 0), Point(1, len(text) - 1)) + return + text_buffer = "" + start_pos = Point(1, 0) + + def seek(point: Point, count: int): + while point.column + count > len(lines[point.row - 1]): + count -= len(lines[point.row - 1]) - point.column + point.row += 1 + point.column = 0 + return point + + for ln in range(1, len(lines) + 1): + line = lines[ln - 1] + if len(text_buffer + line) > self.config.chunk_size: + consumed = line[: self.config.chunk_size - len(text_buffer)] + yield Chunk( + text_buffer + consumed, start_pos, Point(ln, len(consumed) - 1) + ) + text_buffer = "" + if len(consumed) < len(line): + start_pos = Point(ln, len(consumed)) + else: + start_pos = Point(ln + 1, 0) + else: + text_buffer += line class TreeSitterChunker(ChunkerBase): @@ -81,22 +122,77 @@ def __init__(self, config: Optional[Config] = None): config = Config() super().__init__(config) - def __chunk_node(self, node: Node, text: str) -> Generator[str, None, None]: + def __chunk_node(self, node: Node, text: str) -> Generator[Chunk, None, None]: current_chunk = "" + + current_start = None + for child in node.children: - child_length = child.end_byte - child.start_byte + child_text = text[child.start_byte : child.end_byte] + child_length = len(child_text) + if child_length > self.config.chunk_size: + # Yield current chunk if exists if current_chunk: - yield current_chunk + assert current_start is not None + yield Chunk( + text=current_chunk, + start=current_start, + end=Point( + row=current_start.row + current_chunk.count("\n"), + column=len(current_chunk.split("\n")[-1]) - 1 + if "\n" in current_chunk + else current_start.column + len(current_chunk) - 1, + ), + ) current_chunk = "" + current_start = None + + # Recursively chunk the large child node yield from self.__chunk_node(child, text) - elif len(current_chunk) + child_length > self.config.chunk_size: - yield current_chunk - current_chunk = text[child.start_byte : child.end_byte] + + elif not current_chunk: + # Start new chunk + current_chunk = child_text + current_start = Point( + row=child.start_point.row + 1, column=child.start_point.column + ) + + elif len(current_chunk) + child_length <= self.config.chunk_size: + # Add to current chunk + current_chunk += child_text + else: - current_chunk += text[child.start_byte : child.end_byte] + # Yield current chunk and start new one + assert current_start is not None + yield Chunk( + text=current_chunk, + start=current_start, + end=Point( + row=current_start.row + current_chunk.count("\n"), + column=len(current_chunk.split("\n")[-1]) - 1 + if "\n" in current_chunk + else current_start.column + len(current_chunk) - 1, + ), + ) + current_chunk = child_text + current_start = Point( + row=child.start_point.row + 1, column=child.start_point.column + ) + + # Yield remaining chunk if current_chunk: - yield current_chunk + assert current_start is not None + yield Chunk( + text=current_chunk, + start=current_start, + end=Point( + row=current_start.row + current_chunk.count("\n"), + column=len(current_chunk.split("\n")[-1]) - 1 + if "\n" in current_chunk + else current_start.column + len(current_chunk) - 1, + ), + ) @cache def __guess_type(self, path: str, content: str) -> Optional[Lexer]: @@ -119,7 +215,7 @@ def __build_pattern(self, language: str): return f"(?:{'|'.join(patterns)})" return "" - def chunk(self, data: str) -> Generator[str, None, None]: + def chunk(self, data: str) -> Generator[Chunk, None, None]: """ data: path to the file """ @@ -155,7 +251,7 @@ def chunk(self, data: str) -> Generator[str, None, None]: if pattern_str: re_pattern = re.compile(pattern_str) for chunk in chunks_gen: - if re_pattern.match(chunk) is None: + if re_pattern.match(chunk.text) is None: yield chunk else: yield from chunks_gen diff --git a/tests/test_chunking.py b/tests/test_chunking.py index 285ac30c..9263e59a 100644 --- a/tests/test_chunking.py +++ b/tests/test_chunking.py @@ -12,24 +12,30 @@ from vectorcode.cli_utils import Config -class TestChunking: +class TestStringChunker: file_chunker = FileChunker() def test_string_chunker(self): string_chunker = StringChunker(Config(chunk_size=-1, overlap_ratio=0.5)) - assert list(string_chunker.chunk("hello world")) == ["hello world"] + assert list(str(i) for i in string_chunker.chunk("hello world")) == [ + "hello world" + ] string_chunker = StringChunker(Config(chunk_size=5, overlap_ratio=0.5)) - assert list(string_chunker.chunk("hello world")) == [ + assert list(str(i) for i in string_chunker.chunk("hello world")) == [ "hello", "llo w", "o wor", "world", ] string_chunker = StringChunker(Config(chunk_size=5, overlap_ratio=0)) - assert list(string_chunker.chunk("hello world")) == ["hello", " worl", "d"] + assert list(str(i) for i in string_chunker.chunk("hello world")) == [ + "hello", + " worl", + "d", + ] string_chunker = StringChunker(Config(chunk_size=5, overlap_ratio=0.8)) - assert list(string_chunker.chunk("hello world")) == [ + assert list(str(i) for i in string_chunker.chunk("hello world")) == [ "hello", "ello ", "llo w", @@ -39,31 +45,49 @@ def test_string_chunker(self): "world", ] + +class TestFileChunker: def test_file_chunker(self): - """ - Use StringChunker output as ground truth to test chunking. - """ - file_path = __file__ - ratio = 0.5 - chunk_size = 100 - - with open(file_path) as fin: - string_chunker = StringChunker( - Config(chunk_size=chunk_size, overlap_ratio=ratio) - ) - string_chunks = list(string_chunker.chunk(fin.read())) - - with open(file_path) as fin: - file_chunker = FileChunker( - Config(chunk_size=chunk_size, overlap_ratio=ratio) - ) - file_chunks = list(file_chunker.chunk(fin)) - - assert len(string_chunks) == len(file_chunks), ( - f"Number of chunks do not match. {len(string_chunks)} != {len(file_chunks)}" - ) - for string_chunk, file_chunk in zip(string_chunks, file_chunks): - assert string_chunk == file_chunk + test_content = "hello world" + + with tempfile.NamedTemporaryFile(mode="w", delete=False) as tmp_file: + tmp_file.write(test_content) + tmp_file_name = tmp_file.name + + # Test negative chunk size (return whole file) + with open(tmp_file_name, "r") as f: + chunker = FileChunker(Config(chunk_size=-1, overlap_ratio=0.5)) + assert list(str(i) for i in chunker.chunk(f)) == ["hello world"] + + # Test basic chunking with overlap + with open(tmp_file_name, "r") as f: + chunker = FileChunker(Config(chunk_size=5, overlap_ratio=0.5)) + assert list(str(i) for i in chunker.chunk(f)) == [ + "hello", + "llo w", + "o wor", + "world", + ] + + # Test no overlap + with open(tmp_file_name, "r") as f: + chunker = FileChunker(Config(chunk_size=5, overlap_ratio=0)) + assert list(str(i) for i in chunker.chunk(f)) == ["hello", " worl", "d"] + + # Test high overlap ratio + with open(tmp_file_name, "r") as f: + chunker = FileChunker(Config(chunk_size=5, overlap_ratio=0.8)) + assert list(str(i) for i in chunker.chunk(f)) == [ + "hello", + "ello ", + "llo w", + "lo wo", + "o wor", + " worl", + "world", + ] + + os.remove(tmp_file_name) def test_no_config(): @@ -96,7 +120,7 @@ def bar(): tmp_file.write(test_content) test_file = tmp_file.name - chunks = list(chunker.chunk(test_file)) + chunks = list(str(i) for i in chunker.chunk(test_file)) assert chunks == ['def foo():\n return "foo"', 'def bar():\n return "bar"'] os.remove(test_file) @@ -118,7 +142,7 @@ def bar(): tmp_file.write(test_content) test_file = tmp_file.name - chunks = list(chunker.chunk(test_file)) + chunks = list(str(i) for i in chunker.chunk(test_file)) assert chunks == ['def bar():\n return "bar"'] os.remove(test_file) @@ -140,7 +164,7 @@ def bar(): tmp_file.write(test_content) test_file = tmp_file.name - chunks = list(chunker.chunk(test_file)) + chunks = list(str(i) for i in chunker.chunk(test_file)) assert chunks == [] os.remove(test_file) @@ -160,7 +184,7 @@ def bar(): tmp_file.write(test_content) test_file = tmp_file.name - chunks = list(chunker.chunk(test_file)) + chunks = list(str(i) for i in chunker.chunk(test_file)) assert chunks == ['def bar():\n return "bar"'] os.remove(test_file) @@ -178,7 +202,7 @@ def bar(): tmp_file.write(test_content) test_file = tmp_file.name - chunks = list(chunker.chunk(test_file)) + chunks = list(str(i) for i in chunker.chunk(test_file)) assert chunks == ['functionbar()return "bar"end'] os.remove(test_file) @@ -199,7 +223,7 @@ def test_treesitter_chunker_lua(): tmp_file.write(test_content) test_file = tmp_file.name - chunks = list(chunker.chunk(test_file)) + chunks = list(str(i) for i in chunker.chunk(test_file)) assert chunks == ['functionfoo()return "foo"end', 'functionbar()return "bar"end'] os.remove(test_file) @@ -221,7 +245,7 @@ def add_numbers(a, b) tmp_file.write(test_content) test_file = tmp_file.name - chunks = list(chunker.chunk(test_file)) + chunks = list(str(i) for i in chunker.chunk(test_file)) assert len(chunks) > 0 os.remove(test_file) @@ -243,7 +267,7 @@ def add_numbers(a, b) tmp_file.write(test_content) test_file = tmp_file.name - chunks = list(chunker.chunk(test_file)) + chunks = list(str(i) for i in chunker.chunk(test_file)) assert len(chunks) == 1 os.remove(test_file) @@ -268,8 +292,8 @@ def test_treesitter_chunker_fallback(): tmp_file.write(test_content) test_file = tmp_file.name - tree_sitter_chunks = list(tree_sitter_chunker.chunk(test_file)) - string_chunks = list(string_chunker.chunk(test_content)) + tree_sitter_chunks = list(str(i) for i in tree_sitter_chunker.chunk(test_file)) + string_chunks = list(str(i) for i in string_chunker.chunk(test_content)) assert tree_sitter_chunks == string_chunks From a2f8240b1bfd2024aa953a355f70d0e913997007 Mon Sep 17 00:00:00 2001 From: Davidyz Date: Fri, 28 Mar 2025 13:33:39 +0000 Subject: [PATCH 02/11] feat(cli): implemented line ranges and fixed broken tests. --- src/vectorcode/chunking.py | 62 ++++++++++++-------- src/vectorcode/subcommands/query/__init__.py | 2 +- src/vectorcode/subcommands/vectorise.py | 17 ++++-- tests/test_chunking.py | 4 +- 4 files changed, 54 insertions(+), 31 deletions(-) diff --git a/src/vectorcode/chunking.py b/src/vectorcode/chunking.py index afc6ffc4..dfe2d9e6 100644 --- a/src/vectorcode/chunking.py +++ b/src/vectorcode/chunking.py @@ -90,30 +90,44 @@ def chunk(self, data: TextIOWrapper) -> Generator[Chunk, None, None]: text = "".join(lines) yield Chunk(text, Point(1, 0), Point(1, len(text) - 1)) return - text_buffer = "" - start_pos = Point(1, 0) - - def seek(point: Point, count: int): - while point.column + count > len(lines[point.row - 1]): - count -= len(lines[point.row - 1]) - point.column - point.row += 1 - point.column = 0 - return point - - for ln in range(1, len(lines) + 1): - line = lines[ln - 1] - if len(text_buffer + line) > self.config.chunk_size: - consumed = line[: self.config.chunk_size - len(text_buffer)] - yield Chunk( - text_buffer + consumed, start_pos, Point(ln, len(consumed) - 1) - ) - text_buffer = "" - if len(consumed) < len(line): - start_pos = Point(ln, len(consumed)) - else: - start_pos = Point(ln + 1, 0) - else: - text_buffer += line + + text = "".join(lines) + step_size = max( + 1, int(self.config.chunk_size * (1 - self.config.overlap_ratio)) + ) + + # Convert lines to absolute positions + line_offsets = [0] + for line in lines: + line_offsets.append(line_offsets[-1] + len(line)) + + i = 0 + while i < len(text): + chunk_text = text[i : i + self.config.chunk_size] + + # Find start position + start_line = ( + next(ln for ln, offset in enumerate(line_offsets) if offset > i) - 1 + ) + start_col = i - line_offsets[start_line] + + # Find end position + end_pos = i + len(chunk_text) + end_line = ( + next(ln for ln, offset in enumerate(line_offsets) if offset >= end_pos) + - 1 + ) + end_col = end_pos - line_offsets[end_line] - 1 + + yield Chunk( + chunk_text, + Point(start_line + 1, start_col), + Point(end_line + 1, end_col), + ) + + if i + self.config.chunk_size >= len(text): + break + i += step_size class TreeSitterChunker(ChunkerBase): diff --git a/src/vectorcode/subcommands/query/__init__.py b/src/vectorcode/subcommands/query/__init__.py index 4c626f7a..e6223ef2 100644 --- a/src/vectorcode/subcommands/query/__init__.py +++ b/src/vectorcode/subcommands/query/__init__.py @@ -22,7 +22,7 @@ async def get_query_result_files( if configs.query: chunker = StringChunker(configs) for q in configs.query: - query_chunks.extend(chunker.chunk(q)) + query_chunks.extend(str(i) for i in chunker.chunk(q)) configs.query_exclude = [ expand_path(i, True) diff --git a/src/vectorcode/subcommands/vectorise.py b/src/vectorcode/subcommands/vectorise.py index cff789c4..ecedc65d 100644 --- a/src/vectorcode/subcommands/vectorise.py +++ b/src/vectorcode/subcommands/vectorise.py @@ -13,7 +13,7 @@ from chromadb.api.models.AsyncCollection import AsyncCollection from chromadb.api.types import IncludeEnum -from vectorcode.chunking import TreeSitterChunker +from vectorcode.chunking import Chunk, TreeSitterChunker from vectorcode.cli_utils import Config, expand_globs, expand_path from vectorcode.common import get_client, get_collection, verify_ef @@ -54,18 +54,27 @@ async def chunked_add( try: async with semaphore: - chunks = list(TreeSitterChunker(configs).chunk(full_path_str)) + chunks: list[Chunk | str] = list( + TreeSitterChunker(configs).chunk(full_path_str) + ) if len(chunks) == 0 or (len(chunks) == 1 and chunks[0] == ""): # empty file return chunks.append(str(os.path.relpath(full_path_str, configs.project_root))) + metas = [] + for chunk in chunks: + meta = {"path": full_path_str} + if isinstance(chunk, Chunk): + meta["start"] = {"row": chunk.start.row, "col": chunk.start.column} + meta["end"] = {"row": chunk.end.row, "col": chunk.end.column} + metas.append(meta) async with collection_lock: for idx in range(0, len(chunks), max_batch_size): inserted_chunks = chunks[idx : idx + max_batch_size] await collection.add( ids=[get_uuid() for _ in inserted_chunks], - documents=inserted_chunks, - metadatas=[{"path": full_path_str} for _ in inserted_chunks], + documents=[str(i) for i in inserted_chunks], + metadatas=metas, ) except UnicodeDecodeError: # probably binary. skip it. diff --git a/tests/test_chunking.py b/tests/test_chunking.py index 9263e59a..40278b69 100644 --- a/tests/test_chunking.py +++ b/tests/test_chunking.py @@ -48,10 +48,10 @@ def test_string_chunker(self): class TestFileChunker: def test_file_chunker(self): - test_content = "hello world" + test_content = ["hello ", "world"] with tempfile.NamedTemporaryFile(mode="w", delete=False) as tmp_file: - tmp_file.write(test_content) + tmp_file.writelines(test_content) tmp_file_name = tmp_file.name # Test negative chunk size (return whole file) From 3ebb051ab0c6d745a6622a24db796bd307a23e85 Mon Sep 17 00:00:00 2001 From: Davidyz Date: Fri, 28 Mar 2025 14:18:13 +0000 Subject: [PATCH 03/11] feat(cli): add `Chunk` support in `chunked_add` --- src/vectorcode/chunking.py | 2 +- src/vectorcode/subcommands/vectorise.py | 4 +- tests/subcommands/test_vectorise.py | 4 +- tests/test_chunking.py | 74 +++++++++++++++++++++++++ 4 files changed, 80 insertions(+), 4 deletions(-) diff --git a/src/vectorcode/chunking.py b/src/vectorcode/chunking.py index dfe2d9e6..fdc8f8aa 100644 --- a/src/vectorcode/chunking.py +++ b/src/vectorcode/chunking.py @@ -66,7 +66,7 @@ def chunk(self, data: str): yield Chunk( text=chunk_text, start=Point(row=1, column=i), - end=Point(row=1, column=len(chunk_text) - 1), + end=Point(row=1, column=i + len(chunk_text) - 1), ) if i + self.config.chunk_size >= len(data): break diff --git a/src/vectorcode/subcommands/vectorise.py b/src/vectorcode/subcommands/vectorise.py index ecedc65d..d636edae 100644 --- a/src/vectorcode/subcommands/vectorise.py +++ b/src/vectorcode/subcommands/vectorise.py @@ -63,7 +63,7 @@ async def chunked_add( chunks.append(str(os.path.relpath(full_path_str, configs.project_root))) metas = [] for chunk in chunks: - meta = {"path": full_path_str} + meta: dict[str, str | dict[str, int]] = {"path": full_path_str} if isinstance(chunk, Chunk): meta["start"] = {"row": chunk.start.row, "col": chunk.start.column} meta["end"] = {"row": chunk.end.row, "col": chunk.end.column} @@ -76,7 +76,7 @@ async def chunked_add( documents=[str(i) for i in inserted_chunks], metadatas=metas, ) - except UnicodeDecodeError: + except UnicodeDecodeError: # pragma: nocover # probably binary. skip it. return diff --git a/tests/subcommands/test_vectorise.py b/tests/subcommands/test_vectorise.py index eee353d0..bcb47e4b 100644 --- a/tests/subcommands/test_vectorise.py +++ b/tests/subcommands/test_vectorise.py @@ -9,7 +9,9 @@ import pathspec import pytest from chromadb.api.models.AsyncCollection import AsyncCollection +from tree_sitter import Point +from vectorcode.chunking import Chunk from vectorcode.cli_utils import Config from vectorcode.subcommands.vectorise import ( chunked_add, @@ -47,7 +49,7 @@ async def test_chunked_add(): semaphore = asyncio.Semaphore(1) with patch("vectorcode.chunking.TreeSitterChunker.chunk") as mock_chunk: - mock_chunk.return_value = ["chunk1", "chunk2"] + mock_chunk.return_value = [Chunk("chunk1", Point(1, 0), Point(1, 5)), "chunk2"] await chunked_add( file_path, collection, diff --git a/tests/test_chunking.py b/tests/test_chunking.py index 40278b69..ab7836b5 100644 --- a/tests/test_chunking.py +++ b/tests/test_chunking.py @@ -2,8 +2,10 @@ import tempfile import pytest +from tree_sitter import Point from vectorcode.chunking import ( + Chunk, ChunkerBase, FileChunker, StringChunker, @@ -27,12 +29,18 @@ def test_string_chunker(self): "o wor", "world", ] + assert list(string_chunker.chunk("hello world"))[0] == Chunk( + "hello", Point(1, 0), Point(1, 4) + ) + string_chunker = StringChunker(Config(chunk_size=5, overlap_ratio=0)) assert list(str(i) for i in string_chunker.chunk("hello world")) == [ "hello", " worl", "d", ] + chunks = list(string_chunker.chunk("hello world")) + assert chunks[1] == Chunk(" worl", Point(1, 5), Point(1, 9)) string_chunker = StringChunker(Config(chunk_size=5, overlap_ratio=0.8)) assert list(str(i) for i in string_chunker.chunk("hello world")) == [ @@ -89,6 +97,32 @@ def test_file_chunker(self): os.remove(tmp_file_name) + def test_file_chunker_positions(self): + test_content = ["first line\n", "second line\n", "third line"] + + with tempfile.NamedTemporaryFile(mode="w", delete=False) as tmp_file: + tmp_file.writelines(test_content) + tmp_file_name = tmp_file.name + + # Test chunk positions + with open(tmp_file_name, "r") as f: + chunker = FileChunker(Config(chunk_size=10, overlap_ratio=0)) + chunks = list(chunker.chunk(f)) + + assert chunks[0].text == "first line" + assert chunks[0].start == Point(1, 0) + assert chunks[0].end == Point(1, 9) + + assert chunks[1].text == "\nsecond li" + assert chunks[1].start == Point(1, 10) + assert chunks[1].end == Point(2, 8) + + assert chunks[2].text == "ne\nthird l" + assert chunks[2].start == Point(2, 9) + assert chunks[2].end == Point(3, 6) + + os.remove(tmp_file_name) + def test_no_config(): assert StringChunker().config == Config() @@ -298,3 +332,43 @@ def test_treesitter_chunker_fallback(): assert tree_sitter_chunks == string_chunks os.remove(test_file) + + +def test_treesitter_chunker_positions(): + """Test that TreeSitterChunker produces correct start/end positions for chunks.""" + chunker = TreeSitterChunker(Config(chunk_size=15)) + + test_content = """\ +def foo(): + return 1 + \\ + 2 + +@decorator +def bar(): + return "bar" +""" + with tempfile.NamedTemporaryFile(mode="w", delete=False, suffix=".py") as tmp_file: + tmp_file.write(test_content) + test_file = tmp_file.name + + chunks = list(chunker.chunk(test_file)) + + # Verify chunks and their positions + assert len(chunks) >= 2 # Should have at least 2 chunks + + # First chunk should contain the function definition start + assert "deffoo():" in chunks[0].text + assert chunks[0].start == Point(1, 0) + + # Last chunk should contain the final return statement + assert 'return "bar"' in chunks[-1].text + assert chunks[-1].end.row == 7 + assert chunks[-1].end.column in (14, 15) # Allow 1-column difference + + # Verify positions are contiguous + for i in range(len(chunks) - 1): + assert chunks[i].end.row <= chunks[i + 1].start.row + if chunks[i].end.row == chunks[i + 1].start.row: + assert chunks[i].end.column <= chunks[i + 1].start.column + + os.remove(test_file) From 778423a64de00e5293dd968b67b05f75a57333ab Mon Sep 17 00:00:00 2001 From: Davidyz Date: Fri, 28 Mar 2025 15:24:04 +0000 Subject: [PATCH 04/11] feat(cli): make `CrossEncoderReranker` work with `chunk` --- src/vectorcode/cli_utils.py | 1 + src/vectorcode/subcommands/query/__init__.py | 33 ++++++++++++++------ src/vectorcode/subcommands/query/reranker.py | 13 +++++--- 3 files changed, 33 insertions(+), 14 deletions(-) diff --git a/src/vectorcode/cli_utils.py b/src/vectorcode/cli_utils.py index e22bd016..bd660962 100644 --- a/src/vectorcode/cli_utils.py +++ b/src/vectorcode/cli_utils.py @@ -23,6 +23,7 @@ class QueryInclude(StrEnum): path = "path" document = "document" + chunk = "chunk" def to_header(self) -> str: """ diff --git a/src/vectorcode/subcommands/query/__init__.py b/src/vectorcode/subcommands/query/__init__.py index e6223ef2..9baa9cbd 100644 --- a/src/vectorcode/subcommands/query/__init__.py +++ b/src/vectorcode/subcommands/query/__init__.py @@ -7,7 +7,7 @@ from chromadb.errors import InvalidCollectionException, InvalidDimensionException from vectorcode.chunking import StringChunker -from vectorcode.cli_utils import Config, expand_globs, expand_path +from vectorcode.cli_utils import Config, QueryInclude, expand_globs, expand_path from vectorcode.common import ( get_client, get_collection, @@ -33,12 +33,14 @@ async def get_query_result_files( print("Empty collection!", file=sys.stderr) return [] try: - num_query = await collection.count() - if configs.query_multiplier > 0: - num_query = min( - int(configs.n_result * configs.query_multiplier), - await collection.count(), - ) + num_query = configs.n_result + if QueryInclude.chunk not in configs.include: + num_query = await collection.count() + if configs.query_multiplier > 0: + num_query = min( + int(configs.n_result * configs.query_multiplier), + await collection.count(), + ) if len(configs.query_exclude): filtered_files = {"path": {"$nin": configs.query_exclude}} else: @@ -71,6 +73,15 @@ async def get_query_result_files( async def query(configs: Config) -> int: + if ( + QueryInclude.chunk in configs.include + and QueryInclude.document in configs.include + ): + print( + "Having both chunk and document in the output is not supported!", + file=sys.stderr, + ) + return 1 client = await get_client(configs) try: collection = await get_collection(client, configs, False) @@ -101,14 +112,16 @@ async def query(configs: Config) -> int: for path in await get_query_result_files(collection, configs): if os.path.isfile(path): - with open(path) as fin: - document = fin.read() if configs.use_absolute_path: output_path = os.path.abspath(path) else: output_path = os.path.relpath(path, configs.project_root) + full_result = {"path": output_path} + if QueryInclude.document in configs.include: + with open(path) as fin: + document = fin.read() + full_result["document"] = document - full_result = {"path": output_path, "document": document} structured_result.append( {str(key): full_result[str(key)] for key in configs.include} ) diff --git a/src/vectorcode/subcommands/query/reranker.py b/src/vectorcode/subcommands/query/reranker.py index 9c2837c7..fcb3564b 100644 --- a/src/vectorcode/subcommands/query/reranker.py +++ b/src/vectorcode/subcommands/query/reranker.py @@ -6,11 +6,12 @@ import numpy from chromadb.api.types import QueryResult -from vectorcode.cli_utils import Config +from vectorcode.cli_utils import Config, QueryInclude class RerankerBase: def __init__(self, configs: Config, **kwargs: Any): + self.configs = configs self.n_result = configs.n_result @abstractmethod @@ -62,15 +63,19 @@ def rerank(self, results: QueryResult) -> list[str]: assert results["documents"] is not None documents: DefaultDict[str, list[float]] = defaultdict(list) for query_chunk_idx in range(len(self.query_chunks)): + chunk_ids = results["ids"][query_chunk_idx] chunk_metas = results["metadatas"][query_chunk_idx] chunk_docs = results["documents"][query_chunk_idx] ranks = self.model.rank( self.query_chunks[query_chunk_idx], chunk_docs, apply_softmax=True ) for rank in ranks: - documents[chunk_metas[rank["corpus_id"]]["path"]].append( - float(rank["score"]) - ) + if QueryInclude.chunk in self.configs.include: + documents[chunk_ids[rank["corpus_id"]]].append(float(rank["score"])) + else: + documents[chunk_metas[rank["corpus_id"]]["path"]].append( + float(rank["score"]) + ) top_k = int(numpy.mean(tuple(len(i) for i in documents.values()))) for key in documents.keys(): From d34e06c5fe88ed406d115951d1060fbea31b7241 Mon Sep 17 00:00:00 2001 From: Davidyz Date: Sun, 30 Mar 2025 18:54:06 +0100 Subject: [PATCH 05/11] feat(cli): finished CLI mode (mostly). --- src/vectorcode/chunking.py | 9 +- src/vectorcode/lsp_main.py | 19 +--- src/vectorcode/subcommands/chunks.py | 2 +- src/vectorcode/subcommands/query/__init__.py | 94 ++++++++++++++------ src/vectorcode/subcommands/vectorise.py | 5 +- tests/subcommands/query/test_query.py | 12 ++- tests/test_lsp.py | 9 +- 7 files changed, 97 insertions(+), 53 deletions(-) diff --git a/src/vectorcode/chunking.py b/src/vectorcode/chunking.py index fdc8f8aa..65327bf9 100644 --- a/src/vectorcode/chunking.py +++ b/src/vectorcode/chunking.py @@ -235,10 +235,11 @@ def chunk(self, data: str) -> Generator[Chunk, None, None]: """ assert os.path.isfile(data) with open(data) as fin: - content = fin.read() - if self.config.chunk_size < 0: - yield content - return + lines = fin.readlines() + content = "".join(lines) + if self.config.chunk_size < 0 and content: + yield Chunk(content, Point(1, 0), Point(len(lines), len(lines[-1]) - 1)) + return parser = None language = None lexer = self.__guess_type(data, content) diff --git a/src/vectorcode/lsp_main.py b/src/vectorcode/lsp_main.py index 3ad30c4b..465db61f 100644 --- a/src/vectorcode/lsp_main.py +++ b/src/vectorcode/lsp_main.py @@ -24,7 +24,7 @@ ) from vectorcode.common import get_client, get_collection, try_server from vectorcode.subcommands.ls import get_collection_list -from vectorcode.subcommands.query import get_query_result_files +from vectorcode.subcommands.query import build_query_results cached_project_configs: dict[str, Config] = {} DEFAULT_PROJECT_ROOT: str | None = None @@ -108,20 +108,9 @@ async def execute_command(ls: LanguageServer, args: list[str]): ) final_results = [] try: - for path in await get_query_result_files( - collection=collection, - configs=final_configs, - ): - if os.path.isfile(path): - with open(path) as fin: - output_path = path - if not final_configs.use_absolute_path: - output_path = os.path.relpath( - path, final_configs.project_root - ) - final_results.append( - {"path": output_path, "document": fin.read()} - ) + final_results.extend( + await build_query_results(collection, final_configs) + ) finally: ls.progress.end( progress_token, diff --git a/src/vectorcode/subcommands/chunks.py b/src/vectorcode/subcommands/chunks.py index e01a90a2..2ee0e0c4 100644 --- a/src/vectorcode/subcommands/chunks.py +++ b/src/vectorcode/subcommands/chunks.py @@ -9,5 +9,5 @@ async def chunks(configs: Config) -> int: result = [] for file_path in configs.files: result.append(list(chunker.chunk(str(file_path)))) - print(json.dumps(result)) + print(json.dumps(str(result))) return 0 diff --git a/src/vectorcode/subcommands/query/__init__.py b/src/vectorcode/subcommands/query/__init__.py index 9baa9cbd..e775aec8 100644 --- a/src/vectorcode/subcommands/query/__init__.py +++ b/src/vectorcode/subcommands/query/__init__.py @@ -2,6 +2,7 @@ import os import sys +from chromadb import GetResult from chromadb.api.models.AsyncCollection import AsyncCollection from chromadb.api.types import IncludeEnum from chromadb.errors import InvalidCollectionException, InvalidDimensionException @@ -33,18 +34,20 @@ async def get_query_result_files( print("Empty collection!", file=sys.stderr) return [] try: + if len(configs.query_exclude): + filtered_files: dict[str, dict] = {"path": {"$nin": configs.query_exclude}} + else: + filtered_files = {} num_query = configs.n_result - if QueryInclude.chunk not in configs.include: + if QueryInclude.chunk in configs.include: + filtered_files["start"] = {"$gte": 0} + else: num_query = await collection.count() if configs.query_multiplier > 0: num_query = min( int(configs.n_result * configs.query_multiplier), await collection.count(), ) - if len(configs.query_exclude): - filtered_files = {"path": {"$nin": configs.query_exclude}} - else: - filtered_files = None results = await collection.query( query_texts=query_chunks, n_results=num_query, @@ -72,6 +75,64 @@ async def get_query_result_files( return aggregated_results +async def build_query_results( + collection: AsyncCollection, configs: Config +) -> list[dict[str, str | int]]: + structured_result = [] + for identifier in await get_query_result_files(collection, configs): + if os.path.isfile(identifier): + if configs.use_absolute_path: + output_path = os.path.abspath(identifier) + else: + output_path = os.path.relpath(identifier, configs.project_root) + full_result = {"path": output_path} + with open(identifier) as fin: + document = fin.read() + full_result["document"] = document + + structured_result.append( + {str(key): full_result[str(key)] for key in configs.include} + ) + elif QueryInclude.chunk in configs.include: + chunk: GetResult = await collection.get( + identifier, include=[IncludeEnum.metadatas, IncludeEnum.documents] + ) + meta = chunk.get( + "metadatas", + ) + if meta is not None and len(meta) != 0: + full_result: dict[str, str | int] = { + "chunk": str(chunk.get("documents", [""])[0]) + } + if meta[0].get("start") is not None and meta[0].get("end") is not None: + path = str(meta[0].get("path")) + with open(path) as fin: + start: int = meta[0]["start"] + end: int = meta[0]["end"] + full_result["chunk"] = "".join(fin.readlines()[start : end + 1]) + full_result["start_line"] = start + full_result["end_line"] = end + full_result["path"] = str( + meta[0]["path"] + if configs.use_absolute_path + else os.path.relpath(meta[0]["path"], str(configs.project_root)) + ) + + structured_result.append(full_result) + else: + print( + "This collection doesn't support chunk-mode output because it lacks the necessary metadata. Please re-vectorise it.", + file=sys.stderr, + ) + + else: + print( + f"{identifier} is no longer a valid file! Please re-run vectorcode vectorise to refresh the database.", + file=sys.stderr, + ) + return structured_result + + async def query(configs: Config) -> int: if ( QueryInclude.chunk in configs.include @@ -108,28 +169,7 @@ async def query(configs: Config) -> int: if not configs.pipe: print("Starting querying...") - structured_result = [] - - for path in await get_query_result_files(collection, configs): - if os.path.isfile(path): - if configs.use_absolute_path: - output_path = os.path.abspath(path) - else: - output_path = os.path.relpath(path, configs.project_root) - full_result = {"path": output_path} - if QueryInclude.document in configs.include: - with open(path) as fin: - document = fin.read() - full_result["document"] = document - - structured_result.append( - {str(key): full_result[str(key)] for key in configs.include} - ) - else: - print( - f"{path} is no longer a valid file! Please re-run vectorcode vectorise to refresh the database.", - file=sys.stderr, - ) + structured_result = await build_query_results(collection, configs) if configs.pipe: print(json.dumps(structured_result)) diff --git a/src/vectorcode/subcommands/vectorise.py b/src/vectorcode/subcommands/vectorise.py index d636edae..05061007 100644 --- a/src/vectorcode/subcommands/vectorise.py +++ b/src/vectorcode/subcommands/vectorise.py @@ -65,8 +65,9 @@ async def chunked_add( for chunk in chunks: meta: dict[str, str | dict[str, int]] = {"path": full_path_str} if isinstance(chunk, Chunk): - meta["start"] = {"row": chunk.start.row, "col": chunk.start.column} - meta["end"] = {"row": chunk.end.row, "col": chunk.end.column} + meta["start"] = chunk.start.row + meta["end"] = chunk.end.row + metas.append(meta) async with collection_lock: for idx in range(0, len(chunks), max_batch_size): diff --git a/tests/subcommands/query/test_query.py b/tests/subcommands/query/test_query.py index 080dce42..9d54d623 100644 --- a/tests/subcommands/query/test_query.py +++ b/tests/subcommands/query/test_query.py @@ -5,7 +5,7 @@ from chromadb.api.types import IncludeEnum from chromadb.errors import InvalidCollectionException, InvalidDimensionException -from vectorcode.cli_utils import Config, QueryInclude +from vectorcode.cli_utils import CliAction, Config, QueryInclude from vectorcode.subcommands.query import get_query_result_files, query @@ -71,7 +71,7 @@ async def test_get_query_result_files(mock_collection, mock_config): assert IncludeEnum.metadatas in kwargs["include"] assert IncludeEnum.distances in kwargs["include"] assert IncludeEnum.documents in kwargs["include"] - assert kwargs["where"] is None # Since query_exclude is empty + assert not kwargs["where"] # Since query_exclude is empty # Check reranker was used correctly MockReranker.assert_called_once_with(mock_config) @@ -444,3 +444,11 @@ async def test_query_invalid_ef(mock_config): # Verify the function returns error code assert result == 1 + + +@pytest.mark.asyncio +async def test_query_invalid_include(): + faulty_config = Config( + action=CliAction.query, include=[QueryInclude.chunk, QueryInclude.document] + ) + assert await query(faulty_config) != 0 diff --git a/tests/test_lsp.py b/tests/test_lsp.py index 8b6ddd26..f315c4b1 100644 --- a/tests/test_lsp.py +++ b/tests/test_lsp.py @@ -4,7 +4,7 @@ from pygls.server import LanguageServer from vectorcode import __version__ -from vectorcode.cli_utils import CliAction, Config +from vectorcode.cli_utils import CliAction, Config, QueryInclude from vectorcode.lsp_main import ( execute_command, lsp_start, @@ -23,13 +23,18 @@ def mock_language_server(): @pytest.fixture def mock_config(): - config = MagicMock(spec=Config) + # config = MagicMock(spec=Config) + config = Config() config.host = "localhost" config.port = 8000 config.action = CliAction.query config.project_root = "/test/project" config.use_absolute_path = True config.pipe = False + config.overlap_ratio = 0.2 + config.query_exclude = [] + config.include = [QueryInclude.path] + config.query_multipler = 10 return config From d13d5412b7e8552b4155e1c77ef6ba9aeecdbe7b Mon Sep 17 00:00:00 2001 From: Davidyz Date: Mon, 31 Mar 2025 09:57:12 +0100 Subject: [PATCH 06/11] test(cli): fix broken test --- tests/test_lsp.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/test_lsp.py b/tests/test_lsp.py index f315c4b1..823a0f75 100644 --- a/tests/test_lsp.py +++ b/tests/test_lsp.py @@ -88,7 +88,7 @@ async def test_execute_command_query(mock_language_server, mock_config): patch("vectorcode.lsp_main.get_client", new_callable=AsyncMock), patch("vectorcode.lsp_main.get_collection", new_callable=AsyncMock), patch( - "vectorcode.lsp_main.get_query_result_files", new_callable=AsyncMock + "vectorcode.lsp_main.build_query_results", new_callable=AsyncMock ) as mock_get_query_result_files, patch("os.path.isfile", return_value=True), patch("vectorcode.lsp_main.try_server", return_value=True), From e88214032d8bdb62bb8519abf2eca7d0ee72573a Mon Sep 17 00:00:00 2001 From: Davidyz Date: Mon, 31 Mar 2025 12:59:19 +0100 Subject: [PATCH 07/11] feat(cli): implement chunk mode support for NaiveReranker --- src/vectorcode/subcommands/query/reranker.py | 10 +++- tests/subcommands/query/test_reranker.py | 58 +++++++++++++++++++- 2 files changed, 64 insertions(+), 4 deletions(-) diff --git a/src/vectorcode/subcommands/query/reranker.py b/src/vectorcode/subcommands/query/reranker.py index fcb3564b..335ff3f2 100644 --- a/src/vectorcode/subcommands/query/reranker.py +++ b/src/vectorcode/subcommands/query/reranker.py @@ -28,16 +28,20 @@ def rerank(self, results: QueryResult) -> list[str]: assert results["distances"] is not None documents: DefaultDict[str, list[float]] = defaultdict(list) for query_chunk_idx in range(len(results["ids"])): + chunk_ids = results["ids"][query_chunk_idx] chunk_metas = results["metadatas"][query_chunk_idx] chunk_distances = results["distances"][query_chunk_idx] # NOTE: distances, smaller is better. paths = [str(meta["path"]) for meta in chunk_metas] assert len(paths) == len(chunk_distances) - for distance, path in zip(chunk_distances, paths): - if path is None: + for distance, identifier in zip( + chunk_distances, + chunk_ids if QueryInclude.chunk in self.configs.include else paths, + ): + if identifier is None: # so that vectorcode doesn't break on old collections. continue - documents[path].append(distance) + documents[identifier].append(distance) top_k = int(numpy.mean(tuple(len(i) for i in documents.values()))) for key in documents.keys(): diff --git a/tests/subcommands/query/test_reranker.py b/tests/subcommands/query/test_reranker.py index 49dbfb1d..f715527a 100644 --- a/tests/subcommands/query/test_reranker.py +++ b/tests/subcommands/query/test_reranker.py @@ -2,7 +2,7 @@ import pytest -from vectorcode.cli_utils import Config +from vectorcode.cli_utils import Config, QueryInclude from vectorcode.subcommands.query.reranker import ( CrossEncoderReranker, NaiveReranker, @@ -153,3 +153,59 @@ def test_naive_reranker_document_selection_logic(config): assert len(result) > 0 # Common files should be present assert "file2.py" in result or "file3.py" in result + + +def test_naive_reranker_with_chunk_ids(config): + """Test NaiveReranker returns chunk IDs when QueryInclude.chunk is set""" + config.include.append( + QueryInclude.chunk + ) # Assuming QueryInclude.chunk would be "chunk" + query_result = { + "ids": [["id1", "id2"], ["id3", "id1"]], + "distances": [[0.1, 0.2], [0.3, 0.4]], + "metadatas": [ + [{"path": "file1.py"}, {"path": "file2.py"}], + [{"path": "file3.py"}, {"path": "file1.py"}], + ], + } + reranker = NaiveReranker(config) + result = reranker.rerank(query_result) + + assert isinstance(result, list) + assert len(result) <= config.n_result + assert all(isinstance(id, str) for id in result) + assert all(id.startswith("id") for id in result) # Verify IDs not paths + + +@patch("sentence_transformers.CrossEncoder") +def test_cross_encoder_reranker_with_chunk_ids( + mock_cross_encoder, config, query_chunks +): + """Test CrossEncoderReranker returns chunk IDs when QueryInclude.chunk is set""" + mock_model = MagicMock() + mock_cross_encoder.return_value = mock_model + mock_model.rank.return_value = [ + {"corpus_id": 0, "score": 0.9}, + {"corpus_id": 1, "score": 0.7}, + ] + + config.include = {"chunk"} # Use comma instead of append + reranker = CrossEncoderReranker( + config, query_chunks, "cross-encoder/ms-marco-MiniLM-L-6-v2" + ) + + # Match query_chunks length with results + result = reranker.rerank( + { + "ids": [["id1", "id2"], ["id3", "id4"]], # Two query chunks + "metadatas": [ + [{"path": "file1.py"}, {"path": "file2.py"}], + [{"path": "file3.py"}, {"path": "file4.py"}], + ], + "documents": [["doc1", "doc2"], ["doc3", "doc4"]], + } + ) + + assert isinstance(result, list) + assert all(isinstance(id, str) for id in result) + assert all(id in ["id1", "id2", "id3", "id4"] for id in result) From 47c28baf0ceb3014cdff6eff328c51a7862888a1 Mon Sep 17 00:00:00 2001 From: Davidyz Date: Mon, 31 Mar 2025 15:16:02 +0100 Subject: [PATCH 08/11] test(cli): test for `build_query_results` --- tests/subcommands/query/test_query.py | 98 +++++++++++++++++++++++++-- 1 file changed, 94 insertions(+), 4 deletions(-) diff --git a/tests/subcommands/query/test_query.py b/tests/subcommands/query/test_query.py index 9d54d623..170bbad5 100644 --- a/tests/subcommands/query/test_query.py +++ b/tests/subcommands/query/test_query.py @@ -1,12 +1,17 @@ -from unittest.mock import AsyncMock, MagicMock, patch +from unittest.mock import AsyncMock, MagicMock, mock_open, patch import pytest +from chromadb import GetResult from chromadb.api.models.AsyncCollection import AsyncCollection from chromadb.api.types import IncludeEnum from chromadb.errors import InvalidCollectionException, InvalidDimensionException from vectorcode.cli_utils import CliAction, Config, QueryInclude -from vectorcode.subcommands.query import get_query_result_files, query +from vectorcode.subcommands.query import ( + build_query_results, + get_query_result_files, + query, +) @pytest.fixture @@ -17,8 +22,16 @@ def mock_collection(): "ids": [["id1", "id2", "id3"], ["id4", "id5", "id6"]], "distances": [[0.1, 0.2, 0.3], [0.4, 0.5, 0.6]], "metadatas": [ - [{"path": "file1.py"}, {"path": "file2.py"}, {"path": "file3.py"}], - [{"path": "file2.py"}, {"path": "file4.py"}, {"path": "file3.py"}], + [ + {"path": "file1.py", "start": 1, "end": 1}, + {"path": "file2.py", "start": 1, "end": 1}, + {"path": "file3.py", "start": 1, "end": 1}, + ], + [ + {"path": "file2.py", "start": 1, "end": 1}, + {"path": "file4.py", "start": 1, "end": 1}, + {"path": "file3.py", "start": 1, "end": 1}, + ], ], "documents": [ ["content1", "content2", "content3"], @@ -83,6 +96,83 @@ async def test_get_query_result_files(mock_collection, mock_config): assert result == ["file1.py", "file2.py", "file3.py"] +@pytest.mark.asyncio +async def test_get_query_result_files_include_chunk(mock_collection, mock_config): + """Test get_query_result_files when QueryInclude.chunk is included.""" + mock_config.include = [QueryInclude.chunk] # Include chunk + + with patch("vectorcode.subcommands.query.reranker.NaiveReranker") as MockReranker: + mock_reranker_instance = MagicMock() + mock_reranker_instance.rerank.return_value = ["chunk1"] + MockReranker.return_value = mock_reranker_instance + + await get_query_result_files(mock_collection, mock_config) + + # Check query call includes where clause for chunks + mock_collection.query.assert_called_once() + _, kwargs = mock_collection.query.call_args + # Line 43: Check the 'if' condition branch + assert kwargs["where"] == {"start": {"$gte": 0}} + assert kwargs["n_results"] == 3 # n_result should be used directly + + +@pytest.mark.asyncio +async def test_build_query_results_chunk_mode_success(mock_collection, mock_config): + """Test build_query_results in chunk mode successfully retrieves chunk details.""" + mock_config.include = [QueryInclude.chunk, QueryInclude.path] + mock_config.project_root = "/test/project" + mock_config.use_absolute_path = False + identifier = "chunk_id_1" + file_path = "/test/project/subdir/file1.py" + relative_path = "subdir/file1.py" + start_line = 5 + end_line = 10 + + full_file_content_lines = [f"line {i}\n" for i in range(15)] + full_file_content = "".join(full_file_content_lines) + + expected_chunk_content = "".join(full_file_content_lines[start_line : end_line + 1]) + + mock_get_result = GetResult( + ids=[identifier], + embeddings=None, + documents=["original chunk doc in db"], + metadatas=[{"path": file_path, "start": start_line, "end": end_line}], + ) + + with ( + patch( + "vectorcode.subcommands.query.get_query_result_files", + return_value=[identifier], + ), + patch("os.path.isfile", return_value=False), + patch("builtins.open", mock_open(read_data=full_file_content)) as mocked_open, + patch("os.path.relpath", return_value=relative_path) as mock_relpath, + ): + mock_collection.get = AsyncMock(return_value=mock_get_result) + + results = await build_query_results(mock_collection, mock_config) + + mock_collection.get.assert_called_once_with( + identifier, include=[IncludeEnum.metadatas, IncludeEnum.documents] + ) + + mocked_open.assert_called_once_with(file_path) + + mock_relpath.assert_called_once_with(file_path, str(mock_config.project_root)) + + assert len(results) == 1 + + expected_full_result = { + "path": relative_path, + "chunk": expected_chunk_content, + "start_line": start_line, + "end_line": end_line, + } + + assert results[0] == expected_full_result + + @pytest.mark.asyncio async def test_get_query_result_files_with_query_exclude(mock_collection, mock_config): # Setup query_exclude From 4673d0852a55530b928e6bd49a059bbd34c2c2db Mon Sep 17 00:00:00 2001 From: Davidyz Date: Mon, 31 Mar 2025 15:39:08 +0100 Subject: [PATCH 09/11] feat(cli): make sure it fallback to `document` mode when range metadata is not available --- src/vectorcode/subcommands/query/__init__.py | 18 +++++++-- tests/subcommands/query/test_query.py | 39 ++++++++++++++++++++ 2 files changed, 53 insertions(+), 4 deletions(-) diff --git a/src/vectorcode/subcommands/query/__init__.py b/src/vectorcode/subcommands/query/__init__.py index e775aec8..018c7dd0 100644 --- a/src/vectorcode/subcommands/query/__init__.py +++ b/src/vectorcode/subcommands/query/__init__.py @@ -35,12 +35,12 @@ async def get_query_result_files( return [] try: if len(configs.query_exclude): - filtered_files: dict[str, dict] = {"path": {"$nin": configs.query_exclude}} + filter: dict[str, dict] = {"path": {"$nin": configs.query_exclude}} else: - filtered_files = {} + filter = {} num_query = configs.n_result if QueryInclude.chunk in configs.include: - filtered_files["start"] = {"$gte": 0} + filter["start"] = {"$gte": 0} else: num_query = await collection.count() if configs.query_multiplier > 0: @@ -56,7 +56,7 @@ async def get_query_result_files( IncludeEnum.distances, IncludeEnum.documents, ], - where=filtered_files, + where=filter or None, ) except IndexError: # no results found @@ -169,6 +169,16 @@ async def query(configs: Config) -> int: if not configs.pipe: print("Starting querying...") + if QueryInclude.chunk in configs.include: + if len((await collection.get(where={"start": {"$gte": 0}}))["ids"]) == 0: + print( + """ +This collection doesn't contain line range metadata. Falling back to `--include path document`. +Please re-vectorise it to use `--include chunk`.""", + file=sys.stderr, + ) + configs.include = [QueryInclude.path, QueryInclude.document] + structured_result = await build_query_results(collection, configs) if configs.pipe: diff --git a/tests/subcommands/query/test_query.py b/tests/subcommands/query/test_query.py index 170bbad5..a98d63ab 100644 --- a/tests/subcommands/query/test_query.py +++ b/tests/subcommands/query/test_query.py @@ -542,3 +542,42 @@ async def test_query_invalid_include(): action=CliAction.query, include=[QueryInclude.chunk, QueryInclude.document] ) assert await query(faulty_config) != 0 + + +@pytest.mark.asyncio +async def test_query_chunk_mode_no_metadata_fallback(mock_config): + mock_config.include = [QueryInclude.chunk, QueryInclude.path] + mock_client = AsyncMock() + mock_collection = AsyncMock() + + # Mock collection.get to return no IDs for the metadata check + mock_collection.get.return_value = {"ids": []} + + with ( + patch("vectorcode.subcommands.query.get_client", return_value=mock_client), + patch( + "vectorcode.subcommands.query.get_collection", return_value=mock_collection + ), + patch("vectorcode.subcommands.query.verify_ef", return_value=True), + patch("vectorcode.subcommands.query.build_query_results") as mock_build_results, + patch("sys.stderr") as mock_stderr, + ): + mock_build_results.return_value = [] # Return empty results for simplicity + + result = await query(mock_config) + + assert result == 0 + + # Verify the metadata check call + mock_collection.get.assert_called_once_with(where={"start": {"$gte": 0}}) + + # Verify the warning was printed + assert mock_stderr.write.call_count > 0 + call_args, _ = mock_stderr.write.call_args_list[0] + assert "Falling back to `--include path document`" in call_args[0] + + # Verify build_query_results was called with the *modified* config + mock_build_results.assert_called_once() + args, _ = mock_build_results.call_args + _, called_config = args + assert called_config.include == [QueryInclude.path, QueryInclude.document] From 8b7799fdb4a28de181dc131fe822c1b0b9ccb0c1 Mon Sep 17 00:00:00 2001 From: Davidyz Date: Mon, 31 Mar 2025 15:45:37 +0100 Subject: [PATCH 10/11] fix(cli): do not output `path` when not specified from the arguments --- src/vectorcode/subcommands/query/__init__.py | 13 ++++++++----- 1 file changed, 8 insertions(+), 5 deletions(-) diff --git a/src/vectorcode/subcommands/query/__init__.py b/src/vectorcode/subcommands/query/__init__.py index 018c7dd0..4d23306b 100644 --- a/src/vectorcode/subcommands/query/__init__.py +++ b/src/vectorcode/subcommands/query/__init__.py @@ -112,11 +112,14 @@ async def build_query_results( full_result["chunk"] = "".join(fin.readlines()[start : end + 1]) full_result["start_line"] = start full_result["end_line"] = end - full_result["path"] = str( - meta[0]["path"] - if configs.use_absolute_path - else os.path.relpath(meta[0]["path"], str(configs.project_root)) - ) + if QueryInclude.path in configs.include: + full_result["path"] = str( + meta[0]["path"] + if configs.use_absolute_path + else os.path.relpath( + meta[0]["path"], str(configs.project_root) + ) + ) structured_result.append(full_result) else: From 3d0b6db47c53cacbad3eab651101135bc48067ee Mon Sep 17 00:00:00 2001 From: Davidyz Date: Mon, 31 Mar 2025 15:56:47 +0100 Subject: [PATCH 11/11] docs(cli): documentation about `--include chunk` --- docs/cli.md | 29 +++++++++++++++++++++++++++++ 1 file changed, 29 insertions(+) diff --git a/docs/cli.md b/docs/cli.md index f5eec174..15528586 100644 --- a/docs/cli.md +++ b/docs/cli.md @@ -331,6 +331,16 @@ vectorcode query foo bar --include path This will only include the `path` in the output. This is effective for both normal CLI usage and [`--pipe` mode](#for-developers). +For some applications, it may be overkill to use the full document as context +and all you need is the chunks. You can do this by using `--include chunk` or +`--include chunk path` in the command. This will return chunks from the +document, and in `pipe` mode the objects will also include the line numbers of +the first and last lines in the chunk. Note that `chunk` and `document` cannot be used at +the same time, and the number of query result (the `-n` parameter) will refer to +the number of retrieved chunks when you use `--include chunk`. For the sake of +completeness, the first and last lines of a chunk will be completed to include +the whole lines if the chunker broke the text from mid-line. + ### Listing All Collections You can use `vectorcode ls` command to list all collections in your ChromaDB. @@ -433,6 +443,25 @@ For the query command, here's the format printed in the `pipe` mode: Basically an array of dictionaries with 2 keys: `"path"` for the path to the document, and `"document"` for the content of the document. +If you used `--include chunk path` parameters, the array will look like this: +```json +[ + { + "path": "path_to_your_code.py", + "chunk": "foo", + "start_line": 1, + "end_line": 1, + }, + { + "path": "path_to_another_file.py", + "chunk": "bar", + "start_line": 1, + "end_line": 1, + } +] +``` +Keep in mind that both `start_line` and `end_line` are inclusive. + #### `vectorcode vectorise` The output is in JSON format. It contains a dictionary with the following fields: - `"add"`: number of added documents;