From f8e67780517c47f2aa81969625821775ec5af6b2 Mon Sep 17 00:00:00 2001
From: Davidyz <hzjlyz@gmail.com>
Date: Thu, 27 Mar 2025 15:39:15 +0000
Subject: [PATCH 01/11] feat(cli): try to implement chunking with line-ranges
 metadata.

---
 src/vectorcode/chunking.py | 162 +++++++++++++++++++++++++++++--------
 tests/test_chunking.py     | 102 ++++++++++++++---------
 2 files changed, 192 insertions(+), 72 deletions(-)

diff --git a/src/vectorcode/chunking.py b/src/vectorcode/chunking.py
index ec0a1346..afc6ffc4 100644
--- a/src/vectorcode/chunking.py
+++ b/src/vectorcode/chunking.py
@@ -1,6 +1,7 @@
 import os
 import re
 from abc import abstractmethod
+from dataclasses import dataclass
 from functools import cache
 from io import TextIOWrapper
 from typing import Generator, Optional
@@ -8,12 +9,26 @@
 from pygments.lexer import Lexer
 from pygments.lexers import guess_lexer_for_filename
 from pygments.util import ClassNotFound
-from tree_sitter import Node
+from tree_sitter import Node, Point
 from tree_sitter_language_pack import get_parser
 
 from vectorcode.cli_utils import Config
 
 
+@dataclass
+class Chunk:
+    """
+    rows are 1-indexed, cols are 0-indexed.
+    """
+
+    text: str
+    start: Point
+    end: Point
+
+    def __str__(self):
+        return self.text
+
+
 class ChunkerBase:
     def __init__(self, config: Optional[Config] = None) -> None:
         if config is None:
@@ -24,7 +39,7 @@ def __init__(self, config: Optional[Config] = None) -> None:
         self.config = config
 
     @abstractmethod
-    def chunk(self, data) -> Generator[str, None, None]:
+    def chunk(self, data) -> Generator[Chunk, None, None]:
         raise NotImplementedError
 
 
@@ -34,16 +49,25 @@ def __init__(self, config: Optional[Config] = None) -> None:
             config = Config()
         super().__init__(config)
 
-    def chunk(self, data: str) -> Generator[str, None, None]:
+    def chunk(self, data: str):
         if self.config.chunk_size < 0:
-            yield data
+            yield Chunk(
+                text=data,
+                start=Point(row=1, column=0),
+                end=Point(row=1, column=len(data)),
+            )
         else:
             step_size = max(
                 1, int(self.config.chunk_size * (1 - self.config.overlap_ratio))
             )
             i = 0
             while i < len(data):
-                yield data[i : i + self.config.chunk_size]
+                chunk_text = data[i : i + self.config.chunk_size]
+                yield Chunk(
+                    text=chunk_text,
+                    start=Point(row=1, column=i),
+                    end=Point(row=1, column=len(chunk_text) - 1),
+                )
                 if i + self.config.chunk_size >= len(data):
                     break
                 i += step_size
@@ -55,24 +79,41 @@ def __init__(self, config: Optional[Config] = None) -> None:
             config = Config()
         super().__init__(config)
 
-    def chunk(self, data: TextIOWrapper) -> Generator[str, None, None]:
-        if self.config.chunk_size < 0:
-            yield "".join(data.readlines())
-        else:
-            step_size = max(
-                1, int(self.config.chunk_size * (1 - self.config.overlap_ratio))
-            )
-            # the output of this method should be identical to that of StringChunker.chunk
-            output = data.read(self.config.chunk_size)
-            yield output
-            if len(output) < self.config.chunk_size:
-                return
-            while True:
-                new_chars = data.read(step_size)
-                output = output[step_size:] + new_chars
-                yield output
-                if len(new_chars) < step_size:
-                    return
+    def chunk(self, data: TextIOWrapper) -> Generator[Chunk, None, None]:
+        lines = data.readlines()
+        if len(lines) == 0:
+            return
+        if (
+            self.config.chunk_size < 0
+            or sum(len(i) for i in lines) < self.config.chunk_size
+        ):
+            text = "".join(lines)
+            yield Chunk(text, Point(1, 0), Point(1, len(text) - 1))
+            return
+        text_buffer = ""
+        start_pos = Point(1, 0)
+
+        def seek(point: Point, count: int):
+            while point.column + count > len(lines[point.row - 1]):
+                count -= len(lines[point.row - 1]) - point.column
+                point.row += 1
+                point.column = 0
+            return point
+
+        for ln in range(1, len(lines) + 1):
+            line = lines[ln - 1]
+            if len(text_buffer + line) > self.config.chunk_size:
+                consumed = line[: self.config.chunk_size - len(text_buffer)]
+                yield Chunk(
+                    text_buffer + consumed, start_pos, Point(ln, len(consumed) - 1)
+                )
+                text_buffer = ""
+                if len(consumed) < len(line):
+                    start_pos = Point(ln, len(consumed))
+                else:
+                    start_pos = Point(ln + 1, 0)
+            else:
+                text_buffer += line
 
 
 class TreeSitterChunker(ChunkerBase):
@@ -81,22 +122,77 @@ def __init__(self, config: Optional[Config] = None):
             config = Config()
         super().__init__(config)
 
-    def __chunk_node(self, node: Node, text: str) -> Generator[str, None, None]:
+    def __chunk_node(self, node: Node, text: str) -> Generator[Chunk, None, None]:
         current_chunk = ""
+
+        current_start = None
+
         for child in node.children:
-            child_length = child.end_byte - child.start_byte
+            child_text = text[child.start_byte : child.end_byte]
+            child_length = len(child_text)
+
             if child_length > self.config.chunk_size:
+                # Yield current chunk if exists
                 if current_chunk:
-                    yield current_chunk
+                    assert current_start is not None
+                    yield Chunk(
+                        text=current_chunk,
+                        start=current_start,
+                        end=Point(
+                            row=current_start.row + current_chunk.count("\n"),
+                            column=len(current_chunk.split("\n")[-1]) - 1
+                            if "\n" in current_chunk
+                            else current_start.column + len(current_chunk) - 1,
+                        ),
+                    )
                     current_chunk = ""
+                    current_start = None
+
+                # Recursively chunk the large child node
                 yield from self.__chunk_node(child, text)
-            elif len(current_chunk) + child_length > self.config.chunk_size:
-                yield current_chunk
-                current_chunk = text[child.start_byte : child.end_byte]
+
+            elif not current_chunk:
+                # Start new chunk
+                current_chunk = child_text
+                current_start = Point(
+                    row=child.start_point.row + 1, column=child.start_point.column
+                )
+
+            elif len(current_chunk) + child_length <= self.config.chunk_size:
+                # Add to current chunk
+                current_chunk += child_text
+
             else:
-                current_chunk += text[child.start_byte : child.end_byte]
+                # Yield current chunk and start new one
+                assert current_start is not None
+                yield Chunk(
+                    text=current_chunk,
+                    start=current_start,
+                    end=Point(
+                        row=current_start.row + current_chunk.count("\n"),
+                        column=len(current_chunk.split("\n")[-1]) - 1
+                        if "\n" in current_chunk
+                        else current_start.column + len(current_chunk) - 1,
+                    ),
+                )
+                current_chunk = child_text
+                current_start = Point(
+                    row=child.start_point.row + 1, column=child.start_point.column
+                )
+
+        # Yield remaining chunk
         if current_chunk:
-            yield current_chunk
+            assert current_start is not None
+            yield Chunk(
+                text=current_chunk,
+                start=current_start,
+                end=Point(
+                    row=current_start.row + current_chunk.count("\n"),
+                    column=len(current_chunk.split("\n")[-1]) - 1
+                    if "\n" in current_chunk
+                    else current_start.column + len(current_chunk) - 1,
+                ),
+            )
 
     @cache
     def __guess_type(self, path: str, content: str) -> Optional[Lexer]:
@@ -119,7 +215,7 @@ def __build_pattern(self, language: str):
             return f"(?:{'|'.join(patterns)})"
         return ""
 
-    def chunk(self, data: str) -> Generator[str, None, None]:
+    def chunk(self, data: str) -> Generator[Chunk, None, None]:
         """
         data: path to the file
         """
@@ -155,7 +251,7 @@ def chunk(self, data: str) -> Generator[str, None, None]:
             if pattern_str:
                 re_pattern = re.compile(pattern_str)
                 for chunk in chunks_gen:
-                    if re_pattern.match(chunk) is None:
+                    if re_pattern.match(chunk.text) is None:
                         yield chunk
             else:
                 yield from chunks_gen
diff --git a/tests/test_chunking.py b/tests/test_chunking.py
index 285ac30c..9263e59a 100644
--- a/tests/test_chunking.py
+++ b/tests/test_chunking.py
@@ -12,24 +12,30 @@
 from vectorcode.cli_utils import Config
 
 
-class TestChunking:
+class TestStringChunker:
     file_chunker = FileChunker()
 
     def test_string_chunker(self):
         string_chunker = StringChunker(Config(chunk_size=-1, overlap_ratio=0.5))
-        assert list(string_chunker.chunk("hello world")) == ["hello world"]
+        assert list(str(i) for i in string_chunker.chunk("hello world")) == [
+            "hello world"
+        ]
         string_chunker = StringChunker(Config(chunk_size=5, overlap_ratio=0.5))
-        assert list(string_chunker.chunk("hello world")) == [
+        assert list(str(i) for i in string_chunker.chunk("hello world")) == [
             "hello",
             "llo w",
             "o wor",
             "world",
         ]
         string_chunker = StringChunker(Config(chunk_size=5, overlap_ratio=0))
-        assert list(string_chunker.chunk("hello world")) == ["hello", " worl", "d"]
+        assert list(str(i) for i in string_chunker.chunk("hello world")) == [
+            "hello",
+            " worl",
+            "d",
+        ]
 
         string_chunker = StringChunker(Config(chunk_size=5, overlap_ratio=0.8))
-        assert list(string_chunker.chunk("hello world")) == [
+        assert list(str(i) for i in string_chunker.chunk("hello world")) == [
             "hello",
             "ello ",
             "llo w",
@@ -39,31 +45,49 @@ def test_string_chunker(self):
             "world",
         ]
 
+
+class TestFileChunker:
     def test_file_chunker(self):
-        """
-        Use StringChunker output as ground truth to test chunking.
-        """
-        file_path = __file__
-        ratio = 0.5
-        chunk_size = 100
-
-        with open(file_path) as fin:
-            string_chunker = StringChunker(
-                Config(chunk_size=chunk_size, overlap_ratio=ratio)
-            )
-            string_chunks = list(string_chunker.chunk(fin.read()))
-
-        with open(file_path) as fin:
-            file_chunker = FileChunker(
-                Config(chunk_size=chunk_size, overlap_ratio=ratio)
-            )
-            file_chunks = list(file_chunker.chunk(fin))
-
-        assert len(string_chunks) == len(file_chunks), (
-            f"Number of chunks do not match. {len(string_chunks)} != {len(file_chunks)}"
-        )
-        for string_chunk, file_chunk in zip(string_chunks, file_chunks):
-            assert string_chunk == file_chunk
+        test_content = "hello world"
+
+        with tempfile.NamedTemporaryFile(mode="w", delete=False) as tmp_file:
+            tmp_file.write(test_content)
+            tmp_file_name = tmp_file.name
+
+        # Test negative chunk size (return whole file)
+        with open(tmp_file_name, "r") as f:
+            chunker = FileChunker(Config(chunk_size=-1, overlap_ratio=0.5))
+            assert list(str(i) for i in chunker.chunk(f)) == ["hello world"]
+
+        # Test basic chunking with overlap
+        with open(tmp_file_name, "r") as f:
+            chunker = FileChunker(Config(chunk_size=5, overlap_ratio=0.5))
+            assert list(str(i) for i in chunker.chunk(f)) == [
+                "hello",
+                "llo w",
+                "o wor",
+                "world",
+            ]
+
+        # Test no overlap
+        with open(tmp_file_name, "r") as f:
+            chunker = FileChunker(Config(chunk_size=5, overlap_ratio=0))
+            assert list(str(i) for i in chunker.chunk(f)) == ["hello", " worl", "d"]
+
+        # Test high overlap ratio
+        with open(tmp_file_name, "r") as f:
+            chunker = FileChunker(Config(chunk_size=5, overlap_ratio=0.8))
+            assert list(str(i) for i in chunker.chunk(f)) == [
+                "hello",
+                "ello ",
+                "llo w",
+                "lo wo",
+                "o wor",
+                " worl",
+                "world",
+            ]
+
+        os.remove(tmp_file_name)
 
 
 def test_no_config():
@@ -96,7 +120,7 @@ def bar():
         tmp_file.write(test_content)
         test_file = tmp_file.name
 
-    chunks = list(chunker.chunk(test_file))
+    chunks = list(str(i) for i in chunker.chunk(test_file))
     assert chunks == ['def foo():\n    return "foo"', 'def bar():\n    return "bar"']
     os.remove(test_file)
 
@@ -118,7 +142,7 @@ def bar():
         tmp_file.write(test_content)
         test_file = tmp_file.name
 
-    chunks = list(chunker.chunk(test_file))
+    chunks = list(str(i) for i in chunker.chunk(test_file))
     assert chunks == ['def bar():\n    return "bar"']
     os.remove(test_file)
 
@@ -140,7 +164,7 @@ def bar():
         tmp_file.write(test_content)
         test_file = tmp_file.name
 
-    chunks = list(chunker.chunk(test_file))
+    chunks = list(str(i) for i in chunker.chunk(test_file))
     assert chunks == []
     os.remove(test_file)
 
@@ -160,7 +184,7 @@ def bar():
         tmp_file.write(test_content)
         test_file = tmp_file.name
 
-    chunks = list(chunker.chunk(test_file))
+    chunks = list(str(i) for i in chunker.chunk(test_file))
     assert chunks == ['def bar():\n    return "bar"']
     os.remove(test_file)
 
@@ -178,7 +202,7 @@ def bar():
         tmp_file.write(test_content)
         test_file = tmp_file.name
 
-    chunks = list(chunker.chunk(test_file))
+    chunks = list(str(i) for i in chunker.chunk(test_file))
     assert chunks == ['functionbar()return "bar"end']
     os.remove(test_file)
 
@@ -199,7 +223,7 @@ def test_treesitter_chunker_lua():
         tmp_file.write(test_content)
         test_file = tmp_file.name
 
-    chunks = list(chunker.chunk(test_file))
+    chunks = list(str(i) for i in chunker.chunk(test_file))
     assert chunks == ['functionfoo()return "foo"end', 'functionbar()return "bar"end']
 
     os.remove(test_file)
@@ -221,7 +245,7 @@ def add_numbers(a, b)
         tmp_file.write(test_content)
         test_file = tmp_file.name
 
-    chunks = list(chunker.chunk(test_file))
+    chunks = list(str(i) for i in chunker.chunk(test_file))
     assert len(chunks) > 0
 
     os.remove(test_file)
@@ -243,7 +267,7 @@ def add_numbers(a, b)
         tmp_file.write(test_content)
         test_file = tmp_file.name
 
-    chunks = list(chunker.chunk(test_file))
+    chunks = list(str(i) for i in chunker.chunk(test_file))
     assert len(chunks) == 1
 
     os.remove(test_file)
@@ -268,8 +292,8 @@ def test_treesitter_chunker_fallback():
         tmp_file.write(test_content)
         test_file = tmp_file.name
 
-    tree_sitter_chunks = list(tree_sitter_chunker.chunk(test_file))
-    string_chunks = list(string_chunker.chunk(test_content))
+    tree_sitter_chunks = list(str(i) for i in tree_sitter_chunker.chunk(test_file))
+    string_chunks = list(str(i) for i in string_chunker.chunk(test_content))
 
     assert tree_sitter_chunks == string_chunks
 

From a2f8240b1bfd2024aa953a355f70d0e913997007 Mon Sep 17 00:00:00 2001
From: Davidyz <hzjlyz@gmail.com>
Date: Fri, 28 Mar 2025 13:33:39 +0000
Subject: [PATCH 02/11] feat(cli): implemented line ranges and fixed broken
 tests.

---
 src/vectorcode/chunking.py                   | 62 ++++++++++++--------
 src/vectorcode/subcommands/query/__init__.py |  2 +-
 src/vectorcode/subcommands/vectorise.py      | 17 ++++--
 tests/test_chunking.py                       |  4 +-
 4 files changed, 54 insertions(+), 31 deletions(-)

diff --git a/src/vectorcode/chunking.py b/src/vectorcode/chunking.py
index afc6ffc4..dfe2d9e6 100644
--- a/src/vectorcode/chunking.py
+++ b/src/vectorcode/chunking.py
@@ -90,30 +90,44 @@ def chunk(self, data: TextIOWrapper) -> Generator[Chunk, None, None]:
             text = "".join(lines)
             yield Chunk(text, Point(1, 0), Point(1, len(text) - 1))
             return
-        text_buffer = ""
-        start_pos = Point(1, 0)
-
-        def seek(point: Point, count: int):
-            while point.column + count > len(lines[point.row - 1]):
-                count -= len(lines[point.row - 1]) - point.column
-                point.row += 1
-                point.column = 0
-            return point
-
-        for ln in range(1, len(lines) + 1):
-            line = lines[ln - 1]
-            if len(text_buffer + line) > self.config.chunk_size:
-                consumed = line[: self.config.chunk_size - len(text_buffer)]
-                yield Chunk(
-                    text_buffer + consumed, start_pos, Point(ln, len(consumed) - 1)
-                )
-                text_buffer = ""
-                if len(consumed) < len(line):
-                    start_pos = Point(ln, len(consumed))
-                else:
-                    start_pos = Point(ln + 1, 0)
-            else:
-                text_buffer += line
+
+        text = "".join(lines)
+        step_size = max(
+            1, int(self.config.chunk_size * (1 - self.config.overlap_ratio))
+        )
+
+        # Convert lines to absolute positions
+        line_offsets = [0]
+        for line in lines:
+            line_offsets.append(line_offsets[-1] + len(line))
+
+        i = 0
+        while i < len(text):
+            chunk_text = text[i : i + self.config.chunk_size]
+
+            # Find start position
+            start_line = (
+                next(ln for ln, offset in enumerate(line_offsets) if offset > i) - 1
+            )
+            start_col = i - line_offsets[start_line]
+
+            # Find end position
+            end_pos = i + len(chunk_text)
+            end_line = (
+                next(ln for ln, offset in enumerate(line_offsets) if offset >= end_pos)
+                - 1
+            )
+            end_col = end_pos - line_offsets[end_line] - 1
+
+            yield Chunk(
+                chunk_text,
+                Point(start_line + 1, start_col),
+                Point(end_line + 1, end_col),
+            )
+
+            if i + self.config.chunk_size >= len(text):
+                break
+            i += step_size
 
 
 class TreeSitterChunker(ChunkerBase):
diff --git a/src/vectorcode/subcommands/query/__init__.py b/src/vectorcode/subcommands/query/__init__.py
index 4c626f7a..e6223ef2 100644
--- a/src/vectorcode/subcommands/query/__init__.py
+++ b/src/vectorcode/subcommands/query/__init__.py
@@ -22,7 +22,7 @@ async def get_query_result_files(
     if configs.query:
         chunker = StringChunker(configs)
         for q in configs.query:
-            query_chunks.extend(chunker.chunk(q))
+            query_chunks.extend(str(i) for i in chunker.chunk(q))
 
     configs.query_exclude = [
         expand_path(i, True)
diff --git a/src/vectorcode/subcommands/vectorise.py b/src/vectorcode/subcommands/vectorise.py
index cff789c4..ecedc65d 100644
--- a/src/vectorcode/subcommands/vectorise.py
+++ b/src/vectorcode/subcommands/vectorise.py
@@ -13,7 +13,7 @@
 from chromadb.api.models.AsyncCollection import AsyncCollection
 from chromadb.api.types import IncludeEnum
 
-from vectorcode.chunking import TreeSitterChunker
+from vectorcode.chunking import Chunk, TreeSitterChunker
 from vectorcode.cli_utils import Config, expand_globs, expand_path
 from vectorcode.common import get_client, get_collection, verify_ef
 
@@ -54,18 +54,27 @@ async def chunked_add(
 
     try:
         async with semaphore:
-            chunks = list(TreeSitterChunker(configs).chunk(full_path_str))
+            chunks: list[Chunk | str] = list(
+                TreeSitterChunker(configs).chunk(full_path_str)
+            )
             if len(chunks) == 0 or (len(chunks) == 1 and chunks[0] == ""):
                 # empty file
                 return
             chunks.append(str(os.path.relpath(full_path_str, configs.project_root)))
+            metas = []
+            for chunk in chunks:
+                meta = {"path": full_path_str}
+                if isinstance(chunk, Chunk):
+                    meta["start"] = {"row": chunk.start.row, "col": chunk.start.column}
+                    meta["end"] = {"row": chunk.end.row, "col": chunk.end.column}
+                metas.append(meta)
             async with collection_lock:
                 for idx in range(0, len(chunks), max_batch_size):
                     inserted_chunks = chunks[idx : idx + max_batch_size]
                     await collection.add(
                         ids=[get_uuid() for _ in inserted_chunks],
-                        documents=inserted_chunks,
-                        metadatas=[{"path": full_path_str} for _ in inserted_chunks],
+                        documents=[str(i) for i in inserted_chunks],
+                        metadatas=metas,
                     )
     except UnicodeDecodeError:
         # probably binary. skip it.
diff --git a/tests/test_chunking.py b/tests/test_chunking.py
index 9263e59a..40278b69 100644
--- a/tests/test_chunking.py
+++ b/tests/test_chunking.py
@@ -48,10 +48,10 @@ def test_string_chunker(self):
 
 class TestFileChunker:
     def test_file_chunker(self):
-        test_content = "hello world"
+        test_content = ["hello ", "world"]
 
         with tempfile.NamedTemporaryFile(mode="w", delete=False) as tmp_file:
-            tmp_file.write(test_content)
+            tmp_file.writelines(test_content)
             tmp_file_name = tmp_file.name
 
         # Test negative chunk size (return whole file)

From 3ebb051ab0c6d745a6622a24db796bd307a23e85 Mon Sep 17 00:00:00 2001
From: Davidyz <hzjlyz@gmail.com>
Date: Fri, 28 Mar 2025 14:18:13 +0000
Subject: [PATCH 03/11] feat(cli): add `Chunk` support in `chunked_add`

---
 src/vectorcode/chunking.py              |  2 +-
 src/vectorcode/subcommands/vectorise.py |  4 +-
 tests/subcommands/test_vectorise.py     |  4 +-
 tests/test_chunking.py                  | 74 +++++++++++++++++++++++++
 4 files changed, 80 insertions(+), 4 deletions(-)

diff --git a/src/vectorcode/chunking.py b/src/vectorcode/chunking.py
index dfe2d9e6..fdc8f8aa 100644
--- a/src/vectorcode/chunking.py
+++ b/src/vectorcode/chunking.py
@@ -66,7 +66,7 @@ def chunk(self, data: str):
                 yield Chunk(
                     text=chunk_text,
                     start=Point(row=1, column=i),
-                    end=Point(row=1, column=len(chunk_text) - 1),
+                    end=Point(row=1, column=i + len(chunk_text) - 1),
                 )
                 if i + self.config.chunk_size >= len(data):
                     break
diff --git a/src/vectorcode/subcommands/vectorise.py b/src/vectorcode/subcommands/vectorise.py
index ecedc65d..d636edae 100644
--- a/src/vectorcode/subcommands/vectorise.py
+++ b/src/vectorcode/subcommands/vectorise.py
@@ -63,7 +63,7 @@ async def chunked_add(
             chunks.append(str(os.path.relpath(full_path_str, configs.project_root)))
             metas = []
             for chunk in chunks:
-                meta = {"path": full_path_str}
+                meta: dict[str, str | dict[str, int]] = {"path": full_path_str}
                 if isinstance(chunk, Chunk):
                     meta["start"] = {"row": chunk.start.row, "col": chunk.start.column}
                     meta["end"] = {"row": chunk.end.row, "col": chunk.end.column}
@@ -76,7 +76,7 @@ async def chunked_add(
                         documents=[str(i) for i in inserted_chunks],
                         metadatas=metas,
                     )
-    except UnicodeDecodeError:
+    except UnicodeDecodeError:  # pragma: nocover
         # probably binary. skip it.
         return
 
diff --git a/tests/subcommands/test_vectorise.py b/tests/subcommands/test_vectorise.py
index eee353d0..bcb47e4b 100644
--- a/tests/subcommands/test_vectorise.py
+++ b/tests/subcommands/test_vectorise.py
@@ -9,7 +9,9 @@
 import pathspec
 import pytest
 from chromadb.api.models.AsyncCollection import AsyncCollection
+from tree_sitter import Point
 
+from vectorcode.chunking import Chunk
 from vectorcode.cli_utils import Config
 from vectorcode.subcommands.vectorise import (
     chunked_add,
@@ -47,7 +49,7 @@ async def test_chunked_add():
     semaphore = asyncio.Semaphore(1)
 
     with patch("vectorcode.chunking.TreeSitterChunker.chunk") as mock_chunk:
-        mock_chunk.return_value = ["chunk1", "chunk2"]
+        mock_chunk.return_value = [Chunk("chunk1", Point(1, 0), Point(1, 5)), "chunk2"]
         await chunked_add(
             file_path,
             collection,
diff --git a/tests/test_chunking.py b/tests/test_chunking.py
index 40278b69..ab7836b5 100644
--- a/tests/test_chunking.py
+++ b/tests/test_chunking.py
@@ -2,8 +2,10 @@
 import tempfile
 
 import pytest
+from tree_sitter import Point
 
 from vectorcode.chunking import (
+    Chunk,
     ChunkerBase,
     FileChunker,
     StringChunker,
@@ -27,12 +29,18 @@ def test_string_chunker(self):
             "o wor",
             "world",
         ]
+        assert list(string_chunker.chunk("hello world"))[0] == Chunk(
+            "hello", Point(1, 0), Point(1, 4)
+        )
+
         string_chunker = StringChunker(Config(chunk_size=5, overlap_ratio=0))
         assert list(str(i) for i in string_chunker.chunk("hello world")) == [
             "hello",
             " worl",
             "d",
         ]
+        chunks = list(string_chunker.chunk("hello world"))
+        assert chunks[1] == Chunk(" worl", Point(1, 5), Point(1, 9))
 
         string_chunker = StringChunker(Config(chunk_size=5, overlap_ratio=0.8))
         assert list(str(i) for i in string_chunker.chunk("hello world")) == [
@@ -89,6 +97,32 @@ def test_file_chunker(self):
 
         os.remove(tmp_file_name)
 
+    def test_file_chunker_positions(self):
+        test_content = ["first line\n", "second line\n", "third line"]
+
+        with tempfile.NamedTemporaryFile(mode="w", delete=False) as tmp_file:
+            tmp_file.writelines(test_content)
+            tmp_file_name = tmp_file.name
+
+        # Test chunk positions
+        with open(tmp_file_name, "r") as f:
+            chunker = FileChunker(Config(chunk_size=10, overlap_ratio=0))
+            chunks = list(chunker.chunk(f))
+
+            assert chunks[0].text == "first line"
+            assert chunks[0].start == Point(1, 0)
+            assert chunks[0].end == Point(1, 9)
+
+            assert chunks[1].text == "\nsecond li"
+            assert chunks[1].start == Point(1, 10)
+            assert chunks[1].end == Point(2, 8)
+
+            assert chunks[2].text == "ne\nthird l"
+            assert chunks[2].start == Point(2, 9)
+            assert chunks[2].end == Point(3, 6)
+
+        os.remove(tmp_file_name)
+
 
 def test_no_config():
     assert StringChunker().config == Config()
@@ -298,3 +332,43 @@ def test_treesitter_chunker_fallback():
     assert tree_sitter_chunks == string_chunks
 
     os.remove(test_file)
+
+
+def test_treesitter_chunker_positions():
+    """Test that TreeSitterChunker produces correct start/end positions for chunks."""
+    chunker = TreeSitterChunker(Config(chunk_size=15))
+
+    test_content = """\
+def foo():
+    return 1 + \\
+        2
+
+@decorator
+def bar():
+    return "bar"
+"""
+    with tempfile.NamedTemporaryFile(mode="w", delete=False, suffix=".py") as tmp_file:
+        tmp_file.write(test_content)
+        test_file = tmp_file.name
+
+    chunks = list(chunker.chunk(test_file))
+
+    # Verify chunks and their positions
+    assert len(chunks) >= 2  # Should have at least 2 chunks
+
+    # First chunk should contain the function definition start
+    assert "deffoo():" in chunks[0].text
+    assert chunks[0].start == Point(1, 0)
+
+    # Last chunk should contain the final return statement
+    assert 'return "bar"' in chunks[-1].text
+    assert chunks[-1].end.row == 7
+    assert chunks[-1].end.column in (14, 15)  # Allow 1-column difference
+
+    # Verify positions are contiguous
+    for i in range(len(chunks) - 1):
+        assert chunks[i].end.row <= chunks[i + 1].start.row
+        if chunks[i].end.row == chunks[i + 1].start.row:
+            assert chunks[i].end.column <= chunks[i + 1].start.column
+
+    os.remove(test_file)

From 778423a64de00e5293dd968b67b05f75a57333ab Mon Sep 17 00:00:00 2001
From: Davidyz <hzjlyz@gmail.com>
Date: Fri, 28 Mar 2025 15:24:04 +0000
Subject: [PATCH 04/11] feat(cli): make `CrossEncoderReranker` work with
 `chunk`

---
 src/vectorcode/cli_utils.py                  |  1 +
 src/vectorcode/subcommands/query/__init__.py | 33 ++++++++++++++------
 src/vectorcode/subcommands/query/reranker.py | 13 +++++---
 3 files changed, 33 insertions(+), 14 deletions(-)

diff --git a/src/vectorcode/cli_utils.py b/src/vectorcode/cli_utils.py
index e22bd016..bd660962 100644
--- a/src/vectorcode/cli_utils.py
+++ b/src/vectorcode/cli_utils.py
@@ -23,6 +23,7 @@
 class QueryInclude(StrEnum):
     path = "path"
     document = "document"
+    chunk = "chunk"
 
     def to_header(self) -> str:
         """
diff --git a/src/vectorcode/subcommands/query/__init__.py b/src/vectorcode/subcommands/query/__init__.py
index e6223ef2..9baa9cbd 100644
--- a/src/vectorcode/subcommands/query/__init__.py
+++ b/src/vectorcode/subcommands/query/__init__.py
@@ -7,7 +7,7 @@
 from chromadb.errors import InvalidCollectionException, InvalidDimensionException
 
 from vectorcode.chunking import StringChunker
-from vectorcode.cli_utils import Config, expand_globs, expand_path
+from vectorcode.cli_utils import Config, QueryInclude, expand_globs, expand_path
 from vectorcode.common import (
     get_client,
     get_collection,
@@ -33,12 +33,14 @@ async def get_query_result_files(
         print("Empty collection!", file=sys.stderr)
         return []
     try:
-        num_query = await collection.count()
-        if configs.query_multiplier > 0:
-            num_query = min(
-                int(configs.n_result * configs.query_multiplier),
-                await collection.count(),
-            )
+        num_query = configs.n_result
+        if QueryInclude.chunk not in configs.include:
+            num_query = await collection.count()
+            if configs.query_multiplier > 0:
+                num_query = min(
+                    int(configs.n_result * configs.query_multiplier),
+                    await collection.count(),
+                )
         if len(configs.query_exclude):
             filtered_files = {"path": {"$nin": configs.query_exclude}}
         else:
@@ -71,6 +73,15 @@ async def get_query_result_files(
 
 
 async def query(configs: Config) -> int:
+    if (
+        QueryInclude.chunk in configs.include
+        and QueryInclude.document in configs.include
+    ):
+        print(
+            "Having both chunk and document in the output is not supported!",
+            file=sys.stderr,
+        )
+        return 1
     client = await get_client(configs)
     try:
         collection = await get_collection(client, configs, False)
@@ -101,14 +112,16 @@ async def query(configs: Config) -> int:
 
     for path in await get_query_result_files(collection, configs):
         if os.path.isfile(path):
-            with open(path) as fin:
-                document = fin.read()
             if configs.use_absolute_path:
                 output_path = os.path.abspath(path)
             else:
                 output_path = os.path.relpath(path, configs.project_root)
+            full_result = {"path": output_path}
+            if QueryInclude.document in configs.include:
+                with open(path) as fin:
+                    document = fin.read()
+                    full_result["document"] = document
 
-            full_result = {"path": output_path, "document": document}
             structured_result.append(
                 {str(key): full_result[str(key)] for key in configs.include}
             )
diff --git a/src/vectorcode/subcommands/query/reranker.py b/src/vectorcode/subcommands/query/reranker.py
index 9c2837c7..fcb3564b 100644
--- a/src/vectorcode/subcommands/query/reranker.py
+++ b/src/vectorcode/subcommands/query/reranker.py
@@ -6,11 +6,12 @@
 import numpy
 from chromadb.api.types import QueryResult
 
-from vectorcode.cli_utils import Config
+from vectorcode.cli_utils import Config, QueryInclude
 
 
 class RerankerBase:
     def __init__(self, configs: Config, **kwargs: Any):
+        self.configs = configs
         self.n_result = configs.n_result
 
     @abstractmethod
@@ -62,15 +63,19 @@ def rerank(self, results: QueryResult) -> list[str]:
         assert results["documents"] is not None
         documents: DefaultDict[str, list[float]] = defaultdict(list)
         for query_chunk_idx in range(len(self.query_chunks)):
+            chunk_ids = results["ids"][query_chunk_idx]
             chunk_metas = results["metadatas"][query_chunk_idx]
             chunk_docs = results["documents"][query_chunk_idx]
             ranks = self.model.rank(
                 self.query_chunks[query_chunk_idx], chunk_docs, apply_softmax=True
             )
             for rank in ranks:
-                documents[chunk_metas[rank["corpus_id"]]["path"]].append(
-                    float(rank["score"])
-                )
+                if QueryInclude.chunk in self.configs.include:
+                    documents[chunk_ids[rank["corpus_id"]]].append(float(rank["score"]))
+                else:
+                    documents[chunk_metas[rank["corpus_id"]]["path"]].append(
+                        float(rank["score"])
+                    )
 
         top_k = int(numpy.mean(tuple(len(i) for i in documents.values())))
         for key in documents.keys():

From d34e06c5fe88ed406d115951d1060fbea31b7241 Mon Sep 17 00:00:00 2001
From: Davidyz <hzjlyz@gmail.com>
Date: Sun, 30 Mar 2025 18:54:06 +0100
Subject: [PATCH 05/11] feat(cli): finished CLI mode (mostly).

---
 src/vectorcode/chunking.py                   |  9 +-
 src/vectorcode/lsp_main.py                   | 19 +---
 src/vectorcode/subcommands/chunks.py         |  2 +-
 src/vectorcode/subcommands/query/__init__.py | 94 ++++++++++++++------
 src/vectorcode/subcommands/vectorise.py      |  5 +-
 tests/subcommands/query/test_query.py        | 12 ++-
 tests/test_lsp.py                            |  9 +-
 7 files changed, 97 insertions(+), 53 deletions(-)

diff --git a/src/vectorcode/chunking.py b/src/vectorcode/chunking.py
index fdc8f8aa..65327bf9 100644
--- a/src/vectorcode/chunking.py
+++ b/src/vectorcode/chunking.py
@@ -235,10 +235,11 @@ def chunk(self, data: str) -> Generator[Chunk, None, None]:
         """
         assert os.path.isfile(data)
         with open(data) as fin:
-            content = fin.read()
-        if self.config.chunk_size < 0:
-            yield content
-            return
+            lines = fin.readlines()
+            content = "".join(lines)
+            if self.config.chunk_size < 0 and content:
+                yield Chunk(content, Point(1, 0), Point(len(lines), len(lines[-1]) - 1))
+                return
         parser = None
         language = None
         lexer = self.__guess_type(data, content)
diff --git a/src/vectorcode/lsp_main.py b/src/vectorcode/lsp_main.py
index 3ad30c4b..465db61f 100644
--- a/src/vectorcode/lsp_main.py
+++ b/src/vectorcode/lsp_main.py
@@ -24,7 +24,7 @@
 )
 from vectorcode.common import get_client, get_collection, try_server
 from vectorcode.subcommands.ls import get_collection_list
-from vectorcode.subcommands.query import get_query_result_files
+from vectorcode.subcommands.query import build_query_results
 
 cached_project_configs: dict[str, Config] = {}
 DEFAULT_PROJECT_ROOT: str | None = None
@@ -108,20 +108,9 @@ async def execute_command(ls: LanguageServer, args: list[str]):
             )
             final_results = []
             try:
-                for path in await get_query_result_files(
-                    collection=collection,
-                    configs=final_configs,
-                ):
-                    if os.path.isfile(path):
-                        with open(path) as fin:
-                            output_path = path
-                            if not final_configs.use_absolute_path:
-                                output_path = os.path.relpath(
-                                    path, final_configs.project_root
-                                )
-                            final_results.append(
-                                {"path": output_path, "document": fin.read()}
-                            )
+                final_results.extend(
+                    await build_query_results(collection, final_configs)
+                )
             finally:
                 ls.progress.end(
                     progress_token,
diff --git a/src/vectorcode/subcommands/chunks.py b/src/vectorcode/subcommands/chunks.py
index e01a90a2..2ee0e0c4 100644
--- a/src/vectorcode/subcommands/chunks.py
+++ b/src/vectorcode/subcommands/chunks.py
@@ -9,5 +9,5 @@ async def chunks(configs: Config) -> int:
     result = []
     for file_path in configs.files:
         result.append(list(chunker.chunk(str(file_path))))
-    print(json.dumps(result))
+    print(json.dumps(str(result)))
     return 0
diff --git a/src/vectorcode/subcommands/query/__init__.py b/src/vectorcode/subcommands/query/__init__.py
index 9baa9cbd..e775aec8 100644
--- a/src/vectorcode/subcommands/query/__init__.py
+++ b/src/vectorcode/subcommands/query/__init__.py
@@ -2,6 +2,7 @@
 import os
 import sys
 
+from chromadb import GetResult
 from chromadb.api.models.AsyncCollection import AsyncCollection
 from chromadb.api.types import IncludeEnum
 from chromadb.errors import InvalidCollectionException, InvalidDimensionException
@@ -33,18 +34,20 @@ async def get_query_result_files(
         print("Empty collection!", file=sys.stderr)
         return []
     try:
+        if len(configs.query_exclude):
+            filtered_files: dict[str, dict] = {"path": {"$nin": configs.query_exclude}}
+        else:
+            filtered_files = {}
         num_query = configs.n_result
-        if QueryInclude.chunk not in configs.include:
+        if QueryInclude.chunk in configs.include:
+            filtered_files["start"] = {"$gte": 0}
+        else:
             num_query = await collection.count()
             if configs.query_multiplier > 0:
                 num_query = min(
                     int(configs.n_result * configs.query_multiplier),
                     await collection.count(),
                 )
-        if len(configs.query_exclude):
-            filtered_files = {"path": {"$nin": configs.query_exclude}}
-        else:
-            filtered_files = None
         results = await collection.query(
             query_texts=query_chunks,
             n_results=num_query,
@@ -72,6 +75,64 @@ async def get_query_result_files(
     return aggregated_results
 
 
+async def build_query_results(
+    collection: AsyncCollection, configs: Config
+) -> list[dict[str, str | int]]:
+    structured_result = []
+    for identifier in await get_query_result_files(collection, configs):
+        if os.path.isfile(identifier):
+            if configs.use_absolute_path:
+                output_path = os.path.abspath(identifier)
+            else:
+                output_path = os.path.relpath(identifier, configs.project_root)
+            full_result = {"path": output_path}
+            with open(identifier) as fin:
+                document = fin.read()
+                full_result["document"] = document
+
+            structured_result.append(
+                {str(key): full_result[str(key)] for key in configs.include}
+            )
+        elif QueryInclude.chunk in configs.include:
+            chunk: GetResult = await collection.get(
+                identifier, include=[IncludeEnum.metadatas, IncludeEnum.documents]
+            )
+            meta = chunk.get(
+                "metadatas",
+            )
+            if meta is not None and len(meta) != 0:
+                full_result: dict[str, str | int] = {
+                    "chunk": str(chunk.get("documents", [""])[0])
+                }
+                if meta[0].get("start") is not None and meta[0].get("end") is not None:
+                    path = str(meta[0].get("path"))
+                    with open(path) as fin:
+                        start: int = meta[0]["start"]
+                        end: int = meta[0]["end"]
+                        full_result["chunk"] = "".join(fin.readlines()[start : end + 1])
+                    full_result["start_line"] = start
+                    full_result["end_line"] = end
+                    full_result["path"] = str(
+                        meta[0]["path"]
+                        if configs.use_absolute_path
+                        else os.path.relpath(meta[0]["path"], str(configs.project_root))
+                    )
+
+                    structured_result.append(full_result)
+            else:
+                print(
+                    "This collection doesn't support chunk-mode output because it lacks the necessary metadata. Please re-vectorise it.",
+                    file=sys.stderr,
+                )
+
+        else:
+            print(
+                f"{identifier} is no longer a valid file! Please re-run vectorcode vectorise to refresh the database.",
+                file=sys.stderr,
+            )
+    return structured_result
+
+
 async def query(configs: Config) -> int:
     if (
         QueryInclude.chunk in configs.include
@@ -108,28 +169,7 @@ async def query(configs: Config) -> int:
     if not configs.pipe:
         print("Starting querying...")
 
-    structured_result = []
-
-    for path in await get_query_result_files(collection, configs):
-        if os.path.isfile(path):
-            if configs.use_absolute_path:
-                output_path = os.path.abspath(path)
-            else:
-                output_path = os.path.relpath(path, configs.project_root)
-            full_result = {"path": output_path}
-            if QueryInclude.document in configs.include:
-                with open(path) as fin:
-                    document = fin.read()
-                    full_result["document"] = document
-
-            structured_result.append(
-                {str(key): full_result[str(key)] for key in configs.include}
-            )
-        else:
-            print(
-                f"{path} is no longer a valid file! Please re-run vectorcode vectorise to refresh the database.",
-                file=sys.stderr,
-            )
+    structured_result = await build_query_results(collection, configs)
 
     if configs.pipe:
         print(json.dumps(structured_result))
diff --git a/src/vectorcode/subcommands/vectorise.py b/src/vectorcode/subcommands/vectorise.py
index d636edae..05061007 100644
--- a/src/vectorcode/subcommands/vectorise.py
+++ b/src/vectorcode/subcommands/vectorise.py
@@ -65,8 +65,9 @@ async def chunked_add(
             for chunk in chunks:
                 meta: dict[str, str | dict[str, int]] = {"path": full_path_str}
                 if isinstance(chunk, Chunk):
-                    meta["start"] = {"row": chunk.start.row, "col": chunk.start.column}
-                    meta["end"] = {"row": chunk.end.row, "col": chunk.end.column}
+                    meta["start"] = chunk.start.row
+                    meta["end"] = chunk.end.row
+
                 metas.append(meta)
             async with collection_lock:
                 for idx in range(0, len(chunks), max_batch_size):
diff --git a/tests/subcommands/query/test_query.py b/tests/subcommands/query/test_query.py
index 080dce42..9d54d623 100644
--- a/tests/subcommands/query/test_query.py
+++ b/tests/subcommands/query/test_query.py
@@ -5,7 +5,7 @@
 from chromadb.api.types import IncludeEnum
 from chromadb.errors import InvalidCollectionException, InvalidDimensionException
 
-from vectorcode.cli_utils import Config, QueryInclude
+from vectorcode.cli_utils import CliAction, Config, QueryInclude
 from vectorcode.subcommands.query import get_query_result_files, query
 
 
@@ -71,7 +71,7 @@ async def test_get_query_result_files(mock_collection, mock_config):
         assert IncludeEnum.metadatas in kwargs["include"]
         assert IncludeEnum.distances in kwargs["include"]
         assert IncludeEnum.documents in kwargs["include"]
-        assert kwargs["where"] is None  # Since query_exclude is empty
+        assert not kwargs["where"]  # Since query_exclude is empty
 
         # Check reranker was used correctly
         MockReranker.assert_called_once_with(mock_config)
@@ -444,3 +444,11 @@ async def test_query_invalid_ef(mock_config):
 
         # Verify the function returns error code
         assert result == 1
+
+
+@pytest.mark.asyncio
+async def test_query_invalid_include():
+    faulty_config = Config(
+        action=CliAction.query, include=[QueryInclude.chunk, QueryInclude.document]
+    )
+    assert await query(faulty_config) != 0
diff --git a/tests/test_lsp.py b/tests/test_lsp.py
index 8b6ddd26..f315c4b1 100644
--- a/tests/test_lsp.py
+++ b/tests/test_lsp.py
@@ -4,7 +4,7 @@
 from pygls.server import LanguageServer
 
 from vectorcode import __version__
-from vectorcode.cli_utils import CliAction, Config
+from vectorcode.cli_utils import CliAction, Config, QueryInclude
 from vectorcode.lsp_main import (
     execute_command,
     lsp_start,
@@ -23,13 +23,18 @@ def mock_language_server():
 
 @pytest.fixture
 def mock_config():
-    config = MagicMock(spec=Config)
+    # config = MagicMock(spec=Config)
+    config = Config()
     config.host = "localhost"
     config.port = 8000
     config.action = CliAction.query
     config.project_root = "/test/project"
     config.use_absolute_path = True
     config.pipe = False
+    config.overlap_ratio = 0.2
+    config.query_exclude = []
+    config.include = [QueryInclude.path]
+    config.query_multipler = 10
     return config
 
 

From d13d5412b7e8552b4155e1c77ef6ba9aeecdbe7b Mon Sep 17 00:00:00 2001
From: Davidyz <hzjlyz@gmail.com>
Date: Mon, 31 Mar 2025 09:57:12 +0100
Subject: [PATCH 06/11] test(cli): fix broken test

---
 tests/test_lsp.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tests/test_lsp.py b/tests/test_lsp.py
index f315c4b1..823a0f75 100644
--- a/tests/test_lsp.py
+++ b/tests/test_lsp.py
@@ -88,7 +88,7 @@ async def test_execute_command_query(mock_language_server, mock_config):
         patch("vectorcode.lsp_main.get_client", new_callable=AsyncMock),
         patch("vectorcode.lsp_main.get_collection", new_callable=AsyncMock),
         patch(
-            "vectorcode.lsp_main.get_query_result_files", new_callable=AsyncMock
+            "vectorcode.lsp_main.build_query_results", new_callable=AsyncMock
         ) as mock_get_query_result_files,
         patch("os.path.isfile", return_value=True),
         patch("vectorcode.lsp_main.try_server", return_value=True),

From e88214032d8bdb62bb8519abf2eca7d0ee72573a Mon Sep 17 00:00:00 2001
From: Davidyz <hzjlyz@gmail.com>
Date: Mon, 31 Mar 2025 12:59:19 +0100
Subject: [PATCH 07/11] feat(cli): implement chunk mode support for
 NaiveReranker

---
 src/vectorcode/subcommands/query/reranker.py | 10 +++-
 tests/subcommands/query/test_reranker.py     | 58 +++++++++++++++++++-
 2 files changed, 64 insertions(+), 4 deletions(-)

diff --git a/src/vectorcode/subcommands/query/reranker.py b/src/vectorcode/subcommands/query/reranker.py
index fcb3564b..335ff3f2 100644
--- a/src/vectorcode/subcommands/query/reranker.py
+++ b/src/vectorcode/subcommands/query/reranker.py
@@ -28,16 +28,20 @@ def rerank(self, results: QueryResult) -> list[str]:
         assert results["distances"] is not None
         documents: DefaultDict[str, list[float]] = defaultdict(list)
         for query_chunk_idx in range(len(results["ids"])):
+            chunk_ids = results["ids"][query_chunk_idx]
             chunk_metas = results["metadatas"][query_chunk_idx]
             chunk_distances = results["distances"][query_chunk_idx]
             # NOTE: distances, smaller is better.
             paths = [str(meta["path"]) for meta in chunk_metas]
             assert len(paths) == len(chunk_distances)
-            for distance, path in zip(chunk_distances, paths):
-                if path is None:
+            for distance, identifier in zip(
+                chunk_distances,
+                chunk_ids if QueryInclude.chunk in self.configs.include else paths,
+            ):
+                if identifier is None:
                     # so that vectorcode doesn't break on old collections.
                     continue
-                documents[path].append(distance)
+                documents[identifier].append(distance)
 
         top_k = int(numpy.mean(tuple(len(i) for i in documents.values())))
         for key in documents.keys():
diff --git a/tests/subcommands/query/test_reranker.py b/tests/subcommands/query/test_reranker.py
index 49dbfb1d..f715527a 100644
--- a/tests/subcommands/query/test_reranker.py
+++ b/tests/subcommands/query/test_reranker.py
@@ -2,7 +2,7 @@
 
 import pytest
 
-from vectorcode.cli_utils import Config
+from vectorcode.cli_utils import Config, QueryInclude
 from vectorcode.subcommands.query.reranker import (
     CrossEncoderReranker,
     NaiveReranker,
@@ -153,3 +153,59 @@ def test_naive_reranker_document_selection_logic(config):
     assert len(result) > 0
     # Common files should be present
     assert "file2.py" in result or "file3.py" in result
+
+
+def test_naive_reranker_with_chunk_ids(config):
+    """Test NaiveReranker returns chunk IDs when QueryInclude.chunk is set"""
+    config.include.append(
+        QueryInclude.chunk
+    )  # Assuming QueryInclude.chunk would be "chunk"
+    query_result = {
+        "ids": [["id1", "id2"], ["id3", "id1"]],
+        "distances": [[0.1, 0.2], [0.3, 0.4]],
+        "metadatas": [
+            [{"path": "file1.py"}, {"path": "file2.py"}],
+            [{"path": "file3.py"}, {"path": "file1.py"}],
+        ],
+    }
+    reranker = NaiveReranker(config)
+    result = reranker.rerank(query_result)
+
+    assert isinstance(result, list)
+    assert len(result) <= config.n_result
+    assert all(isinstance(id, str) for id in result)
+    assert all(id.startswith("id") for id in result)  # Verify IDs not paths
+
+
+@patch("sentence_transformers.CrossEncoder")
+def test_cross_encoder_reranker_with_chunk_ids(
+    mock_cross_encoder, config, query_chunks
+):
+    """Test CrossEncoderReranker returns chunk IDs when QueryInclude.chunk is set"""
+    mock_model = MagicMock()
+    mock_cross_encoder.return_value = mock_model
+    mock_model.rank.return_value = [
+        {"corpus_id": 0, "score": 0.9},
+        {"corpus_id": 1, "score": 0.7},
+    ]
+
+    config.include = {"chunk"}  # Use comma instead of append
+    reranker = CrossEncoderReranker(
+        config, query_chunks, "cross-encoder/ms-marco-MiniLM-L-6-v2"
+    )
+
+    # Match query_chunks length with results
+    result = reranker.rerank(
+        {
+            "ids": [["id1", "id2"], ["id3", "id4"]],  # Two query chunks
+            "metadatas": [
+                [{"path": "file1.py"}, {"path": "file2.py"}],
+                [{"path": "file3.py"}, {"path": "file4.py"}],
+            ],
+            "documents": [["doc1", "doc2"], ["doc3", "doc4"]],
+        }
+    )
+
+    assert isinstance(result, list)
+    assert all(isinstance(id, str) for id in result)
+    assert all(id in ["id1", "id2", "id3", "id4"] for id in result)

From 47c28baf0ceb3014cdff6eff328c51a7862888a1 Mon Sep 17 00:00:00 2001
From: Davidyz <hzjlyz@gmail.com>
Date: Mon, 31 Mar 2025 15:16:02 +0100
Subject: [PATCH 08/11] test(cli): test for `build_query_results`

---
 tests/subcommands/query/test_query.py | 98 +++++++++++++++++++++++++--
 1 file changed, 94 insertions(+), 4 deletions(-)

diff --git a/tests/subcommands/query/test_query.py b/tests/subcommands/query/test_query.py
index 9d54d623..170bbad5 100644
--- a/tests/subcommands/query/test_query.py
+++ b/tests/subcommands/query/test_query.py
@@ -1,12 +1,17 @@
-from unittest.mock import AsyncMock, MagicMock, patch
+from unittest.mock import AsyncMock, MagicMock, mock_open, patch
 
 import pytest
+from chromadb import GetResult
 from chromadb.api.models.AsyncCollection import AsyncCollection
 from chromadb.api.types import IncludeEnum
 from chromadb.errors import InvalidCollectionException, InvalidDimensionException
 
 from vectorcode.cli_utils import CliAction, Config, QueryInclude
-from vectorcode.subcommands.query import get_query_result_files, query
+from vectorcode.subcommands.query import (
+    build_query_results,
+    get_query_result_files,
+    query,
+)
 
 
 @pytest.fixture
@@ -17,8 +22,16 @@ def mock_collection():
         "ids": [["id1", "id2", "id3"], ["id4", "id5", "id6"]],
         "distances": [[0.1, 0.2, 0.3], [0.4, 0.5, 0.6]],
         "metadatas": [
-            [{"path": "file1.py"}, {"path": "file2.py"}, {"path": "file3.py"}],
-            [{"path": "file2.py"}, {"path": "file4.py"}, {"path": "file3.py"}],
+            [
+                {"path": "file1.py", "start": 1, "end": 1},
+                {"path": "file2.py", "start": 1, "end": 1},
+                {"path": "file3.py", "start": 1, "end": 1},
+            ],
+            [
+                {"path": "file2.py", "start": 1, "end": 1},
+                {"path": "file4.py", "start": 1, "end": 1},
+                {"path": "file3.py", "start": 1, "end": 1},
+            ],
         ],
         "documents": [
             ["content1", "content2", "content3"],
@@ -83,6 +96,83 @@ async def test_get_query_result_files(mock_collection, mock_config):
         assert result == ["file1.py", "file2.py", "file3.py"]
 
 
+@pytest.mark.asyncio
+async def test_get_query_result_files_include_chunk(mock_collection, mock_config):
+    """Test get_query_result_files when QueryInclude.chunk is included."""
+    mock_config.include = [QueryInclude.chunk]  # Include chunk
+
+    with patch("vectorcode.subcommands.query.reranker.NaiveReranker") as MockReranker:
+        mock_reranker_instance = MagicMock()
+        mock_reranker_instance.rerank.return_value = ["chunk1"]
+        MockReranker.return_value = mock_reranker_instance
+
+        await get_query_result_files(mock_collection, mock_config)
+
+        # Check query call includes where clause for chunks
+        mock_collection.query.assert_called_once()
+        _, kwargs = mock_collection.query.call_args
+        # Line 43: Check the 'if' condition branch
+        assert kwargs["where"] == {"start": {"$gte": 0}}
+        assert kwargs["n_results"] == 3  # n_result should be used directly
+
+
+@pytest.mark.asyncio
+async def test_build_query_results_chunk_mode_success(mock_collection, mock_config):
+    """Test build_query_results in chunk mode successfully retrieves chunk details."""
+    mock_config.include = [QueryInclude.chunk, QueryInclude.path]
+    mock_config.project_root = "/test/project"
+    mock_config.use_absolute_path = False
+    identifier = "chunk_id_1"
+    file_path = "/test/project/subdir/file1.py"
+    relative_path = "subdir/file1.py"
+    start_line = 5
+    end_line = 10
+
+    full_file_content_lines = [f"line {i}\n" for i in range(15)]
+    full_file_content = "".join(full_file_content_lines)
+
+    expected_chunk_content = "".join(full_file_content_lines[start_line : end_line + 1])
+
+    mock_get_result = GetResult(
+        ids=[identifier],
+        embeddings=None,
+        documents=["original chunk doc in db"],
+        metadatas=[{"path": file_path, "start": start_line, "end": end_line}],
+    )
+
+    with (
+        patch(
+            "vectorcode.subcommands.query.get_query_result_files",
+            return_value=[identifier],
+        ),
+        patch("os.path.isfile", return_value=False),
+        patch("builtins.open", mock_open(read_data=full_file_content)) as mocked_open,
+        patch("os.path.relpath", return_value=relative_path) as mock_relpath,
+    ):
+        mock_collection.get = AsyncMock(return_value=mock_get_result)
+
+        results = await build_query_results(mock_collection, mock_config)
+
+        mock_collection.get.assert_called_once_with(
+            identifier, include=[IncludeEnum.metadatas, IncludeEnum.documents]
+        )
+
+        mocked_open.assert_called_once_with(file_path)
+
+        mock_relpath.assert_called_once_with(file_path, str(mock_config.project_root))
+
+        assert len(results) == 1
+
+        expected_full_result = {
+            "path": relative_path,
+            "chunk": expected_chunk_content,
+            "start_line": start_line,
+            "end_line": end_line,
+        }
+
+        assert results[0] == expected_full_result
+
+
 @pytest.mark.asyncio
 async def test_get_query_result_files_with_query_exclude(mock_collection, mock_config):
     # Setup query_exclude

From 4673d0852a55530b928e6bd49a059bbd34c2c2db Mon Sep 17 00:00:00 2001
From: Davidyz <hzjlyz@gmail.com>
Date: Mon, 31 Mar 2025 15:39:08 +0100
Subject: [PATCH 09/11] feat(cli): make sure it fallback to `document` mode
 when range metadata is not available

---
 src/vectorcode/subcommands/query/__init__.py | 18 +++++++--
 tests/subcommands/query/test_query.py        | 39 ++++++++++++++++++++
 2 files changed, 53 insertions(+), 4 deletions(-)

diff --git a/src/vectorcode/subcommands/query/__init__.py b/src/vectorcode/subcommands/query/__init__.py
index e775aec8..018c7dd0 100644
--- a/src/vectorcode/subcommands/query/__init__.py
+++ b/src/vectorcode/subcommands/query/__init__.py
@@ -35,12 +35,12 @@ async def get_query_result_files(
         return []
     try:
         if len(configs.query_exclude):
-            filtered_files: dict[str, dict] = {"path": {"$nin": configs.query_exclude}}
+            filter: dict[str, dict] = {"path": {"$nin": configs.query_exclude}}
         else:
-            filtered_files = {}
+            filter = {}
         num_query = configs.n_result
         if QueryInclude.chunk in configs.include:
-            filtered_files["start"] = {"$gte": 0}
+            filter["start"] = {"$gte": 0}
         else:
             num_query = await collection.count()
             if configs.query_multiplier > 0:
@@ -56,7 +56,7 @@ async def get_query_result_files(
                 IncludeEnum.distances,
                 IncludeEnum.documents,
             ],
-            where=filtered_files,
+            where=filter or None,
         )
     except IndexError:
         # no results found
@@ -169,6 +169,16 @@ async def query(configs: Config) -> int:
     if not configs.pipe:
         print("Starting querying...")
 
+    if QueryInclude.chunk in configs.include:
+        if len((await collection.get(where={"start": {"$gte": 0}}))["ids"]) == 0:
+            print(
+                """
+This collection doesn't contain line range metadata. Falling back to `--include path document`. 
+Please re-vectorise it to use `--include chunk`.""",
+                file=sys.stderr,
+            )
+            configs.include = [QueryInclude.path, QueryInclude.document]
+
     structured_result = await build_query_results(collection, configs)
 
     if configs.pipe:
diff --git a/tests/subcommands/query/test_query.py b/tests/subcommands/query/test_query.py
index 170bbad5..a98d63ab 100644
--- a/tests/subcommands/query/test_query.py
+++ b/tests/subcommands/query/test_query.py
@@ -542,3 +542,42 @@ async def test_query_invalid_include():
         action=CliAction.query, include=[QueryInclude.chunk, QueryInclude.document]
     )
     assert await query(faulty_config) != 0
+
+
+@pytest.mark.asyncio
+async def test_query_chunk_mode_no_metadata_fallback(mock_config):
+    mock_config.include = [QueryInclude.chunk, QueryInclude.path]
+    mock_client = AsyncMock()
+    mock_collection = AsyncMock()
+
+    # Mock collection.get to return no IDs for the metadata check
+    mock_collection.get.return_value = {"ids": []}
+
+    with (
+        patch("vectorcode.subcommands.query.get_client", return_value=mock_client),
+        patch(
+            "vectorcode.subcommands.query.get_collection", return_value=mock_collection
+        ),
+        patch("vectorcode.subcommands.query.verify_ef", return_value=True),
+        patch("vectorcode.subcommands.query.build_query_results") as mock_build_results,
+        patch("sys.stderr") as mock_stderr,
+    ):
+        mock_build_results.return_value = []  # Return empty results for simplicity
+
+        result = await query(mock_config)
+
+        assert result == 0
+
+        # Verify the metadata check call
+        mock_collection.get.assert_called_once_with(where={"start": {"$gte": 0}})
+
+        # Verify the warning was printed
+        assert mock_stderr.write.call_count > 0
+        call_args, _ = mock_stderr.write.call_args_list[0]
+        assert "Falling back to `--include path document`" in call_args[0]
+
+        # Verify build_query_results was called with the *modified* config
+        mock_build_results.assert_called_once()
+        args, _ = mock_build_results.call_args
+        _, called_config = args
+        assert called_config.include == [QueryInclude.path, QueryInclude.document]

From 8b7799fdb4a28de181dc131fe822c1b0b9ccb0c1 Mon Sep 17 00:00:00 2001
From: Davidyz <hzjlyz@gmail.com>
Date: Mon, 31 Mar 2025 15:45:37 +0100
Subject: [PATCH 10/11] fix(cli): do not output `path` when not specified from
 the arguments

---
 src/vectorcode/subcommands/query/__init__.py | 13 ++++++++-----
 1 file changed, 8 insertions(+), 5 deletions(-)

diff --git a/src/vectorcode/subcommands/query/__init__.py b/src/vectorcode/subcommands/query/__init__.py
index 018c7dd0..4d23306b 100644
--- a/src/vectorcode/subcommands/query/__init__.py
+++ b/src/vectorcode/subcommands/query/__init__.py
@@ -112,11 +112,14 @@ async def build_query_results(
                         full_result["chunk"] = "".join(fin.readlines()[start : end + 1])
                     full_result["start_line"] = start
                     full_result["end_line"] = end
-                    full_result["path"] = str(
-                        meta[0]["path"]
-                        if configs.use_absolute_path
-                        else os.path.relpath(meta[0]["path"], str(configs.project_root))
-                    )
+                    if QueryInclude.path in configs.include:
+                        full_result["path"] = str(
+                            meta[0]["path"]
+                            if configs.use_absolute_path
+                            else os.path.relpath(
+                                meta[0]["path"], str(configs.project_root)
+                            )
+                        )
 
                     structured_result.append(full_result)
             else:

From 3d0b6db47c53cacbad3eab651101135bc48067ee Mon Sep 17 00:00:00 2001
From: Davidyz <hzjlyz@gmail.com>
Date: Mon, 31 Mar 2025 15:56:47 +0100
Subject: [PATCH 11/11] docs(cli): documentation about `--include chunk`

---
 docs/cli.md | 29 +++++++++++++++++++++++++++++
 1 file changed, 29 insertions(+)

diff --git a/docs/cli.md b/docs/cli.md
index f5eec174..15528586 100644
--- a/docs/cli.md
+++ b/docs/cli.md
@@ -331,6 +331,16 @@ vectorcode query foo bar --include path
 This will only include the `path` in the output. This is effective for both
 normal CLI usage and [`--pipe` mode](#for-developers).
 
+For some applications, it may be overkill to use the full document as context
+and all you need is the chunks. You can do this by using `--include chunk` or
+`--include chunk path` in the command. This will return chunks from the
+document, and in `pipe` mode the objects will also include the line numbers of 
+the first and last lines in the chunk. Note that `chunk` and `document` cannot be used at
+the same time, and the number of query result (the `-n` parameter) will refer to
+the number of retrieved chunks when you use `--include chunk`. For the sake of 
+completeness, the first and last lines of a chunk will be completed to include
+the whole lines if the chunker broke the text from mid-line.
+
 ### Listing All Collections
 
 You can use `vectorcode ls` command to list all collections in your ChromaDB.
@@ -433,6 +443,25 @@ For the query command, here's the format printed in the `pipe` mode:
 Basically an array of dictionaries with 2 keys: `"path"` for the path to the
 document, and `"document"` for the content of the document.
 
+If you used `--include chunk path` parameters, the array will look like this:
+```json
+[
+    {
+        "path": "path_to_your_code.py",
+        "chunk": "foo",
+        "start_line": 1,
+        "end_line": 1,
+    },
+    {
+        "path": "path_to_another_file.py",
+        "chunk": "bar",
+        "start_line": 1,
+        "end_line": 1,
+    }
+]
+```
+Keep in mind that both `start_line` and `end_line` are inclusive.
+
 #### `vectorcode vectorise`
 The output is in JSON format. It contains a dictionary with the following fields:
 - `"add"`: number of added documents;