Davidyz · Davidyz · Mar 31, 2025 · Mar 27, 2025 · Mar 28, 2025 · Mar 28, 2025
diff --git a/docs/cli.md b/docs/cli.md
@@ -331,6 +331,16 @@ vectorcode query foo bar --include path
 This will only include the `path` in the output. This is effective for both
 normal CLI usage and [`--pipe` mode](#for-developers).
 
+For some applications, it may be overkill to use the full document as context
+and all you need is the chunks. You can do this by using `--include chunk` or
+`--include chunk path` in the command. This will return chunks from the
+document, and in `pipe` mode the objects will also include the line numbers of 
+the first and last lines in the chunk. Note that `chunk` and `document` cannot be used at
+the same time, and the number of query result (the `-n` parameter) will refer to
+the number of retrieved chunks when you use `--include chunk`. For the sake of 
+completeness, the first and last lines of a chunk will be completed to include
+the whole lines if the chunker broke the text from mid-line.
+
 ### Listing All Collections
 
 You can use `vectorcode ls` command to list all collections in your ChromaDB.
@@ -433,6 +443,25 @@ For the query command, here's the format printed in the `pipe` mode:
 Basically an array of dictionaries with 2 keys: `"path"` for the path to the
 document, and `"document"` for the content of the document.
 
+If you used `--include chunk path` parameters, the array will look like this:
+```json
+[
+    {
+        "path": "path_to_your_code.py",
+        "chunk": "foo",
+        "start_line": 1,
+        "end_line": 1,
+    },
+    {
+        "path": "path_to_another_file.py",
+        "chunk": "bar",
+        "start_line": 1,
+        "end_line": 1,
+    }
+]
+```
+Keep in mind that both `start_line` and `end_line` are inclusive.
+
 #### `vectorcode vectorise`
 The output is in JSON format. It contains a dictionary with the following fields:
 - `"add"`: number of added documents;

diff --git a/src/vectorcode/chunking.py b/src/vectorcode/chunking.py
@@ -1,19 +1,34 @@
 import os
 import re
 from abc import abstractmethod
+from dataclasses import dataclass
 from functools import cache
 from io import TextIOWrapper
 from typing import Generator, Optional
 
 from pygments.lexer import Lexer
 from pygments.lexers import guess_lexer_for_filename
 from pygments.util import ClassNotFound
-from tree_sitter import Node
+from tree_sitter import Node, Point
 from tree_sitter_language_pack import get_parser
 
 from vectorcode.cli_utils import Config
 
 
+@dataclass
+class Chunk:
+    """
+    rows are 1-indexed, cols are 0-indexed.
+    """
+
+    text: str
+    start: Point
+    end: Point
+
+    def __str__(self):
+        return self.text
+
+
 class ChunkerBase:
     def __init__(self, config: Optional[Config] = None) -> None:
         if config is None:
@@ -24,7 +39,7 @@
         self.config = config
 
     @abstractmethod
-    def chunk(self, data) -> Generator[str, None, None]:
+    def chunk(self, data) -> Generator[Chunk, None, None]:
         raise NotImplementedError
 
 
@@ -34,16 +49,25 @@
             config = Config()
         super().__init__(config)
 
-    def chunk(self, data: str) -> Generator[str, None, None]:
+    def chunk(self, data: str):
         if self.config.chunk_size < 0:
-            yield data
+            yield Chunk(
+                text=data,
+                start=Point(row=1, column=0),
+                end=Point(row=1, column=len(data)),
+            )
         else:
             step_size = max(
                 1, int(self.config.chunk_size * (1 - self.config.overlap_ratio))
             )
             i = 0
             while i < len(data):
-                yield data[i : i + self.config.chunk_size]
+                chunk_text = data[i : i + self.config.chunk_size]
+                yield Chunk(
+                    text=chunk_text,
+                    start=Point(row=1, column=i),
+                    end=Point(row=1, column=i + len(chunk_text) - 1),
+                )
                 if i + self.config.chunk_size >= len(data):
                     break
                 i += step_size
@@ -55,24 +79,55 @@
             config = Config()
         super().__init__(config)
 
-    def chunk(self, data: TextIOWrapper) -> Generator[str, None, None]:
-        if self.config.chunk_size < 0:
-            yield "".join(data.readlines())
-        else:
-            step_size = max(
-                1, int(self.config.chunk_size * (1 - self.config.overlap_ratio))
+    def chunk(self, data: TextIOWrapper) -> Generator[Chunk, None, None]:
+        lines = data.readlines()
+        if len(lines) == 0:
+            return
+        if (
+            self.config.chunk_size < 0
+            or sum(len(i) for i in lines) < self.config.chunk_size
+        ):
+            text = "".join(lines)
+            yield Chunk(text, Point(1, 0), Point(1, len(text) - 1))
+            return
+
+        text = "".join(lines)
+        step_size = max(
+            1, int(self.config.chunk_size * (1 - self.config.overlap_ratio))
+        )
+
+        # Convert lines to absolute positions
+        line_offsets = [0]
+        for line in lines:
+            line_offsets.append(line_offsets[-1] + len(line))
+
+        i = 0
+        while i < len(text):
+            chunk_text = text[i : i + self.config.chunk_size]
+
+            # Find start position
+            start_line = (
+                next(ln for ln, offset in enumerate(line_offsets) if offset > i) - 1
             )
-            # the output of this method should be identical to that of StringChunker.chunk
-            output = data.read(self.config.chunk_size)
-            yield output
-            if len(output) < self.config.chunk_size:
-                return
-            while True:
-                new_chars = data.read(step_size)
-                output = output[step_size:] + new_chars
-                yield output
-                if len(new_chars) < step_size:
-                    return
+            start_col = i - line_offsets[start_line]
+
+            # Find end position
+            end_pos = i + len(chunk_text)
+            end_line = (
+                next(ln for ln, offset in enumerate(line_offsets) if offset >= end_pos)
+                - 1
+            )
+            end_col = end_pos - line_offsets[end_line] - 1
+
+            yield Chunk(
+                chunk_text,
+                Point(start_line + 1, start_col),
+                Point(end_line + 1, end_col),
+            )
+
+            if i + self.config.chunk_size >= len(text):
+                break
+            i += step_size
 
 
 class TreeSitterChunker(ChunkerBase):
@@ -81,22 +136,77 @@
             config = Config()
         super().__init__(config)
 
-    def __chunk_node(self, node: Node, text: str) -> Generator[str, None, None]:
+    def __chunk_node(self, node: Node, text: str) -> Generator[Chunk, None, None]:
         current_chunk = ""
+
+        current_start = None
+
         for child in node.children:
-            child_length = child.end_byte - child.start_byte
+            child_text = text[child.start_byte : child.end_byte]
+            child_length = len(child_text)
+
             if child_length > self.config.chunk_size:
+                # Yield current chunk if exists
                 if current_chunk:
-                    yield current_chunk
+                    assert current_start is not None
+                    yield Chunk(
+                        text=current_chunk,
+                        start=current_start,
+                        end=Point(
+                            row=current_start.row + current_chunk.count("\n"),
+                            column=len(current_chunk.split("\n")[-1]) - 1
+                            if "\n" in current_chunk
+                            else current_start.column + len(current_chunk) - 1,
+                        ),
+                    )
                     current_chunk = ""
+                    current_start = None
+
+                # Recursively chunk the large child node
                 yield from self.__chunk_node(child, text)
-            elif len(current_chunk) + child_length > self.config.chunk_size:
-                yield current_chunk
-                current_chunk = text[child.start_byte : child.end_byte]
+
+            elif not current_chunk:
+                # Start new chunk
+                current_chunk = child_text
+                current_start = Point(
+                    row=child.start_point.row + 1, column=child.start_point.column
+                )
+
+            elif len(current_chunk) + child_length <= self.config.chunk_size:
+                # Add to current chunk
+                current_chunk += child_text
+
             else:
-                current_chunk += text[child.start_byte : child.end_byte]
+                # Yield current chunk and start new one
+                assert current_start is not None
+                yield Chunk(
+                    text=current_chunk,
+                    start=current_start,
+                    end=Point(
+                        row=current_start.row + current_chunk.count("\n"),
+                        column=len(current_chunk.split("\n")[-1]) - 1
+                        if "\n" in current_chunk
+                        else current_start.column + len(current_chunk) - 1,
+                    ),
+                )
+                current_chunk = child_text
+                current_start = Point(
+                    row=child.start_point.row + 1, column=child.start_point.column
+                )
+
+        # Yield remaining chunk
         if current_chunk:
-            yield current_chunk
+            assert current_start is not None
+            yield Chunk(
+                text=current_chunk,
+                start=current_start,
+                end=Point(
+                    row=current_start.row + current_chunk.count("\n"),
+                    column=len(current_chunk.split("\n")[-1]) - 1
+                    if "\n" in current_chunk
+                    else current_start.column + len(current_chunk) - 1,
+                ),
+            )
 
     @cache
     def __guess_type(self, path: str, content: str) -> Optional[Lexer]:
@@ -119,16 +229,17 @@
             return f"(?:{'|'.join(patterns)})"
         return ""
 
-    def chunk(self, data: str) -> Generator[str, None, None]:
+    def chunk(self, data: str) -> Generator[Chunk, None, None]:
         """
         data: path to the file
         """
         assert os.path.isfile(data)
         with open(data) as fin:
-            content = fin.read()
-        if self.config.chunk_size < 0:
-            yield content
-            return
+            lines = fin.readlines()
+            content = "".join(lines)
+            if self.config.chunk_size < 0 and content:
+                yield Chunk(content, Point(1, 0), Point(len(lines), len(lines[-1]) - 1))
+                return
         parser = None
         language = None
         lexer = self.__guess_type(data, content)
@@ -155,7 +266,7 @@
             if pattern_str:
                 re_pattern = re.compile(pattern_str)
                 for chunk in chunks_gen:
-                    if re_pattern.match(chunk) is None:
+                    if re_pattern.match(chunk.text) is None:
                         yield chunk
             else:
                 yield from chunks_gen
diff --git a/src/vectorcode/cli_utils.py b/src/vectorcode/cli_utils.py
@@ -23,6 +23,7 @@
 class QueryInclude(StrEnum):
     path = "path"
     document = "document"
+    chunk = "chunk"
 
     def to_header(self) -> str:
         """

diff --git a/src/vectorcode/lsp_main.py b/src/vectorcode/lsp_main.py
@@ -24,7 +24,7 @@
 )
 from vectorcode.common import get_client, get_collection, try_server
 from vectorcode.subcommands.ls import get_collection_list
-from vectorcode.subcommands.query import get_query_result_files
+from vectorcode.subcommands.query import build_query_results
 
 cached_project_configs: dict[str, Config] = {}
 DEFAULT_PROJECT_ROOT: str | None = None
@@ -108,20 +108,9 @@ async def execute_command(ls: LanguageServer, args: list[str]):
             )
             final_results = []
             try:
-                for path in await get_query_result_files(
-                    collection=collection,
-                    configs=final_configs,
-                ):
-                    if os.path.isfile(path):
-                        with open(path) as fin:
-                            output_path = path
-                            if not final_configs.use_absolute_path:
-                                output_path = os.path.relpath(
-                                    path, final_configs.project_root
-                                )
-                            final_results.append(
-                                {"path": output_path, "document": fin.read()}
-                            )
+                final_results.extend(
+                    await build_query_results(collection, final_configs)
+                )
             finally:
                 ls.progress.end(
                     progress_token,

diff --git a/src/vectorcode/subcommands/chunks.py b/src/vectorcode/subcommands/chunks.py
@@ -9,5 +9,5 @@ async def chunks(configs: Config) -> int:
     result = []
     for file_path in configs.files:
         result.append(list(chunker.chunk(str(file_path))))
-    print(json.dumps(result))
+    print(json.dumps(str(result)))
     return 0