Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
29 changes: 29 additions & 0 deletions docs/cli.md
Original file line number Diff line number Diff line change
Expand Up @@ -331,6 +331,16 @@ vectorcode query foo bar --include path
This will only include the `path` in the output. This is effective for both
normal CLI usage and [`--pipe` mode](#for-developers).

For some applications, it may be overkill to use the full document as context
and all you need is the chunks. You can do this by using `--include chunk` or
`--include chunk path` in the command. This will return chunks from the
document, and in `pipe` mode the objects will also include the line numbers of
the first and last lines in the chunk. Note that `chunk` and `document` cannot be used at
the same time, and the number of query result (the `-n` parameter) will refer to
the number of retrieved chunks when you use `--include chunk`. For the sake of
completeness, the first and last lines of a chunk will be completed to include
the whole lines if the chunker broke the text from mid-line.

### Listing All Collections

You can use `vectorcode ls` command to list all collections in your ChromaDB.
Expand Down Expand Up @@ -433,6 +443,25 @@ For the query command, here's the format printed in the `pipe` mode:
Basically an array of dictionaries with 2 keys: `"path"` for the path to the
document, and `"document"` for the content of the document.

If you used `--include chunk path` parameters, the array will look like this:
```json
[
{
"path": "path_to_your_code.py",
"chunk": "foo",
"start_line": 1,
"end_line": 1,
},
{
"path": "path_to_another_file.py",
"chunk": "bar",
"start_line": 1,
"end_line": 1,
}
]
```
Keep in mind that both `start_line` and `end_line` are inclusive.

#### `vectorcode vectorise`
The output is in JSON format. It contains a dictionary with the following fields:
- `"add"`: number of added documents;
Expand Down
183 changes: 147 additions & 36 deletions src/vectorcode/chunking.py
Original file line number Diff line number Diff line change
@@ -1,19 +1,34 @@
import os
import re
from abc import abstractmethod
from dataclasses import dataclass
from functools import cache
from io import TextIOWrapper
from typing import Generator, Optional

from pygments.lexer import Lexer
from pygments.lexers import guess_lexer_for_filename
from pygments.util import ClassNotFound
from tree_sitter import Node
from tree_sitter import Node, Point
from tree_sitter_language_pack import get_parser

from vectorcode.cli_utils import Config


@dataclass
class Chunk:
"""
rows are 1-indexed, cols are 0-indexed.
"""

text: str
start: Point
end: Point

def __str__(self):
return self.text


class ChunkerBase:
def __init__(self, config: Optional[Config] = None) -> None:
if config is None:
Expand All @@ -24,7 +39,7 @@
self.config = config

@abstractmethod
def chunk(self, data) -> Generator[str, None, None]:
def chunk(self, data) -> Generator[Chunk, None, None]:
raise NotImplementedError


Expand All @@ -34,16 +49,25 @@
config = Config()
super().__init__(config)

def chunk(self, data: str) -> Generator[str, None, None]:
def chunk(self, data: str):
if self.config.chunk_size < 0:
yield data
yield Chunk(
text=data,
start=Point(row=1, column=0),
end=Point(row=1, column=len(data)),
)
else:
step_size = max(
1, int(self.config.chunk_size * (1 - self.config.overlap_ratio))
)
i = 0
while i < len(data):
yield data[i : i + self.config.chunk_size]
chunk_text = data[i : i + self.config.chunk_size]
yield Chunk(
text=chunk_text,
start=Point(row=1, column=i),
end=Point(row=1, column=i + len(chunk_text) - 1),
)
if i + self.config.chunk_size >= len(data):
break
i += step_size
Expand All @@ -55,24 +79,55 @@
config = Config()
super().__init__(config)

def chunk(self, data: TextIOWrapper) -> Generator[str, None, None]:
if self.config.chunk_size < 0:
yield "".join(data.readlines())
else:
step_size = max(
1, int(self.config.chunk_size * (1 - self.config.overlap_ratio))
def chunk(self, data: TextIOWrapper) -> Generator[Chunk, None, None]:
lines = data.readlines()
if len(lines) == 0:
return

Check warning on line 85 in src/vectorcode/chunking.py

View check run for this annotation

Codecov / codecov/patch

src/vectorcode/chunking.py#L85

Added line #L85 was not covered by tests
if (
self.config.chunk_size < 0
or sum(len(i) for i in lines) < self.config.chunk_size
):
text = "".join(lines)
yield Chunk(text, Point(1, 0), Point(1, len(text) - 1))
return

text = "".join(lines)
step_size = max(
1, int(self.config.chunk_size * (1 - self.config.overlap_ratio))
)

# Convert lines to absolute positions
line_offsets = [0]
for line in lines:
line_offsets.append(line_offsets[-1] + len(line))

i = 0
while i < len(text):
chunk_text = text[i : i + self.config.chunk_size]

# Find start position
start_line = (
next(ln for ln, offset in enumerate(line_offsets) if offset > i) - 1
)
# the output of this method should be identical to that of StringChunker.chunk
output = data.read(self.config.chunk_size)
yield output
if len(output) < self.config.chunk_size:
return
while True:
new_chars = data.read(step_size)
output = output[step_size:] + new_chars
yield output
if len(new_chars) < step_size:
return
start_col = i - line_offsets[start_line]

# Find end position
end_pos = i + len(chunk_text)
end_line = (
next(ln for ln, offset in enumerate(line_offsets) if offset >= end_pos)
- 1
)
end_col = end_pos - line_offsets[end_line] - 1

yield Chunk(
chunk_text,
Point(start_line + 1, start_col),
Point(end_line + 1, end_col),
)

if i + self.config.chunk_size >= len(text):
break
i += step_size


class TreeSitterChunker(ChunkerBase):
Expand All @@ -81,22 +136,77 @@
config = Config()
super().__init__(config)

def __chunk_node(self, node: Node, text: str) -> Generator[str, None, None]:
def __chunk_node(self, node: Node, text: str) -> Generator[Chunk, None, None]:
current_chunk = ""

current_start = None

for child in node.children:
child_length = child.end_byte - child.start_byte
child_text = text[child.start_byte : child.end_byte]
child_length = len(child_text)

if child_length > self.config.chunk_size:
# Yield current chunk if exists
if current_chunk:
yield current_chunk
assert current_start is not None
yield Chunk(
text=current_chunk,
start=current_start,
end=Point(
row=current_start.row + current_chunk.count("\n"),
column=len(current_chunk.split("\n")[-1]) - 1
if "\n" in current_chunk
else current_start.column + len(current_chunk) - 1,
),
)
current_chunk = ""
current_start = None

# Recursively chunk the large child node
yield from self.__chunk_node(child, text)
elif len(current_chunk) + child_length > self.config.chunk_size:
yield current_chunk
current_chunk = text[child.start_byte : child.end_byte]

elif not current_chunk:
# Start new chunk
current_chunk = child_text
current_start = Point(
row=child.start_point.row + 1, column=child.start_point.column
)

elif len(current_chunk) + child_length <= self.config.chunk_size:
# Add to current chunk
current_chunk += child_text

else:
current_chunk += text[child.start_byte : child.end_byte]
# Yield current chunk and start new one
assert current_start is not None
yield Chunk(
text=current_chunk,
start=current_start,
end=Point(
row=current_start.row + current_chunk.count("\n"),
column=len(current_chunk.split("\n")[-1]) - 1
if "\n" in current_chunk
else current_start.column + len(current_chunk) - 1,
),
)
current_chunk = child_text
current_start = Point(
row=child.start_point.row + 1, column=child.start_point.column
)

# Yield remaining chunk
if current_chunk:
yield current_chunk
assert current_start is not None
yield Chunk(
text=current_chunk,
start=current_start,
end=Point(
row=current_start.row + current_chunk.count("\n"),
column=len(current_chunk.split("\n")[-1]) - 1
if "\n" in current_chunk
else current_start.column + len(current_chunk) - 1,
),
)

@cache
def __guess_type(self, path: str, content: str) -> Optional[Lexer]:
Expand All @@ -119,16 +229,17 @@
return f"(?:{'|'.join(patterns)})"
return ""

def chunk(self, data: str) -> Generator[str, None, None]:
def chunk(self, data: str) -> Generator[Chunk, None, None]:
"""
data: path to the file
"""
assert os.path.isfile(data)
with open(data) as fin:
content = fin.read()
if self.config.chunk_size < 0:
yield content
return
lines = fin.readlines()
content = "".join(lines)
if self.config.chunk_size < 0 and content:
yield Chunk(content, Point(1, 0), Point(len(lines), len(lines[-1]) - 1))
return
parser = None
language = None
lexer = self.__guess_type(data, content)
Expand All @@ -155,7 +266,7 @@
if pattern_str:
re_pattern = re.compile(pattern_str)
for chunk in chunks_gen:
if re_pattern.match(chunk) is None:
if re_pattern.match(chunk.text) is None:
yield chunk
else:
yield from chunks_gen
1 change: 1 addition & 0 deletions src/vectorcode/cli_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -23,6 +23,7 @@
class QueryInclude(StrEnum):
path = "path"
document = "document"
chunk = "chunk"

def to_header(self) -> str:
"""
Expand Down
19 changes: 4 additions & 15 deletions src/vectorcode/lsp_main.py
Original file line number Diff line number Diff line change
Expand Up @@ -24,7 +24,7 @@
)
from vectorcode.common import get_client, get_collection, try_server
from vectorcode.subcommands.ls import get_collection_list
from vectorcode.subcommands.query import get_query_result_files
from vectorcode.subcommands.query import build_query_results

cached_project_configs: dict[str, Config] = {}
DEFAULT_PROJECT_ROOT: str | None = None
Expand Down Expand Up @@ -108,20 +108,9 @@ async def execute_command(ls: LanguageServer, args: list[str]):
)
final_results = []
try:
for path in await get_query_result_files(
collection=collection,
configs=final_configs,
):
if os.path.isfile(path):
with open(path) as fin:
output_path = path
if not final_configs.use_absolute_path:
output_path = os.path.relpath(
path, final_configs.project_root
)
final_results.append(
{"path": output_path, "document": fin.read()}
)
final_results.extend(
await build_query_results(collection, final_configs)
)
finally:
ls.progress.end(
progress_token,
Expand Down
2 changes: 1 addition & 1 deletion src/vectorcode/subcommands/chunks.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,5 +9,5 @@ async def chunks(configs: Config) -> int:
result = []
for file_path in configs.files:
result.append(list(chunker.chunk(str(file_path))))
print(json.dumps(result))
print(json.dumps(str(result)))
return 0
Loading