Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
5 changes: 3 additions & 2 deletions openrelik_ai_common/utils/chunker.py
Original file line number Diff line number Diff line change
Expand Up @@ -164,6 +164,7 @@ def _get_next_chunk(
Returns:
A tuple containing the next chunk (or None if end of file) and the updated offset.
"""
offset = int(offset)
if offset >= len(self.file_content):
return None, offset

Expand All @@ -173,10 +174,10 @@ def _get_next_chunk(
)

# Estimate the end character index based on available tokens
estimated_end_char = min(offset + available_tokens * 4, len(self.file_content))
estimated_end_char = int(min(offset + available_tokens * 4, len(self.file_content)))

# Find a suitable breakpoint for a clean chunk break
breakpoint = self._find_breakpoint(offset, estimated_end_char)
breakpoint = int(self._find_breakpoint(offset, estimated_end_char))

# Extract the chunk and update the offset
chunk = self.file_content[offset:breakpoint]
Expand Down
26 changes: 26 additions & 0 deletions tests/test_get_next_chunk.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,26 @@
# tests/test_get_next_chunk.py

# --- stub out the LLM interface needed by TextFileChunker ---
class DummyLLM:
def __init__(self):
self.config = {"model": "dummy", "system_instructions": ""}
def get_max_input_tokens(self, model_name):
return 100 # or whatever token limit you want to test
def count_tokens(self, text):
return len(text) # simplistic: 1 char = 1 token

# --- now import and test ---
from openrelik_ai_common.utils.chunker import TextFileChunker

def test_get_next_chunk_int_cast():
content = "x" * 200
dummy = DummyLLM()
c = TextFileChunker(prompt="p", file_content=content, llm=dummy)
# this used to raise a TypeError before the int() cast
chunk, offset = c._get_next_chunk(prompt="p",
prompt_chunk_wrapper="",
offset=0.0)
# verify the fix
assert isinstance(offset, int)
assert len(chunk) == offset
assert offset <= dummy.get_max_input_tokens("") * 4