-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathtest_chunker.py
More file actions
70 lines (57 loc) · 2.76 KB
/
test_chunker.py
File metadata and controls
70 lines (57 loc) · 2.76 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
"""Tests for semantic text chunking."""
from qp_vault.core.chunker import ChunkerConfig, chunk_text, estimate_tokens
class TestEstimateTokens:
def test_empty(self):
assert estimate_tokens("") == 0
def test_single_word(self):
tokens = estimate_tokens("hello")
assert tokens >= 1
def test_sentence(self):
tokens = estimate_tokens("The quick brown fox jumps over the lazy dog")
assert 5 <= tokens <= 15
class TestChunkText:
def test_empty_text(self):
assert chunk_text("") == []
assert chunk_text(" ") == []
def test_short_text_single_chunk(self):
text = "This is a short paragraph."
chunks = chunk_text(text)
assert len(chunks) == 1
assert chunks[0].chunk_index == 0
assert "short paragraph" in chunks[0].content
def test_preserves_content(self):
text = "First paragraph.\n\nSecond paragraph.\n\nThird paragraph."
chunks = chunk_text(text)
combined = " ".join(c.content for c in chunks)
assert "First paragraph" in combined
assert "Third paragraph" in combined
def test_respects_max_tokens(self):
config = ChunkerConfig(target_tokens=50, max_tokens=100, min_tokens=10, overlap_tokens=10)
# Create text that definitely exceeds max
paragraphs = [f"Paragraph {i} with some filler text to increase token count." for i in range(50)]
text = "\n\n".join(paragraphs)
chunks = chunk_text(text, config)
assert len(chunks) > 1
for chunk in chunks:
assert chunk.token_count <= config.max_tokens * 1.5 # Allow some overshoot
def test_chunk_indexes_sequential(self):
paragraphs = [f"Paragraph number {i} with content." for i in range(20)]
text = "\n\n".join(paragraphs)
config = ChunkerConfig(target_tokens=30, max_tokens=60, min_tokens=5, overlap_tokens=5)
chunks = chunk_text(text, config)
for i, chunk in enumerate(chunks):
assert chunk.chunk_index == i
def test_detects_section_titles(self):
text = "# Introduction\n\nThis is the intro.\n\n## Methods\n\nThis is the methods section."
chunks = chunk_text(text)
# At least one chunk should have a section title
sections = [c.section_title for c in chunks if c.section_title]
assert len(sections) > 0
def test_small_trailing_chunk_merged(self):
config = ChunkerConfig(target_tokens=100, min_tokens=50, max_tokens=200, overlap_tokens=10)
# Create text where last paragraph is very small
text = ("A " * 80) + "\n\n" + "tiny."
chunks = chunk_text(text, config)
# The tiny trailing text should be merged into previous chunk
if len(chunks) == 1:
assert "tiny" in chunks[0].content