Skip to content

Commit fdcea15

Browse files
committed
test: add comprehensive TF-IDF index tests
Add 26 test cases covering: - Tokenization (7 tests): basic tokenization, lowercase, punctuation removal, stopword filtering, underscore preservation, edge cases - TF-IDF Index (15 tests): index creation, vocabulary building, search functionality, relevance ranking, score ranges, empty queries, edge cases - TfidfDocument (2 tests): creation and immutability - Integration (2 tests): realistic tool name matching scenarios All tests passing, ensuring TF-IDF implementation is robust and reliable.
1 parent 472a35c commit fdcea15

File tree

1 file changed

+299
-0
lines changed

1 file changed

+299
-0
lines changed

tests/test_tfidf_index.py

Lines changed: 299 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,299 @@
1+
"""Tests for TF-IDF index implementation"""
2+
3+
import pytest
4+
5+
from stackone_ai.utils.tfidf_index import TfidfDocument, TfidfIndex, tokenize
6+
7+
8+
class TestTokenize:
9+
"""Test tokenization functionality"""
10+
11+
def test_basic_tokenization(self):
12+
"""Test basic text tokenization"""
13+
text = "Hello World"
14+
tokens = tokenize(text)
15+
assert tokens == ["hello", "world"]
16+
17+
def test_lowercase_conversion(self):
18+
"""Test that text is lowercased"""
19+
text = "UPPERCASE lowercase MiXeD"
20+
tokens = tokenize(text)
21+
assert all(t.islower() for t in tokens)
22+
23+
def test_punctuation_removal(self):
24+
"""Test that punctuation is removed"""
25+
text = "Hello, world! How are you?"
26+
tokens = tokenize(text)
27+
assert "," not in tokens
28+
assert "!" not in tokens
29+
assert "?" not in tokens
30+
31+
def test_stopword_filtering(self):
32+
"""Test that stopwords are removed"""
33+
text = "the quick brown fox and the lazy dog"
34+
tokens = tokenize(text)
35+
# Stopwords should be filtered
36+
assert "the" not in tokens
37+
assert "and" not in tokens
38+
# Content words should remain
39+
assert "quick" in tokens
40+
assert "brown" in tokens
41+
assert "fox" in tokens
42+
assert "lazy" in tokens
43+
assert "dog" in tokens
44+
45+
def test_underscore_preservation(self):
46+
"""Test that underscores are preserved"""
47+
text = "hris_list_employees"
48+
tokens = tokenize(text)
49+
assert "hris_list_employees" in tokens
50+
51+
def test_empty_string(self):
52+
"""Test tokenization of empty string"""
53+
tokens = tokenize("")
54+
assert tokens == []
55+
56+
def test_only_stopwords(self):
57+
"""Test text with only stopwords"""
58+
text = "the a an and or but"
59+
tokens = tokenize(text)
60+
assert tokens == []
61+
62+
63+
class TestTfidfIndex:
64+
"""Test TF-IDF index functionality"""
65+
66+
@pytest.fixture
67+
def sample_documents(self):
68+
"""Create sample documents for testing"""
69+
return [
70+
TfidfDocument(id="doc1", text="create new employee in hris system"),
71+
TfidfDocument(id="doc2", text="list all employees from database"),
72+
TfidfDocument(id="doc3", text="update employee information"),
73+
TfidfDocument(id="doc4", text="delete employee record"),
74+
TfidfDocument(id="doc5", text="search for candidates in ats"),
75+
TfidfDocument(id="doc6", text="create job posting"),
76+
]
77+
78+
def test_index_creation(self, sample_documents):
79+
"""Test that index can be created"""
80+
index = TfidfIndex()
81+
index.build(sample_documents)
82+
83+
assert len(index.vocab) > 0
84+
assert len(index.idf) == len(index.vocab)
85+
assert len(index.docs) == len(sample_documents)
86+
87+
def test_vocabulary_building(self, sample_documents):
88+
"""Test vocabulary is built correctly"""
89+
index = TfidfIndex()
90+
index.build(sample_documents)
91+
92+
# Check that content words are in vocabulary
93+
assert any("employee" in term for term in index.vocab.keys())
94+
assert any("create" in term for term in index.vocab.keys())
95+
assert any("hris" in term for term in index.vocab.keys())
96+
97+
def test_search_returns_results(self, sample_documents):
98+
"""Test that search returns relevant results"""
99+
index = TfidfIndex()
100+
index.build(sample_documents)
101+
102+
results = index.search("employee", k=5)
103+
104+
assert len(results) > 0
105+
# Results should be sorted by score
106+
for i in range(len(results) - 1):
107+
assert results[i].score >= results[i + 1].score
108+
109+
def test_search_relevance(self, sample_documents):
110+
"""Test that search returns relevant documents"""
111+
index = TfidfIndex()
112+
index.build(sample_documents)
113+
114+
# Search for "employee"
115+
results = index.search("employee", k=5)
116+
117+
# Top results should contain employee-related docs
118+
top_ids = {r.id for r in results[:3]}
119+
assert "doc1" in top_ids or "doc2" in top_ids or "doc3" in top_ids
120+
121+
def test_search_with_multiple_terms(self, sample_documents):
122+
"""Test search with multiple query terms"""
123+
index = TfidfIndex()
124+
index.build(sample_documents)
125+
126+
results = index.search("create employee hris", k=5)
127+
128+
assert len(results) > 0
129+
# doc1 should be highly ranked (contains all three terms)
130+
top_ids = [r.id for r in results[:2]]
131+
assert "doc1" in top_ids
132+
133+
def test_search_limit(self, sample_documents):
134+
"""Test that search respects k parameter"""
135+
index = TfidfIndex()
136+
index.build(sample_documents)
137+
138+
results = index.search("employee", k=2)
139+
assert len(results) <= 2
140+
141+
results = index.search("employee", k=10)
142+
# Should return at most the number of documents
143+
assert len(results) <= len(sample_documents)
144+
145+
def test_score_range(self, sample_documents):
146+
"""Test that scores are in [0, 1] range"""
147+
index = TfidfIndex()
148+
index.build(sample_documents)
149+
150+
results = index.search("employee", k=10)
151+
152+
for result in results:
153+
assert 0.0 <= result.score <= 1.0
154+
155+
def test_empty_query(self, sample_documents):
156+
"""Test search with empty query"""
157+
index = TfidfIndex()
158+
index.build(sample_documents)
159+
160+
results = index.search("", k=5)
161+
assert results == []
162+
163+
def test_no_matching_terms(self, sample_documents):
164+
"""Test search with terms not in vocabulary"""
165+
index = TfidfIndex()
166+
index.build(sample_documents)
167+
168+
results = index.search("xyzabc", k=5)
169+
assert results == []
170+
171+
def test_stopword_query(self, sample_documents):
172+
"""Test search with only stopwords"""
173+
index = TfidfIndex()
174+
index.build(sample_documents)
175+
176+
results = index.search("the and or", k=5)
177+
assert results == []
178+
179+
def test_empty_corpus(self):
180+
"""Test building index with empty corpus"""
181+
index = TfidfIndex()
182+
index.build([])
183+
184+
assert len(index.vocab) == 0
185+
assert len(index.docs) == 0
186+
187+
results = index.search("test", k=5)
188+
assert results == []
189+
190+
def test_single_document(self):
191+
"""Test with single document"""
192+
index = TfidfIndex()
193+
docs = [TfidfDocument(id="doc1", text="single document test")]
194+
index.build(docs)
195+
196+
results = index.search("document", k=5)
197+
assert len(results) == 1
198+
assert results[0].id == "doc1"
199+
assert results[0].score > 0
200+
201+
def test_duplicate_documents(self):
202+
"""Test with duplicate document IDs"""
203+
index = TfidfIndex()
204+
docs = [
205+
TfidfDocument(id="doc1", text="first document"),
206+
TfidfDocument(id="doc1", text="duplicate id"),
207+
]
208+
index.build(docs)
209+
210+
# Both documents should be in index
211+
assert len(index.docs) == 2
212+
213+
def test_case_insensitive_search(self, sample_documents):
214+
"""Test that search is case-insensitive"""
215+
index = TfidfIndex()
216+
index.build(sample_documents)
217+
218+
results_lower = index.search("employee", k=5)
219+
results_upper = index.search("EMPLOYEE", k=5)
220+
results_mixed = index.search("EmPlOyEe", k=5)
221+
222+
# Should return same results (same IDs in same order)
223+
assert len(results_lower) == len(results_upper) == len(results_mixed)
224+
assert [r.id for r in results_lower] == [r.id for r in results_upper]
225+
assert [r.id for r in results_lower] == [r.id for r in results_mixed]
226+
227+
def test_special_characters_in_query(self, sample_documents):
228+
"""Test search with special characters"""
229+
index = TfidfIndex()
230+
index.build(sample_documents)
231+
232+
# Special characters should be stripped
233+
results = index.search("employee!", k=5)
234+
assert len(results) > 0
235+
236+
results2 = index.search("employee", k=5)
237+
# Should return same results
238+
assert [r.id for r in results] == [r.id for r in results2]
239+
240+
def test_idf_calculation(self):
241+
"""Test IDF values are calculated correctly"""
242+
index = TfidfIndex()
243+
docs = [
244+
TfidfDocument(id="doc1", text="common word appears everywhere"),
245+
TfidfDocument(id="doc2", text="common word appears here too"),
246+
TfidfDocument(id="doc3", text="common word and rare term"),
247+
]
248+
index.build(docs)
249+
250+
# "common" appears in all docs, should have lower IDF
251+
# "rare" appears in one doc, should have higher IDF
252+
common_id = index.vocab.get("common")
253+
rare_id = index.vocab.get("rare")
254+
255+
if common_id is not None and rare_id is not None:
256+
assert index.idf[rare_id] > index.idf[common_id]
257+
258+
259+
class TestTfidfDocument:
260+
"""Test TfidfDocument named tuple"""
261+
262+
def test_document_creation(self):
263+
"""Test creating a document"""
264+
doc = TfidfDocument(id="test", text="test text")
265+
assert doc.id == "test"
266+
assert doc.text == "test text"
267+
268+
def test_document_immutability(self):
269+
"""Test that TfidfDocument is immutable"""
270+
doc = TfidfDocument(id="test", text="test text")
271+
with pytest.raises(AttributeError):
272+
doc.id = "new_id" # type: ignore
273+
274+
275+
class TestTfidfIntegration:
276+
"""Integration tests for TF-IDF with realistic scenarios"""
277+
278+
def test_tool_name_matching(self):
279+
"""Test matching tool names"""
280+
index = TfidfIndex()
281+
docs = [
282+
TfidfDocument(id="hris_create_employee", text="create employee hris system"),
283+
TfidfDocument(id="hris_list_employees", text="list employees hris system"),
284+
TfidfDocument(id="ats_create_candidate", text="create candidate ats system"),
285+
TfidfDocument(id="crm_list_contacts", text="list contacts crm system"),
286+
]
287+
index.build(docs)
288+
289+
# Search for HRIS tools
290+
results = index.search("employee hris", k=5)
291+
top_ids = [r.id for r in results[:2]]
292+
assert "hris_create_employee" in top_ids or "hris_list_employees" in top_ids
293+
294+
# Search for create operations
295+
results = index.search("create", k=5)
296+
assert len(results) > 0
297+
# Should find multiple create tools
298+
create_count = sum(1 for r in results if "create" in r.id)
299+
assert create_count >= 2

0 commit comments

Comments
 (0)