Skip to content

Commit f7e76ec

Browse files
KoderFPVclaudehappy-otter
committed
feat: Add semantic product search with LLM evaluation tests
- Implement semantic product search using Weaviate text_vector - Add LLM-based search query extraction from conversation context - Create LLM-as-judge evaluation framework for e2e testing - Add translations for product search responses (en/pl) Product Search: - searchProductIdsInWeaviate for semantic search via nearText - productsNode extracts query from full conversation, not just last message - Returns formatted product list with prices, categories, stock status Evaluation Framework: - evaluator.ts: LLM-as-judge using Bielik model (score 1-5) - conversationRunner.ts: Execute multi-turn conversations - productSearch.e2e.test.ts: 11 scenarios (single-turn, multi-turn, edge cases) - Separate vitest config for e2e tests (npm run test:eval) Generated with [Claude Code](https://claude.ai/code) via [Happy](https://happy.engineering) Co-Authored-By: Claude <noreply@anthropic.com> Co-Authored-By: Happy <yesreply@happy.engineering>
1 parent b1b7523 commit f7e76ec

24 files changed

+1202
-44
lines changed

.env.example

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -23,7 +23,7 @@ TEST_SERVER_URL=http://localhost:2137
2323

2424
# Ollama Configuration
2525
OLLAMA_URL=http://localhost:2142/v1
26-
OLLAMA_MODEL=speakleash/bielik-11b-v3.0-instruct:Q8_0
26+
OLLAMA_MODEL=mistral-small3.2:24b-instruct-2506-q8_0
2727

2828
# vLLM Qwen3-VL Configuration (Vision)
2929
VLLM_QWEN3_VL_URL=http://localhost:2141/v1

.gitignore

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -43,3 +43,6 @@ weaviate-data
4343

4444
# Dev server logs
4545
/logs/
46+
47+
# Evaluation test results
48+
agents/__tests__/evaluation/last-run/

CLAUDE.md

Lines changed: 10 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -42,7 +42,12 @@ This file provides guidance to Claude Code (claude.ai/code) when working with co
4242
- **Return types**: Do not explicitly declare return types for functions - let TypeScript infer them automatically (e.g., `const add = (a: number, b: number) => a + b` instead of `const add = (a: number, b: number): number => a + b`)
4343
- **Always use braces**: Always use curly braces `{}` for if statements, even for single-line blocks (e.g., `if (condition) { return value; }` instead of `if (condition) return value;`)
4444
- **No magic numbers**: Extract numeric constants to named constants at the top of the file to make the code self-documenting (e.g., `const DEFAULT_PAGE_SIZE = 10;` instead of using `10` directly in code)
45-
- **User-facing strings must be translated**: All strings visible to users must be internationalized, including UI text, error messages from services, and API responses. Strings should never be hardcoded in components, services, or API handlers. Services must accept locale parameter to provide translated error messages that will be displayed to users.
45+
- **User-facing strings must be translated**: All strings visible to users must be internationalized. This includes:
46+
- UI text in components
47+
- Error messages from services and API responses
48+
- **AI agent responses** (chatNode, productsNode, etc.) - all messages returned to users must use translations from `messages/` files
49+
- Never hardcode user-facing strings in code - always use `getTranslations` or translation files
50+
- Services and agents must accept locale parameter and use it to fetch translated strings
4651
- This makes optional properties more concise and follows TypeScript best practices
4752

4853
## Documentation
@@ -596,9 +601,10 @@ make status # Check Docker services status
596601
# Logs and Debugging
597602
make logs # View Docker logs (follow mode)
598603

599-
# Testing
600-
make test # Run all tests
601-
make test-watch # Run tests in watch mode
604+
# Testing (always run with TEST_LOCALE=en)
605+
TEST_LOCALE=en make test # Run all tests
606+
TEST_LOCALE=en make test-watch # Run tests in watch mode
607+
TEST_LOCALE=en npm test # Alternative: run tests with npm
602608
make lint # Run ESLint
603609
make type-check # Run TypeScript type checking
604610

Lines changed: 78 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,78 @@
1+
import { executeChatGraphWithStream, IStreamCallback } from '@/agents/graph/chatGraph';
2+
import { IConversationTurn } from './evaluator';
3+
4+
export interface IConversationScenario {
5+
name: string;
6+
locale: string;
7+
turns: Array<{
8+
userMessage: string;
9+
validateResponse?: (response: string) => boolean;
10+
}>;
11+
expectedBehavior: string;
12+
}
13+
14+
export interface IConversationResult {
15+
scenario: IConversationScenario;
16+
conversation: IConversationTurn[];
17+
success: boolean;
18+
error?: string;
19+
}
20+
21+
const createNoopCallbacks = (): IStreamCallback => ({
22+
onToken: () => {},
23+
onComplete: () => {},
24+
onError: () => {},
25+
});
26+
27+
export const runConversation = async (
28+
scenario: IConversationScenario
29+
): Promise<IConversationResult> => {
30+
const conversation: IConversationTurn[] = [];
31+
const sessionId = `eval-${Date.now()}-${Math.random().toString(36).slice(2)}`;
32+
const callbacks = createNoopCallbacks();
33+
34+
const messages: Array<{ role: string; content: string }> = [];
35+
36+
for (const turn of scenario.turns) {
37+
messages.push({ role: 'user', content: turn.userMessage });
38+
conversation.push({ role: 'user', content: turn.userMessage });
39+
40+
const response = await executeChatGraphWithStream(
41+
sessionId,
42+
scenario.locale,
43+
messages,
44+
callbacks
45+
);
46+
47+
messages.push({ role: 'assistant', content: response });
48+
conversation.push({ role: 'assistant', content: response });
49+
50+
if (turn.validateResponse && !turn.validateResponse(response)) {
51+
return {
52+
scenario,
53+
conversation,
54+
success: false,
55+
error: `Response validation failed for turn: "${turn.userMessage}"`,
56+
};
57+
}
58+
}
59+
60+
return {
61+
scenario,
62+
conversation,
63+
success: true,
64+
};
65+
};
66+
67+
export const runMultipleConversations = async (
68+
scenarios: IConversationScenario[]
69+
): Promise<IConversationResult[]> => {
70+
const results: IConversationResult[] = [];
71+
72+
for (const scenario of scenarios) {
73+
const result = await runConversation(scenario);
74+
results.push(result);
75+
}
76+
77+
return results;
78+
};
Lines changed: 166 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,166 @@
1+
import { HumanMessage, SystemMessage } from '@langchain/core/messages';
2+
import { createOllamaClient } from '@/services/llm/llm.service';
3+
4+
const EVALUATOR_TEMPERATURE = 0.1;
5+
const EVALUATOR_MAX_TOKENS = 500;
6+
7+
export interface IEvaluationResult {
8+
score: number;
9+
reasoning: string;
10+
passed: boolean;
11+
}
12+
13+
export interface IConversationTurn {
14+
role: 'user' | 'assistant';
15+
content: string;
16+
}
17+
18+
export interface IEvaluationCriteria {
19+
name: string;
20+
description: string;
21+
weight: number;
22+
}
23+
24+
const createEvaluationPrompt = (
25+
criteria: IEvaluationCriteria[],
26+
expectedBehavior: string
27+
) => {
28+
const criteriaList = criteria
29+
.map((c, i) => `${i + 1}. ${c.name} (weight: ${c.weight}): ${c.description}`)
30+
.join('\n');
31+
32+
return `You are an AI evaluator. Your task is to evaluate a conversation between a user and an e-commerce shopping assistant.
33+
34+
EVALUATION CRITERIA:
35+
${criteriaList}
36+
37+
EXPECTED BEHAVIOR:
38+
${expectedBehavior}
39+
40+
SCORING INSTRUCTIONS:
41+
- Score each criterion from 1 to 5:
42+
1 = Very poor, completely fails the criterion
43+
2 = Poor, mostly fails with minor success
44+
3 = Acceptable, meets basic expectations
45+
4 = Good, exceeds expectations in some areas
46+
5 = Excellent, fully meets or exceeds all expectations
47+
48+
- Calculate weighted average score
49+
- Response MUST be in this exact JSON format:
50+
{
51+
"scores": [
52+
{"criterion": "criterion_name", "score": X, "reason": "brief explanation"}
53+
],
54+
"overall_score": X.X,
55+
"reasoning": "overall assessment",
56+
"passed": true/false
57+
}`;
58+
};
59+
60+
const formatConversationForEvaluation = (conversation: IConversationTurn[]) => {
61+
return conversation
62+
.map((turn) => `${turn.role.toUpperCase()}: ${turn.content}`)
63+
.join('\n\n');
64+
};
65+
66+
const parseEvaluationResponse = (response: string): IEvaluationResult => {
67+
const jsonMatch = response.match(/\{[\s\S]*\}/);
68+
if (!jsonMatch) {
69+
return {
70+
score: 1,
71+
reasoning: 'Failed to parse evaluation response',
72+
passed: false,
73+
};
74+
}
75+
76+
try {
77+
const parsed = JSON.parse(jsonMatch[0]);
78+
79+
return {
80+
score: parsed.overall_score,
81+
reasoning: parsed.reasoning,
82+
passed: parsed.passed,
83+
};
84+
} catch {
85+
const scoreMatch = response.match(/overall_score["\s:]+(\d+\.?\d*)/);
86+
const reasoningMatch = response.match(/reasoning["\s:]+["']([^"']+)["']/);
87+
const passedMatch = response.match(/passed["\s:]+(\w+)/);
88+
89+
const score = scoreMatch ? parseFloat(scoreMatch[1]) : 1;
90+
const reasoning = reasoningMatch ? reasoningMatch[1] : 'Failed to parse reasoning';
91+
const passed = passedMatch ? passedMatch[1] === 'true' : false;
92+
93+
return { score, reasoning, passed };
94+
}
95+
};
96+
97+
export const evaluateConversation = async (
98+
conversation: IConversationTurn[],
99+
criteria: IEvaluationCriteria[],
100+
expectedBehavior: string
101+
): Promise<IEvaluationResult> => {
102+
const llm = createOllamaClient(EVALUATOR_TEMPERATURE, EVALUATOR_MAX_TOKENS);
103+
104+
const systemPrompt = createEvaluationPrompt(criteria, expectedBehavior);
105+
const conversationText = formatConversationForEvaluation(conversation);
106+
107+
const response = await llm.invoke([
108+
new SystemMessage(systemPrompt),
109+
new HumanMessage(`CONVERSATION TO EVALUATE:\n\n${conversationText}`),
110+
]);
111+
112+
const content = response.content.toString();
113+
114+
return parseEvaluationResponse(content);
115+
};
116+
117+
export const defaultProductSearchCriteria: IEvaluationCriteria[] = [
118+
{
119+
name: 'Relevance',
120+
description: 'Does the assistant return products relevant to the user query?',
121+
weight: 3,
122+
},
123+
{
124+
name: 'Completeness',
125+
description: 'Does the response include necessary product details (name, price, category)?',
126+
weight: 2,
127+
},
128+
{
129+
name: 'Helpfulness',
130+
description: 'Is the assistant helpful in guiding the user to find products?',
131+
weight: 2,
132+
},
133+
{
134+
name: 'Accuracy',
135+
description: 'Are the product details accurate and properly formatted?',
136+
weight: 2,
137+
},
138+
{
139+
name: 'Natural Language',
140+
description: 'Is the response natural and easy to understand?',
141+
weight: 1,
142+
},
143+
];
144+
145+
export const defaultChatCriteria: IEvaluationCriteria[] = [
146+
{
147+
name: 'Appropriateness',
148+
description: 'Is the response appropriate for the user message?',
149+
weight: 3,
150+
},
151+
{
152+
name: 'Helpfulness',
153+
description: 'Does the assistant provide helpful information or guidance?',
154+
weight: 2,
155+
},
156+
{
157+
name: 'Coherence',
158+
description: 'Is the response coherent and logically structured?',
159+
weight: 2,
160+
},
161+
{
162+
name: 'Tone',
163+
description: 'Is the tone friendly and professional?',
164+
weight: 1,
165+
},
166+
];

0 commit comments

Comments
 (0)