Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,11 @@
from typing import Dict, List, Optional, Union, Any, Tuple

from typing_extensions import overload, override
from azure.ai.evaluation._legacy.prompty import AsyncPrompty

if os.getenv("AI_EVALS_USE_PF_PROMPTY", "false").lower() == "true":
from promptflow.core._flow import AsyncPrompty
else:
from azure.ai.evaluation._legacy.prompty import AsyncPrompty

from azure.ai.evaluation._evaluators._common import PromptyEvaluatorBase
from azure.ai.evaluation._evaluators._common._validators import ConversationValidator, ValidatorInterface
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -169,8 +169,7 @@ async def _do_eval(self, eval_input: Dict) -> Dict[str, Union[float, str]]: # t
eval_input["response"] = reformat_agent_response(eval_input["response"], logger)

prompty_output_dict = await self._flow(timeout=self._LLM_CALL_TIMEOUT, **eval_input)
llm_output = prompty_output_dict["llm_output"]
# llm_output should always be a dictionary because the response_format of prompty is set to json_object, but checking anyway
llm_output = prompty_output_dict.get("llm_output", prompty_output_dict)
score = math.nan
if isinstance(llm_output, dict):
score = llm_output.get("score", math.nan)
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -197,7 +197,7 @@ async def _do_eval(self, eval_input: Dict) -> Dict[str, Union[float, str]]: # t
if not isinstance(eval_input["response"], str):
eval_input["response"] = reformat_agent_response(eval_input["response"], logger)
result = await self._flow(timeout=self._LLM_CALL_TIMEOUT, **eval_input)
llm_output = result.get("llm_output")
llm_output = result.get("llm_output", result)
score = math.nan

if isinstance(llm_output, dict):
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -162,7 +162,7 @@ async def _do_eval(self, eval_input: Dict) -> Dict[str, Union[float, str]]: # t
)

result = await self._flow(timeout=self._LLM_CALL_TIMEOUT, **eval_input)
llm_output = result.get("llm_output") if isinstance(result, dict) else result
llm_output = result.get("llm_output", result) if isinstance(result, dict) else result

score = math.nan
llm_output_is_dict = isinstance(llm_output, dict)
Expand All @@ -176,19 +176,27 @@ async def _do_eval(self, eval_input: Dict) -> Dict[str, Union[float, str]]: # t

binary_result = self._get_binary_result(score)

input_token_count = result.get("input_token_count", 0) if isinstance(result, dict) else 0
output_token_count = result.get("output_token_count", 0) if isinstance(result, dict) else 0
total_token_count = result.get("total_token_count", 0) if isinstance(result, dict) else 0
finish_reason = result.get("finish_reason", "") if isinstance(result, dict) else ""
model_id = result.get("model_id", "") if isinstance(result, dict) else ""
sample_input = result.get("sample_input", "") if isinstance(result, dict) else ""
sample_output = result.get("sample_output", "") if isinstance(result, dict) else ""

# updating the result key and threshold to int based on the schema
return {
f"{self._result_key}": int(score),
f"{self._result_key}_result": binary_result,
f"{self._result_key}_threshold": int(self._threshold),
f"{self._result_key}_reason": reason,
f"{self._result_key}_prompt_tokens": result.get("input_token_count", 0),
f"{self._result_key}_completion_tokens": result.get("output_token_count", 0),
f"{self._result_key}_total_tokens": result.get("total_token_count", 0),
f"{self._result_key}_finish_reason": result.get("finish_reason", ""),
f"{self._result_key}_model": result.get("model_id", ""),
f"{self._result_key}_sample_input": result.get("sample_input", ""),
f"{self._result_key}_sample_output": result.get("sample_output", ""),
f"{self._result_key}_prompt_tokens": input_token_count,
f"{self._result_key}_completion_tokens": output_token_count,
f"{self._result_key}_total_tokens": total_token_count,
f"{self._result_key}_finish_reason": finish_reason,
f"{self._result_key}_model": model_id,
f"{self._result_key}_sample_input": sample_input,
f"{self._result_key}_sample_output": sample_output,
}

if logger:
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,7 @@
from typing_extensions import overload, override

from azure.ai.evaluation._evaluators._common import PromptyEvaluatorBase
from azure.ai.evaluation._exceptions import EvaluationException, ErrorBlame, ErrorCategory, ErrorTarget


class SimilarityEvaluator(PromptyEvaluatorBase):
Expand Down Expand Up @@ -134,3 +135,37 @@ def __call__( # pylint: disable=docstring-missing-param
:rtype: Dict[str, float]
"""
return super().__call__(*args, **kwargs)

@override
def _convert_kwargs_to_eval_input(self, **kwargs):
"""Convert keyword arguments to evaluation input, with validation."""
query = kwargs.get("query")
response = kwargs.get("response")
ground_truth = kwargs.get("ground_truth")

# Validate required fields are not None
if query is None:
raise EvaluationException(
message="SimilarityEvaluator: 'query' is a required input and cannot be None.",
blame=ErrorBlame.USER_ERROR,
category=ErrorCategory.MISSING_FIELD,
target=ErrorTarget.SIMILARITY_EVALUATOR,
)

if response is None:
raise EvaluationException(
message="SimilarityEvaluator: 'response' is a required input and cannot be None.",
blame=ErrorBlame.USER_ERROR,
category=ErrorCategory.MISSING_FIELD,
target=ErrorTarget.SIMILARITY_EVALUATOR,
)

if ground_truth is None:
raise EvaluationException(
message="SimilarityEvaluator: 'ground_truth' is a required input and cannot be None.",
blame=ErrorBlame.USER_ERROR,
category=ErrorCategory.MISSING_FIELD,
target=ErrorTarget.SIMILARITY_EVALUATOR,
)

return super()._convert_kwargs_to_eval_input(**kwargs)
Original file line number Diff line number Diff line change
Expand Up @@ -218,7 +218,7 @@ async def _do_eval(self, eval_input: Dict) -> Dict[str, Union[float, str, bool]]
}

prompty_output_dict = await self._flow(timeout=self._LLM_CALL_TIMEOUT, **prompty_input)
llm_output = prompty_output_dict["llm_output"]
llm_output = prompty_output_dict.get("llm_output", prompty_output_dict)

if isinstance(llm_output, dict):
flagged = llm_output.get("flagged", False)
Expand All @@ -230,6 +230,7 @@ async def _do_eval(self, eval_input: Dict) -> Dict[str, Union[float, str, bool]]
return {
f"{self._result_key}": score,
f"{self._result_key}_result": score_result,
f"{self._result_key}_threshold": self._threshold,
f"{self._result_key}_reason": reasoning,
f"{self._result_key}_details": llm_output.get("details", ""),
f"{self._result_key}_prompt_tokens": prompty_output_dict.get("input_token_count", 0),
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -167,11 +167,11 @@ async def _do_eval(self, eval_input: Dict) -> Dict[str, Union[float, str]]: # t
)
eval_input["query"] = reformat_conversation_history(eval_input["query"], logger, include_system_messages=True)
eval_input["response"] = reformat_agent_response(eval_input["response"], logger, include_tool_messages=True)
if "tool_definitions" in eval_input and eval_input["tool_definitions"] is not None:
if "tool_definitions" in eval_input and eval_input["tool_definitions"]:
eval_input["tool_definitions"] = reformat_tool_definitions(eval_input["tool_definitions"], logger)

prompty_output_dict = await self._flow(timeout=self._LLM_CALL_TIMEOUT, **eval_input)
llm_output = prompty_output_dict.get("llm_output", {})
llm_output = prompty_output_dict.get("llm_output", prompty_output_dict)

if isinstance(llm_output, dict):
success_value = llm_output.get("success", False)
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -79,7 +79,7 @@ class _TaskNavigationEfficiencyEvaluator(EvaluatorBase):
{"role": "assistant", "content": [{"type": "tool_call", "tool_call_id": "call_3", "name": "call_tool_B", "arguments": {}}]},
{"role": "assistant", "content": [{"type": "tool_call", "tool_call_id": "call_4", "name": "response_synthesis", "arguments": {}}]}
],
ground_truth=["identify_tools_to_call", ""call_tool_A", "call_tool_B", "response_synthesis"]
ground_truth=["identify_tools_to_call", "call_tool_A", "call_tool_B", "response_synthesis"]
)

# Example 2: Using tool names with parameters (exact parameter matching required)
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -223,7 +223,7 @@ async def _do_eval(self, eval_input: Dict) -> Dict[str, Union[float, str]]: # t

# Single LLM call for all tool calls
prompty_output_dict = await self._flow(timeout=self._LLM_CALL_TIMEOUT, **eval_input)
llm_output = prompty_output_dict.get("llm_output", {})
llm_output = prompty_output_dict.get("llm_output", prompty_output_dict)
if isinstance(llm_output, dict):
score = llm_output.get(self._LLM_SCORE_KEY, None)
if not score or not check_score_is_valid(
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -180,7 +180,7 @@ async def _do_eval(self, eval_input: Dict) -> Dict[str, Union[str, float]]: # t
eval_input["tool_definitions"] = _reformat_tool_definitions(filtered_tool_definitions, logger)

prompty_output_dict = await self._flow(timeout=self._LLM_CALL_TIMEOUT, **eval_input)
llm_output = prompty_output_dict.get("llm_output", "")
llm_output = prompty_output_dict.get("llm_output", prompty_output_dict)

if isinstance(llm_output, dict):
success = llm_output.get("success", False)
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -178,7 +178,7 @@ async def _do_eval(self, eval_input: Dict) -> Dict[str, Union[float, str]]:

# Call the LLM to evaluate
prompty_output_dict = await self._flow(timeout=self._LLM_CALL_TIMEOUT, **eval_input)
llm_output = prompty_output_dict.get("llm_output", {})
llm_output = prompty_output_dict.get("llm_output", prompty_output_dict)

if isinstance(llm_output, dict):
result = llm_output.get("result", None)
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -196,7 +196,7 @@ async def _do_eval(self, eval_input: Dict) -> Dict[str, Union[float, str]]: # t
eval_input["response"] = reformat_agent_response(eval_input["response"], logger, include_tool_messages=True)

prompty_output_dict = await self._flow(timeout=self._LLM_CALL_TIMEOUT, **eval_input)
llm_output = prompty_output_dict.get("llm_output", "")
llm_output = prompty_output_dict.get("llm_output", prompty_output_dict)
if isinstance(llm_output, dict):
output_label = llm_output.get("label", None)
if output_label is None:
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -90,7 +90,7 @@ def __init__(self, model_config, *, threshold=1, credential=None, **kwargs):
model_config=model_config,
prompty_file=prompty_path,
result_key=self._RESULT_KEY,
threshold=1,
threshold=threshold,
credential=credential,
**kwargs,
)
Expand Down Expand Up @@ -198,7 +198,7 @@ async def _do_eval(self, eval_input: Dict) -> Dict[str, Union[float, str]]:

# Call the LLM to evaluate
prompty_output_dict = await self._flow(timeout=self._LLM_CALL_TIMEOUT, **eval_input)
llm_output = prompty_output_dict.get("llm_output", {})
llm_output = prompty_output_dict.get("llm_output", prompty_output_dict)

if isinstance(llm_output, dict):
score = llm_output.get("score", None)
Expand Down
Loading