Azure · m7md7sien · Feb 12, 2026 · Feb 12, 2026
@@ -5,7 +5,11 @@
 from typing import Dict, List, Optional, Union, Any, Tuple
 
 from typing_extensions import overload, override
-from azure.ai.evaluation._legacy.prompty import AsyncPrompty
+
+if os.getenv("AI_EVALS_USE_PF_PROMPTY", "false").lower() == "true":
+    from promptflow.core._flow import AsyncPrompty
+else:
+    from azure.ai.evaluation._legacy.prompty import AsyncPrompty
 
 from azure.ai.evaluation._evaluators._common import PromptyEvaluatorBase
 from azure.ai.evaluation._evaluators._common._validators import ConversationValidator, ValidatorInterface

@@ -169,8 +169,7 @@ async def _do_eval(self, eval_input: Dict) -> Dict[str, Union[float, str]]:  # t
         eval_input["response"] = reformat_agent_response(eval_input["response"], logger)
 
         prompty_output_dict = await self._flow(timeout=self._LLM_CALL_TIMEOUT, **eval_input)
-        llm_output = prompty_output_dict["llm_output"]
-        # llm_output should always be a dictionary because the response_format of prompty is set to json_object, but checking anyway
+        llm_output = prompty_output_dict.get("llm_output", prompty_output_dict)
         score = math.nan
         if isinstance(llm_output, dict):
             score = llm_output.get("score", math.nan)

@@ -197,7 +197,7 @@ async def _do_eval(self, eval_input: Dict) -> Dict[str, Union[float, str]]:  # t
         if not isinstance(eval_input["response"], str):
             eval_input["response"] = reformat_agent_response(eval_input["response"], logger)
         result = await self._flow(timeout=self._LLM_CALL_TIMEOUT, **eval_input)
-        llm_output = result.get("llm_output")
+        llm_output = result.get("llm_output", result)
         score = math.nan
 
         if isinstance(llm_output, dict):

@@ -162,7 +162,7 @@ async def _do_eval(self, eval_input: Dict) -> Dict[str, Union[float, str]]:  # t
             )
 
         result = await self._flow(timeout=self._LLM_CALL_TIMEOUT, **eval_input)
-        llm_output = result.get("llm_output") if isinstance(result, dict) else result
+        llm_output = result.get("llm_output", result) if isinstance(result, dict) else result
 
         score = math.nan
         llm_output_is_dict = isinstance(llm_output, dict)
@@ -176,19 +176,27 @@ async def _do_eval(self, eval_input: Dict) -> Dict[str, Union[float, str]]:  # t
 
             binary_result = self._get_binary_result(score)
 
+            input_token_count = result.get("input_token_count", 0) if isinstance(result, dict) else 0
+            output_token_count = result.get("output_token_count", 0) if isinstance(result, dict) else 0
+            total_token_count = result.get("total_token_count", 0) if isinstance(result, dict) else 0
+            finish_reason = result.get("finish_reason", "") if isinstance(result, dict) else ""
+            model_id = result.get("model_id", "") if isinstance(result, dict) else ""
+            sample_input = result.get("sample_input", "") if isinstance(result, dict) else ""
+            sample_output = result.get("sample_output", "") if isinstance(result, dict) else ""
+
             # updating the result key and threshold to int based on the schema
             return {
                 f"{self._result_key}": int(score),
                 f"{self._result_key}_result": binary_result,
                 f"{self._result_key}_threshold": int(self._threshold),
                 f"{self._result_key}_reason": reason,
-                f"{self._result_key}_prompt_tokens": result.get("input_token_count", 0),
-                f"{self._result_key}_completion_tokens": result.get("output_token_count", 0),
-                f"{self._result_key}_total_tokens": result.get("total_token_count", 0),
-                f"{self._result_key}_finish_reason": result.get("finish_reason", ""),
-                f"{self._result_key}_model": result.get("model_id", ""),
-                f"{self._result_key}_sample_input": result.get("sample_input", ""),
-                f"{self._result_key}_sample_output": result.get("sample_output", ""),
+                f"{self._result_key}_prompt_tokens": input_token_count,
+                f"{self._result_key}_completion_tokens": output_token_count,
+                f"{self._result_key}_total_tokens": total_token_count,
+                f"{self._result_key}_finish_reason": finish_reason,
+                f"{self._result_key}_model": model_id,
+                f"{self._result_key}_sample_input": sample_input,
+                f"{self._result_key}_sample_output": sample_output,
             }
 
         if logger:

@@ -8,6 +8,7 @@
 from typing_extensions import overload, override
 
 from azure.ai.evaluation._evaluators._common import PromptyEvaluatorBase
+from azure.ai.evaluation._exceptions import EvaluationException, ErrorBlame, ErrorCategory, ErrorTarget
 
 
 class SimilarityEvaluator(PromptyEvaluatorBase):
@@ -134,3 +135,37 @@ def __call__(  # pylint: disable=docstring-missing-param
         :rtype: Dict[str, float]
         """
         return super().__call__(*args, **kwargs)
+
+    @override
+    def _convert_kwargs_to_eval_input(self, **kwargs):
+        """Convert keyword arguments to evaluation input, with validation."""
+        query = kwargs.get("query")
+        response = kwargs.get("response")
+        ground_truth = kwargs.get("ground_truth")
+
+        # Validate required fields are not None
+        if query is None:
+            raise EvaluationException(
+                message="SimilarityEvaluator: 'query' is a required input and cannot be None.",
+                blame=ErrorBlame.USER_ERROR,
+                category=ErrorCategory.MISSING_FIELD,
+                target=ErrorTarget.SIMILARITY_EVALUATOR,
+            )
+
+        if response is None:
+            raise EvaluationException(
+                message="SimilarityEvaluator: 'response' is a required input and cannot be None.",
+                blame=ErrorBlame.USER_ERROR,
+                category=ErrorCategory.MISSING_FIELD,
+                target=ErrorTarget.SIMILARITY_EVALUATOR,
+            )
+
+        if ground_truth is None:
+            raise EvaluationException(
+                message="SimilarityEvaluator: 'ground_truth' is a required input and cannot be None.",
+                blame=ErrorBlame.USER_ERROR,
+                category=ErrorCategory.MISSING_FIELD,
+                target=ErrorTarget.SIMILARITY_EVALUATOR,
+            )
+
+        return super()._convert_kwargs_to_eval_input(**kwargs)
@@ -218,7 +218,7 @@ async def _do_eval(self, eval_input: Dict) -> Dict[str, Union[float, str, bool]]
         }
 
         prompty_output_dict = await self._flow(timeout=self._LLM_CALL_TIMEOUT, **prompty_input)
-        llm_output = prompty_output_dict["llm_output"]
+        llm_output = prompty_output_dict.get("llm_output", prompty_output_dict)
 
         if isinstance(llm_output, dict):
             flagged = llm_output.get("flagged", False)
@@ -230,6 +230,7 @@ async def _do_eval(self, eval_input: Dict) -> Dict[str, Union[float, str, bool]]
             return {
                 f"{self._result_key}": score,
                 f"{self._result_key}_result": score_result,
+                f"{self._result_key}_threshold": self._threshold,
                 f"{self._result_key}_reason": reasoning,
                 f"{self._result_key}_details": llm_output.get("details", ""),
                 f"{self._result_key}_prompt_tokens": prompty_output_dict.get("input_token_count", 0),

@@ -167,11 +167,11 @@ async def _do_eval(self, eval_input: Dict) -> Dict[str, Union[float, str]]:  # t
             )
         eval_input["query"] = reformat_conversation_history(eval_input["query"], logger, include_system_messages=True)
         eval_input["response"] = reformat_agent_response(eval_input["response"], logger, include_tool_messages=True)
-        if "tool_definitions" in eval_input and eval_input["tool_definitions"] is not None:
+        if "tool_definitions" in eval_input and eval_input["tool_definitions"]:
             eval_input["tool_definitions"] = reformat_tool_definitions(eval_input["tool_definitions"], logger)
 
         prompty_output_dict = await self._flow(timeout=self._LLM_CALL_TIMEOUT, **eval_input)
-        llm_output = prompty_output_dict.get("llm_output", {})
+        llm_output = prompty_output_dict.get("llm_output", prompty_output_dict)
 
         if isinstance(llm_output, dict):
             success_value = llm_output.get("success", False)

@@ -79,7 +79,7 @@ class _TaskNavigationEfficiencyEvaluator(EvaluatorBase):
                     {"role": "assistant", "content": [{"type": "tool_call", "tool_call_id": "call_3", "name": "call_tool_B", "arguments": {}}]},
                     {"role": "assistant", "content": [{"type": "tool_call", "tool_call_id": "call_4", "name": "response_synthesis", "arguments": {}}]}
                 ],
-                ground_truth=["identify_tools_to_call", ""call_tool_A", "call_tool_B", "response_synthesis"]
+                ground_truth=["identify_tools_to_call", "call_tool_A", "call_tool_B", "response_synthesis"]
             )
 
             # Example 2: Using tool names with parameters (exact parameter matching required)

@@ -223,7 +223,7 @@ async def _do_eval(self, eval_input: Dict) -> Dict[str, Union[float, str]]:  # t
 
         # Single LLM call for all tool calls
         prompty_output_dict = await self._flow(timeout=self._LLM_CALL_TIMEOUT, **eval_input)
-        llm_output = prompty_output_dict.get("llm_output", {})
+        llm_output = prompty_output_dict.get("llm_output", prompty_output_dict)
         if isinstance(llm_output, dict):
             score = llm_output.get(self._LLM_SCORE_KEY, None)
             if not score or not check_score_is_valid(

@@ -180,7 +180,7 @@ async def _do_eval(self, eval_input: Dict) -> Dict[str, Union[str, float]]:  # t
             eval_input["tool_definitions"] = _reformat_tool_definitions(filtered_tool_definitions, logger)
 
         prompty_output_dict = await self._flow(timeout=self._LLM_CALL_TIMEOUT, **eval_input)
-        llm_output = prompty_output_dict.get("llm_output", "")
+        llm_output = prompty_output_dict.get("llm_output", prompty_output_dict)
 
         if isinstance(llm_output, dict):
             success = llm_output.get("success", False)

@@ -178,7 +178,7 @@ async def _do_eval(self, eval_input: Dict) -> Dict[str, Union[float, str]]:
 
         # Call the LLM to evaluate
         prompty_output_dict = await self._flow(timeout=self._LLM_CALL_TIMEOUT, **eval_input)
-        llm_output = prompty_output_dict.get("llm_output", {})
+        llm_output = prompty_output_dict.get("llm_output", prompty_output_dict)
 
         if isinstance(llm_output, dict):
             result = llm_output.get("result", None)

@@ -196,7 +196,7 @@ async def _do_eval(self, eval_input: Dict) -> Dict[str, Union[float, str]]:  # t
         eval_input["response"] = reformat_agent_response(eval_input["response"], logger, include_tool_messages=True)
 
         prompty_output_dict = await self._flow(timeout=self._LLM_CALL_TIMEOUT, **eval_input)
-        llm_output = prompty_output_dict.get("llm_output", "")
+        llm_output = prompty_output_dict.get("llm_output", prompty_output_dict)
         if isinstance(llm_output, dict):
             output_label = llm_output.get("label", None)
             if output_label is None:

@@ -90,7 +90,7 @@ def __init__(self, model_config, *, threshold=1, credential=None, **kwargs):
             model_config=model_config,
             prompty_file=prompty_path,
             result_key=self._RESULT_KEY,
-            threshold=1,
+            threshold=threshold,
             credential=credential,
             **kwargs,
         )
@@ -198,7 +198,7 @@ async def _do_eval(self, eval_input: Dict) -> Dict[str, Union[float, str]]:
 
         # Call the LLM to evaluate
         prompty_output_dict = await self._flow(timeout=self._LLM_CALL_TIMEOUT, **eval_input)
-        llm_output = prompty_output_dict.get("llm_output", {})
+        llm_output = prompty_output_dict.get("llm_output", prompty_output_dict)
 
         if isinstance(llm_output, dict):
             score = llm_output.get("score", None)