Azure · YoYoJa · Sep 30, 2025 · Oct 1, 2025 · Oct 1, 2025 · Oct 1, 2025
@@ -2,5 +2,5 @@
   "AssetsRepo": "Azure/azure-sdk-assets",
   "AssetsRepoPrefixPath": "python",
   "TagPrefix": "python/evaluation/azure-ai-evaluation",
-  "Tag": "python/evaluation/azure-ai-evaluation_d7b00f22b8"
+  "Tag": "python/evaluation/azure-ai-evaluation_5bef6dc713"
 }
@@ -43,6 +43,7 @@ class AzureOpenAILabelGrader(AzureOpenAIGrader):
     """
 
     id = "azureai://built-in/evaluators/azure-openai/label_grader"
+    _type = "label_model"
 
     def __init__(
         self,
@@ -62,6 +63,6 @@ def __init__(
             model=model,
             name=name,
             passing_labels=passing_labels,
-            type="label_model",
+            type=AzureOpenAILabelGrader._type,
         )
         super().__init__(model_config=model_config, grader_config=grader, credential=credential, **kwargs)
@@ -54,6 +54,7 @@ class AzureOpenAIPythonGrader(AzureOpenAIGrader):
     """
 
     id = "azureai://built-in/evaluators/azure-openai/python_grader"
+    _type = "python"
 
     def __init__(
         self,
@@ -79,7 +80,7 @@ def __init__(
             image_tag=image_tag,
             pass_threshold=pass_threshold,
             source=source,
-            type="python",
+            type=AzureOpenAIPythonGrader._type,
         )
 
         super().__init__(model_config=model_config, grader_config=grader, credential=credential, **kwargs)
@@ -49,6 +49,7 @@ class AzureOpenAIScoreModelGrader(AzureOpenAIGrader):
     """
 
     id = "azureai://built-in/evaluators/azure-openai/score_model_grader"
+    _type = "score_model"
 
     def __init__(
         self,
@@ -80,7 +81,7 @@ def __init__(
         self.pass_threshold = pass_threshold
 
         # Create OpenAI ScoreModelGrader instance
-        grader_kwargs = {"input": input, "model": model, "name": name, "type": "score_model"}
+        grader_kwargs = {"input": input, "model": model, "name": name, "type": AzureOpenAIScoreModelGrader._type}
 
         if range is not None:
             grader_kwargs["range"] = range

@@ -38,6 +38,7 @@ class AzureOpenAIStringCheckGrader(AzureOpenAIGrader):
     """
 
     id = "azureai://built-in/evaluators/azure-openai/string_check_grader"
+    _type = "string_check"
 
     def __init__(
         self,
@@ -60,6 +61,6 @@ def __init__(
             name=name,
             operation=operation,
             reference=reference,
-            type="string_check",
+            type=AzureOpenAIStringCheckGrader._type,
         )
         super().__init__(model_config=model_config, grader_config=grader, credential=credential, **kwargs)
@@ -43,6 +43,7 @@ class AzureOpenAITextSimilarityGrader(AzureOpenAIGrader):
     """
 
     id = "azureai://built-in/evaluators/azure-openai/text_similarity_grader"
+    _type = "text_similarity"
 
     def __init__(
         self,
@@ -74,6 +75,6 @@ def __init__(
             pass_threshold=pass_threshold,
             name=name,
             reference=reference,
-            type="text_similarity",
+            type=AzureOpenAITextSimilarityGrader._type,
         )
         super().__init__(model_config=model_config, grader_config=grader, credential=credential, **kwargs)
@@ -411,6 +411,25 @@ def parse_response(  # pylint: disable=too-many-branches,too-many-statements
                 result[pm_metric_name + "_reason"] = (
                     parsed_response["reasoning"] if "reasoning" in parsed_response else ""
                 )
+                result[pm_metric_name + "_total_tokens"] = (
+                    parsed_response["totalTokenCount"] if "totalTokenCount" in parsed_response else ""
+                )
+                result[pm_metric_name + "_prompt_tokens"] = (
+                    parsed_response["inputTokenCount"] if "inputTokenCount" in parsed_response else ""
+                )
+                result[pm_metric_name + "_completion_tokens"] = (
+                    parsed_response["outputTokenCount"] if "outputTokenCount" in parsed_response else ""
+                )
+                result[pm_metric_name + "_finish_reason"] = (
+                    parsed_response["finish_reason"] if "finish_reason" in parsed_response else ""
+                )
+                result[pm_metric_name + "_sample_input"] = (
+                    parsed_response["sample_input"] if "sample_input" in parsed_response else ""
+                )
+                result[pm_metric_name + "_sample_output"] = (
+                    parsed_response["sample_output"] if "sample_output" in parsed_response else ""
+                )
+                result[pm_metric_name + "_model"] = parsed_response["model"] if "model" in parsed_response else ""
             return result
         if metric_name not in batch_response[0]:
             return {}
@@ -442,9 +461,39 @@ def parse_response(  # pylint: disable=too-many-branches,too-many-statements
             # Add all attributes under the details.
             details = {}
             for key, value in parsed_response.items():
-                if key not in {"label", "reasoning", "version"}:
+                if key not in {
+                    "label",
+                    "reasoning",
+                    "version",
+                    "totalTokenCount",
+                    "inputTokenCount",
+                    "outputTokenCount",
+                    "finish_reason",
+                    "sample_input",
+                    "sample_output",
+                    "model",
+                }:
                     details[key.replace("-", "_")] = value
             result[metric_display_name + "_details"] = details
+        result[metric_display_name + "_total_tokens"] = (
+            parsed_response["totalTokenCount"] if "totalTokenCount" in parsed_response else ""
+        )
+        result[metric_display_name + "_prompt_tokens"] = (
+            parsed_response["inputTokenCount"] if "inputTokenCount" in parsed_response else ""
+        )
+        result[metric_display_name + "_completion_tokens"] = (
+            parsed_response["outputTokenCount"] if "outputTokenCount" in parsed_response else ""
+        )
+        result[metric_display_name + "_finish_reason"] = (
+            parsed_response["finish_reason"] if "finish_reason" in parsed_response else ""
+        )
+        result[metric_display_name + "_sample_input"] = (
+            parsed_response["sample_input"] if "sample_input" in parsed_response else ""
+        )
+        result[metric_display_name + "_sample_output"] = (
+            parsed_response["sample_output"] if "sample_output" in parsed_response else ""
+        )
+        result[metric_display_name + "_model"] = parsed_response["model"] if "model" in parsed_response else ""
         return result
     return _parse_content_harm_response(batch_response, metric_name, metric_display_name)
 
@@ -484,6 +533,13 @@ def _parse_content_harm_response(
     except Exception:  # pylint: disable=broad-exception-caught
         harm_response = response[metric_name]
 
+    total_tokens = 0
+    prompt_tokens = 0
+    completion_tokens = 0
+    finish_reason = ""
+    sample_input = ""
+    sample_output = ""
+    model = ""
     if harm_response != "" and isinstance(harm_response, dict):
         # check if "output" is one key in harm_response
         if "output" in harm_response:
@@ -511,6 +567,44 @@ def _parse_content_harm_response(
             reason = harm_response["reason"]
         else:
             reason = ""
+
+        # get token_usage
+        if "totalTokenCount" in harm_response:
+            total_tokens = harm_response["totalTokenCount"]
+        else:
+            total_tokens = 0
+        if "inputTokenCount" in harm_response:
+            prompt_tokens = harm_response["inputTokenCount"]
+        else:
+            prompt_tokens = 0
+        if "outputTokenCount" in harm_response:
+            completion_tokens = harm_response["outputTokenCount"]
+        else:
+            completion_tokens = 0
+
+        # get finish_reason
+        if "finish_reason" in harm_response:
+            finish_reason = harm_response["finish_reason"]
+        else:
+            finish_reason = ""
+
+        # get sample_input
+        if "sample_input" in harm_response:
+            sample_input = harm_response["sample_input"]
+        else:
+            sample_input = ""
+
+        # get sample_output
+        if "sample_output" in harm_response:
+            sample_output = harm_response["sample_output"]
+        else:
+            sample_output = ""
+
+        # get model
+        if "model" in harm_response:
+            model = harm_response["model"]
+        else:
+            model = ""
     elif harm_response != "" and isinstance(harm_response, str):
         metric_value_match = re.findall(r"(\b[0-7])\b", harm_response)
         if metric_value_match:
@@ -537,6 +631,13 @@ def _parse_content_harm_response(
     result[key] = get_harm_severity_level(harm_score)
     result[key + "_score"] = harm_score
     result[key + "_reason"] = reason
+    result[key + "_total_tokens"] = total_tokens
+    result[key + "_prompt_tokens"] = prompt_tokens
+    result[key + "_completion_tokens"] = completion_tokens
+    result[key + "_finish_reason"] = finish_reason
+    result[key + "_sample_input"] = sample_input
+    result[key + "_sample_output"] = sample_output
+    result[key + "_model"] = model
 
     return result
 

@@ -90,6 +90,88 @@ class TokenScope(str, enum.Enum):
     AZURE_ML = "https://ml.azure.com/.default"
 
 
+class _EvaluatorMetricMapping:
+    """
+    Static mapping of evaluator names to their metric names, based on assets.json.
+    The 'builtin.' prefix is removed from the evaluator name keys.
+    """
+
+    EVALUATOR_NAME_METRICS_MAPPINGS = {
+        "bleu_score": ["bleu"],
+        "coherence": ["coherence"],
+        "document_retrieval": [
+            "ndcg@3",
+            "xdcg@3",
+            "fidelity",
+            "top1_relevance",
+            "top3_max_relevance",
+            "holes",
+            "holes_ratio",
+            "total_retrieved_documents",
+            "total_ground_truth_documents",
+        ],
+        "f1_score": ["f1_score"],
+        "fluency": ["fluency"],
+        "gleu_score": ["gleu"],
+        "meteor_score": ["meteor"],
+        "relevance": ["relevance"],
+        "response_completeness": ["response_completeness"],
+        "rouge_score": ["rouge_precision", "rouge_recall", "rouge_f1_score"],
+        "groundedness_pro": ["groundedness_pro"],
+        "similarity": ["similarity"],
+        "intent_resolution": ["intent_resolution"],
+        "retrieval": ["retrieval"],
+        "task_adherence": ["task_adherence"],
+        "tool_call_accuracy": ["tool_call_accuracy"],
+        "groundedness": ["groundedness"],
+        "code_vulnerability": ["code_vulnerability"],
+        "eci": ["eci"],
+        "protected_material": ["protected_material"],
+        "ungrounded_attributes": ["ungrounded_attributes"],
+        "indirect_attack": ["xpia", "xpia_manipulated_content", "xpia_intrusion", "xpia_information_gathering"],
+        "label_grader": ["label_model"],
+        "string_check_grader": ["string_check"],
+        "text_similarity_grader": ["similarity"],
+        "score_model_grader": ["score_model"],
+        "sexual": ["sexual"],
+        "self_harm": ["self_harm"],
+        "violence": ["violence"],
+        "hate_unfairness": ["hate_unfairness"],
+    }
+
+    EVAL_CLASS_NAME_MAP = {
+        "BleuScoreEvaluator": "bleu_score",
+        "CodeVulnerabilityEvaluator": "code_vulnerability",
+        "CoherenceEvaluator": "coherence",
+        "ContentSafetyEvaluator": "content_safety",
+        "DocumentRetrievalEvaluator": "document_retrieval",
+        "ECIEvaluator": "eci",
+        "F1ScoreEvaluator": "f1_score",
+        "FluencyEvaluator": "fluency",
+        "GleuScoreEvaluator": "gleu_score",
+        "GroundednessEvaluator": "groundedness",
+        "GroundednessProEvaluator": "groundedness_pro",
+        "HateUnfairnessEvaluator": "hate_unfairness",
+        "IndirectAttackEvaluator": "indirect_attack",
+        "IntentResolutionEvaluator": "intent_resolution",
+        "MeteorScoreEvaluator": "meteor_score",
+        "ProtectedMaterialEvaluator": "protected_material",
+        "QAEvaluator": "qa",
+        "RelevanceEvaluator": "relevance",
+        "ResponseCompletenessEvaluator": "response_completeness",
+        "RetrievalEvaluator": "retrieval",
+        "RougeScoreEvaluator": "rouge_score",
+        "SelfHarmEvaluator": "self_harm",
+        "SexualEvaluator": "sexual",
+        "SimilarityEvaluator": "similarity",
+        "TaskAdherenceEvaluator": "task_adherence",
+        "TaskCompletionEvaluator": "task_completion",
+        "ToolCallAccuracyEvaluator": "tool_call_accuracy",
+        "UngroundedAttributesEvaluator": "ungrounded_attributes",
+        "ViolenceEvaluator": "violence",
+    }
+
+
 DEFAULT_EVALUATION_RESULTS_FILE_NAME = "evaluation_results.json"
 
 CONTENT_SAFETY_DEFECT_RATE_THRESHOLD_DEFAULT = 4
@@ -116,3 +198,6 @@ class TokenScope(str, enum.Enum):
 AOAI_COLUMN_NAME = "aoai"
 DEFAULT_OAI_EVAL_RUN_NAME = "AI_SDK_EVAL_RUN"
 DEFAULT_AOAI_API_VERSION = "2025-04-01-preview"  # Unfortunately relying on preview version for now.
+
+# OpenTelemetry event names
+EVALUATION_EVENT_NAME = "gen_ai.evaluation.result"
@@ -159,6 +159,16 @@ def get_run_summary(self, client_run: BatchClientRun) -> Dict[str, Any]:
             "completed_lines": total_lines - failed_lines,
             "failed_lines": failed_lines,
             "log_path": None,
+            "error_message": (
+                f"({run.result.error.blame.value}) {run.result.error.message}"
+                if run.result and run.result.error and run.result.error.blame
+                else None
+            ),
+            "error_code": (
+                f"{run.result.error.category.value}"
+                if run.result and run.result.error and run.result.error.category
+                else None
+            ),
         }
 
     @staticmethod