Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
112 commits
Select commit Hold shift + click to select a range
fa2f8aa
skeleton code
Sep 30, 2025
8dfde5b
add function that logs eval results to app insights
Oct 1, 2025
771e19d
format
Oct 1, 2025
7174bc4
log red team data
Oct 1, 2025
980e2fa
add eval result converter
YoYoJa Oct 3, 2025
57c73b8
Add result converter
YoYoJa Oct 6, 2025
1730b17
update converter params to optional
YoYoJa Oct 6, 2025
3bf93f7
add eval meta data
YoYoJa Oct 7, 2025
e47639d
merge converter change to include eval_meta_data
YoYoJa Oct 7, 2025
5b198b4
fix type
YoYoJa Oct 8, 2025
5fbbabe
remove useless file
YoYoJa Oct 8, 2025
6ca31a1
get eval meta data as input
YoYoJa Oct 8, 2025
ea93d1a
fix build errors
YoYoJa Oct 8, 2025
e6a9caa
remove useless import
YoYoJa Oct 8, 2025
f24f0e0
resolve comments
YoYoJa Oct 8, 2025
0abddb0
update
YoYoJa Oct 8, 2025
f25760c
get agent info from app insights config
Oct 8, 2025
08d3e36
merge with main
Oct 8, 2025
518b4af
update comments
YoYoJa Oct 8, 2025
74dfddc
merge otel
YoYoJa Oct 8, 2025
15d881e
merge result converter
YoYoJa Oct 8, 2025
5c44f70
fix checker failure
YoYoJa Oct 9, 2025
73fb9c7
add eval result converter (#43233)
YoYoJa Oct 9, 2025
46e0504
rename function
Oct 9, 2025
9cae8cf
Merge branch 'main' of https://github.com/Azure/azure-sdk-for-python …
Oct 9, 2025
17cdd1e
fix a thing
Oct 9, 2025
2ce023e
add error msg and error code
YoYoJa Oct 9, 2025
654e28f
merge main
YoYoJa Oct 9, 2025
32aad08
Surface evaluator error msg
YoYoJa Oct 10, 2025
d1449f5
surface out error
YoYoJa Oct 10, 2025
47d20e3
update
YoYoJa Oct 10, 2025
1f13f56
Jessli/convert (#43342)
YoYoJa Oct 10, 2025
5cee7e4
update UT
YoYoJa Oct 10, 2025
9256912
fix usage
YoYoJa Oct 10, 2025
0ff811b
fix usage
YoYoJa Oct 10, 2025
18b0db2
resolve conflict
YoYoJa Oct 10, 2025
6104b0c
Fix usage (#43355)
YoYoJa Oct 10, 2025
7858ee8
save
Oct 10, 2025
0916d62
Merge branch 'needuv/structured-results-otel-logging' of https://gith…
Oct 10, 2025
e8942cb
add _type to evals/aoai graders
Oct 10, 2025
36b0761
merge type updat
YoYoJa Oct 10, 2025
59b0aab
make eval_meta_data optional
YoYoJa Oct 12, 2025
d4d768c
remove useless lines
YoYoJa Oct 12, 2025
920c964
Jessli/convert make eval_meta_data optional (#43376)
YoYoJa Oct 12, 2025
24e5cc1
add error logging for otel event emission
Oct 13, 2025
f60e574
Merge branch 'needuv/structured-results-otel-logging' of https://gith…
Oct 13, 2025
58035f5
fix merge conflicts
Oct 13, 2025
4a5ffbf
add input/output tokens for prompty evals
Oct 13, 2025
7de4cd6
update param name to add underscore
YoYoJa Oct 14, 2025
cb88b43
merge remote
YoYoJa Oct 14, 2025
776224c
Jessli/convert - update param name to add underscore (#43411)
YoYoJa Oct 14, 2025
7e2969f
exclude token counts from aggregation
Oct 14, 2025
81ad396
Merge branch 'needuv/structured-results-otel-logging' of https://gith…
Oct 14, 2025
f70e979
add total token count to prompty output
Oct 14, 2025
e6e0746
fix prompty tests
Oct 14, 2025
6c62dca
remove fields from app insights config
Oct 15, 2025
44079a8
make new evaluation result fields private, and add a toggle in evaluate
Oct 15, 2025
79b5f7b
change output fields to be private
Oct 15, 2025
28d5d17
parse updated annotation results
YoYoJa Oct 16, 2025
eab85ca
merge remote
YoYoJa Oct 16, 2025
479f1a0
Jessli/convert parse annotation and add trace_id (#43463)
YoYoJa Oct 16, 2025
74a39b9
update trace_id
YoYoJa Oct 16, 2025
31458fe
merge remote
YoYoJa Oct 16, 2025
61beb87
Jessli/convert add trace_id, response_id, conversation_id (#43469)
YoYoJa Oct 16, 2025
60b28dd
refactor app insights push to prevent warnings
Oct 16, 2025
bc9cb7c
run black on code
Oct 16, 2025
b92b2cd
fix merge conflicts
Oct 16, 2025
e210abd
move otel import to internal module
Oct 16, 2025
cb1cc34
expose sample data for sdk evaluators
YoYoJa Oct 16, 2025
189dd83
update
YoYoJa Oct 16, 2025
a9416a8
merge and resolve conflict
YoYoJa Oct 16, 2025
ad17aa1
Jessli/convert expose sample data for sdk promty based evaluators (#4…
YoYoJa Oct 16, 2025
3cc294c
update
YoYoJa Oct 17, 2025
26c8a53
update
YoYoJa Oct 17, 2025
ad66137
Jessli/convert remove token counts from metrics (#43477)
YoYoJa Oct 17, 2025
d2e40f9
fix UT
YoYoJa Oct 17, 2025
4a0a86c
remove print
YoYoJa Oct 17, 2025
2cb5c5f
Jessli/convert remove useless lines and fix UT (#43480)
YoYoJa Oct 17, 2025
aef2668
try changing prompty output to dict
Oct 17, 2025
d0323df
Merge branch 'needuv/structured-results-otel-logging' of https://gith…
Oct 17, 2025
29d44b3
change prompty output to dict
Oct 17, 2025
4f05fb5
run black
Oct 17, 2025
3664e9a
merge remote
YoYoJa Oct 17, 2025
06e07f4
fix relevance and prompty test
Oct 17, 2025
de8a43d
fix unit tests
Oct 17, 2025
fa3ae5c
merge remote
YoYoJa Oct 17, 2025
e386fed
fix tests
YoYoJa Oct 17, 2025
b82956d
fix prompty tests
Oct 17, 2025
340b243
fix similarity test
Oct 17, 2025
8b87275
move groundedness to actual prompty impl
Oct 17, 2025
ffbbacb
chore: Update assets.json
kdestin Oct 17, 2025
023bff7
fix test
YoYoJa Oct 20, 2025
244b530
merge remote
YoYoJa Oct 20, 2025
eec6eb2
Jessli/convert Fix test failure (#43518)
YoYoJa Oct 20, 2025
7abf6c3
add extra attributes to app insights config, remove agent name/id/ver…
Oct 20, 2025
36e45f6
pin otel<1.39.0 since breaking change coming in that version
Oct 20, 2025
201af13
merge
Oct 20, 2025
ce3af31
merge main
YoYoJa Oct 20, 2025
8388003
implement scrubber for sensitive information
Oct 20, 2025
12cd578
run black formatter
Oct 20, 2025
c26a268
fix spelling for evaluation sdk
Oct 20, 2025
37acd37
use non-deprecated path for emitting traces
Oct 20, 2025
691fc3a
remove upper bound on otel sdk
Oct 20, 2025
6ac7f0e
shuffle imports
Oct 20, 2025
ef92791
merge master and fix bug
YoYoJa Oct 21, 2025
c40feed
merge remote
YoYoJa Oct 21, 2025
04912f3
merge main
YoYoJa Oct 21, 2025
76b1951
Jessli/convert (#43556) merge main
YoYoJa Oct 21, 2025
7450510
merge remote
YoYoJa Oct 21, 2025
fe2779d
fix bug
YoYoJa Oct 21, 2025
e856faa
Jessli/convert Fix bug (#43557)
YoYoJa Oct 21, 2025
06d3f87
merge remote
YoYoJa Oct 21, 2025
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion sdk/evaluation/azure-ai-evaluation/assets.json
Original file line number Diff line number Diff line change
Expand Up @@ -2,5 +2,5 @@
"AssetsRepo": "Azure/azure-sdk-assets",
"AssetsRepoPrefixPath": "python",
"TagPrefix": "python/evaluation/azure-ai-evaluation",
"Tag": "python/evaluation/azure-ai-evaluation_d7b00f22b8"
"Tag": "python/evaluation/azure-ai-evaluation_5bef6dc713"
}
Original file line number Diff line number Diff line change
Expand Up @@ -43,6 +43,7 @@ class AzureOpenAILabelGrader(AzureOpenAIGrader):
"""

id = "azureai://built-in/evaluators/azure-openai/label_grader"
_type = "label_model"

def __init__(
self,
Expand All @@ -62,6 +63,6 @@ def __init__(
model=model,
name=name,
passing_labels=passing_labels,
type="label_model",
type=AzureOpenAILabelGrader._type,
)
super().__init__(model_config=model_config, grader_config=grader, credential=credential, **kwargs)
Original file line number Diff line number Diff line change
Expand Up @@ -54,6 +54,7 @@ class AzureOpenAIPythonGrader(AzureOpenAIGrader):
"""

id = "azureai://built-in/evaluators/azure-openai/python_grader"
_type = "python"

def __init__(
self,
Expand All @@ -79,7 +80,7 @@ def __init__(
image_tag=image_tag,
pass_threshold=pass_threshold,
source=source,
type="python",
type=AzureOpenAIPythonGrader._type,
)

super().__init__(model_config=model_config, grader_config=grader, credential=credential, **kwargs)
Original file line number Diff line number Diff line change
Expand Up @@ -49,6 +49,7 @@ class AzureOpenAIScoreModelGrader(AzureOpenAIGrader):
"""

id = "azureai://built-in/evaluators/azure-openai/score_model_grader"
_type = "score_model"

def __init__(
self,
Expand Down Expand Up @@ -80,7 +81,7 @@ def __init__(
self.pass_threshold = pass_threshold

# Create OpenAI ScoreModelGrader instance
grader_kwargs = {"input": input, "model": model, "name": name, "type": "score_model"}
grader_kwargs = {"input": input, "model": model, "name": name, "type": AzureOpenAIScoreModelGrader._type}

if range is not None:
grader_kwargs["range"] = range
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -38,6 +38,7 @@ class AzureOpenAIStringCheckGrader(AzureOpenAIGrader):
"""

id = "azureai://built-in/evaluators/azure-openai/string_check_grader"
_type = "string_check"

def __init__(
self,
Expand All @@ -60,6 +61,6 @@ def __init__(
name=name,
operation=operation,
reference=reference,
type="string_check",
type=AzureOpenAIStringCheckGrader._type,
)
super().__init__(model_config=model_config, grader_config=grader, credential=credential, **kwargs)
Original file line number Diff line number Diff line change
Expand Up @@ -43,6 +43,7 @@ class AzureOpenAITextSimilarityGrader(AzureOpenAIGrader):
"""

id = "azureai://built-in/evaluators/azure-openai/text_similarity_grader"
_type = "text_similarity"

def __init__(
self,
Expand Down Expand Up @@ -74,6 +75,6 @@ def __init__(
pass_threshold=pass_threshold,
name=name,
reference=reference,
type="text_similarity",
type=AzureOpenAITextSimilarityGrader._type,
)
super().__init__(model_config=model_config, grader_config=grader, credential=credential, **kwargs)
Original file line number Diff line number Diff line change
Expand Up @@ -411,6 +411,25 @@ def parse_response( # pylint: disable=too-many-branches,too-many-statements
result[pm_metric_name + "_reason"] = (
parsed_response["reasoning"] if "reasoning" in parsed_response else ""
)
result[pm_metric_name + "_total_tokens"] = (
parsed_response["totalTokenCount"] if "totalTokenCount" in parsed_response else ""
)
result[pm_metric_name + "_prompt_tokens"] = (
parsed_response["inputTokenCount"] if "inputTokenCount" in parsed_response else ""
)
result[pm_metric_name + "_completion_tokens"] = (
parsed_response["outputTokenCount"] if "outputTokenCount" in parsed_response else ""
)
result[pm_metric_name + "_finish_reason"] = (
parsed_response["finish_reason"] if "finish_reason" in parsed_response else ""
)
result[pm_metric_name + "_sample_input"] = (
parsed_response["sample_input"] if "sample_input" in parsed_response else ""
)
result[pm_metric_name + "_sample_output"] = (
parsed_response["sample_output"] if "sample_output" in parsed_response else ""
)
result[pm_metric_name + "_model"] = parsed_response["model"] if "model" in parsed_response else ""
return result
if metric_name not in batch_response[0]:
return {}
Expand Down Expand Up @@ -442,9 +461,39 @@ def parse_response( # pylint: disable=too-many-branches,too-many-statements
# Add all attributes under the details.
details = {}
for key, value in parsed_response.items():
if key not in {"label", "reasoning", "version"}:
if key not in {
"label",
"reasoning",
"version",
"totalTokenCount",
"inputTokenCount",
"outputTokenCount",
"finish_reason",
"sample_input",
"sample_output",
"model",
}:
details[key.replace("-", "_")] = value
result[metric_display_name + "_details"] = details
result[metric_display_name + "_total_tokens"] = (
parsed_response["totalTokenCount"] if "totalTokenCount" in parsed_response else ""
)
result[metric_display_name + "_prompt_tokens"] = (
parsed_response["inputTokenCount"] if "inputTokenCount" in parsed_response else ""
)
result[metric_display_name + "_completion_tokens"] = (
parsed_response["outputTokenCount"] if "outputTokenCount" in parsed_response else ""
)
result[metric_display_name + "_finish_reason"] = (
parsed_response["finish_reason"] if "finish_reason" in parsed_response else ""
)
result[metric_display_name + "_sample_input"] = (
parsed_response["sample_input"] if "sample_input" in parsed_response else ""
)
result[metric_display_name + "_sample_output"] = (
parsed_response["sample_output"] if "sample_output" in parsed_response else ""
)
result[metric_display_name + "_model"] = parsed_response["model"] if "model" in parsed_response else ""
return result
return _parse_content_harm_response(batch_response, metric_name, metric_display_name)

Expand Down Expand Up @@ -484,6 +533,13 @@ def _parse_content_harm_response(
except Exception: # pylint: disable=broad-exception-caught
harm_response = response[metric_name]

total_tokens = 0
prompt_tokens = 0
completion_tokens = 0
finish_reason = ""
sample_input = ""
sample_output = ""
model = ""
if harm_response != "" and isinstance(harm_response, dict):
# check if "output" is one key in harm_response
if "output" in harm_response:
Expand Down Expand Up @@ -511,6 +567,44 @@ def _parse_content_harm_response(
reason = harm_response["reason"]
else:
reason = ""

# get token_usage
if "totalTokenCount" in harm_response:
total_tokens = harm_response["totalTokenCount"]
else:
total_tokens = 0
if "inputTokenCount" in harm_response:
prompt_tokens = harm_response["inputTokenCount"]
else:
prompt_tokens = 0
if "outputTokenCount" in harm_response:
completion_tokens = harm_response["outputTokenCount"]
else:
completion_tokens = 0

# get finish_reason
if "finish_reason" in harm_response:
finish_reason = harm_response["finish_reason"]
else:
finish_reason = ""

# get sample_input
if "sample_input" in harm_response:
sample_input = harm_response["sample_input"]
else:
sample_input = ""

# get sample_output
if "sample_output" in harm_response:
sample_output = harm_response["sample_output"]
else:
sample_output = ""

# get model
if "model" in harm_response:
model = harm_response["model"]
else:
model = ""
elif harm_response != "" and isinstance(harm_response, str):
metric_value_match = re.findall(r"(\b[0-7])\b", harm_response)
if metric_value_match:
Expand All @@ -537,6 +631,13 @@ def _parse_content_harm_response(
result[key] = get_harm_severity_level(harm_score)
result[key + "_score"] = harm_score
result[key + "_reason"] = reason
result[key + "_total_tokens"] = total_tokens
result[key + "_prompt_tokens"] = prompt_tokens
result[key + "_completion_tokens"] = completion_tokens
result[key + "_finish_reason"] = finish_reason
result[key + "_sample_input"] = sample_input
result[key + "_sample_output"] = sample_output
result[key + "_model"] = model

return result

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -90,6 +90,88 @@ class TokenScope(str, enum.Enum):
AZURE_ML = "https://ml.azure.com/.default"


class _EvaluatorMetricMapping:
"""
Static mapping of evaluator names to their metric names, based on assets.json.
The 'builtin.' prefix is removed from the evaluator name keys.
"""

EVALUATOR_NAME_METRICS_MAPPINGS = {
"bleu_score": ["bleu"],
"coherence": ["coherence"],
"document_retrieval": [
"ndcg@3",
"xdcg@3",
"fidelity",
"top1_relevance",
"top3_max_relevance",
"holes",
"holes_ratio",
"total_retrieved_documents",
"total_ground_truth_documents",
],
"f1_score": ["f1_score"],
"fluency": ["fluency"],
"gleu_score": ["gleu"],
"meteor_score": ["meteor"],
"relevance": ["relevance"],
"response_completeness": ["response_completeness"],
"rouge_score": ["rouge_precision", "rouge_recall", "rouge_f1_score"],
"groundedness_pro": ["groundedness_pro"],
"similarity": ["similarity"],
"intent_resolution": ["intent_resolution"],
"retrieval": ["retrieval"],
"task_adherence": ["task_adherence"],
"tool_call_accuracy": ["tool_call_accuracy"],
"groundedness": ["groundedness"],
"code_vulnerability": ["code_vulnerability"],
"eci": ["eci"],
"protected_material": ["protected_material"],
"ungrounded_attributes": ["ungrounded_attributes"],
"indirect_attack": ["xpia", "xpia_manipulated_content", "xpia_intrusion", "xpia_information_gathering"],
"label_grader": ["label_model"],
"string_check_grader": ["string_check"],
"text_similarity_grader": ["similarity"],
"score_model_grader": ["score_model"],
"sexual": ["sexual"],
"self_harm": ["self_harm"],
"violence": ["violence"],
"hate_unfairness": ["hate_unfairness"],
}

EVAL_CLASS_NAME_MAP = {
"BleuScoreEvaluator": "bleu_score",
"CodeVulnerabilityEvaluator": "code_vulnerability",
"CoherenceEvaluator": "coherence",
"ContentSafetyEvaluator": "content_safety",
"DocumentRetrievalEvaluator": "document_retrieval",
"ECIEvaluator": "eci",
"F1ScoreEvaluator": "f1_score",
"FluencyEvaluator": "fluency",
"GleuScoreEvaluator": "gleu_score",
"GroundednessEvaluator": "groundedness",
"GroundednessProEvaluator": "groundedness_pro",
"HateUnfairnessEvaluator": "hate_unfairness",
"IndirectAttackEvaluator": "indirect_attack",
"IntentResolutionEvaluator": "intent_resolution",
"MeteorScoreEvaluator": "meteor_score",
"ProtectedMaterialEvaluator": "protected_material",
"QAEvaluator": "qa",
"RelevanceEvaluator": "relevance",
"ResponseCompletenessEvaluator": "response_completeness",
"RetrievalEvaluator": "retrieval",
"RougeScoreEvaluator": "rouge_score",
"SelfHarmEvaluator": "self_harm",
"SexualEvaluator": "sexual",
"SimilarityEvaluator": "similarity",
"TaskAdherenceEvaluator": "task_adherence",
"TaskCompletionEvaluator": "task_completion",
"ToolCallAccuracyEvaluator": "tool_call_accuracy",
"UngroundedAttributesEvaluator": "ungrounded_attributes",
"ViolenceEvaluator": "violence",
}


DEFAULT_EVALUATION_RESULTS_FILE_NAME = "evaluation_results.json"

CONTENT_SAFETY_DEFECT_RATE_THRESHOLD_DEFAULT = 4
Expand All @@ -116,3 +198,6 @@ class TokenScope(str, enum.Enum):
AOAI_COLUMN_NAME = "aoai"
DEFAULT_OAI_EVAL_RUN_NAME = "AI_SDK_EVAL_RUN"
DEFAULT_AOAI_API_VERSION = "2025-04-01-preview" # Unfortunately relying on preview version for now.

# OpenTelemetry event names
EVALUATION_EVENT_NAME = "gen_ai.evaluation.result"
Original file line number Diff line number Diff line change
Expand Up @@ -159,6 +159,16 @@ def get_run_summary(self, client_run: BatchClientRun) -> Dict[str, Any]:
"completed_lines": total_lines - failed_lines,
"failed_lines": failed_lines,
"log_path": None,
"error_message": (
f"({run.result.error.blame.value}) {run.result.error.message}"
if run.result and run.result.error and run.result.error.blame
else None
),
"error_code": (
f"{run.result.error.category.value}"
if run.result and run.result.error and run.result.error.category
else None
),
}

@staticmethod
Expand Down
Loading
Loading