Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
9 changes: 9 additions & 0 deletions ddtrace/llmobs/_llmobs.py
Original file line number Diff line number Diff line change
Expand Up @@ -1547,6 +1547,7 @@ def submit_evaluation_for(
ml_app: Optional[str] = None,
timestamp_ms: Optional[int] = None,
metadata: Optional[Dict[str, object]] = None,
assessment: Optional[str] = None,
) -> None:
"""
Submits a custom evaluation metric for a given span.
Expand All @@ -1565,6 +1566,7 @@ def submit_evaluation_for(
If not set, the current time will be used.
:param dict metadata: A JSON serializable dictionary of key-value metadata pairs relevant to the
evaluation metric.
:param str assessment: An assessment of the validity of this evaluation. Must be either "pass" or "fail".
"""
if cls.enabled is False:
log.debug(
Expand Down Expand Up @@ -1673,6 +1675,13 @@ def submit_evaluation_for(
"tags": ["{}:{}".format(k, v) for k, v in evaluation_tags.items()],
}

if assessment:
if not isinstance(assessment, str) or assessment not in ("pass", "fail"):
error = "invalid_assessment"
log.warning("Failed to parse assessment. assessment must be either 'pass' or 'fail'.")
else:
evaluation_metric["success_criteria"] = {"assessment": assessment}

if metadata:
if not isinstance(metadata, dict):
error = "invalid_metadata"
Expand Down
1 change: 1 addition & 0 deletions ddtrace/llmobs/_writer.py
Original file line number Diff line number Diff line change
Expand Up @@ -86,6 +86,7 @@ class LLMObsEvaluationMetricEvent(TypedDict, total=False):
ml_app: str
timestamp_ms: int
tags: List[str]
success_criteria: Dict[str, str]


class LLMObsExperimentEvalMetricEvent(TypedDict, total=False):
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
---
features:
- |
LLM Observability: The ``LLMObs.submit_evaluation_for()`` method now accepts a ``assessment`` argument to denote
whether or not the evaluation is valid or correct. Accepted values are either ``"pass"`` or ``"fail"``.
3 changes: 3 additions & 0 deletions tests/llmobs/_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -275,6 +275,7 @@ def _expected_llmobs_eval_metric_event(
boolean_value=None,
tags=None,
metadata=None,
success_criteria=None,
):
eval_metric_event = {
"join_on": {},
Expand All @@ -299,6 +300,8 @@ def _expected_llmobs_eval_metric_event(
eval_metric_event["boolean_value"] = boolean_value
if tags is not None:
eval_metric_event["tags"] = tags
if success_criteria is not None:
eval_metric_event["success_criteria"] = success_criteria
if timestamp_ms is not None:
eval_metric_event["timestamp_ms"] = timestamp_ms
else:
Expand Down
62 changes: 62 additions & 0 deletions tests/llmobs/test_llmobs_service.py
Original file line number Diff line number Diff line change
Expand Up @@ -2286,6 +2286,68 @@ def test_submit_evaluation_for_metric_with_metadata_enqueues_metric(llmobs, mock
)


def test_submit_evaluation_for_invalid_assessment_raises_warning(llmobs, mock_llmobs_logs):
llmobs.submit_evaluation_for(
span={"span_id": "123", "trace_id": "456"},
label="toxicity",
metric_type="categorical",
value="high",
assessment=True,
)
mock_llmobs_logs.warning.assert_called_once_with(
"Failed to parse assessment. assessment must be either 'pass' or 'fail'."
)


def test_submit_evaluation_for_enqueues_writer_with_success_criteria(llmobs, mock_llmobs_eval_metric_writer):
llmobs.submit_evaluation_for(
span={"span_id": "123", "trace_id": "456"},
label="toxicity",
metric_type="categorical",
value="high",
tags={"foo": "bar", "bee": "baz", "ml_app": "ml_app_override"},
ml_app="ml_app_override",
metadata={"foo": ["bar", "baz"]},
assessment="pass",
)
mock_llmobs_eval_metric_writer.enqueue.assert_called_with(
_expected_llmobs_eval_metric_event(
ml_app="ml_app_override",
span_id="123",
trace_id="456",
label="toxicity",
metric_type="categorical",
categorical_value="high",
tags=["ddtrace.version:{}".format(ddtrace.__version__), "ml_app:ml_app_override", "foo:bar", "bee:baz"],
metadata={"foo": ["bar", "baz"]},
success_criteria={"assessment": "pass"},
)
)
mock_llmobs_eval_metric_writer.reset()
llmobs.submit_evaluation_for(
span={"span_id": "123", "trace_id": "456"},
label="toxicity",
metric_type="categorical",
value="high",
tags={"foo": "bar", "bee": "baz", "ml_app": "ml_app_override"},
ml_app="ml_app_override",
metadata="invalid",
assessment="fail",
)
mock_llmobs_eval_metric_writer.enqueue.assert_called_with(
_expected_llmobs_eval_metric_event(
ml_app="ml_app_override",
span_id="123",
trace_id="456",
label="toxicity",
metric_type="categorical",
categorical_value="high",
tags=["ddtrace.version:{}".format(ddtrace.__version__), "ml_app:ml_app_override", "foo:bar", "bee:baz"],
success_criteria={"assessment": "fail"},
)
)


def test_llmobs_parenting_with_root_apm_span(llmobs, tracer, llmobs_events):
# orphaned llmobs spans with apm root have undefined parent_id
with tracer.trace("no_llm_span"):
Expand Down
Loading