diff --git a/ddtrace/llmobs/_llmobs.py b/ddtrace/llmobs/_llmobs.py index 0e16e62382f..d54e3be6f70 100644 --- a/ddtrace/llmobs/_llmobs.py +++ b/ddtrace/llmobs/_llmobs.py @@ -1547,6 +1547,7 @@ def submit_evaluation_for( ml_app: Optional[str] = None, timestamp_ms: Optional[int] = None, metadata: Optional[Dict[str, object]] = None, + assessment: Optional[str] = None, ) -> None: """ Submits a custom evaluation metric for a given span. @@ -1565,6 +1566,7 @@ def submit_evaluation_for( If not set, the current time will be used. :param dict metadata: A JSON serializable dictionary of key-value metadata pairs relevant to the evaluation metric. + :param str assessment: An assessment of the validity of this evaluation. Must be either "pass" or "fail". """ if cls.enabled is False: log.debug( @@ -1673,6 +1675,13 @@ def submit_evaluation_for( "tags": ["{}:{}".format(k, v) for k, v in evaluation_tags.items()], } + if assessment: + if not isinstance(assessment, str) or assessment not in ("pass", "fail"): + error = "invalid_assessment" + log.warning("Failed to parse assessment. assessment must be either 'pass' or 'fail'.") + else: + evaluation_metric["success_criteria"] = {"assessment": assessment} + if metadata: if not isinstance(metadata, dict): error = "invalid_metadata" diff --git a/ddtrace/llmobs/_writer.py b/ddtrace/llmobs/_writer.py index 3b62ea3bdda..597bc9df759 100644 --- a/ddtrace/llmobs/_writer.py +++ b/ddtrace/llmobs/_writer.py @@ -86,6 +86,7 @@ class LLMObsEvaluationMetricEvent(TypedDict, total=False): ml_app: str timestamp_ms: int tags: List[str] + success_criteria: Dict[str, str] class LLMObsExperimentEvalMetricEvent(TypedDict, total=False): diff --git a/releasenotes/notes/feat-llmobs-submit-eval-success-assessment-11b93921832c54b4.yaml b/releasenotes/notes/feat-llmobs-submit-eval-success-assessment-11b93921832c54b4.yaml new file mode 100644 index 00000000000..7a16bf31f64 --- /dev/null +++ b/releasenotes/notes/feat-llmobs-submit-eval-success-assessment-11b93921832c54b4.yaml @@ -0,0 +1,5 @@ +--- +features: + - | + LLM Observability: The ``LLMObs.submit_evaluation_for()`` method now accepts a ``assessment`` argument to denote + whether or not the evaluation is valid or correct. Accepted values are either ``"pass"`` or ``"fail"``. \ No newline at end of file diff --git a/tests/llmobs/_utils.py b/tests/llmobs/_utils.py index a12154f2c34..35d46863e6e 100644 --- a/tests/llmobs/_utils.py +++ b/tests/llmobs/_utils.py @@ -275,6 +275,7 @@ def _expected_llmobs_eval_metric_event( boolean_value=None, tags=None, metadata=None, + success_criteria=None, ): eval_metric_event = { "join_on": {}, @@ -299,6 +300,8 @@ def _expected_llmobs_eval_metric_event( eval_metric_event["boolean_value"] = boolean_value if tags is not None: eval_metric_event["tags"] = tags + if success_criteria is not None: + eval_metric_event["success_criteria"] = success_criteria if timestamp_ms is not None: eval_metric_event["timestamp_ms"] = timestamp_ms else: diff --git a/tests/llmobs/test_llmobs_service.py b/tests/llmobs/test_llmobs_service.py index 3f080dd03ae..b6039856007 100644 --- a/tests/llmobs/test_llmobs_service.py +++ b/tests/llmobs/test_llmobs_service.py @@ -2286,6 +2286,68 @@ def test_submit_evaluation_for_metric_with_metadata_enqueues_metric(llmobs, mock ) +def test_submit_evaluation_for_invalid_assessment_raises_warning(llmobs, mock_llmobs_logs): + llmobs.submit_evaluation_for( + span={"span_id": "123", "trace_id": "456"}, + label="toxicity", + metric_type="categorical", + value="high", + assessment=True, + ) + mock_llmobs_logs.warning.assert_called_once_with( + "Failed to parse assessment. assessment must be either 'pass' or 'fail'." + ) + + +def test_submit_evaluation_for_enqueues_writer_with_success_criteria(llmobs, mock_llmobs_eval_metric_writer): + llmobs.submit_evaluation_for( + span={"span_id": "123", "trace_id": "456"}, + label="toxicity", + metric_type="categorical", + value="high", + tags={"foo": "bar", "bee": "baz", "ml_app": "ml_app_override"}, + ml_app="ml_app_override", + metadata={"foo": ["bar", "baz"]}, + assessment="pass", + ) + mock_llmobs_eval_metric_writer.enqueue.assert_called_with( + _expected_llmobs_eval_metric_event( + ml_app="ml_app_override", + span_id="123", + trace_id="456", + label="toxicity", + metric_type="categorical", + categorical_value="high", + tags=["ddtrace.version:{}".format(ddtrace.__version__), "ml_app:ml_app_override", "foo:bar", "bee:baz"], + metadata={"foo": ["bar", "baz"]}, + success_criteria={"assessment": "pass"}, + ) + ) + mock_llmobs_eval_metric_writer.reset() + llmobs.submit_evaluation_for( + span={"span_id": "123", "trace_id": "456"}, + label="toxicity", + metric_type="categorical", + value="high", + tags={"foo": "bar", "bee": "baz", "ml_app": "ml_app_override"}, + ml_app="ml_app_override", + metadata="invalid", + assessment="fail", + ) + mock_llmobs_eval_metric_writer.enqueue.assert_called_with( + _expected_llmobs_eval_metric_event( + ml_app="ml_app_override", + span_id="123", + trace_id="456", + label="toxicity", + metric_type="categorical", + categorical_value="high", + tags=["ddtrace.version:{}".format(ddtrace.__version__), "ml_app:ml_app_override", "foo:bar", "bee:baz"], + success_criteria={"assessment": "fail"}, + ) + ) + + def test_llmobs_parenting_with_root_apm_span(llmobs, tracer, llmobs_events): # orphaned llmobs spans with apm root have undefined parent_id with tracer.trace("no_llm_span"):