Skip to content

Commit c45e7cb

Browse files
mturk24aditya1503jwmueller
authored
Added support for evals compilation checks and auto mode (#131)
Co-authored-by: Aditya Thyagarajan <[email protected]> Co-authored-by: Jonas Mueller <[email protected]>
1 parent 961726a commit c45e7cb

File tree

1 file changed

+192
-4
lines changed

1 file changed

+192
-4
lines changed

src/cleanlab_tlm/utils/rag.py

Lines changed: 192 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -11,6 +11,7 @@
1111
from __future__ import annotations
1212

1313
import asyncio
14+
import warnings
1415
from collections.abc import Sequence
1516
from typing import (
1617
TYPE_CHECKING,
@@ -27,6 +28,7 @@
2728
from cleanlab_tlm.errors import ValidationError
2829
from cleanlab_tlm.internal.api import api
2930
from cleanlab_tlm.internal.base import BaseTLM
31+
from cleanlab_tlm.tlm import TLM
3032
from cleanlab_tlm.internal.constants import (
3133
_BINARY_STR,
3234
_CONTINUOUS_STR,
@@ -866,9 +868,10 @@ class Eval:
866868
response_identifier (str, optional): The exact string used in your evaluation `criteria` to reference the RAG/LLM response.
867869
For example, specifying `response_identifier` as "AI Answer" means your `criteria` should refer to the response as "AI Answer".
868870
Leave this value as None (the default) if this Eval doesn't consider the response.
869-
mode (str, optional): What type of evaluation these `criteria` correspond to, either "continuous" (default) or "binary".
871+
mode (str, optional): What type of evaluation these `criteria` correspond to, either "continuous" (default), "binary", or "auto".
870872
- "continuous": For `criteria` that define what is good/better v.s. what is bad/worse, corresponding to evaluations of quality along a continuous spectrum (e.g., relevance, conciseness).
871873
- "binary": For `criteria` written as Yes/No questions, corresponding to evaluations that most would consider either True or False rather than grading along a continuous spectrum (e.g., does Response mention ACME Inc., is Query asking about refund, ...).
874+
- "auto": Automatically determines whether the criteria is binary or continuous based on the criteria text.
872875
Both modes return scores in the 0-1 range.
873876
For "continuous" evaluations, your `criteria` should define what good vs. bad looks like (cases deemed bad will return low evaluation scores).
874877
For binary evaluations, your `criteria` should be a Yes/No question (cases answered "Yes" will return low evaluation scores, so phrase your question such that the likelihood of "Yes" matches the likelihood of the particular problem you wish to detect).
@@ -885,7 +888,7 @@ def __init__(
885888
query_identifier: Optional[str] = None,
886889
context_identifier: Optional[str] = None,
887890
response_identifier: Optional[str] = None,
888-
mode: Optional[str] = _CONTINUOUS_STR,
891+
mode: Optional[str] = "auto",
889892
):
890893
"""
891894
lazydocs: ignore
@@ -901,7 +904,192 @@ def __init__(
901904
self.query_identifier = query_identifier
902905
self.context_identifier = context_identifier
903906
self.response_identifier = response_identifier
904-
self.mode = mode
907+
908+
# Compile and validate the eval
909+
self.mode = self._compile_mode(mode, criteria, name)
910+
911+
def _compile_mode(self, mode: Optional[str], criteria: str, name: str) -> str:
912+
"""
913+
Compile and validate the mode based on criteria.
914+
915+
Args:
916+
mode: The specified mode ("binary", "continuous", or "auto")
917+
criteria: The evaluation criteria text
918+
name: The name of the evaluation
919+
920+
Returns:
921+
str: The compiled mode ("binary" or "continuous")
922+
"""
923+
924+
# Check binary criteria once at the beginning
925+
is_binary = self._check_binary_criteria(criteria)
926+
927+
# If mode is auto, determine it automatically
928+
if mode == "auto":
929+
compiled_mode = _BINARY_STR if is_binary else _CONTINUOUS_STR
930+
931+
# Check if it's appropriate for neither
932+
if not is_binary:
933+
has_good_bad = self._check_good_bad_specified(criteria)
934+
has_numeric = self._check_numeric_scoring_scheme(criteria)
935+
936+
if not has_good_bad and not has_numeric:
937+
warning_msg = (
938+
f"Eval '{name}': Criteria does not appear to be a Yes/No question "
939+
"and does not clearly specify what is good/bad or desirable/undesirable. "
940+
"This may result in poor evaluation quality."
941+
)
942+
warnings.warn(warning_msg, UserWarning)
943+
944+
return compiled_mode
945+
946+
# Validation checks for explicit mode specification
947+
if mode == _BINARY_STR:
948+
if not is_binary:
949+
warning_msg = (
950+
f"Eval '{name}': mode is set to '{_BINARY_STR}' but criteria does not appear "
951+
"to be a Yes/No question. Consider rephrasing as a Yes/No question or "
952+
f"changing mode to '{_CONTINUOUS_STR}'."
953+
)
954+
warnings.warn(warning_msg, UserWarning)
955+
956+
elif mode == _CONTINUOUS_STR:
957+
# Check if it's actually a Yes/No question
958+
if is_binary:
959+
warning_msg = (
960+
f"Eval '{name}': mode is set to '{_CONTINUOUS_STR}' but criteria appears to be "
961+
f"a Yes/No question. Consider changing mode to '{_BINARY_STR}' for more appropriate scoring."
962+
)
963+
warnings.warn(warning_msg, UserWarning)
964+
965+
# Check if good/bad is specified
966+
has_good_bad = self._check_good_bad_specified(criteria)
967+
if not has_good_bad:
968+
warning_msg = (
969+
f"Eval '{name}': mode is set to '{_CONTINUOUS_STR}' but criteria does not clearly "
970+
"specify what is good/desirable versus bad/undesirable. This may lead to "
971+
"inconsistent or unclear scoring."
972+
)
973+
warnings.warn(warning_msg, UserWarning)
974+
975+
# Check if it already has a numeric scoring scheme
976+
has_numeric = self._check_numeric_scoring_scheme(criteria)
977+
if has_numeric:
978+
warning_msg = (
979+
f"Eval '{name}': Your `criteria` appears to specify "
980+
"a numeric scoring scheme. We recommend removing any "
981+
"specific numeric scoring scheme from your `criteria` and just specifying what is considered good/better vs. bad/worse."
982+
)
983+
warnings.warn(warning_msg, UserWarning)
984+
985+
# For explicit modes, return as-is (already validated above)
986+
if mode in (_BINARY_STR, _CONTINUOUS_STR):
987+
return mode
988+
989+
# Default to continuous for None or any other value
990+
return _CONTINUOUS_STR
991+
992+
993+
@staticmethod
994+
def _check_binary_criteria(criteria: str) -> bool:
995+
"""
996+
Check if criteria is a Yes/No question using TLM.
997+
998+
Args:
999+
criteria: The evaluation criteria text
1000+
1001+
Returns:
1002+
True if criteria is a Yes/No question, False otherwise
1003+
"""
1004+
tlm = TLM(quality_preset="base")
1005+
1006+
prompt = f"""Consider the following statement:
1007+
1008+
<statement>
1009+
{criteria}
1010+
</statement>
1011+
1012+
## Instructions
1013+
1014+
Classify this statement into one of the following options:
1015+
A) This statement is essentially worded as a Yes/No question or implies a Yes/No question.
1016+
B) This statement is not a Yes/No question, since replying to it with either "Yes" or "No" would not be sensible.
1017+
1018+
Your output must be one choice from either A or B (output only a single letter, no other text)."""
1019+
1020+
response = tlm.prompt(prompt, constrain_outputs=["A", "B"])
1021+
if isinstance(response, list):
1022+
return False
1023+
response_text = response.get("response", "")
1024+
if response_text is None:
1025+
return False
1026+
return str(response_text).strip().upper() == "A"
1027+
1028+
1029+
@staticmethod
1030+
def _check_good_bad_specified(criteria: str) -> bool:
1031+
"""
1032+
Check if criteria clearly specifies what is Good vs Bad or Desirable vs Undesirable.
1033+
1034+
Args:
1035+
criteria: The evaluation criteria text
1036+
1037+
Returns:
1038+
True if criteria clearly defines good/bad or desirable/undesirable, False otherwise
1039+
"""
1040+
tlm = TLM(quality_preset="base")
1041+
1042+
prompt = f"""Analyze the following evaluation criteria and determine if it clearly specifies what is "good" versus "bad", "desirable" versus "undesirable", "better" versus "worse", or uses similar language to define quality distinctions.
1043+
1044+
The criteria should make it clear what characteristics or qualities are considered positive/desirable versus negative/undesirable.
1045+
1046+
Evaluation Criteria:
1047+
{criteria}
1048+
1049+
Does this criteria clearly specify what is good/desirable versus bad/undesirable? Answer only "Yes" or "No"."""
1050+
1051+
response = tlm.prompt(prompt, constrain_outputs=["Yes", "No"])
1052+
if isinstance(response, list):
1053+
return False
1054+
response_text = response.get("response", "")
1055+
if response_text is None:
1056+
return False
1057+
return str(response_text).strip().lower() == "yes"
1058+
1059+
1060+
@staticmethod
1061+
def _check_numeric_scoring_scheme(criteria: str) -> bool:
1062+
"""
1063+
Check if criteria contains a specific numeric scoring scheme (e.g., "rate from 1-5", "score 0-100").
1064+
1065+
Args:
1066+
criteria: The evaluation criteria text
1067+
1068+
Returns:
1069+
True if criteria includes a numeric scoring scheme, False otherwise
1070+
"""
1071+
tlm = TLM(quality_preset="base")
1072+
1073+
prompt = f"""Analyze the following evaluation criteria and determine if it contains a specific numeric scoring scheme.
1074+
1075+
Examples of numeric scoring schemes include:
1076+
- "Rate from 1 to 5"
1077+
- "Score between 0 and 100"
1078+
- "Assign a rating of 1-10"
1079+
- "Give a score from 0 to 1"
1080+
1081+
Evaluation Criteria:
1082+
{criteria}
1083+
1084+
Does this criteria specify a numeric scoring scheme? Answer only "Yes" or "No"."""
1085+
1086+
response = tlm.prompt(prompt, constrain_outputs=["Yes", "No"])
1087+
if isinstance(response, list):
1088+
return False
1089+
response_text = response.get("response", "")
1090+
if response_text is None:
1091+
return False
1092+
return str(response_text).strip().lower() == "yes"
9051093

9061094
def __repr__(self) -> str:
9071095
"""
@@ -1061,4 +1249,4 @@ class TrustworthyRAGScore(dict[str, EvalMetric]):
10611249
...
10621250
}
10631251
```
1064-
"""
1252+
"""

0 commit comments

Comments
 (0)