cleanlab
diff --git a/‎.stats.yml‎
Lines changed: 2 additions & 2 deletions b/‎.stats.yml‎
Lines changed: 2 additions & 2 deletions
diff --git a/‎src/codex/resources/projects/projects.py‎
Lines changed: 32 additions & 42 deletions b/‎src/codex/resources/projects/projects.py‎
Lines changed: 32 additions & 42 deletions
@@ -1,3 +1,3 @@
 configured_endpoints: 54
-openapi_spec_hash: 49989625bf633c5fdb3e11140f788f2d
-config_hash: 8f6e5c3b064cbb77569a6bf654954a56
+openapi_spec_hash: 57e29e33aec4bbc20171ec3128594e75
+config_hash: 930284cfa37f835d949c8a1b124f4807
@@ -460,7 +460,6 @@ def validate(
         quality_preset: Literal["best", "high", "medium", "low", "base"] | NotGiven = NOT_GIVEN,
         rewritten_question: Optional[str] | NotGiven = NOT_GIVEN,
         task: Optional[str] | NotGiven = NOT_GIVEN,
-        tools: Optional[Iterable[project_validate_params.Tool]] | NotGiven = NOT_GIVEN,
         x_client_library_version: str | NotGiven = NOT_GIVEN,
         x_integration_type: str | NotGiven = NOT_GIVEN,
         x_source: str | NotGiven = NOT_GIVEN,
@@ -505,16 +504,17 @@ def validate(
 
               The default values corresponding to each quality preset are:
 
-              - **best:** `num_consistency_samples` = 8, `num_self_reflections` = 3,
-                `reasoning_effort` = `"high"`.
-              - **high:** `num_consistency_samples` = 4, `num_self_reflections` = 3,
-                `reasoning_effort` = `"high"`.
-              - **medium:** `num_consistency_samples` = 0, `num_self_reflections` = 3,
-                `reasoning_effort` = `"high"`.
-              - **low:** `num_consistency_samples` = 0, `num_self_reflections` = 3,
-                `reasoning_effort` = `"none"`.
-              - **base:** `num_consistency_samples` = 0, `num_self_reflections` = 1,
-                `reasoning_effort` = `"none"`.
+              - **best:** `num_candidate_responses` = 6, `num_consistency_samples` = 8,
+                `use_self_reflection` = True. This preset improves LLM responses.
+              - **high:** `num_candidate_responses` = 4, `num_consistency_samples` = 8,
+                `use_self_reflection` = True. This preset improves LLM responses.
+              - **medium:** `num_candidate_responses` = 1, `num_consistency_samples` = 8,
+                `use_self_reflection` = True.
+              - **low:** `num_candidate_responses` = 1, `num_consistency_samples` = 4,
+                `use_self_reflection` = True.
+              - **base:** `num_candidate_responses` = 1, `num_consistency_samples` = 0,
+                `use_self_reflection` = False. When using `get_trustworthiness_score()` on
+                "base" preset, a faster self-reflection is employed.
 
               By default, TLM uses the: "medium" `quality_preset`, "gpt-4.1-mini" base
               `model`, and `max_tokens` is set to 512. You can set custom values for these
@@ -550,11 +550,12 @@ def validate(
                   strange prompts or prompts that are too vague/open-ended to receive a clearly defined 'good' response.
                   TLM measures consistency via the degree of contradiction between sampled responses that the model considers plausible.
 
-                  num_self_reflections(int, default = 3): the number of self-reflections to perform where the LLM is asked to reflect on the given response and directly evaluate correctness/confidence.
-                  The maximum number of self-reflections currently supported is 3. Lower values will reduce runtimes/costs, but potentially also the reliability of trustworthiness scores.
-                  Reflection helps quantify aleatoric uncertainty associated with challenging prompts and catches responses that are noticeably incorrect/bad upon further analysis.
+                  use_self_reflection (bool, default = `True`): whether the LLM is asked to reflect on the given response and directly evaluate correctness/confidence.
+                  Setting this False disables reflection and will reduce runtimes/costs, but potentially also the reliability of trustworthiness scores.
+                  Reflection helps quantify aleatoric uncertainty associated with challenging prompts
+                  and catches responses that are noticeably incorrect/bad upon further analysis.
 
-                  similarity_measure ({"semantic", "string", "embedding", "embedding_large", "code", "discrepancy"}, default = "discrepancy"): how the
+                  similarity_measure ({"semantic", "string", "embedding", "embedding_large", "code", "discrepancy"}, default = "semantic"): how the
                   trustworthiness scoring's consistency algorithm measures similarity between alternative responses considered plausible by the model.
                   Supported similarity measures include - "semantic" (based on natural language inference),
                   "embedding" (based on vector embedding similarity), "embedding_large" (based on a larger embedding model),
@@ -573,8 +574,6 @@ def validate(
                   - name: Name of the evaluation criteria.
                   - criteria: Instructions specifying the evaluation criteria.
 
-                  use_self_reflection (bool, default = `True`): deprecated. Use `num_self_reflections` instead.
-
           prompt: The prompt to use for the TLM call. If not provided, the prompt will be
               generated from the messages.
 
@@ -583,9 +582,6 @@ def validate(
           rewritten_question: The re-written query if it was provided by the client to Codex from a user to be
               used instead of the original query.
 
-          tools: Tools to use for the LLM call. If not provided, it is assumed no tools were
-              provided to the LLM.
-
           extra_headers: Send extra headers
 
           extra_query: Add additional query parameters to the request
@@ -624,7 +620,6 @@ def validate(
                     "quality_preset": quality_preset,
                     "rewritten_question": rewritten_question,
                     "task": task,
-                    "tools": tools,
                 },
                 project_validate_params.ProjectValidateParams,
             ),
@@ -1033,7 +1028,6 @@ async def validate(
         quality_preset: Literal["best", "high", "medium", "low", "base"] | NotGiven = NOT_GIVEN,
         rewritten_question: Optional[str] | NotGiven = NOT_GIVEN,
         task: Optional[str] | NotGiven = NOT_GIVEN,
-        tools: Optional[Iterable[project_validate_params.Tool]] | NotGiven = NOT_GIVEN,
         x_client_library_version: str | NotGiven = NOT_GIVEN,
         x_integration_type: str | NotGiven = NOT_GIVEN,
         x_source: str | NotGiven = NOT_GIVEN,
@@ -1078,16 +1072,17 @@ async def validate(
 
               The default values corresponding to each quality preset are:
 
-              - **best:** `num_consistency_samples` = 8, `num_self_reflections` = 3,
-                `reasoning_effort` = `"high"`.
-              - **high:** `num_consistency_samples` = 4, `num_self_reflections` = 3,
-                `reasoning_effort` = `"high"`.
-              - **medium:** `num_consistency_samples` = 0, `num_self_reflections` = 3,
-                `reasoning_effort` = `"high"`.
-              - **low:** `num_consistency_samples` = 0, `num_self_reflections` = 3,
-                `reasoning_effort` = `"none"`.
-              - **base:** `num_consistency_samples` = 0, `num_self_reflections` = 1,
-                `reasoning_effort` = `"none"`.
+              - **best:** `num_candidate_responses` = 6, `num_consistency_samples` = 8,
+                `use_self_reflection` = True. This preset improves LLM responses.
+              - **high:** `num_candidate_responses` = 4, `num_consistency_samples` = 8,
+                `use_self_reflection` = True. This preset improves LLM responses.
+              - **medium:** `num_candidate_responses` = 1, `num_consistency_samples` = 8,
+                `use_self_reflection` = True.
+              - **low:** `num_candidate_responses` = 1, `num_consistency_samples` = 4,
+                `use_self_reflection` = True.
+              - **base:** `num_candidate_responses` = 1, `num_consistency_samples` = 0,
+                `use_self_reflection` = False. When using `get_trustworthiness_score()` on
+                "base" preset, a faster self-reflection is employed.
 
               By default, TLM uses the: "medium" `quality_preset`, "gpt-4.1-mini" base
               `model`, and `max_tokens` is set to 512. You can set custom values for these
@@ -1123,11 +1118,12 @@ async def validate(
                   strange prompts or prompts that are too vague/open-ended to receive a clearly defined 'good' response.
                   TLM measures consistency via the degree of contradiction between sampled responses that the model considers plausible.
 
-                  num_self_reflections(int, default = 3): the number of self-reflections to perform where the LLM is asked to reflect on the given response and directly evaluate correctness/confidence.
-                  The maximum number of self-reflections currently supported is 3. Lower values will reduce runtimes/costs, but potentially also the reliability of trustworthiness scores.
-                  Reflection helps quantify aleatoric uncertainty associated with challenging prompts and catches responses that are noticeably incorrect/bad upon further analysis.
+                  use_self_reflection (bool, default = `True`): whether the LLM is asked to reflect on the given response and directly evaluate correctness/confidence.
+                  Setting this False disables reflection and will reduce runtimes/costs, but potentially also the reliability of trustworthiness scores.
+                  Reflection helps quantify aleatoric uncertainty associated with challenging prompts
+                  and catches responses that are noticeably incorrect/bad upon further analysis.
 
-                  similarity_measure ({"semantic", "string", "embedding", "embedding_large", "code", "discrepancy"}, default = "discrepancy"): how the
+                  similarity_measure ({"semantic", "string", "embedding", "embedding_large", "code", "discrepancy"}, default = "semantic"): how the
                   trustworthiness scoring's consistency algorithm measures similarity between alternative responses considered plausible by the model.
                   Supported similarity measures include - "semantic" (based on natural language inference),
                   "embedding" (based on vector embedding similarity), "embedding_large" (based on a larger embedding model),
@@ -1146,8 +1142,6 @@ async def validate(
                   - name: Name of the evaluation criteria.
                   - criteria: Instructions specifying the evaluation criteria.
 
-                  use_self_reflection (bool, default = `True`): deprecated. Use `num_self_reflections` instead.
-
           prompt: The prompt to use for the TLM call. If not provided, the prompt will be
               generated from the messages.
 
@@ -1156,9 +1150,6 @@ async def validate(
           rewritten_question: The re-written query if it was provided by the client to Codex from a user to be
               used instead of the original query.
 
-          tools: Tools to use for the LLM call. If not provided, it is assumed no tools were
-              provided to the LLM.
-
           extra_headers: Send extra headers
 
           extra_query: Add additional query parameters to the request
@@ -1197,7 +1188,6 @@ async def validate(
                     "quality_preset": quality_preset,
                     "rewritten_question": rewritten_question,
                     "task": task,
-                    "tools": tools,
                 },
                 project_validate_params.ProjectValidateParams,
             ),