diff --git a/tensorrt_llm/llmapi/llm_args.py b/tensorrt_llm/llmapi/llm_args.py index 217ee794ebf..8300db67f1b 100644 --- a/tensorrt_llm/llmapi/llm_args.py +++ b/tensorrt_llm/llmapi/llm_args.py @@ -2765,7 +2765,7 @@ class TorchLlmArgs(BaseLlmArgs): # PrivateVars _quant_config: Optional[QuantConfig] = PrivateAttr(default=None) - _disable_flash_infer_sampling: bool = PrivateAttr(default=True) + _disable_flash_infer_sampling: bool = PrivateAttr(default=False) """Unless this is set to False, FlashInfer.sampling is not used, even if available.""" @property diff --git a/tests/unittest/llmapi/apps/_test_openai_chat_guided_decoding.py b/tests/unittest/llmapi/apps/_test_openai_chat_guided_decoding.py index 222c975b85e..ccfe1d02e22 100644 --- a/tests/unittest/llmapi/apps/_test_openai_chat_guided_decoding.py +++ b/tests/unittest/llmapi/apps/_test_openai_chat_guided_decoding.py @@ -136,6 +136,7 @@ def test_openai_compatible_json_schema(client: openai.OpenAI, model_name: str): "type": "json_schema", "json_schema": json_schema }, + temperature=0.0, ) message = chat_completion.choices[0].message