trishullab · amit9oct · Jan 10, 2026 · Jan 10, 2026 · Jan 10, 2026
diff --git a/pyproject.toml b/pyproject.toml
@@ -10,7 +10,7 @@ packages = ["src/copra"]
 
 [project]
 name = "copra-theorem-prover"
-version = "1.4.0"
+version = "1.5.0"
 authors = [
   { name="Amitayush Thakur", email="amitayush@utexas.edu" },
 ]
@@ -24,7 +24,7 @@ classifiers = [
 ]
 
 dependencies = [
-    "itp-interface>=1.4.0",
+    "itp-interface>=1.5.0",
     "openai>=1.99.1",
     "tiktoken>=0.12.0",  # Updated: 0.4.0 incompatible with Python 3.14t, needs PyO3 0.23+ for free-threading
     "sentencepiece>=0.2.0",  # Updated: 0.1.99 lacks Python 3.14t wheels, 0.2.1 has cp314t support

diff --git a/src/copra/agent/dfs_policy_prompter.py b/src/copra/agent/dfs_policy_prompter.py
@@ -277,15 +277,15 @@ def run_prompt(self, request: CoqGptResponse) -> list:
                         n=self.num_sequences,
                         temperature=temperature,
                         max_tokens=tokens_to_generate,
-                        stop=["[END]"],
+                        stop=["[END]"] if not self._gpt_access.is_reasoning_model() else [],
                         **self._model_params)
                 else:
                     responses, usage = self._gpt_access.complete_chat(
                         messages,
                         n=self.num_sequences,
                         temperature=temperature,
                         max_tokens=tokens_to_generate,
-                        stop=["[END]"])
+                        stop=["[END]"] if not self._gpt_access.is_reasoning_model() else [])
                 request_end_time = time.time()
                 time_taken = request_end_time - request_start_time
                 apporx_output_tokens = usage["total_tokens"] - total_token_count

diff --git a/src/copra/gpts/gpt_access.py b/src/copra/gpts/gpt_access.py
@@ -159,6 +159,25 @@ class GptAccess:
         "claude-3-7-sonnet-20250219": "https://api.anthropic.com/v1"
     }
 
+    @staticmethod
+    def check_if_reasoning_model(model_name: str) -> bool:
+        return model_name in {
+            "o1-mini", "o1",
+            "o3-mini", "o3",
+            "o4-mini",
+            "gpt-5-mini",
+            "claude-3-7-sonnet-20250219",
+            "anthropic.claude-3-7-sonnet-20250219-v1:0",
+            "anthropic.claude-3-5-haiku-20241022-v1:0",
+            "anthropic.claude-3-5-sonnet-20241022-v2:0",
+            "anthropic.claude-3-5-sonnet-20240620-v1:0",
+            "anthropic.claude-3-opus-20240229-v1:0",
+            "anthropic.claude-3-haiku-20240307-v1:0",
+            "anthropic.claude-3-sonnet-20240229-v1:0",
+            "deepseek.r1-v1:0",
+            "vllm:openai/gpt-oss-20b"
+        }
+
     def __init__(self,
                  secret_filepath: str = None,
                  model_name: typing.Optional[str] = None) -> None:
@@ -318,13 +337,15 @@ def complete_chat(self,
             self.get_response_generic(model, messages, max_tokens, stop, temperature, n, reasoning_token_count, reasoning_effort)
         else:
             return_responses, usage, stopping_reasons = \
-            self.get_response_generic(model, messages, max_tokens, stop, temperature, n)
+            self.get_response_generic(model, messages, max_tokens, stop, temperature, n, reasoning_token_count, reasoning_effort)
         usage_dict = {
             "prompt_tokens": usage["prompt_tokens"],
             "completion_tokens": usage["completion_tokens"],
             "total_tokens": usage["total_tokens"],
             "reason": stopping_reasons
         }
+        if "reasoning_tokens" in usage:
+            usage_dict["reasoning_tokens"] = usage["reasoning_tokens"]
         return return_responses, usage_dict
 
     def handle_thinking_messages(self, messages: typing.List[typing.Dict[str, str]]) -> typing.List[typing.Dict[str, str]]:
@@ -374,7 +395,32 @@ def get_thinking_response(self, model, messages, max_tokens, stop : typing.List[
         self.usage["prompt_tokens"] += usage.prompt_tokens
         self.usage["completion_tokens"] += usage.completion_tokens
         self.usage["total_tokens"] += usage.total_tokens
-        return_responses = [{"role": choice.message.role, "content": choice.message.content} for choice in response.choices]
+
+        # Extract reasoning tokens from usage details if available
+        reasoning_tokens = 0
+        if hasattr(usage, 'completion_tokens_details') and usage.completion_tokens_details is not None:
+            if hasattr(usage.completion_tokens_details, 'reasoning_tokens') and usage.completion_tokens_details.reasoning_tokens is not None:
+                reasoning_tokens = usage.completion_tokens_details.reasoning_tokens
+
+        return_responses = []
+        for choice in response.choices:
+            resp_dict = {"role": choice.message.role, "content": choice.message.content}
+            # Check multiple possible locations for reasoning content
+            reasoning = None
+            if hasattr(choice.message, 'reasoning') and choice.message.reasoning is not None:
+                reasoning = choice.message.reasoning
+            elif hasattr(choice.message, 'reasoning_content') and choice.message.reasoning_content is not None:
+                reasoning = choice.message.reasoning_content
+            elif choice.message.model_extra is not None and 'reasoning' in choice.message.model_extra:
+                reasoning = choice.message.model_extra['reasoning']
+            elif choice.message.model_extra is not None and 'reasoning_content' in choice.message.model_extra:
+                reasoning = choice.message.model_extra['reasoning_content']
+
+            if reasoning is not None:
+                resp_dict['reasoning'] = reasoning
+                resp_dict['reasoning_effort'] = reasoning_effort
+
+            return_responses.append(resp_dict)
         for i in range(len(return_responses) - 1):
             return_responses[i]["finish_reason"] = "stop"
         if len(response.choices) > 0:
@@ -385,6 +431,8 @@ def get_thinking_response(self, model, messages, max_tokens, stop : typing.List[
             "completion_tokens": usage.completion_tokens,
             "total_tokens": usage.total_tokens
         }
+        if reasoning_tokens > 0:
+            usage["reasoning_tokens"] = reasoning_tokens
         return return_responses, usage, stopping_reasons
 
     def get_gpt_4_o_response(self, model, messages, max_tokens, stop, temperature, top_p, frequency_penalty, presence_penalty, n) -> typing.Tuple[list, dict, str]:
@@ -403,7 +451,10 @@ def get_gpt_4_o_response(self, model, messages, max_tokens, stop, temperature, t
         self.usage["prompt_tokens"] += usage.prompt_tokens
         self.usage["completion_tokens"] += usage.completion_tokens
         self.usage["total_tokens"] += usage.total_tokens
-        return_responses = [{"role": choice.message.role, "content": choice.message.content} for choice in response.choices]
+        return_responses = []
+        for choice in response.choices:
+            resp_dict = {"role": choice.message.role, "content": choice.message.content}
+            return_responses.append(resp_dict)
         for i in range(len(return_responses) - 1):
             return_responses[i]["finish_reason"] = "stop"
         if len(response.choices) > 0:
@@ -415,7 +466,7 @@ def get_gpt_4_o_response(self, model, messages, max_tokens, stop, temperature, t
             "total_tokens": usage.total_tokens
         }
         return return_responses, usage, stopping_reasons
-    
+
     def complete_chat_bedrock(self,
             messages: typing.List[typing.Dict[str, str]],
             model: typing.Optional[str] = None,
@@ -538,6 +589,12 @@ def get_response_generic(self, model, messages, max_tokens, stop, temperature, n
         )
         usage_obj = response.usage
 
+        # Extract reasoning tokens from usage details if available
+        reasoning_tokens = 0
+        if usage_obj is not None and hasattr(usage_obj, 'completion_tokens_details') and usage_obj.completion_tokens_details is not None:
+            if hasattr(usage_obj.completion_tokens_details, 'reasoning_tokens') and usage_obj.completion_tokens_details.reasoning_tokens is not None:
+                reasoning_tokens = usage_obj.completion_tokens_details.reasoning_tokens
+
         # Handle cases where usage might be None (e.g., vLLM, some GPT models)
         if usage_obj is not None:
             prompt_tokens = getattr(usage_obj, 'prompt_tokens', 0)
@@ -566,13 +623,33 @@ def get_response_generic(self, model, messages, max_tokens, stop, temperature, n
                 "completion_tokens": completion_tokens,
                 "total_tokens": total_tokens
             }
+        return_responses = []
+        for choice in response.choices:
+            resp_dict = {"role": choice.message.role, "content": choice.message.content}
+            # Check multiple possible locations for reasoning content
+            reasoning = None
+            if hasattr(choice.message, 'reasoning') and choice.message.reasoning is not None:
+                reasoning = choice.message.reasoning
+            elif hasattr(choice.message, 'reasoning_content') and choice.message.reasoning_content is not None:
+                reasoning = choice.message.reasoning_content
+            elif choice.message.model_extra is not None and 'reasoning' in choice.message.model_extra:
+                reasoning = choice.message.model_extra['reasoning']
+            elif choice.message.model_extra is not None and 'reasoning_content' in choice.message.model_extra:
+                reasoning = choice.message.model_extra['reasoning_content']
+
+            if reasoning is not None:
+                resp_dict['reasoning'] = reasoning
+                resp_dict['reasoning_effort'] = reasoning_effort
+
+            return_responses.append(resp_dict)
 
-        return_responses = [{"role": choice.message.role, "content": choice.message.content} for choice in response.choices]
         for i in range(len(return_responses) - 1):
             return_responses[i]["finish_reason"] = "stop"
         if len(response.choices) > 0:
             return_responses[-1]["finish_reason"] = response.choices[-1].finish_reason
         stopping_reasons = response.choices[-1].finish_reason if len(response.choices) > 0 else "stop"
+        if reasoning_tokens > 0:
+            usage["reasoning_tokens"] = reasoning_tokens
         return return_responses, usage, stopping_reasons
 
     def num_tokens_from_messages(self, messages, model=None):
@@ -659,6 +736,9 @@ def _load_secret(self) -> None:
     def get_usage(self) -> dict:
         return self.usage
 
+    def is_reasoning_model(self) -> bool:
+        return self.check_if_reasoning_model(self.model_name)
+
 # Integration Test Suite
 class IntegrationTests(unittest.TestCase):
     def setUp(self):
@@ -684,6 +764,10 @@ def _run_chat_test(self, model_name: str, token_count: int):
         self.assertIn("content", responses[0])
         self.assertTrue(len(responses[0]["content"]) > 0, f"No content returned for model {model_name}")
         print(f"Model: {model_name}, Response: {responses[0]['content']}")
+        if 'reasoning' in responses[0]:
+            print(f"Reasoning ({responses[0].get('reasoning_effort', 'N/A')}): {responses[0]['reasoning']}")
+        if 'reasoning_tokens' in usage:
+            print(f"Reasoning tokens: {usage['reasoning_tokens']}")
 
     def test_o3_mini_model(self):
         self._run_chat_test("o3-mini", token_count=100)
@@ -694,9 +778,6 @@ def test_o4_mini_model(self):
     def test_gpt4o_model(self):
         self._run_chat_test("gpt-4o", token_count=100)
 
-    def test_o1_mini_model(self):
-        self._run_chat_test("o1-mini", token_count=600)
-
     def test_o1_model(self):
         self._run_chat_test("o1", token_count=300)
 
@@ -710,7 +791,13 @@ def test_bedrock_deepseek_model(self):
         self._run_chat_test("deepseek.r1-v1:0", token_count=300)
 
     def test_gpt5_mini_model(self):
-        self._run_chat_test("gpt-5-mini", token_count=100)
+        self._run_chat_test("gpt-5-mini", token_count=500)
+
+    def test_vllm_model(self):
+        # This test assumes a local vLLM server is running and accessible
+        os.environ["VLLM_API_KEY"] = "EMPTY"  # vLLM doesn't require a real key
+        os.environ["VLLM_BASE_URL"] = "http://127.0.0.1:48000/v1"
+        self._run_chat_test("vllm:openai/gpt-oss-20b", token_count=100)
 
 if __name__ == "__main__":
-    unittest.main()
+    unittest.main()