diff --git a/pyproject.toml b/pyproject.toml
index 0733f20..a0a9cec 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -10,7 +10,7 @@ packages = ["src/copra"]
 
 [project]
 name = "copra-theorem-prover"
-version = "1.5.0"
+version = "1.6.0"
 authors = [
   { name="Amitayush Thakur", email="amitayush@utexas.edu" },
 ]
diff --git a/src/copra/agent/dfs_policy_prompter.py b/src/copra/agent/dfs_policy_prompter.py
index d97cc5c..4e96301 100644
--- a/src/copra/agent/dfs_policy_prompter.py
+++ b/src/copra/agent/dfs_policy_prompter.py
@@ -10,7 +10,6 @@
 from copra.agent.rate_limiter import InvalidActionException
 from copra.agent.simple_policy_prompter import SimplePolicyPrompter
 from copra.agent.gpt_guided_tree_search_policy import PromptSummary, ProofQInfo, TreeSearchAction, TreeSearchActionType
-from copra.gpts.llama_access import ServiceDownError
 from copra.retrieval.coq_bm25_reranker import CoqBM25TrainingDataRetriever
 from copra.prompt_generator.gpt_request_grammar import CoqGPTRequestGrammar, CoqGptRequest, CoqGptRequestActions
 from copra.prompt_generator.dfs_agent_grammar import DfsAgentGrammar
@@ -327,10 +326,6 @@ def run_prompt(self, request: CoqGptResponse) -> list:
                     # don't change temperature for now
 
                 self._num_api_calls += 1
-            except ServiceDownError as e:
-                self.logger.info("Got a service down error. Will giveup until the docker container is restarted.")
-                self.logger.exception(e)
-                raise
             except Exception as e:
                 self.logger.info("Got an unknown exception. Retrying.")
                 self.logger.exception(e)
diff --git a/src/copra/agent/simple_policy_prompter.py b/src/copra/agent/simple_policy_prompter.py
index a7490c6..6fdc386 100644
--- a/src/copra/agent/simple_policy_prompter.py
+++ b/src/copra/agent/simple_policy_prompter.py
@@ -12,7 +12,6 @@
 import logging
 from copra.agent.rate_limiter import RateLimiter
 from copra.gpts.gpt_access import GptAccess
-from copra.gpts.llama_access import LlamaAccess, ServiceDownError
 from copra.prompt_generator.prompter import PolicyPrompter
 from copra.tools.misc import model_supports_openai_api, is_vllm_model
 
@@ -74,10 +73,7 @@ def __init__(
 
         # Initialize LLM access (GptAccess or LlamaAccess)
         # Note: vLLM models (with "vllm:" prefix) are handled by GptAccess
-        if not model_supports_openai_api(gpt_model_name):
-            self._gpt_access = LlamaAccess(gpt_model_name)
-        else:
-            self._gpt_access = GptAccess(secret_filepath=secret_filepath, model_name=gpt_model_name)
+        self._gpt_access = GptAccess(secret_filepath=secret_filepath, model_name=gpt_model_name)
 
         # Get model configuration
         # For vLLM models, use the generic "vllm" key in model_info
@@ -104,14 +100,11 @@ def __init__(
 
     def __enter__(self):
         """Context manager entry - initialize LLM service if needed."""
-        if isinstance(self._gpt_access, LlamaAccess):
-            self._gpt_access.__enter__()
         return self
 
     def __exit__(self, exc_type, exc_value, traceback):
         """Context manager exit - cleanup LLM service if needed."""
-        if isinstance(self._gpt_access, LlamaAccess):
-            self._gpt_access.__exit__(exc_type, exc_value, traceback)
+        pass
 
     def add_to_history(self, message: typing.Dict[str, str]):
         """
diff --git a/src/copra/baselines/gpt4/few_shot_policy_prompter.py b/src/copra/baselines/gpt4/few_shot_policy_prompter.py
index cc57324..4023b01 100644
--- a/src/copra/baselines/gpt4/few_shot_policy_prompter.py
+++ b/src/copra/baselines/gpt4/few_shot_policy_prompter.py
@@ -8,12 +8,10 @@
 from copra.agent.rate_limiter import RateLimiter, InvalidActionException
 from copra.agent.gpt_guided_tree_search_policy import TreeSearchAction
 from copra.gpts.gpt_access import GptAccess
-from copra.gpts.llama_access import LlamaAccess
 from itp_interface.rl.proof_action import ProofAction
 from copra.prompt_generator.prompter import PolicyPrompter
 from copra.prompt_generator.dfs_agent_grammar import DfsAgentGrammar
 from copra.baselines.gpt4.few_shot_grammar import FewShotGptRequest, FewShotGptRequestGrammar, FewShotGptResponse, FewShotGptResponseGrammar
-from copra.tools.misc import model_supports_openai_api
 
 class FewShotGptPolicyPrompter(PolicyPrompter):
     _cache: typing.Dict[str, typing.Any] = {}
@@ -43,10 +41,7 @@ def __init__(self,
         conv_messages = self.agent_grammar.get_openai_conv_messages(example_conv_prompt_path, "system")
         main_message = self.agent_grammar.get_openai_main_message(main_sys_prompt_path, "system")
         self.system_messages = [main_message] + conv_messages
-        if not model_supports_openai_api(gpt_model_name):
-            self._gpt_access = LlamaAccess(gpt_model_name)
-        else:
-            self._gpt_access = GptAccess(secret_filepath=secret_filepath, model_name=gpt_model_name)
+        self._gpt_access = GptAccess(secret_filepath=secret_filepath, model_name=gpt_model_name)
 
         # For vLLM models, use the generic "vllm" key for model info
         model_info_key = "vllm" if gpt_model_name.startswith("vllm:") else gpt_model_name
@@ -83,12 +78,10 @@ def __init__(self,
         pass
 
     def __enter__(self):
-        if isinstance(self._gpt_access, LlamaAccess):
-            self._gpt_access.__enter__()
+        pass
     
     def __exit__(self, exc_type, exc_value, traceback):
-        if isinstance(self._gpt_access, LlamaAccess):
-            self._gpt_access.__exit__(exc_type, exc_value, traceback)
+        pass
 
     def _init_retriever(self):
         if FewShotGptPolicyPrompter._cache.get(self._training_data_path, None) is not None:
diff --git a/src/copra/baselines/gpt4/hammer_policy_prompter.py b/src/copra/baselines/gpt4/hammer_policy_prompter.py
index 679be23..1ae9ee1 100644
--- a/src/copra/baselines/gpt4/hammer_policy_prompter.py
+++ b/src/copra/baselines/gpt4/hammer_policy_prompter.py
@@ -6,10 +6,7 @@
 from copra.retrieval.coq_bm25_reranker import CoqBM25TrainingDataRetriever
 from copra.prompt_generator.agent_grammar import CoqGPTResponseGrammar
 from copra.prompt_generator.gpt_request_grammar import CoqGPTRequestGrammar, CoqGptRequestActions
-from copra.agent.rate_limiter import RateLimiter
 from copra.agent.gpt_guided_tree_search_policy import TreeSearchAction, TreeSearchActionType
-from copra.gpts.gpt_access import GptAccess
-from copra.gpts.llama_access import LlamaAccess
 from itp_interface.rl.proof_action import ProofAction
 from copra.prompt_generator.prompter import PolicyPrompter
 from copra.prompt_generator.dfs_agent_grammar import DfsAgentGrammar
diff --git a/src/copra/baselines/gpt4/informal_few_shot_policy_prompter.py b/src/copra/baselines/gpt4/informal_few_shot_policy_prompter.py
index 4f5b90f..1059783 100644
--- a/src/copra/baselines/gpt4/informal_few_shot_policy_prompter.py
+++ b/src/copra/baselines/gpt4/informal_few_shot_policy_prompter.py
@@ -8,7 +8,6 @@
 from copra.agent.rate_limiter import RateLimiter, InvalidActionException
 from copra.agent.gpt_guided_tree_search_policy import TreeSearchAction
 from copra.gpts.gpt_access import GptAccess
-from copra.gpts.llama_access import LlamaAccess
 from itp_interface.rl.proof_action import ProofAction
 from copra.prompt_generator.prompter import PolicyPrompter
 from copra.prompt_generator.dfs_agent_grammar import DfsAgentGrammar
@@ -44,10 +43,7 @@ def __init__(self,
         conv_messages = self.agent_grammar.get_openai_conv_messages(example_conv_prompt_path, "system")
         main_message = self.agent_grammar.get_openai_main_message(main_sys_prompt_path, "system")
         self.system_messages = [main_message] + conv_messages
-        if not model_supports_openai_api(gpt_model_name):
-            self._gpt_access = LlamaAccess(gpt_model_name)
-        else:
-            self._gpt_access = GptAccess(secret_filepath=secret_filepath, model_name=gpt_model_name)
+        self._gpt_access = GptAccess(secret_filepath=secret_filepath, model_name=gpt_model_name)
         self._token_limit_per_min = GptAccess.gpt_model_info[gpt_model_name]["token_limit_per_min"]
         self._request_limit_per_min = GptAccess.gpt_model_info[gpt_model_name]["request_limit_per_min"]
         self._max_token_per_prompt = GptAccess.gpt_model_info[gpt_model_name]["max_token_per_prompt"]
@@ -81,12 +77,10 @@ def __init__(self,
         pass
 
     def __enter__(self):
-        if isinstance(self._gpt_access, LlamaAccess):
-            self._gpt_access.__enter__()
+        pass
     
     def __exit__(self, exc_type, exc_value, traceback):
-        if isinstance(self._gpt_access, LlamaAccess):
-            self._gpt_access.__exit__(exc_type, exc_value, traceback)
+        pass
 
     def _init_retriever(self):
         if InformalFewShotGptPolicyPrompter._cache.get(self._training_data_path, None) is not None:
diff --git a/src/copra/gpts/llama2_chat_format.py b/src/copra/gpts/llama2_chat_format.py
deleted file mode 100644
index 0ed3d1c..0000000
--- a/src/copra/gpts/llama2_chat_format.py
+++ /dev/null
@@ -1,227 +0,0 @@
-#!/usr/bin/env python3
-
-class Llama2FormatChat(object):
-    """
-        prompt  = f"<<SYS>>\\n{system}\\n<</SYS>>\\n\\n{user_1}"
-        prompt += f"<s>[INST] {prompt.strip()} [/INST] {answer_1.strip()} </s>"
-        prompt += f"<s>[INST] {user_2.strip()} [/INST] {answer_2.strip()} </s>"
-        prompt += f"<s>[INST] {user_3.strip()} [/INST]"
-    """
-    start_token = "<s>"
-    end_token = "</s>"
-    sys_start = "<<SYS>>"
-    sys_end = "<</SYS>>"
-    inst_start = "[INST]"
-    inst_end = "[/INST]"
-    def __init__(self):
-        pass
-
-    def __call__(self, messages) -> str:
-        """
-        messages are of the form
-            messages = [
-                {
-                    "role": "system",
-                    "content": "....",
-                },
-                {
-                    "role": "system",
-                    "name": "example_user",
-                    "content": "....",
-                },
-                {
-                    "role": "system",
-                    "name": "example_assistant",
-                    "content": "....",
-                },
-                {
-                    "role": "system",
-                    "name": "example_user",
-                    "content": "....",
-                },
-                {
-                    "role": "system",
-                    "name": "example_assistant",
-                    "content": "....",
-                },
-                {
-                    "role": "user",
-                    "content": "",
-                }
-            ]
-        """
-        # Collect all the system messages
-        system_messages = []
-        user_messages = []
-        assistant_messages = []
-        role_names = set()
-        for message in messages:
-            if message["role"] == "system":
-                system_messages.append(message)
-                if "name" in message:
-                    role_names.add(message["name"])
-            elif message["role"] == "user":
-                user_messages.append(message)
-                if "name" in message:
-                    role_names.add(message["name"])
-            elif message["role"] == "assistant":
-                assistant_messages.append(message)
-                if "name" in message:
-                    role_names.add(message["name"])
-            else:
-                raise ValueError(f"Unknown role: {message['role']}")
-        sys_prompt = self._format_system_messages(system_messages)
-        user_prompt = self._format_user_assistant_messages(user_messages, assistant_messages)
-        prompt = \
-f"""{sys_prompt}
-
-{user_prompt}"""
-        return prompt, role_names
-
-    def _format_system_messages(self, system_message):
-        """
-        system_message is of the form
-        """
-        main_system_messages = [sys_msg["content"] for sys_msg in system_message if "name" not in sys_msg]
-        main_system_message = "\n".join(main_system_messages)
-        example_messages = [(sys_msg["name"], sys_msg["content"]) for sys_msg in system_message if "name" in sys_msg]
-        example_messages = [f"`{name}`:\n{msg}" for name, msg in example_messages]
-        if len(example_messages) > 0:
-            example_message = "\n".join(example_messages)
-        else:
-            example_message = ""
-        if len(example_messages) > 0:
-            system_message = \
-f"""{main_system_message}
-
-An example of user and assistant interaction is as follows:
-{example_message}"""
-        else:
-            system_message = \
-f"""{main_system_message}"""
-        system_prompt = \
-f"""{Llama2FormatChat.start_token}{Llama2FormatChat.inst_start} {Llama2FormatChat.sys_start}
-{system_message}
-{Llama2FormatChat.sys_end}"""
-        return system_prompt
-    
-    def _format_user_assistant_messages(self, user_messages, assistant_messages):
-        """
-        user_messages is of the form
-        """
-        user_messages = [user_msg["content"] for user_msg in user_messages]
-        user_messages = [f"{user_msg} {Llama2FormatChat.inst_end} " for user_msg in user_messages]
-        assistant_messages = [assistant_msg["content"] for assistant_msg in assistant_messages]
-        assistant_messages = [f"{assistant_msg} {Llama2FormatChat.end_token}{Llama2FormatChat.start_token}{Llama2FormatChat.inst_start} " for assistant_msg in assistant_messages]
-        # Combine the messages one after the other
-        messages = []
-        idx = 0
-        while idx < len(user_messages) or idx < len(assistant_messages):
-            if idx < len(user_messages):
-                messages.append(user_messages[idx])
-            if idx < len(assistant_messages):
-                messages.append(assistant_messages[idx])
-            idx += 1
-        message = "".join(messages)
-        return message
-    
-if __name__ == "__main__":
-    messages = [
-        {
-            "role": "system",
-            "content": "You are a helpful, pattern-following assistant that translates corporate jargon into plain English.",
-        },
-        {
-            "role": "system",
-            "name": "example_user",
-            "content": "New synergies will help drive top-line growth.",
-        },
-        {
-            "role": "system",
-            "name": "example_assistant",
-            "content": "Things working well together will increase revenue.",
-        },
-        {
-            "role": "system",
-            "name": "example_user",
-            "content": "Let's circle back when we have more bandwidth to touch base on opportunities for increased leverage.",
-        },
-        {
-            "role": "system",
-            "name": "example_assistant",
-            "content": "Let's talk later when we're less busy about how to do better.",
-        },
-        {
-            "role": "system",
-            "name": "example_user",
-            "content": "This late pivot means we don't have time to boil the ocean for the client deliverable.",
-        },
-        {
-            "role": "system",
-            "name": "example_assistant", 
-            "content": "Our idea seems to be scooped, don't know how to change direction now."
-        },
-        {
-            "role": "user",
-            "content": "We changed the direction of the project, but we don't have time to do it.",
-        },
-        {
-            "role": "assistant",
-            "content": "Too many changes do not have time to do it.",
-        },
-        {
-            "role": "user",
-            "content": "The pot is boiling, probably the water will spill.",
-        }
-    ]
-    llama2_format_chat = Llama2FormatChat()
-    prompt, role_names = llama2_format_chat(messages)
-    print(prompt)
-    print('='*50)
-    print(role_names)
-    print('-'*100)
-    messages = [
-        {
-            "role": "system",
-            "content": "You are a helpful, pattern-following assistant that translates corporate jargon into plain English.",
-        },
-        {
-            "role": "system",
-            "name": "example_user",
-            "content": "New synergies will help drive top-line growth.",
-        },
-        {
-            "role": "system",
-            "name": "example_assistant",
-            "content": "Things working well together will increase revenue.",
-        },
-        {
-            "role": "system",
-            "name": "example_user",
-            "content": "Let's circle back when we have more bandwidth to touch base on opportunities for increased leverage.",
-        },
-        {
-            "role": "system",
-            "name": "example_assistant",
-            "content": "Let's talk later when we're less busy about how to do better.",
-        },
-        {
-            "role": "system",
-            "name": "example_user",
-            "content": "This late pivot means we don't have time to boil the ocean for the client deliverable.",
-        },
-        {
-            "role": "system",
-            "name": "example_assistant", 
-            "content": "Our idea seems to be scooped, don't know how to change direction now."
-        },
-        {
-            "role": "user",
-            "content": "We changed the direction of the project, but we don't have time to do it.",
-        }
-    ]
-    llama2_format_chat = Llama2FormatChat()
-    prompt, role_names = llama2_format_chat(messages)
-    print(prompt)
-    print('='*50)
-    print(role_names)
\ No newline at end of file
diff --git a/src/copra/gpts/llama_access.py b/src/copra/gpts/llama_access.py
deleted file mode 100644
index 24311d3..0000000
--- a/src/copra/gpts/llama_access.py
+++ /dev/null
@@ -1,329 +0,0 @@
-#!/usr/bin/env python3
-
-import time
-import typing
-import os
-import random
-import logging
-import threading
-# from litellm import token_counter
-from subprocess import Popen, PIPE, STDOUT
-from copra.gpts.gpt_access import GptAccess
-from copra.gpts.llama2_chat_format import Llama2FormatChat
-from huggingface_hub import InferenceClient
-
-class ServiceDownError(Exception):
-    pass
-
-class LlamaAccess(GptAccess):
-    # Use this https://huggingface.co/blog/codellama#conversational-instructions for formatting instructions
-    """
-    This is not thread safe"""
-    process = None
-    model_name = None
-    debug = False
-    random_suffix = random.randint(0, 10**16)
-    models_supported_name = ['codellama/CodeLlama-7b-Instruct-hf', 'EleutherAI/llemma_7b', 'morph-labs/morph-prover-v0-7b']
-    logger : logging.Logger = None
-    port = 8080
-    docker_exit_signal = False
-    litellm_exit_signal = False
-    docker_logging_thread = None
-    litellm_logging_thread = None
-    def __init__(self, model_name: str | None = None, temperature = 0.0) -> None:
-        assert model_name == LlamaAccess.model_name or model_name is None, "Model name must be initialized before use"
-        assert LlamaAccess.process is not None, "LlamaAccess class must be initialized before use"
-        self.secret_filepath = None
-        self.model_name = model_name if model_name is not None else LlamaAccess.models_supported_name[0]
-        self.temperature = temperature
-        self.usage = {
-            "prompt_tokens": 0,
-            "completion_tokens": 0,
-            "total_tokens": 0
-        }
-        self.is_open_ai_model = False
-        self._llama2_format_chat = Llama2FormatChat()
-
-    def __enter__(self):
-        self.model_name = f"huggingface/{self.model_name}"
-        self.interface = InferenceClient(model=f"http://localhost:{LlamaAccess.port}")
-        return self
-    
-    def __exit__(self, exc_type, exc_value, traceback):
-        # self.kill() # only kill the service if this is the last object
-        pass
-    
-    def _get_docker_container_name(model_name: str) -> str:
-        return model_name.replace("/","-") + f"-{LlamaAccess.random_suffix}"
-
-    def _get_docker_container_id(model_name: str) -> str:
-        return os.popen(f"docker ps -q --filter='NAME={LlamaAccess._get_docker_container_name(model_name)}'").read().strip()
-
-    def _check_if_docker_running() -> bool:
-        try:
-            return LlamaAccess._get_docker_container_id(LlamaAccess.model_name) != ""
-        except:
-            return False
-
-    def class_init(model_name: str = None, temperature = 0.0, port = 8080, debug = False, logger: logging.Logger = None):            
-        if model_name is None:
-            model_name = LlamaAccess.models_supported_name[0]
-        elif model_name is not None:
-            assert model_name in LlamaAccess.models_supported_name, f"Model name {model_name} not supported"
-        # Check if docker is running
-        if LlamaAccess.model_name is None:
-            LlamaAccess.model_name = model_name
-            LlamaAccess.debug = debug
-            LlamaAccess.logger = logger if logger is not None else logging.getLogger(__name__)
-            LlamaAccess.docker_exit_signal = False
-            LlamaAccess.litellm_exit_signal = False
-            LlamaAccess.docker_logging_thread = None
-            LlamaAccess.port = port
-        if not LlamaAccess._check_if_docker_running():
-            LlamaAccess._start_service(model_name, temperature, port, debug)
-        pass
-
-    def class_kill():
-        LlamaAccess.kill()
-    
-    def _docker_service_logs():
-        try:
-            while not LlamaAccess.docker_exit_signal:
-                line = LlamaAccess.process.stdout.readline().strip()
-                if line:
-                    LlamaAccess.logger.info(f'Docker:\n {line}')
-                else:
-                    # sleep for a bit to avoid busy waiting
-                    time.sleep(0.02)
-        except:
-            pass
-
-    def _start_service(model_name: str, temperature = 0.0, port = 8080, debug = False) -> None:
-        # Change the openai.api_key to the llama api key
-        # Start the docker container for llama TGI
-        docker_container_name = LlamaAccess._get_docker_container_name(model_name)
-        cuda_visible_devices = os.popen('echo $CUDA_VISIBLE_DEVICES').read().strip()
-        if cuda_visible_devices == '':
-            cuda_visible_devices = '0'
-        cmd = f'sh src/gpts/start_llama.sh {docker_container_name} {model_name} {port}'
-        LlamaAccess.process = Popen(
-            cmd, 
-            shell = True, 
-            stdin = PIPE, 
-            stdout = PIPE, 
-            stderr = STDOUT,
-            cwd = root_dir, 
-            bufsize = 1, 
-            universal_newlines = True)
-        exit_wait = False
-        start_time = time.time()
-        retry = 3
-        while not exit_wait and retry > 0:
-            line = LlamaAccess.process.stdout.readline().strip()
-            if line:
-                LlamaAccess.logger.info(f'Docker:\n {line}')
-                if "Error" in line or "error" in line:
-                    LlamaAccess.process.kill()
-                    raise Exception(f'Failed to start docker container {docker_container_name}, because of error: \n{line}')
-                if line.endswith('Connected'):
-                    time.sleep(1)
-                    exit_wait = True
-            else:
-                # sleep for a bit to avoid busy waiting
-                time.sleep(0.02)
-            end_time = time.time()
-            if end_time - start_time > 400:
-                LlamaAccess.process.kill()
-                LlamaAccess.kill()
-                LlamaAccess.process = Popen(
-                    cmd, 
-                    shell = True, 
-                    stdin = PIPE, 
-                    stdout = PIPE, 
-                    stderr = STDOUT,
-                    cwd = root_dir, 
-                    bufsize = 1, 
-                    universal_newlines = True)
-                exit_wait = False
-                start_time = time.time()
-                retry -= 1
-        
-        # Start the docker logging thread
-        LlamaAccess.docker_exit_signal = False
-        LlamaAccess.docker_logging_thread = threading.Thread(target=LlamaAccess._docker_service_logs)
-        LlamaAccess.docker_logging_thread.start()
-
-    def token_counter(self, model_name: str, text: typing.List[str]) -> int:
-        tokenizer = Tokenizer.from_pretrained("hf-internal-testing/llama-tokenizer")
-        text = " ".join([message["content"] for message in messages])
-        enc = tokenizer.encode(text)
-        num_tokens = len(enc.ids)
-        return num_tokens
-
-    def num_tokens_from_messages(self, messages, model=None):
-        model = model if model is not None else self.model_name
-        num_tokens = self.token_counter(model, messages=messages)
-        return num_tokens
-    
-    def complete_chat(self,
-        messages: typing.List[str],
-        model: typing.Optional[str] = None,
-        n: int = 1,
-        max_tokens: int = 5,
-        temperature: float = 0.25,
-        top_p: float = 1.0,
-        frequency_penalty: float = 0.0,
-        presence_penalty: float = 0.0,
-        stop: list = ["\n"]) -> typing.Tuple[list, dict]:
-        temperature = None if temperature == 0.0 else temperature
-        top_p = None if top_p == 1.0 else top_p
-        try:
-            outputs = []
-            prompt_tokens = self.num_tokens_from_messages(messages)
-            completion_tokens = 0
-            prompt, role_names = self._llama2_format_chat(messages)
-            # LlamaAccess.logger.debug(f"Prompt Received:\n{prompt}")
-            for i in range(n):
-                output = self.interface.text_generation(
-                    prompt=prompt,
-                    details=True,
-                    max_new_tokens=max_tokens,
-                    temperature=temperature,
-                    top_p=top_p,
-                    stop_sequences=stop,
-                    do_sample=i>0,
-                )
-                generated_text = output.generated_text
-                finish_reason = output.details.finish_reason
-                # LlamaAccess.logger.debug(f"Generated Text:\n{generated_text}")
-                if finish_reason.value == "stop_sequence":
-                    finish_reason = "stop"
-                else:
-                    finish_reason = finish_reason.value
-                if finish_reason == "stop":
-                    # Remove the stop token
-                    for stop_token in stop:
-                        if generated_text.endswith(stop_token):
-                            generated_text = generated_text[:generated_text.rfind(stop_token)]
-                            break
-                generated_text = generated_text.strip()
-                for role_name in role_names:
-                    if generated_text.startswith(role_name):
-                        generated_text = generated_text[len(role_name):].strip()
-                        break
-                    elif generated_text.startswith(f"`{role_name}`"):
-                        generated_text = generated_text[len(f"`{role_name}`:"):].strip()
-                        break
-                outputs.append({'role': 'assistant', 'content': generated_text, 'finish_reason': finish_reason})
-                completion_tokens += output.details.generated_tokens
-            total_tokens = prompt_tokens + completion_tokens
-            usage = {
-                "prompt_tokens": prompt_tokens,
-                "completion_tokens": completion_tokens,
-                "total_tokens": total_tokens,
-                "reason": outputs[-1]["finish_reason"] if len(outputs) > 0 else "stop"
-            }
-            self.usage['prompt_tokens'] += prompt_tokens
-            self.usage['completion_tokens'] += completion_tokens
-            self.usage['total_tokens'] += total_tokens
-            return outputs, usage
-        except:
-            if not LlamaAccess._check_if_docker_running():
-                raise ServiceDownError("Docker is shut down, restart the service")
-            else:
-                raise
-
-    def kill():
-        LlamaAccess.logger.info("Killing the docker processes")
-        docker_container_name = LlamaAccess._get_docker_container_name(LlamaAccess.model_name)
-        if LlamaAccess._check_if_docker_running():
-            docker_name = os.popen(f"docker stop {docker_container_name}").read().strip()
-            assert docker_name == docker_container_name, f"Failed to stop container {docker_container_name}"
-            time.sleep(2)
-            LlamaAccess.logger.info(f"Docker Container {docker_container_name} stopped")
-        else:
-            LlamaAccess.logger.info(f"Docker Container {docker_container_name} already stopped")
-        # Remove the docker container
-        docker_name = os.popen(f"docker rm {docker_container_name}").read().strip()
-        time.sleep(2)
-        if docker_name == '':
-            LlamaAccess.logger.info(f"Docker Container {docker_container_name} already removed")
-        else:
-            assert docker_name == docker_container_name, f"Failed to remove container {LlamaAccess._get_docker_container_name(LlamaAccess.model_name)}"
-            LlamaAccess.logger.info(f"Docker Container {docker_container_name} removed")
-        try:
-            LlamaAccess.process.kill()
-        except:
-            pass
-        time.sleep(2)
-        # Stop logging threads
-        LlamaAccess.docker_exit_signal = True
-        LlamaAccess.litellm_exit_signal = True
-        LlamaAccess.docker_logging_thread.join()
-        LlamaAccess.logger.info("Docker processes killed and logging threads stopped")
-        try:
-            LlamaAccess.process.stdin.close()
-        except:
-            pass
-        time.sleep(2)
-        try:
-            LlamaAccess.process.stdout.close()
-        except:
-            pass
-        time.sleep(2)
-        LlamaAccess.logger.info("Docker stdin and stdout closed")
-
-if __name__ == '__main__':
-    logging.basicConfig(level=logging.INFO)
-    logger = logging.getLogger(__name__)
-    logger.log(logging.INFO, "Testing LlamaAccess")
-    LlamaAccess.class_init(port=10005, logger=logger)
-    try:
-        with LlamaAccess() as llama:
-            messages = [
-                {
-                    "role": "system",
-                    "content": "You are a helpful, pattern-following assistant that translates corporate jargon into plain English.",
-                },
-                {
-                    "role": "system",
-                    "name": "example_user",
-                    "content": "New synergies will help drive top-line growth.",
-                },
-                {
-                    "role": "system",
-                    "name": "example_assistant",
-                    "content": "Things working well together will increase revenue.",
-                },
-                {
-                    "role": "system",
-                    "name": "example_user",
-                    "content": "Let's circle back when we have more bandwidth to touch base on opportunities for increased leverage.",
-                },
-                {
-                    "role": "system",
-                    "name": "example_assistant",
-                    "content": "Let's talk later when we're less busy about how to do better.",
-                },
-                {
-                    "role": "system",
-                    "name": "example_user",
-                    "content": "This late pivot means we don't have time to boil the ocean for the client deliverable.",
-                },
-                {
-                    "role": "system",
-                    "name": "example_assistant", 
-                    "content": "Our idea seems to be scooped, don't know how to change direction now."
-                },
-                {
-                    "role": "user",
-                    "content": "We changed the direction of the project, but we don't have time to do it.",
-                }
-            ]
-            messages = [messages[0]] + [messages[1+(i%len(messages[1:-1]))] for i in range(300)] + [messages[-1]]
-            print(llama.num_tokens_from_messages(messages))
-            print("Will call complete_chat soon")
-            #time.sleep(30)
-            print(llama.complete_chat(messages, max_tokens=50, n=2, temperature=0.0, stop=['.']))
-    finally:
-        LlamaAccess.class_kill()
\ No newline at end of file
diff --git a/src/copra/gpts/start_llama.sh b/src/copra/gpts/start_llama.sh
deleted file mode 100644
index 3ae2412..0000000
--- a/src/copra/gpts/start_llama.sh
+++ /dev/null
@@ -1,43 +0,0 @@
-name=codellama_tgi
-model=codellama/CodeLlama-7b-Instruct-hf #tiiuae/falcon-7b-instruct
-port=8080
-cuda="0"
-volume=$PWD/.log/tgi/models
-# Change container name by container name argument
-if [ $# -eq 1 ]; then
-    name=$1
-fi
-# Change model by model name argument
-if [ $# -eq 2 ]; then
-    name=$1
-    model=$2
-fi
-# Change port by port argument
-if [ $# -eq 3 ]; then
-    name=$1
-    model=$2
-    port=$3
-fi
-# Change cuda by cuda argument
-if [ $# -eq 4 ]; then
-    name=$1
-    model=$2
-    port=$3
-    cuda=$4
-fi
-# Change volume by volume argument
-if [ $# -eq 5 ]; then
-    name=$1
-    model=$2
-    port=$3
-    cuda=$4
-    volume=$5
-fi
-# Check if volume exists
-if [ ! -d $volume ]; then
-    # Raise error if volume does not exist
-    echo "Volume $volume does not exist"
-    exit 1
-fi
-echo "Starting $name with model $model and volume $volume"
-docker run --env CUDA_VISIBLE_DEVICES=$cuda --name $name --gpus all --shm-size 1g -p $port:80 -v $volume:/data ghcr.io/huggingface/text-generation-inference:1.1.0 --model-id $model --max-input-length 14000 --max-total-tokens 16384 --max-batch-prefill-tokens 14000
\ No newline at end of file
diff --git a/src/copra/main/eval_benchmark.py b/src/copra/main/eval_benchmark.py
index 7c00f24..07a785d 100644
--- a/src/copra/main/eval_benchmark.py
+++ b/src/copra/main/eval_benchmark.py
@@ -17,7 +17,6 @@
 import math
 import typing
 
-from copra.gpts.llama_access import LlamaAccess, ServiceDownError
 from copra.main.config import (
     EnvSettings, EvalBenchmark, EvalDataset, EvalProofResults, EvalSettings,
     Experiments, EvalRunCheckpointInfo, PromptSettings, parse_config
@@ -118,19 +117,6 @@ def _initialize_services(
             logger.error(f"Failed to start vLLM server: {e}")
             raise
 
-    # Initialize Llama service if using non-OpenAI model (deprecated, prefer vLLM)
-    elif eval_settings.gpt_model_name is not None and \
-       len(eval_settings.gpt_model_name) != 0 and \
-       not model_supports_openai_api(eval_settings.gpt_model_name):
-        llama_logger = setup_logger(
-            __name__ + "_llama",
-            os.path.join(eval_checkpoint_info.logging_dirs[-1], "llama.log"),
-            logging.INFO,
-            '%(asctime)s - %(name)s - %(levelname)s - %(message)s'
-        )
-        LlamaAccess.class_init(eval_settings.gpt_model_name, eval_settings.temperature,
-                              debug=False, logger=llama_logger)
-
     # Initialize Isabelle service if needed
     if eval_benchmark.language == ProofAction.Language.ISABELLE:
         isabelle_logger = setup_logger(
@@ -174,12 +160,6 @@ def _shutdown_services(
             _vllm_server_process = None
             logging.getLogger(__name__).info("vLLM server stopped")
 
-    # Shutdown Llama service if it was initialized
-    elif eval_settings.gpt_model_name is not None and \
-       len(eval_settings.gpt_model_name) != 0 and \
-       not model_supports_openai_api(eval_settings.gpt_model_name):
-        LlamaAccess.class_kill()
-
     # Shutdown Isabelle service if it was initialized
     if eval_benchmark.language == ProofAction.Language.ISABELLE:
         IsabelleExecutor.stop_server()
@@ -511,18 +491,6 @@ def _process_lemma(
                 _handle_proof_timeout(lemma_name, path, elapsed_time, no_proof_res, checkpoint_manager, attempt_idx)
                 logger.info(f"Dumping proof search result:\nProof FAILED for lemma: {lemma_name}\n")
                 should_retry = False
-            elif return_dict.get("service_down", False):
-                # Retry for service down
-                should_retry = True
-                logger.info("Killing the llama process")
-                LlamaAccess.class_kill()
-                logger.info("Killed the llama process")
-                logger.info("Restarting the llama process")
-                # Get llama logger from logs
-                llama_logger = logging.getLogger(__name__ + "_llama")
-                LlamaAccess.class_init(eval_settings.gpt_model_name,
-                                      eval_settings.temperature, debug=False, logger=llama_logger)
-                logger.info("Restarted the llama process")
 
         else:
             # Success case
diff --git a/src/copra/main/proof_execution.py b/src/copra/main/proof_execution.py
index 306eb6a..c0456ad 100644
--- a/src/copra/main/proof_execution.py
+++ b/src/copra/main/proof_execution.py
@@ -13,7 +13,6 @@
 from typing import Dict, Any
 
 from copra.main.parallel_execution import get_executor
-from copra.gpts.llama_access import ServiceDownError
 from copra.agent.dfs_tree_search_with_stack import DFSTreeSearch
 from copra.agent.gpt_guided_tree_search_policy import GptGuidedTreeSearchPolicy
 from copra.agent.simple_proof_agent import ProofAgent
@@ -178,10 +177,6 @@ def _run_prover_wrapper(
             ret_dict["proof_res"] = proof_res
             ret_dict["attempted_success"] = True
             ret_dict["service_down"] = False
-    except ServiceDownError:
-        subprocess_logger.exception(f"ServiceDownError occurred while proving lemma: {lemma_name} in file {path}")
-        ret_dict["attempted_success"] = False
-        ret_dict["service_down"] = True
     except Exception:
         subprocess_logger.exception(f"Exception occurred while proving lemma: {lemma_name} in file {path}")
         ret_dict["attempted_success"] = False