From ffa6b241be1bd9a499335f5ed541951af5db76fd Mon Sep 17 00:00:00 2001
From: Cursor Agent <cursoragent@cursor.com>
Date: Tue, 28 Oct 2025 22:09:59 +0000
Subject: [PATCH] Add timeouts to queries and clean up Dockerfiles

Co-authored-by: rkn <rkn@anyscale.com>
---
 02_service_hello_world/query.py                    |  3 ++-
 03_deploy_llama_3_8b/Dockerfile                    |  5 +++--
 deploy_llama_3_1_70b/Dockerfile                    |  5 +++--
 serve_tensor_parallel/query.py                     |  3 ++-
 skyrl/Dockerfile                                   |  7 ++++---
 video_generation_with_fastvideo/serve_fastvideo.py | 14 ++++----------
 6 files changed, 18 insertions(+), 19 deletions(-)

diff --git a/02_service_hello_world/query.py b/02_service_hello_world/query.py
index 905ea58..185631a 100644
--- a/02_service_hello_world/query.py
+++ b/02_service_hello_world/query.py
@@ -13,6 +13,7 @@
 resp = requests.get(
     urljoin(base_url, "hello"),
     params={"name": "Theodore"},
-    headers={"Authorization": f"Bearer {token}"})
+    headers={"Authorization": f"Bearer {token}"},
+    timeout=10)
 
 print(resp.text)
\ No newline at end of file
diff --git a/03_deploy_llama_3_8b/Dockerfile b/03_deploy_llama_3_8b/Dockerfile
index 7c4782e..21a72dc 100644
--- a/03_deploy_llama_3_8b/Dockerfile
+++ b/03_deploy_llama_3_8b/Dockerfile
@@ -2,7 +2,8 @@ FROM anyscale/ray:2.49.0-slim-py312-cu128
 
 # C compiler for Triton’s runtime build step (vLLM V1 engine)
 # https://github.com/vllm-project/vllm/issues/2997
-RUN sudo apt-get update && \
-    sudo apt-get install -y --no-install-recommends build-essential
+RUN apt-get update && \
+    apt-get install -y --no-install-recommends build-essential && \
+    rm -rf /var/lib/apt/lists/*
 
 RUN pip install vllm==0.10.0
diff --git a/deploy_llama_3_1_70b/Dockerfile b/deploy_llama_3_1_70b/Dockerfile
index ea0023c..ce9f579 100644
--- a/deploy_llama_3_1_70b/Dockerfile
+++ b/deploy_llama_3_1_70b/Dockerfile
@@ -2,7 +2,8 @@ FROM anyscale/ray:2.50.0-slim-py312-cu128
 
 # C compiler for Triton’s runtime build step (vLLM V1 engine)
 # https://github.com/vllm-project/vllm/issues/2997
-RUN sudo apt-get update && \
-    sudo apt-get install -y --no-install-recommends build-essential
+RUN apt-get update && \
+    apt-get install -y --no-install-recommends build-essential && \
+    rm -rf /var/lib/apt/lists/*
 
 RUN pip install vllm==0.11.0
diff --git a/serve_tensor_parallel/query.py b/serve_tensor_parallel/query.py
index 36b9a9a..49f671c 100644
--- a/serve_tensor_parallel/query.py
+++ b/serve_tensor_parallel/query.py
@@ -12,6 +12,7 @@
 resp = requests.get(
     urljoin(base_url, "infer"),
     params={"text": "What is the future of AI? "},
-    headers={"Authorization": f"Bearer {token}"})
+    headers={"Authorization": f"Bearer {token}"},
+    timeout=15)
 
 print(resp.text)
diff --git a/skyrl/Dockerfile b/skyrl/Dockerfile
index 0cd352e..e556dc8 100644
--- a/skyrl/Dockerfile
+++ b/skyrl/Dockerfile
@@ -1,8 +1,9 @@
 FROM anyscale/ray:2.48.0-slim-py312-cu128
 
-RUN sudo apt-get update -y \
-    && sudo apt-get install --no-install-recommends -y build-essential libnuma-dev \
-    && sudo rm -f /etc/apt/sources.list.d/*
+RUN apt-get update -y \
+    && apt-get install --no-install-recommends -y build-essential libnuma-dev \
+    && rm -f /etc/apt/sources.list.d/* \
+    && rm -rf /var/lib/apt/lists/*
 
 RUN curl -LsSf https://astral.sh/uv/install.sh | sh
 
diff --git a/video_generation_with_fastvideo/serve_fastvideo.py b/video_generation_with_fastvideo/serve_fastvideo.py
index c3cd997..ddc742a 100644
--- a/video_generation_with_fastvideo/serve_fastvideo.py
+++ b/video_generation_with_fastvideo/serve_fastvideo.py
@@ -1,4 +1,3 @@
-import asyncio
 from starlette.requests import Request
 from ray import serve
 from ray.serve._private.http_util import ASGIAppReplicaWrapper
@@ -17,13 +16,8 @@
 
 
 def gradio_builder(generator: serve.handle.DeploymentHandle):
-    def query_model(prompt, num_inference_steps):
-
-        async def run_query_model(prompt, num_inference_steps):
-            video_base64 = await generator.generate.remote(prompt, num_inference_steps)
-            return video_base64
-
-        video_base64 = asyncio.run(run_query_model(prompt, num_inference_steps))
+    async def query_model(prompt, num_inference_steps):
+        video_base64 = await generator.generate.remote(prompt, num_inference_steps)
         video_bytes = base64.b64decode(video_base64)
         video_filename = f"{uuid.uuid4()}.mp4"
         video_path = os.path.join(output_dir, video_filename)
@@ -93,7 +87,7 @@ def __init__(self):
             num_gpus=1,  # Adjust based on your hardware
         )
 
-    def generate(self, prompt: str, num_inference_steps: int = 3) -> bytes:
+    def generate(self, prompt: str, num_inference_steps: int = 3) -> str:
         # Generate the video.
         video = self.generator.generate_video(
             prompt,
@@ -110,7 +104,7 @@ def generate(self, prompt: str, num_inference_steps: int = 3) -> bytes:
 
         return video_base64
 
-    async def __call__(self, http_request: Request) -> bytes:
+    async def __call__(self, http_request: Request) -> str:
         data = await http_request.json()
         prompt = data["prompt"]
         num_inference_steps = data["num_inference_steps"]