From 5186e645d29b65d09866e0c908a8f856dea49c45 Mon Sep 17 00:00:00 2001
From: gufengc <gufeng@gradient.network>
Date: Fri, 29 May 2026 23:52:34 +0800
Subject: [PATCH 1/5] Pin vllm-rs install ref

---
 install.sh | 24 +++++++++++++++++++-----
 1 file changed, 19 insertions(+), 5 deletions(-)

diff --git a/install.sh b/install.sh
index 36ff7b5c..11905594 100755
--- a/install.sh
+++ b/install.sh
@@ -13,7 +13,7 @@ SCRIPT_DIR="$(cd "$(dirname "$0")" && pwd)"
 EXTRAS="${PARALLAX_EXTRAS:-}"
 PYTHON_VERSION="${PARALLAX_PYTHON_VERSION:-3.12}"
 VENV_DIR="$SCRIPT_DIR/.venv"
-VLLM_REF="${VLLM_REF:-main}"
+VLLM_REF="${VLLM_REF:-v0.22.0}"
 
 show_help() {
     cat <<'EOF'
@@ -30,7 +30,7 @@ Options:
 Environment:
   PARALLAX_EXTRAS         Same as --extras.
   PARALLAX_PYTHON_VERSION Same as --python.
-  VLLM_REF                vLLM git branch/tag to clone. Defaults to main.
+  VLLM_REF                vLLM git branch/tag to clone. Defaults to v0.22.0.
 EOF
 }
 
@@ -218,15 +218,28 @@ build_vllm_rust_frontend() {
     local rust_dir
     local parallax_scripts_dir
     local target_path
+    local target_version_path
+    local existing_version
     local toolchain
 
     parallax_scripts_dir="$(resolve_venv_bin_dir)"
     target_path="$parallax_scripts_dir/vllm-rs"
+    target_version_path="$target_path.version"
 
     if [[ -f "$target_path" ]]; then
-        chmod +x "$target_path"
-        echo "vllm-rs already exists at $target_path, skipping Rust build."
-        return
+        existing_version=""
+        if [[ -f "$target_version_path" ]]; then
+            existing_version="$(<"$target_version_path")"
+        fi
+        if [[ "$existing_version" != "$VLLM_REF" ]]; then
+            echo "Existing vllm-rs version (${existing_version:-unknown}) does not match $VLLM_REF, rebuilding."
+            rm -f "$target_path" "$target_version_path"
+        else
+            chmod +x "$target_path"
+            printf '%s\n' "$VLLM_REF" > "$target_version_path"
+            echo "vllm-rs already exists at $target_path, skipping Rust build."
+            return
+        fi
     fi
 
     CLONE_PARENT="$(mktemp -d "${TMPDIR:-/tmp}/parallax-vllm-rs.XXXXXX")"
@@ -264,6 +277,7 @@ build_vllm_rust_frontend() {
     mkdir -p "$(dirname "$target_path")"
     cp "$rust_dir/target/release/vllm-rs" "$target_path"
     chmod +x "$target_path"
+    printf '%s\n' "$VLLM_REF" > "$target_version_path"
     echo "Installed vllm-rs to $target_path"
     cleanup_clone
     trap - EXIT

From 4a0c9c84f0eb99fd1e4c18f60605733eca5439db Mon Sep 17 00:00:00 2001
From: gufengc <gufeng@gradient.network>
Date: Fri, 29 May 2026 23:56:00 +0800
Subject: [PATCH 2/5] Report decode step stats

---
 scripts/generate.py | 37 +++++++++++++++++++++++++++++++++++--
 1 file changed, 35 insertions(+), 2 deletions(-)

diff --git a/scripts/generate.py b/scripts/generate.py
index fefbb46f..0348a205 100644
--- a/scripts/generate.py
+++ b/scripts/generate.py
@@ -22,6 +22,7 @@
 """
 
 import argparse
+import statistics
 import time
 
 import mlx.core as mx
@@ -80,6 +81,21 @@ def build_prompt(messages, tokenizer):
     return full_prompt, prompt_tokens
 
 
+def percentile(values, percentile_value):
+    if not values:
+        return 0
+
+    sorted_values = sorted(values)
+    if len(sorted_values) == 1:
+        return sorted_values[0]
+
+    rank = (len(sorted_values) - 1) * percentile_value / 100
+    lower = int(rank)
+    upper = min(lower + 1, len(sorted_values) - 1)
+    weight = rank - lower
+    return sorted_values[lower] * (1 - weight) + sorted_values[upper] * weight
+
+
 def main():
     parser = argparse.ArgumentParser(description="Simple offline inference script")
     parser.add_argument(
@@ -226,7 +242,7 @@ def main():
     print_rank(f"Token 1 (Prefill) time: {prefill_time * 1000:.2f} ms")
 
     # 5. Decode Loop
-    total_decode_time = 0
+    decode_step_times = []
     for i in range(args.max_tokens - 1):
         if is_finished:
             break
@@ -260,13 +276,14 @@ def main():
         request.commit_new_token(token_id)
 
         decode_step_time = time.perf_counter() - decode_step_start
-        total_decode_time += decode_step_time
+        decode_step_times.append(decode_step_time)
         print_rank(f"Token {i + 2} time: {decode_step_time * 1000:.2f} ms")
 
     print_rank("\nGenerated Content:")
     print_rank(tokenizer.decode(request.output_ids))
 
     # Summary Statistics
+    total_decode_time = sum(decode_step_times)
     prompt_tps = request.prompt_len / prefill_time
     generation_tps = len(request.output_ids) / total_decode_time if total_decode_time > 0 else 0
     peak_mem = mx.get_peak_memory() / 1024**3
@@ -274,6 +291,22 @@ def main():
     print_rank("-" * 20)
     print_rank(f"Prompt: {request.prompt_len} tokens, {prompt_tps:.3f} tokens-per-sec")
     print_rank(f"Generation: {len(request.output_ids)} tokens, {generation_tps:.3f} tokens-per-sec")
+    if decode_step_times:
+        decode_step_times_ms = [step_time * 1000 for step_time in decode_step_times]
+        decode_step_time_mean = sum(decode_step_times_ms) / len(decode_step_times_ms)
+        print_rank(
+            "Decode step time (ms): "
+            f"min={min(decode_step_times_ms):.2f}, "
+            f"median={percentile(decode_step_times_ms, 50):.2f}, "
+            f"max={max(decode_step_times_ms):.2f}, "
+            f"mean={decode_step_time_mean:.2f}, "
+            f"std={statistics.pstdev(decode_step_times_ms):.2f}, "
+            f"p90={percentile(decode_step_times_ms, 90):.2f}, "
+            f"p95={percentile(decode_step_times_ms, 95):.2f}, "
+            f"p99={percentile(decode_step_times_ms, 99):.2f}"
+        )
+    else:
+        print_rank("Decode step time (ms): n/a")
     print_rank(f"Peak memory: {peak_mem:.3f} GB")
     cache_manager.free_request(request.request_id)
 

From c5bc7e32dbead1c883254a516ac01c187480aa2a Mon Sep 17 00:00:00 2001
From: gufengc <gufeng@gradient.network>
Date: Sat, 30 May 2026 00:04:06 +0800
Subject: [PATCH 3/5] test: skip cuda tests when sglang import fails

---
 tests/test_model_cuda.py | 19 +++++++++++++++----
 1 file changed, 15 insertions(+), 4 deletions(-)

diff --git a/tests/test_model_cuda.py b/tests/test_model_cuda.py
index a79b7d78..16e5715b 100644
--- a/tests/test_model_cuda.py
+++ b/tests/test_model_cuda.py
@@ -16,11 +16,16 @@
 # Delay import of SGLExecutor to avoid import errors when sglang is not available
 # This allows test collection to succeed even if sglang is not installed
 SGLExecutor = None
+SGL_EXECUTOR_IMPORT_ERROR = None
 try:
     from parallax.server.executor.sglang_executor import SGLExecutor
-except ImportError:
+except ImportError as exc:
     # sglang not available, tests will be skipped
-    pass
+    SGL_EXECUTOR_IMPORT_ERROR = exc
+except Exception as exc:
+    if is_cuda_available():
+        raise
+    SGL_EXECUTOR_IMPORT_ERROR = exc
 
 CUDA_MODEL_REPO = "Qwen/Qwen3-0.6B"
 TOTAL_LAYERS = 28
@@ -64,7 +69,10 @@ def test_cuda_shard_prefill(layers_config: List[Tuple[int, int]], ref_model_and_
     but uses parallax's SGLExecutor instead of direct model loading.
     """
     if SGLExecutor is None:
-        pytest.skip("sglang not available (install with 'pip install -e .[gpu]')")
+        pytest.skip(
+            "sglang not available or failed to import "
+            f"({type(SGL_EXECUTOR_IMPORT_ERROR).__name__}: {SGL_EXECUTOR_IMPORT_ERROR})"
+        )
     if not is_cuda_available():
         pytest.skip("CUDA not available")
 
@@ -171,7 +179,10 @@ def test_cuda_executor_pipeline(ref_model_and_tokenizer):
     This test creates a 2-stage pipeline and verifies it can process requests.
     """
     if SGLExecutor is None:
-        pytest.skip("sglang not available (install with 'pip install -e .[gpu]')")
+        pytest.skip(
+            "sglang not available or failed to import "
+            f"({type(SGL_EXECUTOR_IMPORT_ERROR).__name__}: {SGL_EXECUTOR_IMPORT_ERROR})"
+        )
     if not is_cuda_available():
         pytest.skip("CUDA not available")
 

From 9e261235675d669c9595aa5131e2d9211a3e17e5 Mon Sep 17 00:00:00 2001
From: gufengc <gufeng@gradient.network>
Date: Sat, 30 May 2026 00:10:43 +0800
Subject: [PATCH 4/5] Revert "test: skip cuda tests when sglang import fails"

This reverts commit c5bc7e32dbead1c883254a516ac01c187480aa2a.
---
 tests/test_model_cuda.py | 19 ++++---------------
 1 file changed, 4 insertions(+), 15 deletions(-)

diff --git a/tests/test_model_cuda.py b/tests/test_model_cuda.py
index 16e5715b..a79b7d78 100644
--- a/tests/test_model_cuda.py
+++ b/tests/test_model_cuda.py
@@ -16,16 +16,11 @@
 # Delay import of SGLExecutor to avoid import errors when sglang is not available
 # This allows test collection to succeed even if sglang is not installed
 SGLExecutor = None
-SGL_EXECUTOR_IMPORT_ERROR = None
 try:
     from parallax.server.executor.sglang_executor import SGLExecutor
-except ImportError as exc:
+except ImportError:
     # sglang not available, tests will be skipped
-    SGL_EXECUTOR_IMPORT_ERROR = exc
-except Exception as exc:
-    if is_cuda_available():
-        raise
-    SGL_EXECUTOR_IMPORT_ERROR = exc
+    pass
 
 CUDA_MODEL_REPO = "Qwen/Qwen3-0.6B"
 TOTAL_LAYERS = 28
@@ -69,10 +64,7 @@ def test_cuda_shard_prefill(layers_config: List[Tuple[int, int]], ref_model_and_
     but uses parallax's SGLExecutor instead of direct model loading.
     """
     if SGLExecutor is None:
-        pytest.skip(
-            "sglang not available or failed to import "
-            f"({type(SGL_EXECUTOR_IMPORT_ERROR).__name__}: {SGL_EXECUTOR_IMPORT_ERROR})"
-        )
+        pytest.skip("sglang not available (install with 'pip install -e .[gpu]')")
     if not is_cuda_available():
         pytest.skip("CUDA not available")
 
@@ -179,10 +171,7 @@ def test_cuda_executor_pipeline(ref_model_and_tokenizer):
     This test creates a 2-stage pipeline and verifies it can process requests.
     """
     if SGLExecutor is None:
-        pytest.skip(
-            "sglang not available or failed to import "
-            f"({type(SGL_EXECUTOR_IMPORT_ERROR).__name__}: {SGL_EXECUTOR_IMPORT_ERROR})"
-        )
+        pytest.skip("sglang not available (install with 'pip install -e .[gpu]')")
     if not is_cuda_available():
         pytest.skip("CUDA not available")
 

From e34d19f96264e3e400f6dd0937e5cf5e31ad3b4c Mon Sep 17 00:00:00 2001
From: gufengc <gufeng@gradient.network>
Date: Sat, 30 May 2026 00:17:38 +0800
Subject: [PATCH 5/5] fix: constrain kernels for sglang

---
 pyproject.toml | 1 +
 1 file changed, 1 insertion(+)

diff --git a/pyproject.toml b/pyproject.toml
index ee8ed28b..10b926b4 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -52,6 +52,7 @@ mac = [
 
 gpu = [
   "sglang[all]==0.5.12",
+  "kernels<0.15",
   "accelerate",
   "mlx-lm==0.31.3",
   "mlx[cpu]==0.31.2",