lightseekorg · chungen04 · Jun 15, 2026 · Jun 15, 2026 · Jun 16, 2026 · Jun 16, 2026
diff --git a/README.md b/README.md
@@ -64,6 +64,7 @@ TorchSpec streams hidden states from inference engines into training workers.
 |---------|--------------|--------|
 | [vLLM](https://github.com/vllm-project/vllm) | First-class | Available |
 | [TokenSpeed](https://github.com/lightseekorg/tokenspeed) | First-class | In progress |
+| [TensorRT-LLM](https://github.com/NVIDIA/TensorRT-LLM) | First-class | Available |
 | [SGLang](https://github.com/sgl-project/sglang) | Best community effort | Available |
 | [HuggingFace Transformers](https://github.com/huggingface/transformers) | Best community effort | Available |
 
@@ -112,7 +113,7 @@ pip install -e ".[fa]"
 **vLLM**
 
 ```bash
-./examples/qwen3-8b-single-node/run.sh --config configs/vllm_qwen3_8b.yaml
+./examples/qwen3-8b-single-node/run.sh configs/vllm_qwen3_8b.yaml
 ```
 
 **SGLang**
@@ -121,7 +122,17 @@ pip install -e ".[fa]"
 ./examples/qwen3-8b-single-node/run.sh
 ```
 
-TorchSpec uses vLLM's **Worker Extension** mechanism to hook into the model forward pass and capture hidden states directly inside worker processes, which avoids RPC serialization overhead during extraction. For SGLang, TorchSpec applies a patch to the existing codebase to enable hidden-state extraction.
+**TensorRT-LLM**
+
+Run inside the TensorRT-LLM image (`docker/trtllm/v1.3.0rc18/Dockerfile`), which ships `tensorrt_llm` pre-patched for Mooncake hidden-state capture:
+
+```bash
+./examples/qwen3-8b-single-node/run.sh configs/trtllm_qwen3_8b.yaml
+```
+
+Single-node tensor parallelism only for now (multi-node TP is not yet wired up).
+
+TorchSpec uses vLLM's **Worker Extension** mechanism to hook into the model forward pass and capture hidden states directly inside worker processes, which avoids RPC serialization overhead during extraction. For SGLang, TorchSpec applies a patch to the existing codebase to enable hidden-state extraction. For TensorRT-LLM, TorchSpec builds on its native **SaveHiddenStates** speculative mode and applies a small patch that redirects the captured aux + final hidden states to Mooncake instead of writing them to disk.
 
 ## Examples
 

diff --git a/configs/trtllm_qwen3_8b.yaml b/configs/trtllm_qwen3_8b.yaml
@@ -0,0 +1,80 @@
+# Configuration for train_entry.py with TensorRT-LLM Engine inference (nested config format)
+#
+# GPU allocation:
+#   - 2 GPUs for inference (duplicate mode: each engine has full model copy)
+#   - 2 GPUs for training (DP/FSDP: model sharded across 2 GPUs)
+#   - Total: 4 GPUs
+#
+# Installation:
+#   Use the TensorRT-LLM docker image (docker/trtllm/v1.3.0rc18/Dockerfile),
+#   which ships tensorrt_llm patched for Mooncake hidden-state capture.
+#
+# Usage (same launcher as the sglang example, for a fair side-by-side):
+#   ./examples/qwen3-8b-single-node/run.sh configs/trtllm_qwen3_8b.yaml
+#
+# Note: Uses TensorRT-LLM's SaveHiddenStates speculative mode; the TorchSpec
+# patch redirects captured aux + final hidden states to Mooncake.
+
+model:
+  target_model_path: Qwen/Qwen3-8B
+  trust_remote_code: true
+
+dataset:
+  train_data_path: ../examples/data/sample_conversations.jsonl
+  # eval_data_path: ../examples/data/eval_conversations.jsonl
+  # eval_interval: 100
+  chat_template: qwen
+  prompt_key: conversations
+
+training:
+  attention_backend: flex_attention
+  # TRT tp-workers bind GPU 0..tp-1; keep inference on those to avoid OOM with training.
+  placement_strategy: inference_first
+  micro_batch_size: 1
+  draft_accumulation_steps: 1
+  learning_rate: 1e-4
+  max_concurrent_batches: 1
+  max_grad_norm: 0.5
+  max_seq_length: 16384
+  num_epochs: 1
+  seed: 42
+  training_num_gpus_per_node: 2
+  training_num_nodes: 1
+  ttt_length: 7
+  save_per_epoch: true
+  warmup_ratio: 0.015
+
+inference:
+  inference_engine_type: trtllm
+  inference_num_gpus: 1
+  inference_num_gpus_per_engine: 1
+  inference_num_gpus_per_node: 1
+  max_sample_pool_size: 64       # Max samples in controller pool
+  inference_buffer_threshold: 32     # Fetch prompts when buffer < threshold
+  inference_batch_size: 8
+  trtllm:
+    tp_size: 1
+    # KV-cache memory fraction. Kept below TRT-LLM's 0.9 default to leave room
+    # for the SaveHiddenStates capture buffer, which the KV profiler does not
+    # account for.
+    mem_fraction_static: 0.7
+    extra_args:
+      # Any extra TensorRT-LLM LLM kwarg; e.g. cap the per-iteration token budget.
+      # Must be >= training.max_seq_length: chunked prefill is off, so prompts longer than max_num_tokens are dropped.
+      max_num_tokens: 16384
+
+mooncake:
+  master_server_address: null
+  metadata_server: null
+  protocol: tcp
+  global_segment_size: 16GB
+  local_buffer_size: 4GB
+
+output_dir: ./outputs/qwen3-8b-single-node
+cache_dir: ./cache/qwen3-8b-single-node
+model_download_dir: null
+
+debug:
+  save_debug_train_data: null
+  debug_train_only: false
+  debug_inference_only: false
diff --git a/configs/trtllm_qwen3_8b_dflash.yaml b/configs/trtllm_qwen3_8b_dflash.yaml
@@ -0,0 +1,93 @@
+# DFlash training config for Qwen3-8B target model — TensorRT-LLM inference backend.
+#
+# Mirrors configs/sglang_qwen3_8b_dflash.yaml; only the inference backend differs
+# (engine type, the trtllm block, and placement_strategy).
+#
+# GPU allocation: mirrors the sglang DFlash layout (4 inference tp=1 + 4 training);
+# adjust inference_num_gpus / training_num_gpus_per_node / tp_size to your budget.
+#
+# Usage:
+#   ./examples/qwen3-8b-single-node/run.sh configs/trtllm_qwen3_8b_dflash.yaml
+
+model:
+  target_model_path: Qwen/Qwen3-8B
+  trust_remote_code: true
+  draft_model_config: torchspec/config/dflash_draft_config.json
+
+dataset:
+  train_data_path: ../examples/data/sample_conversations.jsonl
+  eval_data_path: null
+  eval_interval: 100
+  chat_template: qwen
+  prompt_key: conversations
+  min_loss_tokens: 32
+
+training:
+  attention_backend: flex_attention
+  # TRT tp-workers bind GPU 0..tp-1; keep inference on those to avoid OOM with training.
+  placement_strategy: inference_first
+  micro_batch_size: 1
+  draft_accumulation_steps: 2       # was 4 → 2x more optimizer steps
+  learning_rate: 6e-4
+  min_lr: 6e-5                      # 10% of peak — prevents LR death in later epochs
+  weight_decay: 0.01                # AdamW regularization for better generalization
+  max_concurrent_batches: 1
+  max_grad_norm: 1.0
+  max_seq_length: 2048
+  num_epochs: 3
+  seed: 42
+  training_num_gpus_per_node: 4
+  training_num_nodes: 1
+  ttt_length: 7
+  fsdp_strategy: FULL_SHARD
+  fsdp_reduce_dtype: bfloat16
+  prefetch_depth: 8
+  save_interval: 1000
+  save_per_epoch: true
+  max_checkpoints: 2
+  warmup_ratio: 0.04
+
+  # DFlash-specific parameters
+  dflash_block_size: 16
+  dflash_num_anchors: 512
+  dflash_loss_decay_gamma: 7.0
+  dflash_num_target_layers: 5
+
+inference:
+  inference_engine_type: trtllm
+  store_last_hidden_states: false
+  inference_num_gpus: 4
+  inference_num_gpus_per_engine: 1
+  inference_num_gpus_per_node: 4
+  max_sample_pool_size: 64
+  inference_buffer_threshold: 32
+  inference_batch_size: 8
+  trtllm:
+    tp_size: 1
+    # KV-cache memory fraction. Kept below TRT-LLM's 0.9 default to leave room
+    # for the SaveHiddenStates capture buffer, which the KV profiler does not
+    # account for.
+    mem_fraction_static: 0.7
+    extra_args:
+      # Must be >= training.max_seq_length: chunked prefill is off, so prompts longer than max_num_tokens are dropped.
+      max_num_tokens: 2048
+
+mooncake:
+  master_server_address: null
+  metadata_server: null
+  protocol: tcp
+  global_segment_size: 16GB
+  local_buffer_size: 4GB
+  # Hard-pin: master-side TTL is disabled; we rely on our explicit
+  # batch_remove(force=True) (see mooncake/eagle_store.py). Requires
+  # mooncake-transfer-engine >= 0.3.10.post1.
+  enable_hard_pin: true
+
+output_dir: ./outputs/qwen3-8b-dflash
+cache_dir: ./cache/qwen3-8b-dflash
+model_download_dir: null
+
+debug:
+  save_debug_train_data: null
+  debug_train_only: false
+  debug_inference_only: false
diff --git a/docker/justfile b/docker/justfile
@@ -1,10 +1,11 @@
 BACKEND := env("BACKEND", "sglang")
 SGLANG_VERSION := env("SGLANG_VERSION", "v0.5.8.post1")
 VLLM_VERSION := env("VLLM_VERSION", "v0.22.1")
+TRTLLM_VERSION := env("TRTLLM_VERSION", "v1.3.0rc18")
 IMAGE_REPO := env("IMAGE_REPO", "ghcr.io/torchspec-project/torchspec")
 IMAGE_TAG := env("IMAGE_TAG", "")
 
-_dockerfile := if BACKEND == "vllm" { "docker/vllm/" + VLLM_VERSION + "/Dockerfile" } else { "docker/sglang/" + SGLANG_VERSION + "/Dockerfile" }
+_dockerfile := if BACKEND == "vllm" { "docker/vllm/" + VLLM_VERSION + "/Dockerfile" } else if BACKEND == "trtllm" { "docker/trtllm/" + TRTLLM_VERSION + "/Dockerfile" } else { "docker/sglang/" + SGLANG_VERSION + "/Dockerfile" }
 
 build:
     ARG_TAG_POSTFIX="${ARG_TAG_POSTFIX:-""}" ARG_BUILD_EXTRA_ARGS="" just _build-only

diff --git a/docker/trtllm/v1.3.0rc18/Dockerfile b/docker/trtllm/v1.3.0rc18/Dockerfile
@@ -0,0 +1,30 @@
+ARG TRTLLM_IMAGE=nvcr.io/nvidia/tensorrt-llm/release:1.3.0rc18
+FROM ${TRTLLM_IMAGE} AS trtllm
+
+WORKDIR /root/
+
+RUN apt-get update && \
+    apt-get install -y --no-install-recommends nvtop rsync dnsutils && \
+    rm -rf /var/lib/apt/lists/*
+
+# Patch the installed tensorrt_llm (dist-packages) for Mooncake hidden-state
+# capture. Don't import tensorrt_llm at build time to locate it — its __init__
+# loads libcuda.so.1, absent until container runtime; patch the path directly.
+COPY patches/trtllm/v1.3.0rc18/*.patch /tmp/patches/
+RUN cd /usr/local/lib/python3.12/dist-packages && \
+    for p in /tmp/patches/*.patch; do patch -p1 < "$p"; done && \
+    rm -rf /tmp/patches
+
+COPY . /root/torchspec
+RUN cd /root/torchspec && pip install --no-cache-dir -e ".[fa]"
+
+RUN pip uninstall -y mooncake-transfer-engine || true && \
+    pip install --no-cache-dir --no-deps --force-reinstall \
+        mooncake-transfer-engine-cuda13==0.3.11.post1
+
+RUN chmod 755 /usr/local/lib/python3.12/dist-packages/mooncake/mooncake_master || true
+RUN if [ -f /usr/local/lib/python3.12/dist-packages/mooncake/cli.py ]; then \
+      sed -i 's/os.chmod(bin_path, 0o755)/pass/' /usr/local/lib/python3.12/dist-packages/mooncake/cli.py; \
+    fi
+
+WORKDIR /root/torchspec
diff --git a/examples/qwen3-8b-single-node/run.sh b/examples/qwen3-8b-single-node/run.sh
@@ -37,6 +37,14 @@ else
     CONFIG_FILE="$ROOT_DIR/configs/sglang_qwen3_8b.yaml"
 fi
 
+# Derive the tp_size override block from the config's engine type ("sgl" -> "sglang").
+# `|| true` so a config without a literal inference_engine_type line falls back below instead of tripping set -e.
+ENGINE_TYPE=$(grep -oE "inference_engine_type:[[:space:]]*[a-zA-Z]+" "$CONFIG_FILE" | awk '{print $2}' || true)
+case "$ENGINE_TYPE" in
+    sgl) TP_BLOCK=sglang ;;
+    *)   TP_BLOCK="${ENGINE_TYPE:-sglang}" ;;
+esac
+
 IFS=',' read -ra GPU_ARRAY <<< "$CUDA_VISIBLE_DEVICES"
 TOTAL_GPUS=${#GPU_ARRAY[@]}
 
@@ -56,14 +64,13 @@ echo "Local IP: $LOCAL_IP"
 echo "Extra args: $*"
 echo "=============================================="
 
-# TODO: unify tp_size config across sglang/vllm backends
 python3 -m torchspec.train_entry \
     --config "$CONFIG_FILE" \
     training.training_num_gpus_per_node="$TRAIN_GPUS" \
     inference.inference_num_gpus="$INFERENCE_GPUS" \
     inference.inference_num_gpus_per_engine=2 \
     inference.inference_num_gpus_per_node="$TOTAL_GPUS" \
-    inference.sglang.tp_size=2 \
+    inference.${TP_BLOCK}.tp_size=2 \
     "$@"
 
 echo "=============================================="