Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
90 changes: 90 additions & 0 deletions configs/sglang_qwen3_8b_domino_2gpu.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,90 @@
# Domino training config for Qwen3-8B on a 2-GPU RunPod.
#
# GPU allocation:
# - 1 GPU for SGLang target inference
# - 1 GPU for Domino training
#
# First-run calibration:
# CUDA_VISIBLE_DEVICES=0,1 python -m torchspec.train_entry \
# --config configs/sglang_qwen3_8b_domino_2gpu.yaml \
# dataset.train_data_path=/path/to/perfectblend.jsonl \
# training.num_train_steps=100 \
# output_dir=./outputs/qwen3-8b-domino-2gpu-100
#
# Review experiment:
# Run matching DFlash and Domino jobs with the same dataset, seed, and step count.

model:
target_model_path: Qwen/Qwen3-8B
trust_remote_code: true
draft_model_config: torchspec/config/domino_draft_config.json

dataset:
train_data_path: ../examples/data/sample_conversations.jsonl
eval_data_path: null
eval_interval: 100
chat_template: qwen
prompt_key: conversations
min_loss_tokens: 32

training:
attention_backend: flex_attention
micro_batch_size: 1
draft_accumulation_steps: 16
learning_rate: 6e-4
min_lr: 0.0
weight_decay: 0.0
max_concurrent_batches: 1
max_grad_norm: 1.0
max_seq_length: 3072
num_epochs: 3
num_train_steps: 100
seed: 42
training_num_gpus_per_node: 1
training_num_nodes: 1
ttt_length: 7
fsdp_strategy: REPLICATE
fsdp_reduce_dtype: bfloat16
prefetch_depth: 4
save_interval: 500
save_per_epoch: false
max_checkpoints: 2
warmup_ratio: 0.04

dflash_block_size: 16
dflash_num_anchors: 256
dflash_loss_decay_gamma: 7.0
dflash_num_target_layers: 5

# None means lambda decays across lr_total_steps / num_train_steps.
domino_curriculum_steps: null

inference:
inference_engine_type: sgl
store_last_hidden_states: false
inference_num_gpus: 1
inference_num_gpus_per_engine: 1
inference_num_gpus_per_node: 2
max_sample_pool_size: 64
inference_buffer_threshold: 32
inference_batch_size: 8
sglang:
tp_size: 1
mem_fraction_static: 0.7

mooncake:
master_server_address: null
metadata_server: null
protocol: tcp
global_segment_size: 16GB
local_buffer_size: 4GB
enable_hard_pin: true

output_dir: ./outputs/qwen3-8b-domino-2gpu
cache_dir: ./cache/qwen3-8b-domino-2gpu
model_download_dir: null

debug:
save_debug_train_data: null
debug_train_only: false
debug_inference_only: false
116 changes: 116 additions & 0 deletions examples/qwen3-8b-domino-8h100/run.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,116 @@
#!/bin/bash
# Qwen3-8B Domino 8-GPU verification run.
#
# Default GPU allocation:
# - 4 GPUs for SGLang inference, one full model copy per engine
# - 4 GPUs for Domino training with FSDP FULL_SHARD
#
# Usage:
# TRAIN_DATA_PATH=/path/to/perfectblend_10k.jsonl \
# OUTPUT_ROOT=/path/to/durable/output \
# ./examples/qwen3-8b-domino-8h100/run.sh
#
# Optional:
# ./examples/qwen3-8b-domino-8h100/run.sh configs/sglang_qwen3_8b_domino_2gpu.yaml \
# training.num_train_steps=20

set -euo pipefail
set -x

export CUDA_VISIBLE_DEVICES="${CUDA_VISIBLE_DEVICES:-0,1,2,3,4,5,6,7}"
export TORCHSPEC_LOG_LEVEL="${TORCHSPEC_LOG_LEVEL:-INFO}"
export PYTORCH_CUDA_ALLOC_CONF="${PYTORCH_CUDA_ALLOC_CONF:-expandable_segments:True}"
export PYTORCH_ALLOC_CONF="${PYTORCH_ALLOC_CONF:-expandable_segments:True}"
export TORCHINDUCTOR_MAX_AUTOTUNE_GEMM_BACKENDS="${TORCHINDUCTOR_MAX_AUTOTUNE_GEMM_BACKENDS:-ATEN,TRITON}"
export MC_STORE_MEMCPY="${MC_STORE_MEMCPY:-0}"

SCRIPT_DIR="$(cd -- "$(dirname -- "${BASH_SOURCE[0]}")" &>/dev/null && pwd)"
ROOT_DIR="$(dirname "$(dirname "$SCRIPT_DIR")")"
cd "$ROOT_DIR"

CONFIG_FILE="${1:-$ROOT_DIR/configs/sglang_qwen3_8b_domino_2gpu.yaml}"
if [[ -f "$CONFIG_FILE" ]]; then
shift 1 || true
elif [[ -f "$ROOT_DIR/$CONFIG_FILE" ]]; then
CONFIG_FILE="$ROOT_DIR/$CONFIG_FILE"
shift 1 || true
else
CONFIG_FILE="$ROOT_DIR/configs/sglang_qwen3_8b_domino_2gpu.yaml"
fi

IFS=',' read -ra GPU_ARRAY <<< "$CUDA_VISIBLE_DEVICES"
TOTAL_GPUS="${#GPU_ARRAY[@]}"
if [[ "$TOTAL_GPUS" -lt 8 ]]; then
echo "Expected at least 8 visible GPUs, got ${TOTAL_GPUS}: ${CUDA_VISIBLE_DEVICES}" >&2
exit 1
fi

RUN_NAME="${RUN_NAME:-qwen3-8b-domino-8h100-100}"
OUTPUT_ROOT="${OUTPUT_ROOT:-$ROOT_DIR/outputs}"
OUTPUT_DIR="${OUTPUT_DIR:-$OUTPUT_ROOT/$RUN_NAME}"
TRAIN_DATA_PATH="${TRAIN_DATA_PATH:-$ROOT_DIR/data/perfectblend_10k.jsonl}"
NUM_TRAIN_STEPS="${NUM_TRAIN_STEPS:-100}"
SAVE_INTERVAL="${SAVE_INTERVAL:-100}"
MAX_CHECKPOINTS="${MAX_CHECKPOINTS:-1}"

TRAIN_GPUS="${TRAIN_GPUS:-4}"
INFERENCE_GPUS="${INFERENCE_GPUS:-4}"
TP_SIZE="${TP_SIZE:-1}"
DRAFT_ACCUMULATION_STEPS="${DRAFT_ACCUMULATION_STEPS:-4}"
PREFETCH_DEPTH="${PREFETCH_DEPTH:-8}"

export HF_HOME="${HF_HOME:-$ROOT_DIR/hf-cache}"
export TORCHINDUCTOR_CACHE_DIR="${TORCHINDUCTOR_CACHE_DIR:-$ROOT_DIR/cache/compiled_kernels}"
export TORCHSPEC_LOG_DIR="${TORCHSPEC_LOG_DIR:-$OUTPUT_DIR/rank-logs}"

if [[ ! -f "$TRAIN_DATA_PATH" ]]; then
echo "Training data not found: ${TRAIN_DATA_PATH}" >&2
echo "Set TRAIN_DATA_PATH to the JSONL dataset used for the verification run." >&2
exit 1
fi

mkdir -p "$OUTPUT_DIR" "$TORCHSPEC_LOG_DIR" "$ROOT_DIR/cache"

LOG_FILE="$OUTPUT_DIR/launcher.log"
exec > >(tee -a "$LOG_FILE") 2>&1

echo "=============================================="
echo "Qwen3-8B Domino 8-GPU verification"
echo "=============================================="
echo "Config: $CONFIG_FILE"
echo "Run name: $RUN_NAME"
echo "Output dir: $OUTPUT_DIR"
echo "Training data: $TRAIN_DATA_PATH"
echo "CUDA_VISIBLE_DEVICES: $CUDA_VISIBLE_DEVICES"
echo "Training GPUs: $TRAIN_GPUS"
echo "Inference GPUs: $INFERENCE_GPUS"
echo "Inference TP size: $TP_SIZE"
echo "Steps: $NUM_TRAIN_STEPS"
echo "Save interval: $SAVE_INTERVAL"
echo "Max checkpoints: $MAX_CHECKPOINTS"
echo "Extra args: $*"
echo "=============================================="

python3 -m torchspec.train_entry \
--config "$CONFIG_FILE" \
dataset.train_data_path="$TRAIN_DATA_PATH" \
training.num_train_steps="$NUM_TRAIN_STEPS" \
training.save_interval="$SAVE_INTERVAL" \
training.max_checkpoints="$MAX_CHECKPOINTS" \
training.training_num_nodes=1 \
training.training_num_gpus_per_node="$TRAIN_GPUS" \
training.fsdp_strategy=FULL_SHARD \
training.draft_accumulation_steps="$DRAFT_ACCUMULATION_STEPS" \
training.prefetch_depth="$PREFETCH_DEPTH" \
inference.inference_num_gpus="$INFERENCE_GPUS" \
inference.inference_num_gpus_per_engine="$TP_SIZE" \
inference.inference_num_gpus_per_node="$TOTAL_GPUS" \
inference.sglang.tp_size="$TP_SIZE" \
debug.enable_perf_metrics=true \
output_dir="$OUTPUT_DIR" \
cache_dir="$ROOT_DIR/cache/$RUN_NAME" \
"$@"

echo "=============================================="
echo "Training completed. Checkpoints: $OUTPUT_DIR/checkpoints"
echo "=============================================="
159 changes: 159 additions & 0 deletions tests/test_domino.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,159 @@
# Copyright (c) 2026 LightSeek Foundation
#
# Licensed under the MIT License (see repository LICENSE / file headers).

"""Correctness tests for the Domino draft model (CPU/float32).

Verifies the new logic - the Domino causal-correction head and the base-anchored
curriculum loss - in isolation from flex attention/CUDA. The reused DFlash
backbone/anchor/mask code is covered by test_dflash.py.
"""

import pytest

torch = pytest.importorskip("torch")

from torchspec.models.domino import DominoModel # noqa: E402
from torchspec.models.draft.auto import AutoDraftModelConfig # noqa: E402
from torchspec.models.draft.domino import DominoConfig, DominoDraftModel # noqa: E402

DEV = torch.device("cpu")
DT = torch.float32
B, SEQ, BLOCK, NANCH = 2, 24, 4, 4


def _cfg():
return DominoConfig(
hidden_size=32,
intermediate_size=64,
num_hidden_layers=1,
num_attention_heads=4,
num_key_value_heads=2,
vocab_size=32,
num_target_layers=2,
target_hidden_size=32,
target_num_hidden_layers=4,
mask_token_id=31,
gru_hidden_size=16,
correction_rank=8,
)


def _build(cfg):
draft = DominoDraftModel(cfg).to(DEV, DT)
return DominoModel(
draft_model=draft,
block_size=BLOCK,
num_anchors=NANCH,
loss_objective="decay",
dpace_alpha=0.5,
loss_decay_gamma=7.0,
).to(DEV, DT)


def _batch(cfg):
torch.manual_seed(123)
return (
torch.randint(0, cfg.vocab_size, (B, SEQ), device=DEV),
[
torch.randn(B, SEQ, cfg.target_hidden_size, device=DEV, dtype=DT)
for _ in range(cfg.num_target_layers)
],
torch.ones(B, SEQ, device=DEV, dtype=DT),
torch.randn(cfg.vocab_size, cfg.hidden_size, device=DEV, dtype=DT) * 0.02,
)


def _fwd(model, batch, lam):
model.curriculum_lambda = lam
return model(batch[0], batch[1], batch[2], batch[3])


def test_domino_json_resolves_to_domino_config():
cfg = AutoDraftModelConfig.from_file("torchspec/config/domino_draft_config.json")
assert isinstance(cfg, DominoConfig)


def test_head_present_and_trainable():
model = _build(_cfg())
for name in ("causal_gru", "correction_w1", "correction_w2"):
module = getattr(model.draft_model, name)
params = list(module.parameters())
assert params and all(p.requires_grad for p in params)


def test_forward_output_contract():
cfg = _cfg()
model = _build(cfg)
loss, acc, loss_pp, acc_pp, count_pp, aux_metrics = _fwd(model, _batch(cfg), 0.0)
assert loss.ndim == 0 and acc.ndim == 0
assert loss_pp.shape == (BLOCK,) and acc_pp.shape == (BLOCK,)
assert count_pp.shape == (BLOCK,)
assert torch.isfinite(loss)
assert set(aux_metrics) == {
"base_loss",
"final_loss",
"correction_norm",
"correction_abs_mean",
}
assert all(torch.isfinite(v) for v in aux_metrics.values())


def test_curriculum_lambda_is_noop_when_correction_zeroed():
cfg = _cfg()
model = _build(cfg)
batch = _batch(cfg)
with torch.no_grad():
model.draft_model.correction_w2.weight.zero_()
torch.manual_seed(7)
loss_base = _fwd(model, batch, 1.0)[0].item()
torch.manual_seed(7)
loss_final = _fwd(model, batch, 0.0)[0].item()
assert abs(loss_base - loss_final) < 1e-5


def test_curriculum_selects_base_vs_final_when_correction_active():
cfg = _cfg()
model = _build(cfg)
batch = _batch(cfg)
torch.manual_seed(7)
base = _fwd(model, batch, 1.0)[0].item()
torch.manual_seed(7)
final = _fwd(model, batch, 0.0)[0].item()
assert abs(base - final) > 1e-4


def test_gradients_flow_to_domino_head():
cfg = _cfg()
model = _build(cfg)
model.zero_grad(set_to_none=True)
_fwd(model, _batch(cfg), 0.0)[0].backward()
for name in ("causal_gru", "correction_w1", "correction_w2"):
module = getattr(model.draft_model, name)
grad_norm = sum(p.grad.norm().item() for p in module.parameters() if p.grad is not None)
assert grad_norm > 0, name


def test_model_learns_under_curriculum():
torch.manual_seed(0)
cfg = _cfg()
model = _build(cfg)
batch = _batch(cfg)
optimizer = torch.optim.AdamW(
[p for p in model.parameters() if p.requires_grad],
lr=3e-3,
)
steps = 200
first_loss = last_loss = first_acc = last_acc = None
for step in range(steps):
lam = max(0.0, 1.0 - step / (steps * 0.5))
optimizer.zero_grad(set_to_none=True)
loss, acc, *_ = _fwd(model, batch, lam)
loss.backward()
optimizer.step()
if step == 0:
first_loss, first_acc = loss.item(), acc.item()
last_loss, last_acc = loss.item(), acc.item()

assert last_loss < first_loss * 0.5
assert last_acc > first_acc + 0.2
21 changes: 21 additions & 0 deletions torchspec/config/domino_draft_config.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,21 @@
{
"architectures": ["DominoDraftModel"],
"model_type": "domino",
"hidden_size": 4096,
"intermediate_size": 12288,
"num_hidden_layers": 5,
"num_attention_heads": 32,
"num_key_value_heads": 8,
"vocab_size": 151936,
"rms_norm_eps": 1e-6,
"max_position_embeddings": 40960,
"rope_theta": 1000000.0,
"num_target_layers": 5,
"target_hidden_size": 4096,
"target_num_hidden_layers": 36,
"target_layer_ids": [1, 9, 17, 25, 33],
"mask_token_id": 151669,
"gru_hidden_size": 1024,
"correction_rank": 256,
"tie_word_embeddings": false
}
Loading