lightseekorg · SeaTuKeMa · Jun 23, 2026 · Jun 23, 2026
diff --git a/configs/sglang_qwen3_8b_domino_2gpu.yaml b/configs/sglang_qwen3_8b_domino_2gpu.yaml
@@ -0,0 +1,90 @@
+# Domino training config for Qwen3-8B on a 2-GPU RunPod.
+#
+# GPU allocation:
+#   - 1 GPU for SGLang target inference
+#   - 1 GPU for Domino training
+#
+# First-run calibration:
+#   CUDA_VISIBLE_DEVICES=0,1 python -m torchspec.train_entry \
+#     --config configs/sglang_qwen3_8b_domino_2gpu.yaml \
+#     dataset.train_data_path=/path/to/perfectblend.jsonl \
+#     training.num_train_steps=100 \
+#     output_dir=./outputs/qwen3-8b-domino-2gpu-100
+#
+# Review experiment:
+#   Run matching DFlash and Domino jobs with the same dataset, seed, and step count.
+
+model:
+  target_model_path: Qwen/Qwen3-8B
+  trust_remote_code: true
+  draft_model_config: torchspec/config/domino_draft_config.json
+
+dataset:
+  train_data_path: ../examples/data/sample_conversations.jsonl
+  eval_data_path: null
+  eval_interval: 100
+  chat_template: qwen
+  prompt_key: conversations
+  min_loss_tokens: 32
+
+training:
+  attention_backend: flex_attention
+  micro_batch_size: 1
+  draft_accumulation_steps: 16
+  learning_rate: 6e-4
+  min_lr: 0.0
+  weight_decay: 0.0
+  max_concurrent_batches: 1
+  max_grad_norm: 1.0
+  max_seq_length: 3072
+  num_epochs: 3
+  num_train_steps: 100
+  seed: 42
+  training_num_gpus_per_node: 1
+  training_num_nodes: 1
+  ttt_length: 7
+  fsdp_strategy: REPLICATE
+  fsdp_reduce_dtype: bfloat16
+  prefetch_depth: 4
+  save_interval: 500
+  save_per_epoch: false
+  max_checkpoints: 2
+  warmup_ratio: 0.04
+
+  dflash_block_size: 16
+  dflash_num_anchors: 256
+  dflash_loss_decay_gamma: 7.0
+  dflash_num_target_layers: 5
+
+  # None means lambda decays across lr_total_steps / num_train_steps.
+  domino_curriculum_steps: null
+
+inference:
+  inference_engine_type: sgl
+  store_last_hidden_states: false
+  inference_num_gpus: 1
+  inference_num_gpus_per_engine: 1
+  inference_num_gpus_per_node: 2
+  max_sample_pool_size: 64
+  inference_buffer_threshold: 32
+  inference_batch_size: 8
+  sglang:
+    tp_size: 1
+    mem_fraction_static: 0.7
+
+mooncake:
+  master_server_address: null
+  metadata_server: null
+  protocol: tcp
+  global_segment_size: 16GB
+  local_buffer_size: 4GB
+  enable_hard_pin: true
+
+output_dir: ./outputs/qwen3-8b-domino-2gpu
+cache_dir: ./cache/qwen3-8b-domino-2gpu
+model_download_dir: null
+
+debug:
+  save_debug_train_data: null
+  debug_train_only: false
+  debug_inference_only: false
diff --git a/examples/qwen3-8b-domino-8h100/run.sh b/examples/qwen3-8b-domino-8h100/run.sh
@@ -0,0 +1,116 @@
+#!/bin/bash
+# Qwen3-8B Domino 8-GPU verification run.
+#
+# Default GPU allocation:
+#   - 4 GPUs for SGLang inference, one full model copy per engine
+#   - 4 GPUs for Domino training with FSDP FULL_SHARD
+#
+# Usage:
+#   TRAIN_DATA_PATH=/path/to/perfectblend_10k.jsonl \
+#   OUTPUT_ROOT=/path/to/durable/output \
+#   ./examples/qwen3-8b-domino-8h100/run.sh
+#
+# Optional:
+#   ./examples/qwen3-8b-domino-8h100/run.sh configs/sglang_qwen3_8b_domino_2gpu.yaml \
+#     training.num_train_steps=20
+
+set -euo pipefail
+set -x
+
+export CUDA_VISIBLE_DEVICES="${CUDA_VISIBLE_DEVICES:-0,1,2,3,4,5,6,7}"
+export TORCHSPEC_LOG_LEVEL="${TORCHSPEC_LOG_LEVEL:-INFO}"
+export PYTORCH_CUDA_ALLOC_CONF="${PYTORCH_CUDA_ALLOC_CONF:-expandable_segments:True}"
+export PYTORCH_ALLOC_CONF="${PYTORCH_ALLOC_CONF:-expandable_segments:True}"
+export TORCHINDUCTOR_MAX_AUTOTUNE_GEMM_BACKENDS="${TORCHINDUCTOR_MAX_AUTOTUNE_GEMM_BACKENDS:-ATEN,TRITON}"
+export MC_STORE_MEMCPY="${MC_STORE_MEMCPY:-0}"
+
+SCRIPT_DIR="$(cd -- "$(dirname -- "${BASH_SOURCE[0]}")" &>/dev/null && pwd)"
+ROOT_DIR="$(dirname "$(dirname "$SCRIPT_DIR")")"
+cd "$ROOT_DIR"
+
+CONFIG_FILE="${1:-$ROOT_DIR/configs/sglang_qwen3_8b_domino_2gpu.yaml}"
+if [[ -f "$CONFIG_FILE" ]]; then
+  shift 1 || true
+elif [[ -f "$ROOT_DIR/$CONFIG_FILE" ]]; then
+  CONFIG_FILE="$ROOT_DIR/$CONFIG_FILE"
+  shift 1 || true
+else
+  CONFIG_FILE="$ROOT_DIR/configs/sglang_qwen3_8b_domino_2gpu.yaml"
+fi
+
+IFS=',' read -ra GPU_ARRAY <<< "$CUDA_VISIBLE_DEVICES"
+TOTAL_GPUS="${#GPU_ARRAY[@]}"
+if [[ "$TOTAL_GPUS" -lt 8 ]]; then
+  echo "Expected at least 8 visible GPUs, got ${TOTAL_GPUS}: ${CUDA_VISIBLE_DEVICES}" >&2
+  exit 1
+fi
+
+RUN_NAME="${RUN_NAME:-qwen3-8b-domino-8h100-100}"
+OUTPUT_ROOT="${OUTPUT_ROOT:-$ROOT_DIR/outputs}"
+OUTPUT_DIR="${OUTPUT_DIR:-$OUTPUT_ROOT/$RUN_NAME}"
+TRAIN_DATA_PATH="${TRAIN_DATA_PATH:-$ROOT_DIR/data/perfectblend_10k.jsonl}"
+NUM_TRAIN_STEPS="${NUM_TRAIN_STEPS:-100}"
+SAVE_INTERVAL="${SAVE_INTERVAL:-100}"
+MAX_CHECKPOINTS="${MAX_CHECKPOINTS:-1}"
+
+TRAIN_GPUS="${TRAIN_GPUS:-4}"
+INFERENCE_GPUS="${INFERENCE_GPUS:-4}"
+TP_SIZE="${TP_SIZE:-1}"
+DRAFT_ACCUMULATION_STEPS="${DRAFT_ACCUMULATION_STEPS:-4}"
+PREFETCH_DEPTH="${PREFETCH_DEPTH:-8}"
+
+export HF_HOME="${HF_HOME:-$ROOT_DIR/hf-cache}"
+export TORCHINDUCTOR_CACHE_DIR="${TORCHINDUCTOR_CACHE_DIR:-$ROOT_DIR/cache/compiled_kernels}"
+export TORCHSPEC_LOG_DIR="${TORCHSPEC_LOG_DIR:-$OUTPUT_DIR/rank-logs}"
+
+if [[ ! -f "$TRAIN_DATA_PATH" ]]; then
+  echo "Training data not found: ${TRAIN_DATA_PATH}" >&2
+  echo "Set TRAIN_DATA_PATH to the JSONL dataset used for the verification run." >&2
+  exit 1
+fi
+
+mkdir -p "$OUTPUT_DIR" "$TORCHSPEC_LOG_DIR" "$ROOT_DIR/cache"
+
+LOG_FILE="$OUTPUT_DIR/launcher.log"
+exec > >(tee -a "$LOG_FILE") 2>&1
+
+echo "=============================================="
+echo "Qwen3-8B Domino 8-GPU verification"
+echo "=============================================="
+echo "Config: $CONFIG_FILE"
+echo "Run name: $RUN_NAME"
+echo "Output dir: $OUTPUT_DIR"
+echo "Training data: $TRAIN_DATA_PATH"
+echo "CUDA_VISIBLE_DEVICES: $CUDA_VISIBLE_DEVICES"
+echo "Training GPUs: $TRAIN_GPUS"
+echo "Inference GPUs: $INFERENCE_GPUS"
+echo "Inference TP size: $TP_SIZE"
+echo "Steps: $NUM_TRAIN_STEPS"
+echo "Save interval: $SAVE_INTERVAL"
+echo "Max checkpoints: $MAX_CHECKPOINTS"
+echo "Extra args: $*"
+echo "=============================================="
+
+python3 -m torchspec.train_entry \
+  --config "$CONFIG_FILE" \
+  dataset.train_data_path="$TRAIN_DATA_PATH" \
+  training.num_train_steps="$NUM_TRAIN_STEPS" \
+  training.save_interval="$SAVE_INTERVAL" \
+  training.max_checkpoints="$MAX_CHECKPOINTS" \
+  training.training_num_nodes=1 \
+  training.training_num_gpus_per_node="$TRAIN_GPUS" \
+  training.fsdp_strategy=FULL_SHARD \
+  training.draft_accumulation_steps="$DRAFT_ACCUMULATION_STEPS" \
+  training.prefetch_depth="$PREFETCH_DEPTH" \
+  inference.inference_num_gpus="$INFERENCE_GPUS" \
+  inference.inference_num_gpus_per_engine="$TP_SIZE" \
+  inference.inference_num_gpus_per_node="$TOTAL_GPUS" \
+  inference.sglang.tp_size="$TP_SIZE" \
+  debug.enable_perf_metrics=true \
+  output_dir="$OUTPUT_DIR" \
+  cache_dir="$ROOT_DIR/cache/$RUN_NAME" \
+  "$@"
+
+echo "=============================================="
+echo "Training completed. Checkpoints: $OUTPUT_DIR/checkpoints"
+echo "=============================================="
diff --git a/tests/test_domino.py b/tests/test_domino.py
@@ -0,0 +1,159 @@
+# Copyright (c) 2026 LightSeek Foundation
+#
+# Licensed under the MIT License (see repository LICENSE / file headers).
+
+"""Correctness tests for the Domino draft model (CPU/float32).
+
+Verifies the new logic - the Domino causal-correction head and the base-anchored
+curriculum loss - in isolation from flex attention/CUDA. The reused DFlash
+backbone/anchor/mask code is covered by test_dflash.py.
+"""
+
+import pytest
+
+torch = pytest.importorskip("torch")
+
+from torchspec.models.domino import DominoModel  # noqa: E402
+from torchspec.models.draft.auto import AutoDraftModelConfig  # noqa: E402
+from torchspec.models.draft.domino import DominoConfig, DominoDraftModel  # noqa: E402
+
+DEV = torch.device("cpu")
+DT = torch.float32
+B, SEQ, BLOCK, NANCH = 2, 24, 4, 4
+
+
+def _cfg():
+    return DominoConfig(
+        hidden_size=32,
+        intermediate_size=64,
+        num_hidden_layers=1,
+        num_attention_heads=4,
+        num_key_value_heads=2,
+        vocab_size=32,
+        num_target_layers=2,
+        target_hidden_size=32,
+        target_num_hidden_layers=4,
+        mask_token_id=31,
+        gru_hidden_size=16,
+        correction_rank=8,
+    )
+
+
+def _build(cfg):
+    draft = DominoDraftModel(cfg).to(DEV, DT)
+    return DominoModel(
+        draft_model=draft,
+        block_size=BLOCK,
+        num_anchors=NANCH,
+        loss_objective="decay",
+        dpace_alpha=0.5,
+        loss_decay_gamma=7.0,
+    ).to(DEV, DT)
+
+
+def _batch(cfg):
+    torch.manual_seed(123)
+    return (
+        torch.randint(0, cfg.vocab_size, (B, SEQ), device=DEV),
+        [
+            torch.randn(B, SEQ, cfg.target_hidden_size, device=DEV, dtype=DT)
+            for _ in range(cfg.num_target_layers)
+        ],
+        torch.ones(B, SEQ, device=DEV, dtype=DT),
+        torch.randn(cfg.vocab_size, cfg.hidden_size, device=DEV, dtype=DT) * 0.02,
+    )
+
+
+def _fwd(model, batch, lam):
+    model.curriculum_lambda = lam
+    return model(batch[0], batch[1], batch[2], batch[3])
+
+
+def test_domino_json_resolves_to_domino_config():
+    cfg = AutoDraftModelConfig.from_file("torchspec/config/domino_draft_config.json")
+    assert isinstance(cfg, DominoConfig)
+
+
+def test_head_present_and_trainable():
+    model = _build(_cfg())
+    for name in ("causal_gru", "correction_w1", "correction_w2"):
+        module = getattr(model.draft_model, name)
+        params = list(module.parameters())
+        assert params and all(p.requires_grad for p in params)
+
+
+def test_forward_output_contract():
+    cfg = _cfg()
+    model = _build(cfg)
+    loss, acc, loss_pp, acc_pp, count_pp, aux_metrics = _fwd(model, _batch(cfg), 0.0)
+    assert loss.ndim == 0 and acc.ndim == 0
+    assert loss_pp.shape == (BLOCK,) and acc_pp.shape == (BLOCK,)
+    assert count_pp.shape == (BLOCK,)
+    assert torch.isfinite(loss)
+    assert set(aux_metrics) == {
+        "base_loss",
+        "final_loss",
+        "correction_norm",
+        "correction_abs_mean",
+    }
+    assert all(torch.isfinite(v) for v in aux_metrics.values())
+
+
+def test_curriculum_lambda_is_noop_when_correction_zeroed():
+    cfg = _cfg()
+    model = _build(cfg)
+    batch = _batch(cfg)
+    with torch.no_grad():
+        model.draft_model.correction_w2.weight.zero_()
+    torch.manual_seed(7)
+    loss_base = _fwd(model, batch, 1.0)[0].item()
+    torch.manual_seed(7)
+    loss_final = _fwd(model, batch, 0.0)[0].item()
+    assert abs(loss_base - loss_final) < 1e-5
+
+
+def test_curriculum_selects_base_vs_final_when_correction_active():
+    cfg = _cfg()
+    model = _build(cfg)
+    batch = _batch(cfg)
+    torch.manual_seed(7)
+    base = _fwd(model, batch, 1.0)[0].item()
+    torch.manual_seed(7)
+    final = _fwd(model, batch, 0.0)[0].item()
+    assert abs(base - final) > 1e-4
+
+
+def test_gradients_flow_to_domino_head():
+    cfg = _cfg()
+    model = _build(cfg)
+    model.zero_grad(set_to_none=True)
+    _fwd(model, _batch(cfg), 0.0)[0].backward()
+    for name in ("causal_gru", "correction_w1", "correction_w2"):
+        module = getattr(model.draft_model, name)
+        grad_norm = sum(p.grad.norm().item() for p in module.parameters() if p.grad is not None)
+        assert grad_norm > 0, name
+
+
+def test_model_learns_under_curriculum():
+    torch.manual_seed(0)
+    cfg = _cfg()
+    model = _build(cfg)
+    batch = _batch(cfg)
+    optimizer = torch.optim.AdamW(
+        [p for p in model.parameters() if p.requires_grad],
+        lr=3e-3,
+    )
+    steps = 200
+    first_loss = last_loss = first_acc = last_acc = None
+    for step in range(steps):
+        lam = max(0.0, 1.0 - step / (steps * 0.5))
+        optimizer.zero_grad(set_to_none=True)
+        loss, acc, *_ = _fwd(model, batch, lam)
+        loss.backward()
+        optimizer.step()
+        if step == 0:
+            first_loss, first_acc = loss.item(), acc.item()
+        last_loss, last_acc = loss.item(), acc.item()
+
+    assert last_loss < first_loss * 0.5
+    assert last_acc > first_acc + 0.2
diff --git a/torchspec/config/domino_draft_config.json b/torchspec/config/domino_draft_config.json
@@ -0,0 +1,21 @@
+{
+    "architectures": ["DominoDraftModel"],
+    "model_type": "domino",
+    "hidden_size": 4096,
+    "intermediate_size": 12288,
+    "num_hidden_layers": 5,
+    "num_attention_heads": 32,
+    "num_key_value_heads": 8,
+    "vocab_size": 151936,
+    "rms_norm_eps": 1e-6,
+    "max_position_embeddings": 40960,
+    "rope_theta": 1000000.0,
+    "num_target_layers": 5,
+    "target_hidden_size": 4096,
+    "target_num_hidden_layers": 36,
+    "target_layer_ids": [1, 9, 17, 25, 33],
+    "mask_token_id": 151669,
+    "gru_hidden_size": 1024,
+    "correction_rank": 256,
+    "tie_word_embeddings": false
+}