Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
35 changes: 35 additions & 0 deletions .github/configs/nvidia-master.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -12485,6 +12485,41 @@ minimaxm3-fp8-b200-vllm:
- { tp: 4, ep: 4, dp-attn: true, conc-start: 64, conc-end: 128 }
- { tp: 8, ep: 8, dp-attn: true, conc-start: 128, conc-end: 256 }

# MiniMax-M3 NVFP4 (nvidia/MiniMax-M3-NVFP4) B200 single-node vLLM — FP4 variant
# of minimaxm3-fp8-b200-vllm, running on the b200-dgxc cluster. MiniMax-M3
# modelopt NVFP4 support (vllm-project/vllm PR #46380) is baked into the perf
# container image, so no runtime patch is needed. --block-size 128 is mandatory
# (MSA sparse/index cache); weights are pre-staged at /scratch/fsw/models/MiniMax-M3-NVFP4
# (launch_b200-dgxc.sh resolves MODEL_PATH for minimaxm3-fp4).
minimaxm3-fp4-b200-vllm:
image: vllm/vllm-openai:vllm-minimax-m3-perf-x86_64-13.0.1-8b00f41
model: nvidia/MiniMax-M3-NVFP4
model-prefix: minimaxm3
runner: b200-dgxc
precision: fp4
framework: vllm
multinode: false
scenarios:
fixed-seq-len:
- isl: 1024
osl: 1024
search-space:
- { tp: 8, conc-start: 1, conc-end: 64 }
- { tp: 8, ep: 8, conc-start: 1, conc-end: 512 }
- { tp: 4, conc-start: 1, conc-end: 64 }
- { tp: 4, ep: 4, conc-start: 64, conc-end: 512 }
- { tp: 4, ep: 4, dp-attn: true, conc-start: 128, conc-end: 512 }
- { tp: 8, ep: 8, dp-attn: true, conc-start: 256, conc-end: 1024 }
- isl: 8192
osl: 1024
search-space:
- { tp: 8, conc-start: 1, conc-end: 64 }
- { tp: 8, ep: 8, conc-start: 1, conc-end: 256 }
- { tp: 4, conc-start: 1, conc-end: 64 }
- { tp: 4, ep: 4, conc-start: 64, conc-end: 256 }
- { tp: 4, ep: 4, dp-attn: true, conc-start: 64, conc-end: 128 }
- { tp: 8, ep: 8, dp-attn: true, conc-start: 128, conc-end: 256 }

# EAGLE3 speculative-decoding (spec-decoding: mtp) variant of
# minimaxm3-fp8-b200-vllm, pairing MiniMaxAI/MiniMax-M3-MXFP8 with the
# Inferact/MiniMax-M3-EAGLE3 draft head (3 speculative tokens, drafter pinned
Expand Down
86 changes: 86 additions & 0 deletions benchmarks/single_node/fixed_seq_len/minimaxm3_fp4_b200.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,86 @@
#!/usr/bin/env bash

# MiniMax-M3 NVFP4 B200 single-node vLLM recipe.
# Same shape as minimaxm3_fp8_b200.sh but uses the nvidia/MiniMax-M3-NVFP4
# checkpoint. MiniMax-M3 modelopt NVFP4 support (vllm-project/vllm PR #46380) is
# baked into the perf container image, so no runtime patch is needed.

source "$(dirname "$0")/../../benchmark_lib.sh"

check_env_vars \
MODEL \
TP \
EP_SIZE \
DP_ATTENTION \
CONC \
ISL \
OSL \
MAX_MODEL_LEN \
RANDOM_RANGE_RATIO \
RESULT_FILENAME

# launch_b200-dgxc.sh rewrites MODEL to the pre-downloaded path; only download
# when handed a bare HF id (b200-cw / b200-nb runners).
if [[ "$MODEL" != /* ]]; then hf download "$MODEL"; fi

if [[ -n "$SLURM_JOB_ID" ]]; then
echo "JOB $SLURM_JOB_ID running on $SLURMD_NODENAME"
fi

nvidia-smi

SERVER_LOG=/workspace/server.log

export VLLM_ENGINE_READY_TIMEOUT_S=3600
export VLLM_FLOAT32_MATMUL_PRECISION=high

if [ "${DP_ATTENTION}" = "true" ]; then
PARALLEL_ARGS="--tensor-parallel-size=1 --data-parallel-size=$TP --enable-expert-parallel"
elif [ "$EP_SIZE" -gt 1 ]; then
PARALLEL_ARGS="--tensor-parallel-size=$TP --enable-expert-parallel"
else
PARALLEL_ARGS="--tensor-parallel-size=$TP"
fi

if [ "${EVAL_ONLY}" = "true" ]; then
setup_eval_context
MAX_MODEL_LEN="$EVAL_MAX_MODEL_LEN"
fi
start_gpu_monitor

set -x
vllm serve $MODEL --port $PORT \
$PARALLEL_ARGS \
--gpu-memory-utilization 0.90 \
--max-model-len $MAX_MODEL_LEN \
--block-size 128 \
--language-model-only \
--max-cudagraph-capture-size 2048 \
--max-num-batched-tokens "$((ISL * 2 ))" \
--stream-interval 20 --no-enable-prefix-caching \
--trust-remote-code > $SERVER_LOG 2>&1 &

SERVER_PID=$!

wait_for_server_ready --port "$PORT" --server-log "$SERVER_LOG" --server-pid "$SERVER_PID"

run_benchmark_serving \
--model "$MODEL" \
--port "$PORT" \
--backend vllm \
--input-len "$ISL" \
--output-len "$OSL" \
--random-range-ratio "$RANDOM_RANGE_RATIO" \
--num-prompts "$((CONC * 10))" \
--max-concurrency "$CONC" \
--result-filename "$RESULT_FILENAME" \
--result-dir /workspace/ \
--trust-remote-code

if [ "${RUN_EVAL}" = "true" ]; then
run_eval --framework lm-eval --port "$PORT"
append_lm_eval_summary
fi

stop_gpu_monitor
set +x
9 changes: 9 additions & 0 deletions perf-changelog.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -4222,6 +4222,15 @@
- "Initial submission: MiniMax-M3 MXFP4 disagg (prefill/decode) on MI355X with vLLM over the MoRI-IO KV connector (8k/1k)."
pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/1914

- config-keys:
- minimaxm3-fp4-b200-vllm
description:
- "Add MiniMax-M3 NVFP4 (nvidia/MiniMax-M3-NVFP4) B200 single-node aggregated vLLM benchmark (no spec decode), runner b200-dgxc"
- "Image vllm/vllm-openai:vllm-minimax-m3-perf-x86_64-13.0.1-8b00f41 (bakes in MiniMax-M3 modelopt NVFP4 support, vllm-project/vllm PR #46380; no runtime patch needed)"
- "Weights pre-staged at /scratch/fsw/models/MiniMax-M3-NVFP4 (added minimaxm3-fp4 MODEL_PATH branch to launch_b200-dgxc.sh); --block-size 128 (MSA), --language-model-only"
- "Sweeps tp 4/8 with and without EP and dp-attn at 1k1k and 8k1k, conc 1-1024"
pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/1932

- config-keys:
- minimaxm3-fp8-gb300-dynamo-vllm
description:
Expand Down
4 changes: 4 additions & 0 deletions runners/launch_b200-dgxc.sh
Original file line number Diff line number Diff line change
Expand Up @@ -68,6 +68,10 @@ elif [[ $MODEL_PREFIX == "minimaxm3" && $PRECISION == "fp8" ]]; then
# tree (root-owned); it lives in the sa-shared-writable gharunners tree.
export MODEL_PATH="/lustre/fsw/gharunners/models/MiniMax-M3-MXFP8"
export SRT_SLURM_MODEL_PREFIX="minimax-m3-mxfp8"
elif [[ $MODEL_PREFIX == "minimaxm3" && $PRECISION == "fp4" ]]; then
# NVFP4 checkpoint, pre-staged on the b200-dgxc scratch tree.
export MODEL_PATH="/scratch/fsw/models/MiniMax-M3-NVFP4"
export SRT_SLURM_MODEL_PREFIX="minimax-m3-nvfp4"
else
echo "Unsupported model prefix/precision: $MODEL_PREFIX/$PRECISION"
echo "Available models under /lustre/fsw/models:"
Expand Down