diff --git a/.github/configs/nvidia-master.yaml b/.github/configs/nvidia-master.yaml index a255a9d77f..aaa88c634f 100644 --- a/.github/configs/nvidia-master.yaml +++ b/.github/configs/nvidia-master.yaml @@ -12035,6 +12035,222 @@ minimaxm3-fp8-b300-dynamo-vllm: ep: 8 dp-attn: false +# MiniMax-M3 NVFP4 disagg sweep on the same B300 topology matrix as the MXFP8 +# baseline above. The image includes vLLM PR #46380, so no runtime patch is +# needed. +minimaxm3-fp4-b300-dynamo-vllm: + image: vllm/vllm-openai:vllm-minimax-m3-perf-x86_64-13.0.1-8b00f41 + model: nvidia/MiniMax-M3-NVFP4 + model-prefix: minimaxm3 + runner: b300 + precision: fp4 + framework: dynamo-vllm + multinode: true + disagg: true + scenarios: + fixed-seq-len: + - isl: 1024 + osl: 1024 + search-space: + - conc-list: [4, 16, 64, 128, 4096] + prefill: + num-worker: 1 + tp: 2 + ep: 2 + dp-attn: true + additional-settings: + - "CONFIG_FILE=recipes/vllm/minimax-m3/b300-fp4/1k1k/1p1d-dep2-tep8-1k1k.yaml" + decode: + num-worker: 1 + tp: 8 + ep: 8 + dp-attn: false + - conc-list: [1, 4, 8, 16] + prefill: + num-worker: 1 + tp: 2 + ep: 2 + dp-attn: true + additional-settings: + - "CONFIG_FILE=recipes/vllm/minimax-m3/b300-fp4/1k1k/1p1d-dep2-tp4-1k1k.yaml" + decode: + num-worker: 1 + tp: 4 + ep: 1 + dp-attn: false + - conc-list: [2048] + prefill: + num-worker: 1 + tp: 2 + ep: 2 + dp-attn: true + additional-settings: + - "CONFIG_FILE=recipes/vllm/minimax-m3/b300-fp4/1k1k/1p2d-dep2-dep4-1k1k.yaml" + decode: + num-worker: 2 + tp: 4 + ep: 4 + dp-attn: true + - conc-list: [512, 4096] + prefill: + num-worker: 2 + tp: 2 + ep: 2 + dp-attn: true + additional-settings: + - "CONFIG_FILE=recipes/vllm/minimax-m3/b300-fp4/1k1k/2p1d-dep2-dep8-1k1k.yaml" + decode: + num-worker: 1 + tp: 8 + ep: 8 + dp-attn: true + - conc-list: [32] + prefill: + num-worker: 2 + tp: 2 + ep: 2 + dp-attn: true + additional-settings: + - "CONFIG_FILE=recipes/vllm/minimax-m3/b300-fp4/1k1k/2p1d-dep2-tep8-1k1k.yaml" + decode: + num-worker: 1 + tp: 8 + ep: 8 + dp-attn: false + - conc-list: [16] + prefill: + num-worker: 2 + tp: 2 + ep: 2 + dp-attn: true + additional-settings: + - "CONFIG_FILE=recipes/vllm/minimax-m3/b300-fp4/1k1k/2p2d-dep2-tep8-1k1k.yaml" + decode: + num-worker: 2 + tp: 8 + ep: 8 + dp-attn: false + - conc-list: [4] + prefill: + num-worker: 3 + tp: 2 + ep: 2 + dp-attn: true + additional-settings: + - "CONFIG_FILE=recipes/vllm/minimax-m3/b300-fp4/1k1k/3p2d-dep2-tep8-1k1k.yaml" + decode: + num-worker: 2 + tp: 8 + ep: 8 + dp-attn: false + - isl: 8192 + osl: 1024 + search-space: + - conc-list: [256, 512] + prefill: + num-worker: 2 + tp: 2 + ep: 2 + dp-attn: true + additional-settings: + - "CONFIG_FILE=recipes/vllm/minimax-m3/b300-fp4/8k1k/2p2d-dep2-dep8-8k1k.yaml" + decode: + num-worker: 2 + tp: 8 + ep: 8 + dp-attn: true + - conc-list: [16] + prefill: + num-worker: 2 + tp: 2 + ep: 2 + dp-attn: true + additional-settings: + - "CONFIG_FILE=recipes/vllm/minimax-m3/b300-fp4/8k1k/2p2d-dep2-tep8-8k1k.yaml" + decode: + num-worker: 2 + tp: 8 + ep: 8 + dp-attn: false + - conc-list: [4096] + prefill: + num-worker: 4 + tp: 2 + ep: 2 + dp-attn: true + additional-settings: + - "CONFIG_FILE=recipes/vllm/minimax-m3/b300-fp4/8k1k/4p2d-dep2-dep8-8k1k.yaml" + decode: + num-worker: 2 + tp: 8 + ep: 8 + dp-attn: true + - conc-list: [1, 4, 8, 16] + prefill: + num-worker: 1 + tp: 2 + ep: 2 + dp-attn: true + additional-settings: + - "CONFIG_FILE=recipes/vllm/minimax-m3/b300-fp4/8k1k/1p1d-dep2-tp4-8k1k.yaml" + decode: + num-worker: 1 + tp: 4 + ep: 1 + dp-attn: false + - conc-list: [4096] + prefill: + num-worker: 4 + tp: 2 + ep: 2 + dp-attn: true + additional-settings: + - "CONFIG_FILE=recipes/vllm/minimax-m3/b300-fp4/8k1k/4p2d-dep2-tep4-8k1k.yaml" + decode: + num-worker: 2 + tp: 4 + ep: 4 + dp-attn: false + - conc-list: [16, 32, 64, 128] + prefill: + num-worker: 1 + tp: 2 + ep: 2 + dp-attn: true + additional-settings: + - "CONFIG_FILE=recipes/vllm/minimax-m3/b300-fp4/8k1k/1p4d-dep2-tep4-8k1k.yaml" + decode: + num-worker: 4 + tp: 4 + ep: 4 + dp-attn: false + - conc-list: [16] + prefill: + num-worker: 1 + tp: 2 + ep: 2 + dp-attn: true + additional-settings: + - "CONFIG_FILE=recipes/vllm/minimax-m3/b300-fp4/8k1k/1p2d-dep2-tep4-8k1k.yaml" + decode: + num-worker: 2 + tp: 4 + ep: 4 + dp-attn: false + - conc-list: [4] + prefill: + num-worker: 1 + tp: 2 + ep: 2 + dp-attn: true + additional-settings: + - "CONFIG_FILE=recipes/vllm/minimax-m3/b300-fp4/8k1k/1p4d-dep2-tep8-8k1k.yaml" + decode: + num-worker: 4 + tp: 8 + ep: 8 + dp-attn: false + # MiniMax-M3 GB300 disagg sweep — refreshed recipe set (no Marlin variants). # All prefill DEP2 (TP1 DP2 EP, 2 GPU/worker). Decode: DEP4, TEP8, DEP8, TEP4. # 4 GPU/node (GB300 NVL72). kv-cache-dtype=fp8. srun_options mem=0 required. diff --git a/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3/b300-fp4/1k1k/1p1d-dep2-tep8-1k1k.yaml b/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3/b300-fp4/1k1k/1p1d-dep2-tep8-1k1k.yaml new file mode 100644 index 0000000000..486af05573 --- /dev/null +++ b/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3/b300-fp4/1k1k/1p1d-dep2-tep8-1k1k.yaml @@ -0,0 +1,81 @@ +name: "minimax-m3-vllm-disagg-b300-1p1d-fp4-dep2-tep8-1k1k" + +model: + path: "nvidia/MiniMax-M3-NVFP4" + container: "vllm/vllm-openai:vllm-minimax-m3-perf-x86_64-13.0.1-8b00f41" + precision: "fp4" + +resources: + gpu_type: "b300" + gpus_per_node: 8 + prefill_nodes: 1 + decode_nodes: 1 + prefill_workers: 1 + decode_workers: 1 + gpus_per_prefill: 2 + gpus_per_decode: 8 + +dynamo: + install: true + version: 1.3.0.dev20260614 + +frontend: + type: dynamo + enable_multiple_frontends: false + +backend: + type: vllm + connector: null + + prefill_environment: + UCX_TLS: "cuda_copy,rc" + VLLM_FLOAT32_MATMUL_PRECISION: "high" + + decode_environment: + UCX_TLS: "cuda_copy,rc" + VLLM_FLOAT32_MATMUL_PRECISION: "high" + + vllm_config: + prefill: + no-enable-flashinfer-autotune: true + tensor-parallel-size: 1 + data-parallel-size: 2 + data-parallel-rpc-port: 13345 + enable-expert-parallel: true + trust-remote-code: true + no-enable-prefix-caching: true + kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both"}' + block-size: 128 + gpu-memory-utilization: 0.90 + max-model-len: 2304 + language-model-only: true + stream-interval: 32 + max-cudagraph-capture-size: 2048 + max-num-batched-tokens: 2048 + + decode: + no-enable-flashinfer-autotune: true + tensor-parallel-size: 8 + enable-expert-parallel: true + trust-remote-code: true + no-enable-prefix-caching: true + kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both"}' + block-size: 128 + gpu-memory-utilization: 0.90 + max-model-len: 2304 + language-model-only: true + stream-interval: 32 + max-num-seqs: 4096 + max-num-batched-tokens: 16384 + max-cudagraph-capture-size: 8192 + +health_check: + max_attempts: 360 + interval_seconds: 10 + +benchmark: + type: "sa-bench" + isl: 1024 + osl: 1024 + concurrencies: "4x16x64x128x4096" + req_rate: "inf" diff --git a/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3/b300-fp4/1k1k/1p1d-dep2-tp4-1k1k.yaml b/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3/b300-fp4/1k1k/1p1d-dep2-tp4-1k1k.yaml new file mode 100644 index 0000000000..532b78a103 --- /dev/null +++ b/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3/b300-fp4/1k1k/1p1d-dep2-tp4-1k1k.yaml @@ -0,0 +1,82 @@ +name: "minimax-m3-vllm-disagg-b300-1p1d-fp4-dep2-tp4-1k1k" + +model: + path: "nvidia/MiniMax-M3-NVFP4" + container: "vllm/vllm-openai:vllm-minimax-m3-perf-x86_64-13.0.1-8b00f41" + precision: "fp4" + +resources: + gpu_type: "b300" + gpus_per_node: 8 + prefill_nodes: 1 + decode_nodes: 1 + prefill_workers: 1 + decode_workers: 1 + gpus_per_prefill: 2 + gpus_per_decode: 4 + +dynamo: + install: true + version: 1.3.0.dev20260614 + +frontend: + type: dynamo + enable_multiple_frontends: false + +backend: + type: vllm + connector: null + allow_prefill_decode_colocation: true + + prefill_environment: + UCX_TLS: "cuda_ipc,cuda_copy,rc" + VLLM_FLOAT32_MATMUL_PRECISION: "high" + + decode_environment: + UCX_TLS: "cuda_ipc,cuda_copy,rc" + VLLM_FLOAT32_MATMUL_PRECISION: "high" + + vllm_config: + prefill: + no-enable-flashinfer-autotune: true + tensor-parallel-size: 1 + data-parallel-size: 2 + data-parallel-rpc-port: 13345 + enable-expert-parallel: true + trust-remote-code: true + no-enable-prefix-caching: true + kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both"}' + block-size: 128 + gpu-memory-utilization: 0.90 + max-model-len: 2304 + language-model-only: true + stream-interval: 32 + max-cudagraph-capture-size: 2048 + max-num-batched-tokens: 2048 + + decode: + no-enable-flashinfer-autotune: true + tensor-parallel-size: 4 + enable-expert-parallel: false + trust-remote-code: true + no-enable-prefix-caching: true + kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both"}' + block-size: 128 + gpu-memory-utilization: 0.90 + max-model-len: 2304 + language-model-only: true + stream-interval: 32 + max-num-seqs: 4096 + max-num-batched-tokens: 16384 + max-cudagraph-capture-size: 2048 + +health_check: + max_attempts: 360 + interval_seconds: 10 + +benchmark: + type: "sa-bench" + isl: 1024 + osl: 1024 + concurrencies: "1x4x8x16" + req_rate: "inf" diff --git a/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3/b300-fp4/1k1k/1p2d-dep2-dep4-1k1k.yaml b/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3/b300-fp4/1k1k/1p2d-dep2-dep4-1k1k.yaml new file mode 100644 index 0000000000..fde8442a18 --- /dev/null +++ b/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3/b300-fp4/1k1k/1p2d-dep2-dep4-1k1k.yaml @@ -0,0 +1,83 @@ +name: "minimax-m3-vllm-disagg-b300-1p2d-fp4-dep2-dep4-1k1k" + +model: + path: "nvidia/MiniMax-M3-NVFP4" + container: "vllm/vllm-openai:vllm-minimax-m3-perf-x86_64-13.0.1-8b00f41" + precision: "fp4" + +resources: + gpu_type: "b300" + gpus_per_node: 8 + prefill_nodes: 1 + decode_nodes: 1 + prefill_workers: 1 + decode_workers: 2 + gpus_per_prefill: 2 + gpus_per_decode: 4 + +dynamo: + install: true + version: 1.3.0.dev20260614 + +frontend: + type: dynamo + enable_multiple_frontends: false + +backend: + type: vllm + connector: null + + prefill_environment: + UCX_TLS: "cuda_copy,rc" + VLLM_FLOAT32_MATMUL_PRECISION: "high" + + decode_environment: + UCX_TLS: "cuda_copy,rc" + VLLM_FLOAT32_MATMUL_PRECISION: "high" + + vllm_config: + prefill: + no-enable-flashinfer-autotune: true + tensor-parallel-size: 1 + data-parallel-size: 2 + data-parallel-rpc-port: 13345 + enable-expert-parallel: true + trust-remote-code: true + no-enable-prefix-caching: true + kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both"}' + block-size: 128 + gpu-memory-utilization: 0.90 + max-model-len: 2304 + language-model-only: true + stream-interval: 32 + max-cudagraph-capture-size: 2048 + max-num-batched-tokens: 2048 + + decode: + no-enable-flashinfer-autotune: true + tensor-parallel-size: 1 + data-parallel-size: 4 + data-parallel-rpc-port: 13345 + enable-expert-parallel: true + trust-remote-code: true + no-enable-prefix-caching: true + kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both"}' + block-size: 128 + gpu-memory-utilization: 0.90 + max-model-len: 2304 + language-model-only: true + stream-interval: 32 + max-num-seqs: 1024 + max-num-batched-tokens: 16384 + max-cudagraph-capture-size: 8192 + +health_check: + max_attempts: 360 + interval_seconds: 10 + +benchmark: + type: "sa-bench" + isl: 1024 + osl: 1024 + concurrencies: "2048" + req_rate: "inf" diff --git a/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3/b300-fp4/1k1k/2p1d-dep2-dep8-1k1k.yaml b/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3/b300-fp4/1k1k/2p1d-dep2-dep8-1k1k.yaml new file mode 100644 index 0000000000..ed3b5f9950 --- /dev/null +++ b/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3/b300-fp4/1k1k/2p1d-dep2-dep8-1k1k.yaml @@ -0,0 +1,83 @@ +name: "minimax-m3-vllm-disagg-b300-2p1d-fp4-dep2-dep8-1k1k" + +model: + path: "nvidia/MiniMax-M3-NVFP4" + container: "vllm/vllm-openai:vllm-minimax-m3-perf-x86_64-13.0.1-8b00f41" + precision: "fp4" + +resources: + gpu_type: "b300" + gpus_per_node: 8 + prefill_nodes: 1 + decode_nodes: 1 + prefill_workers: 2 + decode_workers: 1 + gpus_per_prefill: 2 + gpus_per_decode: 8 + +dynamo: + install: true + version: 1.3.0.dev20260614 + +frontend: + type: dynamo + enable_multiple_frontends: false + +backend: + type: vllm + connector: null + + prefill_environment: + UCX_TLS: "cuda_copy,rc" + VLLM_FLOAT32_MATMUL_PRECISION: "high" + + decode_environment: + UCX_TLS: "cuda_copy,rc" + VLLM_FLOAT32_MATMUL_PRECISION: "high" + + vllm_config: + prefill: + no-enable-flashinfer-autotune: true + tensor-parallel-size: 1 + data-parallel-size: 2 + data-parallel-rpc-port: 13345 + enable-expert-parallel: true + trust-remote-code: true + no-enable-prefix-caching: true + kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both"}' + block-size: 128 + gpu-memory-utilization: 0.90 + max-model-len: 2304 + language-model-only: true + stream-interval: 32 + max-cudagraph-capture-size: 2048 + max-num-batched-tokens: 2048 + + decode: + no-enable-flashinfer-autotune: true + tensor-parallel-size: 1 + data-parallel-size: 8 + data-parallel-rpc-port: 13345 + enable-expert-parallel: true + trust-remote-code: true + no-enable-prefix-caching: true + kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both"}' + block-size: 128 + gpu-memory-utilization: 0.90 + max-model-len: 2304 + language-model-only: true + stream-interval: 32 + max-num-seqs: 4096 + max-num-batched-tokens: 16384 + max-cudagraph-capture-size: 8192 + +health_check: + max_attempts: 360 + interval_seconds: 10 + +benchmark: + type: "sa-bench" + isl: 1024 + osl: 1024 + concurrencies: "512x4096" + req_rate: "inf" diff --git a/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3/b300-fp4/1k1k/2p1d-dep2-tep8-1k1k.yaml b/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3/b300-fp4/1k1k/2p1d-dep2-tep8-1k1k.yaml new file mode 100644 index 0000000000..0784283b91 --- /dev/null +++ b/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3/b300-fp4/1k1k/2p1d-dep2-tep8-1k1k.yaml @@ -0,0 +1,81 @@ +name: "minimax-m3-vllm-disagg-b300-2p1d-fp4-dep2-tep8-1k1k" + +model: + path: "nvidia/MiniMax-M3-NVFP4" + container: "vllm/vllm-openai:vllm-minimax-m3-perf-x86_64-13.0.1-8b00f41" + precision: "fp4" + +resources: + gpu_type: "b300" + gpus_per_node: 8 + prefill_nodes: 1 + decode_nodes: 1 + prefill_workers: 2 + decode_workers: 1 + gpus_per_prefill: 2 + gpus_per_decode: 8 + +dynamo: + install: true + version: 1.3.0.dev20260614 + +frontend: + type: dynamo + enable_multiple_frontends: false + +backend: + type: vllm + connector: null + + prefill_environment: + UCX_TLS: "cuda_copy,rc" + VLLM_FLOAT32_MATMUL_PRECISION: "high" + + decode_environment: + UCX_TLS: "cuda_copy,rc" + VLLM_FLOAT32_MATMUL_PRECISION: "high" + + vllm_config: + prefill: + no-enable-flashinfer-autotune: true + tensor-parallel-size: 1 + data-parallel-size: 2 + data-parallel-rpc-port: 13345 + enable-expert-parallel: true + trust-remote-code: true + no-enable-prefix-caching: true + kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both"}' + block-size: 128 + gpu-memory-utilization: 0.90 + max-model-len: 2304 + language-model-only: true + stream-interval: 32 + max-cudagraph-capture-size: 2048 + max-num-batched-tokens: 2048 + + decode: + no-enable-flashinfer-autotune: true + tensor-parallel-size: 8 + enable-expert-parallel: true + trust-remote-code: true + no-enable-prefix-caching: true + kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both"}' + block-size: 128 + gpu-memory-utilization: 0.90 + max-model-len: 2304 + language-model-only: true + stream-interval: 32 + max-num-seqs: 4096 + max-num-batched-tokens: 16384 + max-cudagraph-capture-size: 8192 + +health_check: + max_attempts: 360 + interval_seconds: 10 + +benchmark: + type: "sa-bench" + isl: 1024 + osl: 1024 + concurrencies: "32" + req_rate: "inf" diff --git a/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3/b300-fp4/1k1k/2p2d-dep2-tep8-1k1k.yaml b/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3/b300-fp4/1k1k/2p2d-dep2-tep8-1k1k.yaml new file mode 100644 index 0000000000..59c52da00c --- /dev/null +++ b/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3/b300-fp4/1k1k/2p2d-dep2-tep8-1k1k.yaml @@ -0,0 +1,81 @@ +name: "minimax-m3-vllm-disagg-b300-2p2d-fp4-dep2-tep8-1k1k" + +model: + path: "nvidia/MiniMax-M3-NVFP4" + container: "vllm/vllm-openai:vllm-minimax-m3-perf-x86_64-13.0.1-8b00f41" + precision: "fp4" + +resources: + gpu_type: "b300" + gpus_per_node: 8 + prefill_nodes: 1 + decode_nodes: 2 + prefill_workers: 2 + decode_workers: 2 + gpus_per_prefill: 2 + gpus_per_decode: 8 + +dynamo: + install: true + version: 1.3.0.dev20260614 + +frontend: + type: dynamo + enable_multiple_frontends: false + +backend: + type: vllm + connector: null + + prefill_environment: + UCX_TLS: "cuda_copy,rc" + VLLM_FLOAT32_MATMUL_PRECISION: "high" + + decode_environment: + UCX_TLS: "cuda_copy,rc" + VLLM_FLOAT32_MATMUL_PRECISION: "high" + + vllm_config: + prefill: + no-enable-flashinfer-autotune: true + tensor-parallel-size: 1 + data-parallel-size: 2 + data-parallel-rpc-port: 13345 + enable-expert-parallel: true + trust-remote-code: true + no-enable-prefix-caching: true + kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both"}' + block-size: 128 + gpu-memory-utilization: 0.90 + max-model-len: 2304 + language-model-only: true + stream-interval: 32 + max-cudagraph-capture-size: 2048 + max-num-batched-tokens: 2048 + + decode: + no-enable-flashinfer-autotune: true + tensor-parallel-size: 8 + enable-expert-parallel: true + trust-remote-code: true + no-enable-prefix-caching: true + kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both"}' + block-size: 128 + gpu-memory-utilization: 0.90 + max-model-len: 2304 + language-model-only: true + stream-interval: 32 + max-num-seqs: 4096 + max-num-batched-tokens: 16384 + max-cudagraph-capture-size: 8192 + +health_check: + max_attempts: 360 + interval_seconds: 10 + +benchmark: + type: "sa-bench" + isl: 1024 + osl: 1024 + concurrencies: "16" + req_rate: "inf" diff --git a/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3/b300-fp4/1k1k/3p2d-dep2-tep8-1k1k.yaml b/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3/b300-fp4/1k1k/3p2d-dep2-tep8-1k1k.yaml new file mode 100644 index 0000000000..7e9f7dec31 --- /dev/null +++ b/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3/b300-fp4/1k1k/3p2d-dep2-tep8-1k1k.yaml @@ -0,0 +1,81 @@ +name: "minimax-m3-vllm-disagg-b300-3p2d-fp4-dep2-tep8-1k1k" + +model: + path: "nvidia/MiniMax-M3-NVFP4" + container: "vllm/vllm-openai:vllm-minimax-m3-perf-x86_64-13.0.1-8b00f41" + precision: "fp4" + +resources: + gpu_type: "b300" + gpus_per_node: 8 + prefill_nodes: 1 + decode_nodes: 2 + prefill_workers: 3 + decode_workers: 2 + gpus_per_prefill: 2 + gpus_per_decode: 8 + +dynamo: + install: true + version: 1.3.0.dev20260614 + +frontend: + type: dynamo + enable_multiple_frontends: false + +backend: + type: vllm + connector: null + + prefill_environment: + UCX_TLS: "cuda_copy,rc" + VLLM_FLOAT32_MATMUL_PRECISION: "high" + + decode_environment: + UCX_TLS: "cuda_copy,rc" + VLLM_FLOAT32_MATMUL_PRECISION: "high" + + vllm_config: + prefill: + no-enable-flashinfer-autotune: true + tensor-parallel-size: 1 + data-parallel-size: 2 + data-parallel-rpc-port: 13345 + enable-expert-parallel: true + trust-remote-code: true + no-enable-prefix-caching: true + kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both"}' + block-size: 128 + gpu-memory-utilization: 0.90 + max-model-len: 2304 + language-model-only: true + stream-interval: 32 + max-cudagraph-capture-size: 2048 + max-num-batched-tokens: 2048 + + decode: + no-enable-flashinfer-autotune: true + tensor-parallel-size: 8 + enable-expert-parallel: true + trust-remote-code: true + no-enable-prefix-caching: true + kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both"}' + block-size: 128 + gpu-memory-utilization: 0.90 + max-model-len: 2304 + language-model-only: true + stream-interval: 32 + max-num-seqs: 4096 + max-num-batched-tokens: 16384 + max-cudagraph-capture-size: 8192 + +health_check: + max_attempts: 360 + interval_seconds: 10 + +benchmark: + type: "sa-bench" + isl: 1024 + osl: 1024 + concurrencies: "4" + req_rate: "inf" diff --git a/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3/b300-fp4/8k1k/1p1d-dep2-tp4-8k1k.yaml b/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3/b300-fp4/8k1k/1p1d-dep2-tp4-8k1k.yaml new file mode 100644 index 0000000000..be2683d0ca --- /dev/null +++ b/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3/b300-fp4/8k1k/1p1d-dep2-tp4-8k1k.yaml @@ -0,0 +1,86 @@ +name: "minimax-m3-vllm-disagg-b300-1p1d-fp4-dep2-tp4-8k1k" + +model: + path: "nvidia/MiniMax-M3-NVFP4" + container: "vllm/vllm-openai:vllm-minimax-m3-perf-x86_64-13.0.1-8b00f41" + precision: "fp4" + +resources: + gpu_type: "b300" + gpus_per_node: 8 + prefill_nodes: 1 + decode_nodes: 1 + prefill_workers: 1 + decode_workers: 1 + gpus_per_prefill: 2 + gpus_per_decode: 4 + +dynamo: + install: true + version: 1.3.0.dev20260614 + +frontend: + type: dynamo + enable_multiple_frontends: false + +backend: + type: vllm + connector: null + allow_prefill_decode_colocation: true + + prefill_environment: + VLLM_FLOAT32_MATMUL_PRECISION: high + UCX_TLS: "cuda_ipc,cuda_copy,rc" + + decode_environment: + VLLM_FLOAT32_MATMUL_PRECISION: high + UCX_TLS: "cuda_ipc,cuda_copy,rc" + + vllm_config: + prefill: + no-enable-flashinfer-autotune: true + tensor-parallel-size: 1 + data-parallel-size: 2 + data-parallel-rpc-port: 13345 + enable-expert-parallel: true + trust-remote-code: true + no-enable-prefix-caching: true + kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both"}' + attention-config: '{"backend": "FLASHINFER", "use_trtllm_attention": true, "indexer_kv_dtype": "fp8"}' + kv-cache-dtype: fp8 + block-size: 128 + gpu-memory-utilization: 0.90 + max-model-len: 9472 + language-model-only: true + stream-interval: 32 + max-cudagraph-capture-size: 2048 + max-num-batched-tokens: 16384 + + decode: + no-enable-flashinfer-autotune: true + tensor-parallel-size: 4 + enable-expert-parallel: false + trust-remote-code: true + no-enable-prefix-caching: true + kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both"}' + attention-config: '{"backend": "FLASHINFER", "use_trtllm_attention": true, "indexer_kv_dtype": "fp8"}' + kv-cache-dtype: fp8 + block-size: 128 + gpu-memory-utilization: 0.90 + max-model-len: 9472 + language-model-only: true + stream-interval: 32 + max-num-seqs: 1024 + max-num-batched-tokens: 16384 + max-cudagraph-capture-size: 2048 + +health_check: + max_attempts: 360 + interval_seconds: 10 + +benchmark: + type: "sa-bench" + isl: 8192 + osl: 1024 + concurrencies: "1x4x8x16" + req_rate: "inf" diff --git a/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3/b300-fp4/8k1k/1p2d-dep2-tep4-8k1k.yaml b/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3/b300-fp4/8k1k/1p2d-dep2-tep4-8k1k.yaml new file mode 100644 index 0000000000..5be198f113 --- /dev/null +++ b/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3/b300-fp4/8k1k/1p2d-dep2-tep4-8k1k.yaml @@ -0,0 +1,85 @@ +name: "minimax-m3-vllm-disagg-b300-1p2d-fp4-dep2-tep4-8k1k" + +model: + path: "nvidia/MiniMax-M3-NVFP4" + container: "vllm/vllm-openai:vllm-minimax-m3-perf-x86_64-13.0.1-8b00f41" + precision: "fp4" + +resources: + gpu_type: "b300" + gpus_per_node: 8 + prefill_nodes: 1 + decode_nodes: 1 + prefill_workers: 1 + decode_workers: 2 + gpus_per_prefill: 2 + gpus_per_decode: 4 + +dynamo: + install: true + version: 1.3.0.dev20260614 + +frontend: + type: dynamo + enable_multiple_frontends: false + +backend: + type: vllm + connector: null + + prefill_environment: + VLLM_FLOAT32_MATMUL_PRECISION: high + UCX_TLS: "cuda_copy,rc" + + decode_environment: + VLLM_FLOAT32_MATMUL_PRECISION: high + UCX_TLS: "cuda_copy,rc" + + vllm_config: + prefill: + no-enable-flashinfer-autotune: true + tensor-parallel-size: 1 + data-parallel-size: 2 + data-parallel-rpc-port: 13345 + enable-expert-parallel: true + trust-remote-code: true + no-enable-prefix-caching: true + kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both"}' + attention-config: '{"backend": "FLASHINFER", "use_trtllm_attention": true, "indexer_kv_dtype": "fp8"}' + kv-cache-dtype: fp8 + block-size: 128 + gpu-memory-utilization: 0.90 + max-model-len: 9472 + language-model-only: true + stream-interval: 32 + max-cudagraph-capture-size: 2048 + max-num-batched-tokens: 16384 + + decode: + no-enable-flashinfer-autotune: true + tensor-parallel-size: 4 + enable-expert-parallel: true + trust-remote-code: true + no-enable-prefix-caching: true + kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both"}' + attention-config: '{"backend": "FLASHINFER", "use_trtllm_attention": true, "indexer_kv_dtype": "fp8"}' + kv-cache-dtype: fp8 + block-size: 128 + gpu-memory-utilization: 0.90 + max-model-len: 9472 + language-model-only: true + stream-interval: 32 + max-num-seqs: 512 + max-num-batched-tokens: 16384 + max-cudagraph-capture-size: 4096 + +health_check: + max_attempts: 360 + interval_seconds: 10 + +benchmark: + type: "sa-bench" + isl: 8192 + osl: 1024 + concurrencies: "16" + req_rate: "inf" diff --git a/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3/b300-fp4/8k1k/1p4d-dep2-tep4-8k1k.yaml b/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3/b300-fp4/8k1k/1p4d-dep2-tep4-8k1k.yaml new file mode 100644 index 0000000000..90d688f615 --- /dev/null +++ b/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3/b300-fp4/8k1k/1p4d-dep2-tep4-8k1k.yaml @@ -0,0 +1,85 @@ +name: "minimax-m3-vllm-disagg-b300-1p4d-fp4-dep2-tep4-8k1k" + +model: + path: "nvidia/MiniMax-M3-NVFP4" + container: "vllm/vllm-openai:vllm-minimax-m3-perf-x86_64-13.0.1-8b00f41" + precision: "fp4" + +resources: + gpu_type: "b300" + gpus_per_node: 8 + prefill_nodes: 1 + decode_nodes: 2 + prefill_workers: 1 + decode_workers: 4 + gpus_per_prefill: 2 + gpus_per_decode: 4 + +dynamo: + install: true + version: 1.3.0.dev20260614 + +frontend: + type: dynamo + enable_multiple_frontends: false + +backend: + type: vllm + connector: null + + prefill_environment: + VLLM_FLOAT32_MATMUL_PRECISION: high + UCX_TLS: "cuda_copy,rc" + + decode_environment: + VLLM_FLOAT32_MATMUL_PRECISION: high + UCX_TLS: "cuda_copy,rc" + + vllm_config: + prefill: + no-enable-flashinfer-autotune: true + tensor-parallel-size: 1 + data-parallel-size: 2 + data-parallel-rpc-port: 13345 + enable-expert-parallel: true + trust-remote-code: true + no-enable-prefix-caching: true + kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both"}' + attention-config: '{"backend": "FLASHINFER", "use_trtllm_attention": true, "indexer_kv_dtype": "fp8"}' + kv-cache-dtype: fp8 + block-size: 128 + gpu-memory-utilization: 0.90 + max-model-len: 9472 + language-model-only: true + stream-interval: 32 + max-cudagraph-capture-size: 2048 + max-num-batched-tokens: 16384 + + decode: + no-enable-flashinfer-autotune: true + tensor-parallel-size: 4 + enable-expert-parallel: true + trust-remote-code: true + no-enable-prefix-caching: true + kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both"}' + attention-config: '{"backend": "FLASHINFER", "use_trtllm_attention": true, "indexer_kv_dtype": "fp8"}' + kv-cache-dtype: fp8 + block-size: 128 + gpu-memory-utilization: 0.90 + max-model-len: 9472 + language-model-only: true + stream-interval: 32 + max-num-seqs: 512 + max-num-batched-tokens: 16384 + max-cudagraph-capture-size: 4096 + +health_check: + max_attempts: 360 + interval_seconds: 10 + +benchmark: + type: "sa-bench" + isl: 8192 + osl: 1024 + concurrencies: "16x32x64x128" + req_rate: "inf" diff --git a/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3/b300-fp4/8k1k/1p4d-dep2-tep8-8k1k.yaml b/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3/b300-fp4/8k1k/1p4d-dep2-tep8-8k1k.yaml new file mode 100644 index 0000000000..2154742821 --- /dev/null +++ b/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3/b300-fp4/8k1k/1p4d-dep2-tep8-8k1k.yaml @@ -0,0 +1,85 @@ +name: "minimax-m3-vllm-disagg-b300-1p4d-fp4-dep2-tep8-8k1k" + +model: + path: "nvidia/MiniMax-M3-NVFP4" + container: "vllm/vllm-openai:vllm-minimax-m3-perf-x86_64-13.0.1-8b00f41" + precision: "fp4" + +resources: + gpu_type: "b300" + gpus_per_node: 8 + prefill_nodes: 1 + decode_nodes: 4 + prefill_workers: 1 + decode_workers: 4 + gpus_per_prefill: 2 + gpus_per_decode: 8 + +dynamo: + install: true + version: 1.3.0.dev20260614 + +frontend: + type: dynamo + enable_multiple_frontends: false + +backend: + type: vllm + connector: null + + prefill_environment: + VLLM_FLOAT32_MATMUL_PRECISION: high + UCX_TLS: "cuda_copy,rc" + + decode_environment: + VLLM_FLOAT32_MATMUL_PRECISION: high + UCX_TLS: "cuda_copy,rc" + + vllm_config: + prefill: + no-enable-flashinfer-autotune: true + tensor-parallel-size: 1 + data-parallel-size: 2 + data-parallel-rpc-port: 13345 + enable-expert-parallel: true + trust-remote-code: true + no-enable-prefix-caching: true + kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both"}' + attention-config: '{"backend": "FLASHINFER", "use_trtllm_attention": true, "indexer_kv_dtype": "fp8"}' + kv-cache-dtype: fp8 + block-size: 128 + gpu-memory-utilization: 0.90 + max-model-len: 9472 + language-model-only: true + stream-interval: 32 + max-cudagraph-capture-size: 2048 + max-num-batched-tokens: 16384 + + decode: + no-enable-flashinfer-autotune: true + tensor-parallel-size: 8 + enable-expert-parallel: true + trust-remote-code: true + no-enable-prefix-caching: true + kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both"}' + attention-config: '{"backend": "FLASHINFER", "use_trtllm_attention": true, "indexer_kv_dtype": "fp8"}' + kv-cache-dtype: fp8 + block-size: 128 + gpu-memory-utilization: 0.90 + max-model-len: 9472 + language-model-only: true + stream-interval: 32 + max-num-seqs: 1024 + max-num-batched-tokens: 16384 + max-cudagraph-capture-size: 4096 + +health_check: + max_attempts: 360 + interval_seconds: 10 + +benchmark: + type: "sa-bench" + isl: 8192 + osl: 1024 + concurrencies: "4" + req_rate: "inf" diff --git a/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3/b300-fp4/8k1k/2p2d-dep2-dep8-8k1k.yaml b/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3/b300-fp4/8k1k/2p2d-dep2-dep8-8k1k.yaml new file mode 100644 index 0000000000..c49fd1ccbf --- /dev/null +++ b/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3/b300-fp4/8k1k/2p2d-dep2-dep8-8k1k.yaml @@ -0,0 +1,87 @@ +name: "minimax-m3-vllm-disagg-b300-2p2d-fp4-dep2-dep8-8k1k" + +model: + path: "nvidia/MiniMax-M3-NVFP4" + container: "vllm/vllm-openai:vllm-minimax-m3-perf-x86_64-13.0.1-8b00f41" + precision: "fp4" + +resources: + gpu_type: "b300" + gpus_per_node: 8 + prefill_nodes: 1 + decode_nodes: 2 + prefill_workers: 2 + decode_workers: 2 + gpus_per_prefill: 2 + gpus_per_decode: 8 + +dynamo: + install: true + version: 1.3.0.dev20260614 + +frontend: + type: dynamo + enable_multiple_frontends: false + +backend: + type: vllm + connector: null + + prefill_environment: + VLLM_FLOAT32_MATMUL_PRECISION: high + UCX_TLS: "cuda_copy,rc" + + decode_environment: + VLLM_FLOAT32_MATMUL_PRECISION: high + UCX_TLS: "cuda_copy,rc" + + vllm_config: + prefill: + no-enable-flashinfer-autotune: true + tensor-parallel-size: 1 + data-parallel-size: 2 + data-parallel-rpc-port: 13345 + enable-expert-parallel: true + trust-remote-code: true + no-enable-prefix-caching: true + kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both"}' + attention-config: '{"backend": "FLASHINFER", "use_trtllm_attention": true, "indexer_kv_dtype": "fp8"}' + kv-cache-dtype: fp8 + block-size: 128 + gpu-memory-utilization: 0.90 + max-model-len: 9472 + language-model-only: true + stream-interval: 32 + max-cudagraph-capture-size: 2048 + max-num-batched-tokens: 16384 + + decode: + no-enable-flashinfer-autotune: true + tensor-parallel-size: 1 + data-parallel-size: 8 + data-parallel-rpc-port: 13345 + enable-expert-parallel: true + trust-remote-code: true + no-enable-prefix-caching: true + kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both"}' + attention-config: '{"backend": "FLASHINFER", "use_trtllm_attention": true, "indexer_kv_dtype": "fp8"}' + kv-cache-dtype: fp8 + block-size: 128 + gpu-memory-utilization: 0.90 + max-model-len: 9472 + language-model-only: true + stream-interval: 32 + max-num-seqs: 1024 + max-num-batched-tokens: 16384 + max-cudagraph-capture-size: 4096 + +health_check: + max_attempts: 360 + interval_seconds: 10 + +benchmark: + type: "sa-bench" + isl: 8192 + osl: 1024 + concurrencies: "256x512" + req_rate: "inf" diff --git a/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3/b300-fp4/8k1k/2p2d-dep2-tep8-8k1k.yaml b/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3/b300-fp4/8k1k/2p2d-dep2-tep8-8k1k.yaml new file mode 100644 index 0000000000..1b8dfd627f --- /dev/null +++ b/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3/b300-fp4/8k1k/2p2d-dep2-tep8-8k1k.yaml @@ -0,0 +1,85 @@ +name: "minimax-m3-vllm-disagg-b300-2p2d-fp4-dep2-tep8-8k1k" + +model: + path: "nvidia/MiniMax-M3-NVFP4" + container: "vllm/vllm-openai:vllm-minimax-m3-perf-x86_64-13.0.1-8b00f41" + precision: "fp4" + +resources: + gpu_type: "b300" + gpus_per_node: 8 + prefill_nodes: 1 + decode_nodes: 2 + prefill_workers: 2 + decode_workers: 2 + gpus_per_prefill: 2 + gpus_per_decode: 8 + +dynamo: + install: true + version: 1.3.0.dev20260614 + +frontend: + type: dynamo + enable_multiple_frontends: false + +backend: + type: vllm + connector: null + + prefill_environment: + VLLM_FLOAT32_MATMUL_PRECISION: high + UCX_TLS: "cuda_copy,rc" + + decode_environment: + VLLM_FLOAT32_MATMUL_PRECISION: high + UCX_TLS: "cuda_copy,rc" + + vllm_config: + prefill: + no-enable-flashinfer-autotune: true + tensor-parallel-size: 1 + data-parallel-size: 2 + data-parallel-rpc-port: 13345 + enable-expert-parallel: true + trust-remote-code: true + no-enable-prefix-caching: true + kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both"}' + attention-config: '{"backend": "FLASHINFER", "use_trtllm_attention": true, "indexer_kv_dtype": "fp8"}' + kv-cache-dtype: fp8 + block-size: 128 + gpu-memory-utilization: 0.90 + max-model-len: 9472 + language-model-only: true + stream-interval: 32 + max-cudagraph-capture-size: 2048 + max-num-batched-tokens: 16384 + + decode: + no-enable-flashinfer-autotune: true + tensor-parallel-size: 8 + enable-expert-parallel: true + trust-remote-code: true + no-enable-prefix-caching: true + kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both"}' + attention-config: '{"backend": "FLASHINFER", "use_trtllm_attention": true, "indexer_kv_dtype": "fp8"}' + kv-cache-dtype: fp8 + block-size: 128 + gpu-memory-utilization: 0.90 + max-model-len: 9472 + language-model-only: true + stream-interval: 32 + max-num-seqs: 1024 + max-num-batched-tokens: 16384 + max-cudagraph-capture-size: 4096 + +health_check: + max_attempts: 360 + interval_seconds: 10 + +benchmark: + type: "sa-bench" + isl: 8192 + osl: 1024 + concurrencies: "16" + req_rate: "inf" diff --git a/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3/b300-fp4/8k1k/4p2d-dep2-dep8-8k1k.yaml b/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3/b300-fp4/8k1k/4p2d-dep2-dep8-8k1k.yaml new file mode 100644 index 0000000000..73473aac94 --- /dev/null +++ b/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3/b300-fp4/8k1k/4p2d-dep2-dep8-8k1k.yaml @@ -0,0 +1,87 @@ +name: "minimax-m3-vllm-disagg-b300-4p2d-fp4-dep2-dep8-8k1k" + +model: + path: "nvidia/MiniMax-M3-NVFP4" + container: "vllm/vllm-openai:vllm-minimax-m3-perf-x86_64-13.0.1-8b00f41" + precision: "fp4" + +resources: + gpu_type: "b300" + gpus_per_node: 8 + prefill_nodes: 1 + decode_nodes: 2 + prefill_workers: 4 + decode_workers: 2 + gpus_per_prefill: 2 + gpus_per_decode: 8 + +dynamo: + install: true + version: 1.3.0.dev20260614 + +frontend: + type: dynamo + enable_multiple_frontends: false + +backend: + type: vllm + connector: null + + prefill_environment: + VLLM_FLOAT32_MATMUL_PRECISION: high + UCX_TLS: "cuda_copy,rc" + + decode_environment: + VLLM_FLOAT32_MATMUL_PRECISION: high + UCX_TLS: "cuda_copy,rc" + + vllm_config: + prefill: + no-enable-flashinfer-autotune: true + tensor-parallel-size: 1 + data-parallel-size: 2 + data-parallel-rpc-port: 13345 + enable-expert-parallel: true + trust-remote-code: true + no-enable-prefix-caching: true + kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both"}' + attention-config: '{"backend": "FLASHINFER", "use_trtllm_attention": true, "indexer_kv_dtype": "fp8"}' + kv-cache-dtype: fp8 + block-size: 128 + gpu-memory-utilization: 0.90 + max-model-len: 9472 + language-model-only: true + stream-interval: 32 + max-cudagraph-capture-size: 2048 + max-num-batched-tokens: 16384 + + decode: + no-enable-flashinfer-autotune: true + tensor-parallel-size: 1 + data-parallel-size: 8 + data-parallel-rpc-port: 13345 + enable-expert-parallel: true + trust-remote-code: true + no-enable-prefix-caching: true + kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both"}' + attention-config: '{"backend": "FLASHINFER", "use_trtllm_attention": true, "indexer_kv_dtype": "fp8"}' + kv-cache-dtype: fp8 + block-size: 128 + gpu-memory-utilization: 0.90 + max-model-len: 9472 + language-model-only: true + stream-interval: 32 + max-num-seqs: 1024 # Per DP rank: 2 workers x DP8 = 16 ranks. + max-num-batched-tokens: 16384 + max-cudagraph-capture-size: 4096 + +health_check: + max_attempts: 360 + interval_seconds: 10 + +benchmark: + type: "sa-bench" + isl: 8192 + osl: 1024 + concurrencies: "4096" + req_rate: "inf" diff --git a/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3/b300-fp4/8k1k/4p2d-dep2-tep4-8k1k.yaml b/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3/b300-fp4/8k1k/4p2d-dep2-tep4-8k1k.yaml new file mode 100644 index 0000000000..23c99d3282 --- /dev/null +++ b/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3/b300-fp4/8k1k/4p2d-dep2-tep4-8k1k.yaml @@ -0,0 +1,85 @@ +name: "minimax-m3-vllm-disagg-b300-4p2d-fp4-dep2-tep4-8k1k" + +model: + path: "nvidia/MiniMax-M3-NVFP4" + container: "vllm/vllm-openai:vllm-minimax-m3-perf-x86_64-13.0.1-8b00f41" + precision: "fp4" + +resources: + gpu_type: "b300" + gpus_per_node: 8 + prefill_nodes: 1 + decode_nodes: 1 + prefill_workers: 4 + decode_workers: 2 + gpus_per_prefill: 2 + gpus_per_decode: 4 + +dynamo: + install: true + version: 1.3.0.dev20260614 + +frontend: + type: dynamo + enable_multiple_frontends: false + +backend: + type: vllm + connector: null + + prefill_environment: + VLLM_FLOAT32_MATMUL_PRECISION: high + UCX_TLS: "cuda_copy,rc" + + decode_environment: + VLLM_FLOAT32_MATMUL_PRECISION: high + UCX_TLS: "cuda_copy,rc" + + vllm_config: + prefill: + no-enable-flashinfer-autotune: true + tensor-parallel-size: 1 + data-parallel-size: 2 + data-parallel-rpc-port: 13345 + enable-expert-parallel: true + trust-remote-code: true + no-enable-prefix-caching: true + kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both"}' + attention-config: '{"backend": "FLASHINFER", "use_trtllm_attention": true, "indexer_kv_dtype": "fp8"}' + kv-cache-dtype: fp8 + block-size: 128 + gpu-memory-utilization: 0.90 + max-model-len: 9472 + language-model-only: true + stream-interval: 32 + max-cudagraph-capture-size: 2048 + max-num-batched-tokens: 16384 + + decode: + no-enable-flashinfer-autotune: true + tensor-parallel-size: 4 + enable-expert-parallel: true + trust-remote-code: true + no-enable-prefix-caching: true + kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both"}' + attention-config: '{"backend": "FLASHINFER", "use_trtllm_attention": true, "indexer_kv_dtype": "fp8"}' + kv-cache-dtype: fp8 + block-size: 128 + gpu-memory-utilization: 0.90 + max-model-len: 9472 + language-model-only: true + stream-interval: 32 + max-num-seqs: 1024 + max-num-batched-tokens: 16384 + max-cudagraph-capture-size: 4096 + +health_check: + max_attempts: 360 + interval_seconds: 10 + +benchmark: + type: "sa-bench" + isl: 8192 + osl: 1024 + concurrencies: "4096" + req_rate: "inf" diff --git a/perf-changelog.yaml b/perf-changelog.yaml index a112c63498..c318f2a2ac 100644 --- a/perf-changelog.yaml +++ b/perf-changelog.yaml @@ -4335,3 +4335,11 @@ - "minimaxm3-fp8-mi355x-vllm-mtp: 1k1k sweeps TP8 (conc 4-32), TP8/EP8 (conc 1-256), TP4 (conc 1-2 and 32-64), TP4/EP4 (conc 128-256); 8k1k sweeps TP8 (conc 1 and 4-16), TP4 (conc 16-128)." - "Serving flags are otherwise unchanged." pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/1946 + +- config-keys: + - minimaxm3-fp4-b300-dynamo-vllm + description: + - "Add MiniMax-M3 NVFP4 B300 disaggregated vLLM benchmarks via Dynamo for 1k1k and 8k1k STP (no MTP)" + - "Use nvidia/MiniMax-M3-NVFP4 from /scratch/models/MiniMax-M3-NVFP4 with vllm/vllm-openai:vllm-minimax-m3-perf-x86_64-13.0.1-8b00f41, which includes vllm-project/vllm PR #46380; no runtime patch needed" + - "Reuse the existing MXFP8 B300 topology and concurrency matrix across 15 srt-slurm recipes, while dropping the FP8-only Marlin override from TP4 decode" + pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/1931 diff --git a/runners/launch_b300-nv.sh b/runners/launch_b300-nv.sh index 7a7008691a..9f6cf1b07c 100644 --- a/runners/launch_b300-nv.sh +++ b/runners/launch_b300-nv.sh @@ -47,11 +47,14 @@ elif [[ $MODEL_PREFIX == "minimaxm2.5" && $PRECISION == "fp4" && $FRAMEWORK == " elif [[ $MODEL_PREFIX == "minimaxm2.5" && $PRECISION == "fp8" && $FRAMEWORK == "dynamo-vllm" ]]; then export MODEL_PATH="/data/models/MiniMax-M2.5" export SRT_SLURM_MODEL_PREFIX="minimax-m2.5-fp8" +elif [[ $MODEL_PREFIX == "minimaxm3" && $PRECISION == "fp4" && $FRAMEWORK == "dynamo-vllm" ]]; then + export MODEL_PATH="/scratch/models/MiniMax-M3-NVFP4" + export SRT_SLURM_MODEL_PREFIX="nvidia/MiniMax-M3-NVFP4" elif [[ $MODEL_PREFIX == "minimaxm3" && $PRECISION == "fp8" && $FRAMEWORK == "dynamo-vllm" ]]; then export MODEL_PATH="/data/models/MiniMax-M3-MXFP8" export SRT_SLURM_MODEL_PREFIX="MiniMaxAI/MiniMax-M3-MXFP8" else - echo "Unsupported model: $MODEL_PREFIX-$PRECISION. Supported models are: dsr1-fp4, dsr1-fp8, dsv4-fp4 with dynamo-vllm, minimaxm2.5-fp4 with dynamo-vllm, minimaxm2.5-fp8 with dynamo-vllm, minimaxm3-fp8 with dynamo-vllm" + echo "Unsupported model: $MODEL_PREFIX-$PRECISION. Supported models are: dsr1-fp4, dsr1-fp8, dsv4-fp4 with dynamo-vllm, minimaxm2.5-fp4 with dynamo-vllm, minimaxm2.5-fp8 with dynamo-vllm, minimaxm3-fp4 with dynamo-vllm, minimaxm3-fp8 with dynamo-vllm" exit 1 fi @@ -85,18 +88,22 @@ elif [[ $FRAMEWORK == "dynamo-vllm" && $MODEL_PREFIX == "minimaxm2.5" && $PRECIS git checkout main mkdir -p recipes/vllm/minimax-m2.5-fp8 cp -rT "$GITHUB_WORKSPACE/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m2.5-b300-fp8" recipes/vllm/minimax-m2.5-fp8 -elif [[ $FRAMEWORK == "dynamo-vllm" && $MODEL_PREFIX == "minimaxm3" && $PRECISION == "fp8" ]]; then +elif [[ $FRAMEWORK == "dynamo-vllm" && $MODEL_PREFIX == "minimaxm3" && ( $PRECISION == "fp4" || $PRECISION == "fp8" ) ]]; then git clone https://github.com/NVIDIA/srt-slurm.git "$SRT_REPO_DIR" cd "$SRT_REPO_DIR" || exit 1 git checkout sa-submission-q2-2026 mkdir -p recipes/vllm/minimax-m3 cp -rT "$GITHUB_WORKSPACE/benchmarks/multi_node/srt-slurm-recipes/vllm/minimax-m3" recipes/vllm/minimax-m3 - SRTCTL_SETUP_SCRIPT="minimax-m3-vllm-fixes.sh" + if [[ $PRECISION == "fp8" ]]; then + SRTCTL_SETUP_SCRIPT="minimax-m3-vllm-fixes.sh" + fi # NVIDIA/srt-slurm#38 git show 22d46ba9971615016d2339c9ffbc7b4597accfad --format= -- src/srtctl/core/ip_utils/get_node_ip.sh | git apply - || exit 1 - cp \ - "$GITHUB_WORKSPACE/benchmarks/multi_node/srt-slurm-recipes/configs/$SRTCTL_SETUP_SCRIPT" \ - "configs/$SRTCTL_SETUP_SCRIPT" + if [[ -n "$SRTCTL_SETUP_SCRIPT" ]]; then + cp \ + "$GITHUB_WORKSPACE/benchmarks/multi_node/srt-slurm-recipes/configs/$SRTCTL_SETUP_SCRIPT" \ + "configs/$SRTCTL_SETUP_SCRIPT" + fi else git clone https://github.com/NVIDIA/srt-slurm.git "$SRT_REPO_DIR" cd "$SRT_REPO_DIR" || exit 1