From f23f216b36320757bb3425ab3d026cd33a468447 Mon Sep 17 00:00:00 2001 From: Lanyu Liao Date: Wed, 19 Nov 2025 01:24:20 -0800 Subject: [PATCH 1/4] set correct lm_head_tp_size_upper_bound Signed-off-by: Lanyu Liao --- tensorrt_llm/_torch/utils.py | 12 +++++++++--- 1 file changed, 9 insertions(+), 3 deletions(-) diff --git a/tensorrt_llm/_torch/utils.py b/tensorrt_llm/_torch/utils.py index 5beff19f710..9c744208f90 100644 --- a/tensorrt_llm/_torch/utils.py +++ b/tensorrt_llm/_torch/utils.py @@ -1,4 +1,5 @@ import contextlib +import os import threading from dataclasses import dataclass from enum import Enum, IntEnum @@ -317,9 +318,14 @@ def create_lm_head_tp_mapping(mapping: Mapping, token_count: int) -> Mapping: # Since token_count=256 will hit the boundary of math-bound problem # We use 256 // token_count to determine the lm_head_tp_size lm_head_tp_size_raw = 256 // token_count - lm_head_tp_size = nearest_in_buckets(lm_head_tp_size_raw, - [1, mapping.gpus_per_node]) - assert mapping.tp_size % lm_head_tp_size == 0 + # TODO: On platforms like GB200, setting lm_head_tp_size_upper_bound to world_size could be more efficient when world_size > gpus_per_node, we need to do further investigation. + lm_head_tp_size_upper_bound = min(mapping.world_size, mapping.gpus_per_node) + lm_head_tp_size = int( + os.getenv( + 'LM_HEAD_TP_SIZE', + nearest_in_buckets(lm_head_tp_size_raw, + [1, lm_head_tp_size_upper_bound]))) + assert mapping.tp_size % lm_head_tp_size == 0, f"mapping.tp_size: {mapping.tp_size}, lm_head_tp_size: {lm_head_tp_size}" lm_head_pp_size = mapping.pp_size * mapping.tp_size // lm_head_tp_size return Mapping( From d694d5724bdbc4c90a0cb9944e262ad8144fdab3 Mon Sep 17 00:00:00 2001 From: Lanyu Liao Date: Wed, 19 Nov 2025 02:11:27 -0800 Subject: [PATCH 2/4] add it Signed-off-by: Lanyu Liao --- tests/integration/test_lists/test-db/l0_dgx_b200.yml | 2 ++ 1 file changed, 2 insertions(+) diff --git a/tests/integration/test_lists/test-db/l0_dgx_b200.yml b/tests/integration/test_lists/test-db/l0_dgx_b200.yml index 1c2428bba60..6389514cfe2 100644 --- a/tests/integration/test_lists/test-db/l0_dgx_b200.yml +++ b/tests/integration/test_lists/test-db/l0_dgx_b200.yml @@ -128,6 +128,8 @@ l0_dgx_b200: - accuracy/test_llm_api_pytorch.py::TestDeepSeekV32::test_fp8_blockscale[latency] TIMEOUT (180) - accuracy/test_llm_api_pytorch.py::TestDeepSeekV32::test_nvfp4_multi_gpus[baseline_fp8kv] TIMEOUT (180) - accuracy/test_llm_api_pytorch.py::TestDeepSeekV32::test_nvfp4_multi_gpus[latency] TIMEOUT (180) + - accuracy/test_llm_api_pytorch.py::TestDeepSeekR1::test_nvfp4_multi_gpus[latency_adp_lmtp] TIMEOUT (180) + - accuracy/test_llm_api_pytorch.py::TestDeepSeekR1::test_nvfp4_multi_gpus[latency_trtllmgen_adp_lmtp] TIMEOUT (180) - condition: ranges: system_gpu_count: From 73f94e9f5af60d5fe4bbc3e253049974ddc0a3c7 Mon Sep 17 00:00:00 2001 From: Lanyu Liao Date: Wed, 19 Nov 2025 05:55:23 -0800 Subject: [PATCH 3/4] add test case where world_size < gpus_per_node Signed-off-by: Lanyu Liao --- .../defs/accuracy/test_llm_api_pytorch.py | 18 +++++++++++++++--- .../test_lists/test-db/l0_dgx_b200.yml | 3 +-- 2 files changed, 16 insertions(+), 5 deletions(-) diff --git a/tests/integration/defs/accuracy/test_llm_api_pytorch.py b/tests/integration/defs/accuracy/test_llm_api_pytorch.py index 33556590044..201e880a036 100644 --- a/tests/integration/defs/accuracy/test_llm_api_pytorch.py +++ b/tests/integration/defs/accuracy/test_llm_api_pytorch.py @@ -2049,6 +2049,18 @@ class TestDeepSeekR1(LlmapiAccuracyTestHarness): 32, "TRTLLM", marks=pytest.mark.skip_less_mpi_world_size(8)), + pytest.param(4, + 1, + 4, + 3, + False, + True, + True, + True, + True, + 16, + "CUTLASS", + marks=pytest.mark.skip_less_mpi_world_size(4)), pytest.param(8, 1, 8, @@ -2124,9 +2136,9 @@ class TestDeepSeekR1(LlmapiAccuracyTestHarness): ], ids=[ "latency", "latency_trtllmgen", "latency_adp_lmtp", - "latency_trtllmgen_adp_lmtp", "throughput", "throughput_tp8", - "throughput_tp4", "throughput_mtp", "throughput_bs8_mtp", - "throughput_pp4_mtp" + "latency_trtllmgen_adp_lmtp", "latency_adp_lmtp_tp4", "throughput", + "throughput_tp8", "throughput_tp4", "throughput_mtp", + "throughput_bs8_mtp", "throughput_pp4_mtp" ]) def test_nvfp4_multi_gpus(self, tp_size, pp_size, ep_size, mtp_nextn, fp8kv, attention_dp, enable_lm_head_tp_in_adp, diff --git a/tests/integration/test_lists/test-db/l0_dgx_b200.yml b/tests/integration/test_lists/test-db/l0_dgx_b200.yml index 6389514cfe2..d10ad2260bf 100644 --- a/tests/integration/test_lists/test-db/l0_dgx_b200.yml +++ b/tests/integration/test_lists/test-db/l0_dgx_b200.yml @@ -59,6 +59,7 @@ l0_dgx_b200: - accuracy/test_llm_api_pytorch.py::TestLlama3_3_70BInstruct::test_fp8_tp2pp2 - accuracy/test_llm_api_pytorch.py::TestLlama3_3_70BInstruct::test_nvfp4_tp4 - accuracy/test_llm_api_pytorch.py::TestGPTOSS::test_w4_4gpus[dp4-cutlass-auto] + - accuracy/test_llm_api_pytorch.py::TestDeepSeekR1::test_nvfp4_multi_gpus[latency_adp_lmtp_tp4] - condition: ranges: system_gpu_count: @@ -128,8 +129,6 @@ l0_dgx_b200: - accuracy/test_llm_api_pytorch.py::TestDeepSeekV32::test_fp8_blockscale[latency] TIMEOUT (180) - accuracy/test_llm_api_pytorch.py::TestDeepSeekV32::test_nvfp4_multi_gpus[baseline_fp8kv] TIMEOUT (180) - accuracy/test_llm_api_pytorch.py::TestDeepSeekV32::test_nvfp4_multi_gpus[latency] TIMEOUT (180) - - accuracy/test_llm_api_pytorch.py::TestDeepSeekR1::test_nvfp4_multi_gpus[latency_adp_lmtp] TIMEOUT (180) - - accuracy/test_llm_api_pytorch.py::TestDeepSeekR1::test_nvfp4_multi_gpus[latency_trtllmgen_adp_lmtp] TIMEOUT (180) - condition: ranges: system_gpu_count: From ce076d64dfcfa682112961d808e8d2a28025e16b Mon Sep 17 00:00:00 2001 From: Lanyu Liao Date: Wed, 19 Nov 2025 22:03:13 -0800 Subject: [PATCH 4/4] add comment for lm_head_tp_size_raw Signed-off-by: Lanyu Liao --- tensorrt_llm/_torch/utils.py | 1 + 1 file changed, 1 insertion(+) diff --git a/tensorrt_llm/_torch/utils.py b/tensorrt_llm/_torch/utils.py index 9c744208f90..1cc0d313cc6 100644 --- a/tensorrt_llm/_torch/utils.py +++ b/tensorrt_llm/_torch/utils.py @@ -317,6 +317,7 @@ def create_lm_head_tp_mapping(mapping: Mapping, token_count: int) -> Mapping: # We use heuristic to determine the lm_head_tp_size # Since token_count=256 will hit the boundary of math-bound problem # We use 256 // token_count to determine the lm_head_tp_size + # For more details, refer to the blog: https://github.com/NVIDIA/TensorRT-LLM/blob/main/docs/source/blogs/tech_blog/blog14_Scaling_Expert_Parallelism_in_TensorRT-LLM_part3.md#mtp-lm-head-tensor-parallelism lm_head_tp_size_raw = 256 // token_count # TODO: On platforms like GB200, setting lm_head_tp_size_upper_bound to world_size could be more efficient when world_size > gpus_per_node, we need to do further investigation. lm_head_tp_size_upper_bound = min(mapping.world_size, mapping.gpus_per_node)