Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
13 changes: 10 additions & 3 deletions tensorrt_llm/_torch/utils.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
import contextlib
import os
import threading
from dataclasses import dataclass
from enum import Enum, IntEnum
Expand Down Expand Up @@ -316,10 +317,16 @@ def create_lm_head_tp_mapping(mapping: Mapping, token_count: int) -> Mapping:
# We use heuristic to determine the lm_head_tp_size
# Since token_count=256 will hit the boundary of math-bound problem
# We use 256 // token_count to determine the lm_head_tp_size
# For more details, refer to the blog: https://github.com/NVIDIA/TensorRT-LLM/blob/main/docs/source/blogs/tech_blog/blog14_Scaling_Expert_Parallelism_in_TensorRT-LLM_part3.md#mtp-lm-head-tensor-parallelism
lm_head_tp_size_raw = 256 // token_count
lm_head_tp_size = nearest_in_buckets(lm_head_tp_size_raw,
[1, mapping.gpus_per_node])
assert mapping.tp_size % lm_head_tp_size == 0
# TODO: On platforms like GB200, setting lm_head_tp_size_upper_bound to world_size could be more efficient when world_size > gpus_per_node, we need to do further investigation.
lm_head_tp_size_upper_bound = min(mapping.world_size, mapping.gpus_per_node)
lm_head_tp_size = int(
os.getenv(
'LM_HEAD_TP_SIZE',
nearest_in_buckets(lm_head_tp_size_raw,
[1, lm_head_tp_size_upper_bound])))
assert mapping.tp_size % lm_head_tp_size == 0, f"mapping.tp_size: {mapping.tp_size}, lm_head_tp_size: {lm_head_tp_size}"
lm_head_pp_size = mapping.pp_size * mapping.tp_size // lm_head_tp_size

return Mapping(
Expand Down
18 changes: 15 additions & 3 deletions tests/integration/defs/accuracy/test_llm_api_pytorch.py
Original file line number Diff line number Diff line change
Expand Up @@ -2049,6 +2049,18 @@ class TestDeepSeekR1(LlmapiAccuracyTestHarness):
32,
"TRTLLM",
marks=pytest.mark.skip_less_mpi_world_size(8)),
pytest.param(4,
1,
4,
3,
False,
True,
True,
True,
True,
16,
"CUTLASS",
marks=pytest.mark.skip_less_mpi_world_size(4)),
pytest.param(8,
1,
8,
Expand Down Expand Up @@ -2124,9 +2136,9 @@ class TestDeepSeekR1(LlmapiAccuracyTestHarness):
],
ids=[
"latency", "latency_trtllmgen", "latency_adp_lmtp",
"latency_trtllmgen_adp_lmtp", "throughput", "throughput_tp8",
"throughput_tp4", "throughput_mtp", "throughput_bs8_mtp",
"throughput_pp4_mtp"
"latency_trtllmgen_adp_lmtp", "latency_adp_lmtp_tp4", "throughput",
"throughput_tp8", "throughput_tp4", "throughput_mtp",
"throughput_bs8_mtp", "throughput_pp4_mtp"
])
def test_nvfp4_multi_gpus(self, tp_size, pp_size, ep_size, mtp_nextn, fp8kv,
attention_dp, enable_lm_head_tp_in_adp,
Expand Down
1 change: 1 addition & 0 deletions tests/integration/test_lists/test-db/l0_dgx_b200.yml
Original file line number Diff line number Diff line change
Expand Up @@ -59,6 +59,7 @@ l0_dgx_b200:
- accuracy/test_llm_api_pytorch.py::TestLlama3_3_70BInstruct::test_fp8_tp2pp2
- accuracy/test_llm_api_pytorch.py::TestLlama3_3_70BInstruct::test_nvfp4_tp4
- accuracy/test_llm_api_pytorch.py::TestGPTOSS::test_w4_4gpus[dp4-cutlass-auto]
- accuracy/test_llm_api_pytorch.py::TestDeepSeekR1::test_nvfp4_multi_gpus[latency_adp_lmtp_tp4]
- condition:
ranges:
system_gpu_count:
Expand Down