Skip to content
33 changes: 26 additions & 7 deletions jenkins/L0_Test.groovy
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
@Library(['bloom-jenkins-shared-lib@main', 'trtllm-jenkins-shared-lib@main']) _
@Library(['bloom-jenkins-shared-lib@emma/add_spark_for_slurm', 'trtllm-jenkins-shared-lib@main']) _

import java.lang.InterruptedException
import groovy.transform.Field
Expand Down Expand Up @@ -100,7 +100,7 @@ MODEL_CACHE_DIR="/scratch.trt_llm_data/llm-models"
REQUIRED_OPEN_DRIVER_TYPES = ["b100-ts2", "rtx-5080", "rtx-5090", "rtx-pro-6000", "rtx-pro-6000d"]

// GPU types that don't support dynamic driver flashing
REQUIRED_NO_DRIVER_TYPES = ["dgx-h100", "dgx-h200", "gh200"]
REQUIRED_NO_DRIVER_TYPES = ["dgx-h100", "dgx-h200", "gh200", "gb10x"]

// ENABLE_NGC_DEVEL_IMAGE_TEST is currently disabled in the Jenkins BuildDockerImageSanityTest job config
ENABLE_NGC_DEVEL_IMAGE_TEST = params.enableNgcDevelImageTest ?: false
Expand Down Expand Up @@ -520,6 +520,7 @@ def runLLMTestlistWithAgent(pipeline, platform, testList, config=VANILLA_CONFIG,

def slurmJobID = null
def dockerArgs = null
def hasgdrdrv = false

try {
// Run ssh command to start node in desired cluster via SLURM
Expand Down Expand Up @@ -642,6 +643,9 @@ def runLLMTestlistWithAgent(pipeline, platform, testList, config=VANILLA_CONFIG,
echo "--gpus ${gpuCount}"
fi
""", returnStdout: true).trim()
if (fileExists('/dev/gdrdrv')) {
hasgdrdrv = true
}
}

dockerArgs = "${dockerArgs} " +
Expand All @@ -657,7 +661,9 @@ def runLLMTestlistWithAgent(pipeline, platform, testList, config=VANILLA_CONFIG,

if (partition.clusterName == "dlcluster") {
dockerArgs += " -e NVIDIA_IMEX_CHANNELS=0"
dockerArgs += " --device=/dev/gdrdrv:/dev/gdrdrv"
if (hasgdrdrv) {
dockerArgs += " --device=/dev/gdrdrv:/dev/gdrdrv"
}
}
echo "Final dockerArgs: ${dockerArgs}"
} else {
Expand Down Expand Up @@ -1253,7 +1259,7 @@ def cacheErrorAndUploadResult(stageName, taskRunner, finallyRunner, noResultIfSu

def createKubernetesPodConfig(image, type, arch = "amd64", gpuCount = 1, perfMode = false)
{
def targetCould = "kubernetes-cpu"
def targetCloud = "kubernetes-cpu"
def selectors = """
nvidia.com/node_type: builder
kubernetes.io/arch: ${arch}
Expand All @@ -1265,6 +1271,7 @@ def createKubernetesPodConfig(image, type, arch = "amd64", gpuCount = 1, perfMod

def archSuffix = arch == "arm64" ? "arm" : "amd"
def jnlpImage = "urm.nvidia.com/sw-ipp-blossom-sre-docker-local/lambda/custom_jnlp_images_${archSuffix}_linux:jdk17"
println "Using type: ${type} to create Kubernetes Pod config"

switch(type)
{
Expand Down Expand Up @@ -1344,14 +1351,24 @@ def createKubernetesPodConfig(image, type, arch = "amd64", gpuCount = 1, perfMod
def gpuType = KubernetesManager.selectGPU(type)
nodeLabelPrefix = type

targetCould = "kubernetes"
targetCloud = "kubernetes"

// The following GPU types doesn't support dynamic driver flashing.
if (REQUIRED_NO_DRIVER_TYPES.any { type.contains(it) }) {
selectors = """
if (type == "gb10x") {
targetCloud = "nvks-sparks-cloud"
selectors = """
kubernetes.io/arch: ${arch}
kubernetes.io/os: linux
nvidia.com/gpu.machine: NVIDIA_DGX_Spark
nvidia.com/tenant: blossom_trt"""
}
else {
selectors = """
kubernetes.io/arch: ${arch}
kubernetes.io/os: linux
nvidia.com/gpu_type: ${gpuType}"""
}
} else if (perfMode && !hasMultipleGPUs) {
// Use single GPU machine with "tensorrt/test_type: perf" for stable perf testing.
// H100 / A100 single GPU machine has this unique label in TensorRT Blossom pool.
Expand Down Expand Up @@ -1435,7 +1452,7 @@ def createKubernetesPodConfig(image, type, arch = "amd64", gpuCount = 1, perfMod
}

def podConfig = [
cloud: targetCould,
cloud: targetCloud,
namespace: "sw-tensorrt",
label: nodeLabel,
yaml: """
Expand Down Expand Up @@ -2774,12 +2791,14 @@ def launchTestJobs(pipeline, testFilter)
// The total machine time is scaled proportionally according to the number of each GPU.
SBSATestConfigs = [
"GH200-TensorRT-Post-Merge-1": ["gh200", "l0_gh200", 1, 1],
"GB10-PyTorch-1": ["gb10x", "l0_gb10", 1, 1],
]
fullSet += SBSATestConfigs.keySet()

SBSASlurmTestConfigs = [
"GB200-4_GPUs-PyTorch-1": ["gb200-x4-oci", "l0_gb200_multi_gpus", 1, 1, 4],
"GB200-4_GPUs-PyTorch-Post-Merge-1": ["gb200-x4-oci", "l0_gb200_multi_gpus", 1, 1, 4],
"GB10-TensorRT-Post-Merge-1": ["gb10x", "l0_gb10", 1, 1],
// Disable GB300 stages due to nodes will be offline temporarily.
// "GB300-PyTorch-1": ["gb300-single", "l0_gb300", 1, 1],
// "GB300-4_GPUs-PyTorch-Post-Merge-1": ["gb300-x4", "l0_gb300_multi_gpus", 1, 1, 4],
Expand Down
7 changes: 6 additions & 1 deletion tests/integration/defs/sysinfo/get_sysinfo.py
Original file line number Diff line number Diff line change
Expand Up @@ -191,7 +191,12 @@ def construct_gpu_properties(mako_opts, device_index=0):
assert gpu_name != "", "device_product_name is empty after removing substring 'NVIDIA' and leading/trailing whitespaces."

compute_capability = get_compute_capability(device_index)
gpu_memory = pynvml.nvmlDeviceGetMemoryInfo(handle).total / (1024**2)
try:
gpu_memory = pynvml.nvmlDeviceGetMemoryInfo(handle).total / (1024**
2)
except pynvml.NVMLError_NotSupported as e:
logger.warning("Unable to get GPU memory info: {}".format(e))
gpu_memory = 8 * 1024**3
# Gather GPU information
mako_opt_dict["gpu"] = gpu_name
mako_opt_dict["gpu_memory"] = gpu_memory
Expand Down
32 changes: 32 additions & 0 deletions tests/integration/test_lists/test-db/l0_gb10.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,32 @@
version: 0.0.1
l0_gb10:
- condition:
ranges:
system_gpu_count:
gte: 1
lte: 1
wildcards:
gpu:
- '*gb10*'
linux_distribution_name: ubuntu*
cpu: aarch64
terms:
stage: post_merge
backend: tensorrt
tests:
- llmapi/test_llm_examples.py::test_llmapi_quickstart_atexit
- condition:
ranges:
system_gpu_count:
gte: 1
lte: 1
wildcards:
gpu:
- '*gb10*'
linux_distribution_name: ubuntu*
cpu: aarch64
terms:
stage: pre_merge
backend: pytorch
tests:
- unittest/utils/test_util.py