diff --git a/docker/Dockerfile.multi b/docker/Dockerfile.multi index b6bd6829701..b05f84459a8 100644 --- a/docker/Dockerfile.multi +++ b/docker/Dockerfile.multi @@ -28,7 +28,7 @@ FROM base AS devel # # NB: PyTorch requires this to be < 1.0 -ENV PYTORCH_CUDA_ALLOC_CONF="garbage_collection_threshold:0.99999" +ENV PYTORCH_ALLOC_CONF="garbage_collection_threshold:0.99999" # Copy all installation scripts at once to reduce layers COPY docker/common/install.sh \ diff --git a/docs/source/deployment-guide/deployment-guide-for-deepseek-r1-on-trtllm.md b/docs/source/deployment-guide/deployment-guide-for-deepseek-r1-on-trtllm.md index 8b0b89ec885..b7190cb5d13 100644 --- a/docs/source/deployment-guide/deployment-guide-for-deepseek-r1-on-trtllm.md +++ b/docs/source/deployment-guide/deployment-guide-for-deepseek-r1-on-trtllm.md @@ -250,7 +250,7 @@ Here is an example response, showing that the TensorRT LLM server returns “New ### Troubleshooting Tips * If you encounter CUDA out-of-memory errors, try reducing `max_batch_size` or `max_seq_len`. - * For running input/output sequence lengths of 8K/1K on H200, there is a known CUDA Out-Of-Memory issue caused by the PyTorch CUDA Caching Allocator fragmenting memory. As a workaround, you can set the environment variable `PYTORCH_CUDA_ALLOC_CONF=max_split_size_mb:8192`. For more details, please refer to the [PyTorch documentation on optimizing memory usage](https://docs.pytorch.org/docs/stable/notes/cuda.html#optimizing-memory-usage-with-pytorch-cuda-alloc-conf). + * For running input/output sequence lengths of 8K/1K on H200, there is a known CUDA Out-Of-Memory issue caused by the PyTorch CUDA Caching Allocator fragmenting memory. As a workaround, you can set the environment variable `PYTORCH_ALLOC_CONF=max_split_size_mb:8192`. For more details, please refer to the [PyTorch documentation on optimizing memory usage](https://docs.pytorch.org/docs/stable/notes/cuda.html#optimizing-memory-usage-with-pytorch-cuda-alloc-conf). * Ensure your model checkpoints are compatible with the expected format. * For performance issues, check GPU utilization with nvidia-smi while the server is running. * If the container fails to start, verify that the NVIDIA Container Toolkit is properly installed. diff --git a/enroot/Makefile b/enroot/Makefile index 4f15f028140..29f299e57c9 100644 --- a/enroot/Makefile +++ b/enroot/Makefile @@ -39,7 +39,7 @@ run_sqsh: --container-image "$(SQSH_PATH)" \ --container-mounts "$(SOURCE_DIR):$(CODE_DIR)" --container-workdir $(CODE_DIR) \ --container-mount-home --container-remap-root \ - --export PYTORCH_CUDA_ALLOC_CONF=garbage_collection_threshold:0.99999 \ + --export PYTORCH_ALLOC_CONF=garbage_collection_threshold:0.99999 \ $(RUN_CMD) endif diff --git a/jenkins/current_image_tags.properties b/jenkins/current_image_tags.properties index a412b0b8c7f..1bef8381eaf 100644 --- a/jenkins/current_image_tags.properties +++ b/jenkins/current_image_tags.properties @@ -13,7 +13,7 @@ # images are adopted from PostMerge pipelines, the abbreviated commit hash is used instead. IMAGE_NAME=urm.nvidia.com/sw-tensorrt-docker/tensorrt-llm -LLM_DOCKER_IMAGE=urm.nvidia.com/sw-tensorrt-docker/tensorrt-llm:pytorch-25.10-py3-x86_64-ubuntu24.04-trt10.13.3.9-skip-tritondevel-202511200955-9055 -LLM_SBSA_DOCKER_IMAGE=urm.nvidia.com/sw-tensorrt-docker/tensorrt-llm:pytorch-25.10-py3-aarch64-ubuntu24.04-trt10.13.3.9-skip-tritondevel-202511200955-9055 -LLM_ROCKYLINUX8_PY310_DOCKER_IMAGE=urm.nvidia.com/sw-tensorrt-docker/tensorrt-llm:cuda-13.0.2-devel-rocky8-x86_64-rocky8-py310-trt10.13.3.9-skip-tritondevel-202511200955-9055 -LLM_ROCKYLINUX8_PY312_DOCKER_IMAGE=urm.nvidia.com/sw-tensorrt-docker/tensorrt-llm:cuda-13.0.2-devel-rocky8-x86_64-rocky8-py312-trt10.13.3.9-skip-tritondevel-202511200955-9055 +LLM_DOCKER_IMAGE=urm.nvidia.com/sw-tensorrt-docker/tensorrt-llm:pytorch-25.10-py3-x86_64-ubuntu24.04-trt10.13.3.9-skip-tritondevel-202511271125-9294 +LLM_SBSA_DOCKER_IMAGE=urm.nvidia.com/sw-tensorrt-docker/tensorrt-llm:pytorch-25.10-py3-aarch64-ubuntu24.04-trt10.13.3.9-skip-tritondevel-202511271125-9294 +LLM_ROCKYLINUX8_PY310_DOCKER_IMAGE=urm.nvidia.com/sw-tensorrt-docker/tensorrt-llm:cuda-13.0.2-devel-rocky8-x86_64-rocky8-py310-trt10.13.3.9-skip-tritondevel-202511271125-9294 +LLM_ROCKYLINUX8_PY312_DOCKER_IMAGE=urm.nvidia.com/sw-tensorrt-docker/tensorrt-llm:cuda-13.0.2-devel-rocky8-x86_64-rocky8-py312-trt10.13.3.9-skip-tritondevel-202511271125-9294 diff --git a/tensorrt_llm/_torch/pyexecutor/_util.py b/tensorrt_llm/_torch/pyexecutor/_util.py index 8da982aba2b..941f4d905a7 100644 --- a/tensorrt_llm/_torch/pyexecutor/_util.py +++ b/tensorrt_llm/_torch/pyexecutor/_util.py @@ -961,7 +961,7 @@ def _adjust_torch_mem_fraction(): # torch.cuda._set_allocator_settings (added in PyTorch 2.8.0-rc1) # or a similar API is available, the warning below should be removed # and the allocator GC threshold be set via the new API instead. - torch_allocator_config = os.environ.get("PYTORCH_CUDA_ALLOC_CONF", "") + torch_allocator_config = os.environ.get("PYTORCH_ALLOC_CONF", "") torch_mem_threshold_advised = ( torch.cuda.get_allocator_backend() == "native" and "expandable_segments:True" not in torch_allocator_config) @@ -969,7 +969,7 @@ def _adjust_torch_mem_fraction(): if torch_mem_threshold_advised and not torch_mem_threshold_set: logger.warning( "It is recommended to incl. 'garbage_collection_threshold:0.???' or 'backend:cudaMallocAsync'" - " or 'expandable_segments:True' in PYTORCH_CUDA_ALLOC_CONF.") + " or 'expandable_segments:True' in PYTORCH_ALLOC_CONF.") # NOTE: Even if a memory threshold was not set (cf. warning above), setting a memory # fraction < 1.0 is beneficial, because