From d7dc9b2c78aafd585be673a9e592e56cb35386cb Mon Sep 17 00:00:00 2001 From: Jirka B Date: Fri, 25 Jul 2025 14:04:27 +0200 Subject: [PATCH 01/25] ci: split common & distributed --- .github/workflows/ci-tests.yml | 27 +++++++++++++++++---------- 1 file changed, 17 insertions(+), 10 deletions(-) diff --git a/.github/workflows/ci-tests.yml b/.github/workflows/ci-tests.yml index 04f8d027c00..654bc4b6f33 100644 --- a/.github/workflows/ci-tests.yml +++ b/.github/workflows/ci-tests.yml @@ -45,19 +45,26 @@ jobs: - "2.5.1" - "2.6.0" - "2.7.1" + testing: ["common", "distributed"] include: # cover additional python and PT combinations - - { os: "ubuntu-22.04", python-version: "3.9", pytorch-version: "2.0.1", requires: "oldest" } - - { os: "ubuntu-22.04", python-version: "3.12", pytorch-version: "2.6.0" } - - { os: "ubuntu-22.04", python-version: "3.12", pytorch-version: "2.7.1" } + - { os: "ubuntu-22.04", python-version: "3.9", pytorch-version: "2.0.1", requires: "oldest", testing: "common" } + - { os: "ubuntu-22.04", python-version: "3.9", pytorch-version: "2.0.1", requires: "oldest", testing: "distributed" } + - { os: "ubuntu-22.04", python-version: "3.12", pytorch-version: "2.7.1", testing: "common" } + - { os: "ubuntu-22.04", python-version: "3.12", pytorch-version: "2.7.1", testing: "distributed" } # standard mac machine, not the M1 - - { os: "macOS-13", python-version: "3.10", pytorch-version: "2.0.1" } + - { os: "macOS-14", python-version: "3.10", pytorch-version: "2.0.1", testing: "common" } + - { os: "macOS-14", python-version: "3.10", pytorch-version: "2.0.1", testing: "distributed" } # using the ARM based M1 machine - - { os: "macOS-14", python-version: "3.10", pytorch-version: "2.0.1" } - - { os: "macOS-14", python-version: "3.12", pytorch-version: "2.7.1" } + - { os: "macOS-14", python-version: "3.10", pytorch-version: "2.0.1", testing: "common" } + - { os: "macOS-14", python-version: "3.10", pytorch-version: "2.0.1", testing: "distributed" } + - { os: "macOS-14", python-version: "3.12", pytorch-version: "2.7.1", testing: "common" } + - { os: "macOS-14", python-version: "3.12", pytorch-version: "2.7.1", testing: "distributed" } # some windows - - { os: "windows-2022", python-version: "3.10", pytorch-version: "2.0.1" } - - { os: "windows-2022", python-version: "3.12", pytorch-version: "2.7.1" } + - { os: "windows-2022", python-version: "3.10", pytorch-version: "2.0.1", testing: "common" } + - { os: "windows-2022", python-version: "3.10", pytorch-version: "2.0.1", testing: "distributed" } + - { os: "windows-2022", python-version: "3.12", pytorch-version: "2.7.1", testing: "common" } + - { os: "windows-2022", python-version: "3.12", pytorch-version: "2.7.1", testing: "distributed" } # Future released version #- { os: "ubuntu-22.04", python-version: "3.11", pytorch-version: "2.8.0" } #- { os: "macOS-14", python-version: "3.11", pytorch-version: "2.8.0" } @@ -167,7 +174,7 @@ jobs: - name: Unittests common # skip for PR if there is nothing to test, note that outside PR there is default 'unittests' - if: ${{ env.TEST_DIRS != '' }} + if: ${{ env.TEST_DIRS != '' && matrix.testing == 'common' }} working-directory: ./tests run: | python -m pytest \ @@ -183,7 +190,7 @@ jobs: - name: Unittests DDP # skip for PR if there is nothing to test, note that outside PR there is default 'unittests' - if: ${{ env.TEST_DIRS != '' }} + if: ${{ env.TEST_DIRS != '' && matrix.testing == 'distributed' }} working-directory: ./tests env: USE_PYTEST_POOL: "1" From bf756d3ee93ec5671f3d0ccac14f9ed35c984bb8 Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Fri, 25 Jul 2025 12:06:35 +0000 Subject: [PATCH 02/25] [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --- .github/workflows/ci-tests.yml | 16 ++++++++++++++-- 1 file changed, 14 insertions(+), 2 deletions(-) diff --git a/.github/workflows/ci-tests.yml b/.github/workflows/ci-tests.yml index 654bc4b6f33..9dca7e4f4ed 100644 --- a/.github/workflows/ci-tests.yml +++ b/.github/workflows/ci-tests.yml @@ -48,8 +48,20 @@ jobs: testing: ["common", "distributed"] include: # cover additional python and PT combinations - - { os: "ubuntu-22.04", python-version: "3.9", pytorch-version: "2.0.1", requires: "oldest", testing: "common" } - - { os: "ubuntu-22.04", python-version: "3.9", pytorch-version: "2.0.1", requires: "oldest", testing: "distributed" } + - { + os: "ubuntu-22.04", + python-version: "3.9", + pytorch-version: "2.0.1", + requires: "oldest", + testing: "common", + } + - { + os: "ubuntu-22.04", + python-version: "3.9", + pytorch-version: "2.0.1", + requires: "oldest", + testing: "distributed", + } - { os: "ubuntu-22.04", python-version: "3.12", pytorch-version: "2.7.1", testing: "common" } - { os: "ubuntu-22.04", python-version: "3.12", pytorch-version: "2.7.1", testing: "distributed" } # standard mac machine, not the M1 From dfeee3d261cfa40f85bc8f530f5039f309a875db Mon Sep 17 00:00:00 2001 From: Nicki Skafte Date: Mon, 4 Aug 2025 08:19:01 +0200 Subject: [PATCH 03/25] try random port instead --- tests/unittests/conftest.py | 9 +++------ 1 file changed, 3 insertions(+), 6 deletions(-) diff --git a/tests/unittests/conftest.py b/tests/unittests/conftest.py index 65c086cb9ba..be221996de8 100644 --- a/tests/unittests/conftest.py +++ b/tests/unittests/conftest.py @@ -56,14 +56,11 @@ def setup_ddp(rank, world_size): world_size: the number of processes """ - global CURRENT_PORT + import random + port = random.randint(10000, 20000) # noqa: S311 os.environ["MASTER_ADDR"] = "localhost" - os.environ["MASTER_PORT"] = str(CURRENT_PORT) - - CURRENT_PORT += 1 - if CURRENT_PORT > MAX_PORT: - CURRENT_PORT = START_PORT + os.environ["MASTER_PORT"] = str(port) if torch.distributed.group.WORLD is not None: # if already initialized, destroy the process group torch.distributed.destroy_process_group() From c17511db63fbcb5152e5881c28a0c82e6aabf45a Mon Sep 17 00:00:00 2001 From: Nicki Skafte Date: Mon, 4 Aug 2025 09:13:18 +0200 Subject: [PATCH 04/25] try instead a free socket approach --- tests/unittests/conftest.py | 17 ++++++++++------- tests/unittests/image/test_ms_ssim.py | 4 ++-- tests/unittests/image/test_ssim.py | 4 ++-- tests/unittests/utilities/test_utilities.py | 14 -------------- 4 files changed, 14 insertions(+), 25 deletions(-) diff --git a/tests/unittests/conftest.py b/tests/unittests/conftest.py index be221996de8..746a3e66c26 100644 --- a/tests/unittests/conftest.py +++ b/tests/unittests/conftest.py @@ -13,6 +13,7 @@ # limitations under the License. import contextlib import os +import socket import sys import pytest @@ -30,9 +31,6 @@ EXTRA_DIM = 3 THRESHOLD = 0.5 -MAX_PORT = 8100 -START_PORT = 8088 -CURRENT_PORT = START_PORT USE_PYTEST_POOL = os.getenv("USE_PYTEST_POOL", "0") == "1" @@ -44,6 +42,14 @@ def use_deterministic_algorithms(): torch.use_deterministic_algorithms(False) +def get_free_port(): + """Find an available free port on localhost.""" + with socket.socket(socket.AF_INET, socket.SOCK_STREAM) as s: + s.bind(("localhost", 0)) # Bind to a free port provided by the OS + s.setsockopt(socket.SOL_SOCKET, socket.SO_REUSEADDR, 1) + return s.getsockname()[1] + + def setup_ddp(rank, world_size): """Initialize ddp environment. @@ -56,11 +62,8 @@ def setup_ddp(rank, world_size): world_size: the number of processes """ - import random - - port = random.randint(10000, 20000) # noqa: S311 os.environ["MASTER_ADDR"] = "localhost" - os.environ["MASTER_PORT"] = str(port) + os.environ["MASTER_PORT"] = str(get_free_port()) if torch.distributed.group.WORLD is not None: # if already initialized, destroy the process group torch.distributed.destroy_process_group() diff --git a/tests/unittests/image/test_ms_ssim.py b/tests/unittests/image/test_ms_ssim.py index a5ba60bf2fd..c7501ff1c4e 100644 --- a/tests/unittests/image/test_ms_ssim.py +++ b/tests/unittests/image/test_ms_ssim.py @@ -24,8 +24,8 @@ from unittests import NUM_BATCHES, _Input from unittests._helpers import seed_all from unittests._helpers.testers import MetricTester +from unittests.conftest import get_free_port from unittests.image import cleanup_ddp, setup_ddp -from unittests.utilities.test_utilities import find_free_port seed_all(42) @@ -137,7 +137,7 @@ def test_ms_ssim_reduction_none_ddp(): """ world_size = 2 - free_port = find_free_port() + free_port = get_free_port() if free_port == -1: pytest.skip("No free port available for DDP test.") mp.spawn(_run_ms_ssim_ddp, args=(world_size, free_port), nprocs=world_size, join=True) diff --git a/tests/unittests/image/test_ssim.py b/tests/unittests/image/test_ssim.py index 6bfc642bd2b..a3e0e1ba299 100644 --- a/tests/unittests/image/test_ssim.py +++ b/tests/unittests/image/test_ssim.py @@ -27,8 +27,8 @@ from unittests import NUM_BATCHES, _Input from unittests._helpers import seed_all from unittests._helpers.testers import MetricTester +from unittests.conftest import get_free_port from unittests.image import cleanup_ddp, setup_ddp -from unittests.utilities.test_utilities import find_free_port seed_all(42) @@ -392,7 +392,7 @@ def test_ssim_reduction_none_ddp(): """ world_size = 2 - free_port = find_free_port() + free_port = get_free_port() if free_port == -1: pytest.skip("No free port available for DDP test.") mp.spawn(_run_ssim_ddp, args=(world_size, free_port), nprocs=world_size, join=True) diff --git a/tests/unittests/utilities/test_utilities.py b/tests/unittests/utilities/test_utilities.py index e34bbda2662..6f9f5d62843 100644 --- a/tests/unittests/utilities/test_utilities.py +++ b/tests/unittests/utilities/test_utilities.py @@ -11,7 +11,6 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. -import socket import sys import numpy as np @@ -19,7 +18,6 @@ import torch from lightning_utilities.test.warning import no_warning_call from torch import tensor -from unittests.conftest import MAX_PORT, START_PORT from torchmetrics.regression import MeanSquaredError, PearsonCorrCoef from torchmetrics.utilities import check_forward_full_state_property, rank_zero_debug, rank_zero_info, rank_zero_warn @@ -241,15 +239,3 @@ def test_half_precision_top_k_cpu_raises_error(): x = torch.randn(100, 10, dtype=torch.half) with pytest.raises(RuntimeError, match="\"topk_cpu\" not implemented for 'Half'"): torch.topk(x, k=3, dim=1) - - -def find_free_port(start=START_PORT, end=MAX_PORT): - """Returns an available localhost port in the given range or returns -1 if no port available.""" - for port in range(start, end + 1): - with socket.socket(socket.AF_INET, socket.SOCK_STREAM) as s: - try: - s.bind(("localhost", port)) - return port - except OSError: - continue - return -1 From e2af42077fc4a60b2a985ce4d01deb2089960ac1 Mon Sep 17 00:00:00 2001 From: Nicki Skafte Date: Mon, 4 Aug 2025 12:57:12 +0200 Subject: [PATCH 05/25] fix ports not syncing --- tests/unittests/bases/test_ddp.py | 13 +++++++------ tests/unittests/conftest.py | 14 +++++++++++--- tests/unittests/image/__init__.py | 17 ----------------- tests/unittests/image/test_ms_ssim.py | 3 +-- tests/unittests/image/test_ssim.py | 3 +-- 5 files changed, 20 insertions(+), 30 deletions(-) diff --git a/tests/unittests/bases/test_ddp.py b/tests/unittests/bases/test_ddp.py index ad29b0f4c6e..1eecad747d9 100644 --- a/tests/unittests/bases/test_ddp.py +++ b/tests/unittests/bases/test_ddp.py @@ -27,7 +27,7 @@ from unittests import NUM_PROCESSES, USE_PYTEST_POOL from unittests._helpers import seed_all from unittests._helpers.testers import DummyListMetric, DummyMetric, DummyMetricSum -from unittests.conftest import setup_ddp +from unittests.conftest import get_free_port, setup_ddp seed_all(42) @@ -106,9 +106,9 @@ def test_ddp(process): pytest.pool.map(process, range(NUM_PROCESSES)) -def _test_ddp_gather_all_autograd_same_shape(rank: int, worldsize: int = NUM_PROCESSES) -> None: +def _test_ddp_gather_all_autograd_same_shape(rank: int, worldsize: int, port: int) -> None: """Test that ddp gather preserves local rank's autograd graph for same-shaped tensors across ranks.""" - setup_ddp(rank, worldsize) + setup_ddp(rank, worldsize, port) x = (rank + 1) * torch.ones(10, requires_grad=True) # random linear transformation, it should really not matter what we do here @@ -121,9 +121,9 @@ def _test_ddp_gather_all_autograd_same_shape(rank: int, worldsize: int = NUM_PRO assert torch.allclose(grad, a * torch.ones_like(x)) -def _test_ddp_gather_all_autograd_different_shape(rank: int, worldsize: int = NUM_PROCESSES) -> None: +def _test_ddp_gather_all_autograd_different_shape(rank: int, worldsize: int, port: int) -> None: """Test that ddp gather preserves local rank's autograd graph for differently-shaped tensors across ranks.""" - setup_ddp(rank, worldsize) + setup_ddp(rank, worldsize, port) x = (rank + 1) * torch.ones(rank + 1, 2 - rank, requires_grad=True) # random linear transformation, it should really not matter what we do here @@ -144,7 +144,8 @@ def _test_ddp_gather_all_autograd_different_shape(rank: int, worldsize: int = NU ) def test_ddp_autograd(process): """Test ddp functions for autograd compatibility.""" - pytest.pool.map(process, range(NUM_PROCESSES)) + port = get_free_port() + pytest.pool.starmap(process, [(rank, NUM_PROCESSES, port) for rank in range(NUM_PROCESSES)]) def _test_non_contiguous_tensors(rank): diff --git a/tests/unittests/conftest.py b/tests/unittests/conftest.py index 746a3e66c26..dde15ca3a90 100644 --- a/tests/unittests/conftest.py +++ b/tests/unittests/conftest.py @@ -50,7 +50,7 @@ def get_free_port(): return s.getsockname()[1] -def setup_ddp(rank, world_size): +def setup_ddp(rank, world_size, master_port): """Initialize ddp environment. If a particular test relies on the order of the processes in the pool to be [0, 1, 2, ...], then this function @@ -60,10 +60,11 @@ def setup_ddp(rank, world_size): Args: rank: the rank of the process world_size: the number of processes + master_port: the port to use for the master process """ os.environ["MASTER_ADDR"] = "localhost" - os.environ["MASTER_PORT"] = str(get_free_port()) + os.environ["MASTER_PORT"] = str(master_port) if torch.distributed.group.WORLD is not None: # if already initialized, destroy the process group torch.distributed.destroy_process_group() @@ -72,12 +73,19 @@ def setup_ddp(rank, world_size): torch.distributed.init_process_group("gloo", rank=rank, world_size=world_size) +def cleanup_ddp(): + """Clean up the DDP process group if initialized.""" + if torch.distributed.is_initialized(): + torch.distributed.destroy_process_group() + + def pytest_sessionstart(): """Global initialization of multiprocessing pool; runs before any test.""" if not USE_PYTEST_POOL: return + port = get_free_port() pool = Pool(processes=NUM_PROCESSES) - pool.starmap(setup_ddp, [(rank, NUM_PROCESSES) for rank in range(NUM_PROCESSES)]) + pool.starmap(setup_ddp, [(rank, NUM_PROCESSES, port) for rank in range(NUM_PROCESSES)]) pytest.pool = pool diff --git a/tests/unittests/image/__init__.py b/tests/unittests/image/__init__.py index 8eea7d284b8..13ab22f7140 100644 --- a/tests/unittests/image/__init__.py +++ b/tests/unittests/image/__init__.py @@ -13,23 +13,6 @@ # limitations under the License. import os -import torch -import torch.distributed as dist - from unittests import _PATH_ALL_TESTS _SAMPLE_IMAGE = os.path.join(_PATH_ALL_TESTS, "_data", "image", "i01_01_5.bmp") - - -def setup_ddp(rank: int, world_size: int, free_port: int): - """Set up DDP with a free port and assign CUDA device to the given rank.""" - os.environ["MASTER_ADDR"] = "localhost" - os.environ["MASTER_PORT"] = str(free_port) - dist.init_process_group(backend="nccl", rank=rank, world_size=world_size) - torch.cuda.set_device(rank) - - -def cleanup_ddp(): - """Clean up the DDP process group if initialized.""" - if dist.is_initialized(): - dist.destroy_process_group() diff --git a/tests/unittests/image/test_ms_ssim.py b/tests/unittests/image/test_ms_ssim.py index c7501ff1c4e..8995d8a6b2c 100644 --- a/tests/unittests/image/test_ms_ssim.py +++ b/tests/unittests/image/test_ms_ssim.py @@ -24,8 +24,7 @@ from unittests import NUM_BATCHES, _Input from unittests._helpers import seed_all from unittests._helpers.testers import MetricTester -from unittests.conftest import get_free_port -from unittests.image import cleanup_ddp, setup_ddp +from unittests.conftest import cleanup_ddp, get_free_port, setup_ddp seed_all(42) diff --git a/tests/unittests/image/test_ssim.py b/tests/unittests/image/test_ssim.py index a3e0e1ba299..352b33c5fe1 100644 --- a/tests/unittests/image/test_ssim.py +++ b/tests/unittests/image/test_ssim.py @@ -27,8 +27,7 @@ from unittests import NUM_BATCHES, _Input from unittests._helpers import seed_all from unittests._helpers.testers import MetricTester -from unittests.conftest import get_free_port -from unittests.image import cleanup_ddp, setup_ddp +from unittests.conftest import cleanup_ddp, get_free_port, setup_ddp seed_all(42) From 6c92ed981b5ba70ea375fd470a736360af673ded Mon Sep 17 00:00:00 2001 From: Nicki Skafte Date: Tue, 5 Aug 2025 07:02:27 +0200 Subject: [PATCH 06/25] try limit number of parallel jobs for debugging --- .github/workflows/ci-tests.yml | 1 + 1 file changed, 1 insertion(+) diff --git a/.github/workflows/ci-tests.yml b/.github/workflows/ci-tests.yml index 9dca7e4f4ed..cfff7987d7e 100644 --- a/.github/workflows/ci-tests.yml +++ b/.github/workflows/ci-tests.yml @@ -33,6 +33,7 @@ jobs: needs: check-diff strategy: fail-fast: false + max-parallel: 2 matrix: os: ["ubuntu-22.04"] python-version: ["3.10"] From 86f33bd0fd68553f865826a157d94f11a67a470c Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Wed, 6 Aug 2025 15:07:13 +0000 Subject: [PATCH 07/25] [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --- tests/unittests/utilities/test_utilities.py | 1 - 1 file changed, 1 deletion(-) diff --git a/tests/unittests/utilities/test_utilities.py b/tests/unittests/utilities/test_utilities.py index dc1d4563a52..00ffa53b3a1 100644 --- a/tests/unittests/utilities/test_utilities.py +++ b/tests/unittests/utilities/test_utilities.py @@ -19,7 +19,6 @@ from lightning_utilities.test.warning import no_warning_call from torch import tensor from unittests._helpers import _IS_WINDOWS -from unittests.conftest import MAX_PORT, START_PORT from torchmetrics.regression import MeanSquaredError, PearsonCorrCoef from torchmetrics.utilities import check_forward_full_state_property, rank_zero_debug, rank_zero_info, rank_zero_warn From 0e502a64c262baf10ddbc185a9d2d97d4ff7234e Mon Sep 17 00:00:00 2001 From: Jirka B Date: Wed, 6 Aug 2025 21:37:55 +0200 Subject: [PATCH 08/25] pytest --- .github/workflows/ci-tests.yml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/.github/workflows/ci-tests.yml b/.github/workflows/ci-tests.yml index cfff7987d7e..b75a3d32930 100644 --- a/.github/workflows/ci-tests.yml +++ b/.github/workflows/ci-tests.yml @@ -190,7 +190,7 @@ jobs: if: ${{ env.TEST_DIRS != '' && matrix.testing == 'common' }} working-directory: ./tests run: | - python -m pytest \ + pytest \ $TEST_DIRS \ --cov=torchmetrics \ --durations=50 \ @@ -208,7 +208,7 @@ jobs: env: USE_PYTEST_POOL: "1" run: | - python -m pytest -v \ + pytest -v \ $TEST_DIRS \ --cov=torchmetrics \ --durations=50 \ From 913cea6ed97d37296005623422b591b00c79fef0 Mon Sep 17 00:00:00 2001 From: Jirka B Date: Wed, 6 Aug 2025 21:54:47 +0200 Subject: [PATCH 09/25] gpu --- .azure/gpu-unittests.yml | 25 ++++++++++++++++--------- .github/workflows/ci-tests.yml | 1 - 2 files changed, 16 insertions(+), 10 deletions(-) diff --git a/.azure/gpu-unittests.yml b/.azure/gpu-unittests.yml index 12b45a0ed53..f6f4dae5648 100644 --- a/.azure/gpu-unittests.yml +++ b/.azure/gpu-unittests.yml @@ -24,11 +24,18 @@ jobs: - job: unitest_GPU strategy: matrix: - "PyTorch | 2.0 oldest": + "PyTorch | 2.0 oldest | commonn": # Torch does not have build wheels with old Torch versions for newer CUDA docker-image: "ubuntu22.04-cuda12.1.1-py3.9-torch2.0" torch-ver: "2.0" - "PyTorch | 2.X stable": + "PyTorch | 2.0 oldest | DDP": + # Torch does not have build wheels with old Torch versions for newer CUDA + docker-image: "ubuntu22.04-cuda12.1.1-py3.9-torch2.0" + torch-ver: "2.0" + "PyTorch | 2.X stable | commonn": + docker-image: "ubuntu22.04-cuda12.6.3-py3.11-torch2.7" + torch-ver: "2.7" + "PyTorch | 2.X stable | DDP": docker-image: "ubuntu22.04-cuda12.6.3-py3.11-torch2.7" torch-ver: "2.7" #"PyTorch | 2.X future": @@ -123,7 +130,7 @@ jobs: - bash: | python .github/assistant.py set-oldest-versions - condition: eq(variables['torch-ver'], '2.0') + condition: contains(variables['Agent.JobName'], 'oldest') displayName: "Setting oldest versions" - bash: | @@ -185,27 +192,27 @@ jobs: - bash: | du -h --max-depth=1 . - python -m pytest $(TEST_DIRS) \ + pytest $(TEST_DIRS) \ -m "not DDP" --numprocesses=6 --dist=loadfile \ --cov=torchmetrics --timeout=240 --durations=100 \ --reruns 3 --reruns-delay 1 workingDirectory: "tests/" # skip for PR if there is nothing to test, note that outside PR there is default 'unittests' - condition: and(succeeded(), ne(variables['TEST_DIRS'], '')) + condition: and(succeeded(), ne(variables['TEST_DIRS'], ''), contains(variables['Agent.JobName'], 'commonn')) timeoutInMinutes: "120" - displayName: "UnitTesting common" + displayName: "Testing common" - bash: | - python -m pytest $(TEST_DIRS) -v \ + pytest $(TEST_DIRS) -v \ --cov=torchmetrics -m "DDP" \ --timeout=240 --durations=100 env: USE_PYTEST_POOL: "1" workingDirectory: "tests/" # skip for PR if there is nothing to test, note that outside PR there is default 'unittests' - condition: and(succeeded(), ne(variables['TEST_DIRS'], '')) + condition: and(succeeded(), ne(variables['TEST_DIRS'], ''), contains(variables['Agent.JobName'], 'DDP')) timeoutInMinutes: "120" - displayName: "UnitTesting DDP" + displayName: "Testing DDP" - bash: | du -h --max-depth=1 tests/ diff --git a/.github/workflows/ci-tests.yml b/.github/workflows/ci-tests.yml index b75a3d32930..e8e14dd789a 100644 --- a/.github/workflows/ci-tests.yml +++ b/.github/workflows/ci-tests.yml @@ -33,7 +33,6 @@ jobs: needs: check-diff strategy: fail-fast: false - max-parallel: 2 matrix: os: ["ubuntu-22.04"] python-version: ["3.10"] From 5637d3b836f3888f845b5f70a44446292b3bcbc3 Mon Sep 17 00:00:00 2001 From: Jirka B Date: Wed, 6 Aug 2025 22:40:28 +0200 Subject: [PATCH 10/25] doctest --- .azure/gpu-unittests.yml | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/.azure/gpu-unittests.yml b/.azure/gpu-unittests.yml index f6f4dae5648..7812224a9e7 100644 --- a/.azure/gpu-unittests.yml +++ b/.azure/gpu-unittests.yml @@ -28,10 +28,17 @@ jobs: # Torch does not have build wheels with old Torch versions for newer CUDA docker-image: "ubuntu22.04-cuda12.1.1-py3.9-torch2.0" torch-ver: "2.0" + "PyTorch | 2.0 oldest | doctest": + # Torch does not have build wheels with old Torch versions for newer CUDA + docker-image: "ubuntu22.04-cuda12.1.1-py3.9-torch2.0" + torch-ver: "2.0" "PyTorch | 2.0 oldest | DDP": # Torch does not have build wheels with old Torch versions for newer CUDA docker-image: "ubuntu22.04-cuda12.1.1-py3.9-torch2.0" torch-ver: "2.0" + "PyTorch | 2.X stable | doctest": + docker-image: "ubuntu22.04-cuda12.6.3-py3.11-torch2.7" + torch-ver: "2.7" "PyTorch | 2.X stable | commonn": docker-image: "ubuntu22.04-cuda12.6.3-py3.11-torch2.7" torch-ver: "2.7" @@ -155,6 +162,7 @@ jobs: --timeout=240 --durations=50 \ --reruns 2 --reruns-delay 1 # --numprocesses=5 --dist=loadfile + condition: contains(variables['Agent.JobName'], 'doctest') env: DOCTEST_DOWNLOAD_TIMEOUT: "180" SKIP_SLOW_DOCTEST: "1" From a0ad4885ef90081d37eaee1c192c565abd906cda Mon Sep 17 00:00:00 2001 From: Jirka B Date: Thu, 7 Aug 2025 19:40:08 +0200 Subject: [PATCH 11/25] rev GPU --- .azure/gpu-integrations.yml | 4 ++-- .azure/gpu-unittests.yml | 37 +++++++++++-------------------------- 2 files changed, 13 insertions(+), 28 deletions(-) diff --git a/.azure/gpu-integrations.yml b/.azure/gpu-integrations.yml index fa3bc22c5d7..9d956bc434d 100644 --- a/.azure/gpu-integrations.yml +++ b/.azure/gpu-integrations.yml @@ -22,8 +22,8 @@ jobs: torch-ver: "2.0" requires: "oldest" "torch | 2.x": - docker-image: "pytorch/pytorch:2.7.1-cuda12.6-cudnn9-runtime" - torch-ver: "2.7" + docker-image: "pytorch/pytorch:2.8.0-cuda12.6-cudnn9-runtime" + torch-ver: "2.8" # how long to run the job before automatically cancelling timeoutInMinutes: "40" # how much time to give 'run always even if cancelled tasks' before stopping them diff --git a/.azure/gpu-unittests.yml b/.azure/gpu-unittests.yml index 7812224a9e7..8d287268df1 100644 --- a/.azure/gpu-unittests.yml +++ b/.azure/gpu-unittests.yml @@ -24,27 +24,13 @@ jobs: - job: unitest_GPU strategy: matrix: - "PyTorch | 2.0 oldest | commonn": + "PyTorch | 2.0 oldest": # Torch does not have build wheels with old Torch versions for newer CUDA docker-image: "ubuntu22.04-cuda12.1.1-py3.9-torch2.0" torch-ver: "2.0" - "PyTorch | 2.0 oldest | doctest": - # Torch does not have build wheels with old Torch versions for newer CUDA - docker-image: "ubuntu22.04-cuda12.1.1-py3.9-torch2.0" - torch-ver: "2.0" - "PyTorch | 2.0 oldest | DDP": - # Torch does not have build wheels with old Torch versions for newer CUDA - docker-image: "ubuntu22.04-cuda12.1.1-py3.9-torch2.0" - torch-ver: "2.0" - "PyTorch | 2.X stable | doctest": - docker-image: "ubuntu22.04-cuda12.6.3-py3.11-torch2.7" - torch-ver: "2.7" - "PyTorch | 2.X stable | commonn": - docker-image: "ubuntu22.04-cuda12.6.3-py3.11-torch2.7" - torch-ver: "2.7" - "PyTorch | 2.X stable | DDP": - docker-image: "ubuntu22.04-cuda12.6.3-py3.11-torch2.7" - torch-ver: "2.7" + "PyTorch | 2.X stable": + docker-image: "ubuntu24.04-cuda12.6.3-py3.12-torch2.8" + torch-ver: "2.8" #"PyTorch | 2.X future": # docker-image: "ubuntu22.04-cuda12.6.3-py3.11-torch2.8" # torch-ver: "2.8" @@ -137,7 +123,7 @@ jobs: - bash: | python .github/assistant.py set-oldest-versions - condition: contains(variables['Agent.JobName'], 'oldest') + condition: eq(variables['torch-ver'], '2.0') displayName: "Setting oldest versions" - bash: | @@ -162,7 +148,6 @@ jobs: --timeout=240 --durations=50 \ --reruns 2 --reruns-delay 1 # --numprocesses=5 --dist=loadfile - condition: contains(variables['Agent.JobName'], 'doctest') env: DOCTEST_DOWNLOAD_TIMEOUT: "180" SKIP_SLOW_DOCTEST: "1" @@ -200,27 +185,27 @@ jobs: - bash: | du -h --max-depth=1 . - pytest $(TEST_DIRS) \ + python -m pytest $(TEST_DIRS) \ -m "not DDP" --numprocesses=6 --dist=loadfile \ --cov=torchmetrics --timeout=240 --durations=100 \ --reruns 3 --reruns-delay 1 workingDirectory: "tests/" # skip for PR if there is nothing to test, note that outside PR there is default 'unittests' - condition: and(succeeded(), ne(variables['TEST_DIRS'], ''), contains(variables['Agent.JobName'], 'commonn')) + condition: and(succeeded(), ne(variables['TEST_DIRS'], '')) timeoutInMinutes: "120" - displayName: "Testing common" + displayName: "UnitTesting common" - bash: | - pytest $(TEST_DIRS) -v \ + python -m pytest $(TEST_DIRS) -v \ --cov=torchmetrics -m "DDP" \ --timeout=240 --durations=100 env: USE_PYTEST_POOL: "1" workingDirectory: "tests/" # skip for PR if there is nothing to test, note that outside PR there is default 'unittests' - condition: and(succeeded(), ne(variables['TEST_DIRS'], ''), contains(variables['Agent.JobName'], 'DDP')) + condition: and(succeeded(), ne(variables['TEST_DIRS'], '')) timeoutInMinutes: "120" - displayName: "Testing DDP" + displayName: "UnitTesting DDP" - bash: | du -h --max-depth=1 tests/ From d10a688bc66f7a6ad39d04e0869fa4a3fbd4c9d4 Mon Sep 17 00:00:00 2001 From: Jirka B Date: Mon, 11 Aug 2025 19:39:48 +0200 Subject: [PATCH 12/25] get_free_port --- tests/unittests/conftest.py | 21 +++++++++++++++------ 1 file changed, 15 insertions(+), 6 deletions(-) diff --git a/tests/unittests/conftest.py b/tests/unittests/conftest.py index dde15ca3a90..c94474d0fbd 100644 --- a/tests/unittests/conftest.py +++ b/tests/unittests/conftest.py @@ -42,12 +42,21 @@ def use_deterministic_algorithms(): torch.use_deterministic_algorithms(False) -def get_free_port(): - """Find an available free port on localhost.""" - with socket.socket(socket.AF_INET, socket.SOCK_STREAM) as s: - s.bind(("localhost", 0)) # Bind to a free port provided by the OS - s.setsockopt(socket.SOL_SOCKET, socket.SO_REUSEADDR, 1) - return s.getsockname()[1] +def get_free_port(max_tries=10): + """Find an available free port on localhost, retrying if necessary.""" + for _ in range(max_tries): + with socket.socket(socket.AF_INET, socket.SOCK_STREAM) as s: + s.bind(("localhost", 0)) + s.setsockopt(socket.SOL_SOCKET, socket.SO_REUSEADDR, 1) + port = s.getsockname()[1] + # Try to bind again to check if port is still free + try: + with socket.socket(socket.AF_INET, socket.SOCK_STREAM) as test_sock: + test_sock.bind(("localhost", port)) + return port + except OSError: + continue + raise RuntimeError("Could not find a free port after several attempts") def setup_ddp(rank, world_size, master_port): From 1c93193ff6f433add9ec7a0352278423d440f554 Mon Sep 17 00:00:00 2001 From: Jirka B Date: Mon, 11 Aug 2025 19:52:00 +0200 Subject: [PATCH 13/25] get_free_port --- tests/unittests/conftest.py | 24 +++++++++--------------- 1 file changed, 9 insertions(+), 15 deletions(-) diff --git a/tests/unittests/conftest.py b/tests/unittests/conftest.py index c94474d0fbd..c592658b54d 100644 --- a/tests/unittests/conftest.py +++ b/tests/unittests/conftest.py @@ -42,21 +42,15 @@ def use_deterministic_algorithms(): torch.use_deterministic_algorithms(False) -def get_free_port(max_tries=10): - """Find an available free port on localhost, retrying if necessary.""" - for _ in range(max_tries): - with socket.socket(socket.AF_INET, socket.SOCK_STREAM) as s: - s.bind(("localhost", 0)) - s.setsockopt(socket.SOL_SOCKET, socket.SO_REUSEADDR, 1) - port = s.getsockname()[1] - # Try to bind again to check if port is still free - try: - with socket.socket(socket.AF_INET, socket.SOCK_STREAM) as test_sock: - test_sock.bind(("localhost", port)) - return port - except OSError: - continue - raise RuntimeError("Could not find a free port after several attempts") +def get_free_port(): + """Find an available free port on localhost and keep it reserved.""" + with socket.socket(socket.AF_INET, socket.SOCK_STREAM) as s: + s.bind(("localhost", 0)) # Bind to a free port provided by the OS + s.setsockopt(socket.SOL_SOCKET, socket.SO_REUSEADDR, 1) + port = s.getsockname()[1] + # Keep socket open longer to prevent immediate reuse + s.listen(1) + return port def setup_ddp(rank, world_size, master_port): From 6f9bec7d5ec721dce5e93da7577a8f8384b474c6 Mon Sep 17 00:00:00 2001 From: Jirka B Date: Tue, 12 Aug 2025 00:40:23 +0200 Subject: [PATCH 14/25] get_free_port --- tests/unittests/conftest.py | 28 ++++++++++++++++++++++------ 1 file changed, 22 insertions(+), 6 deletions(-) diff --git a/tests/unittests/conftest.py b/tests/unittests/conftest.py index c592658b54d..aff46720797 100644 --- a/tests/unittests/conftest.py +++ b/tests/unittests/conftest.py @@ -43,14 +43,30 @@ def use_deterministic_algorithms(): def get_free_port(): - """Find an available free port on localhost and keep it reserved.""" + """Find an available free port on localhost with better reservation.""" + import time + import random + + # Try multiple times with different base ports to avoid conflicts + for _ in range(10): + try: + with socket.socket(socket.AF_INET, socket.SOCK_STREAM) as s: + # Use a random port in a higher range to avoid common conflicts + base_port = random.randint(20000, 30000) + s.bind(("localhost", base_port)) + s.setsockopt(socket.SOL_SOCKET, socket.SO_REUSEADDR, 1) + port = s.getsockname()[1] + # Brief delay to reduce race conditions + time.sleep(0.1) + return port + except OSError: + continue + + # Fallback to original method with socket.socket(socket.AF_INET, socket.SOCK_STREAM) as s: - s.bind(("localhost", 0)) # Bind to a free port provided by the OS + s.bind(("localhost", 0)) s.setsockopt(socket.SOL_SOCKET, socket.SO_REUSEADDR, 1) - port = s.getsockname()[1] - # Keep socket open longer to prevent immediate reuse - s.listen(1) - return port + return s.getsockname()[1] def setup_ddp(rank, world_size, master_port): From 5f81e754a7aa1f436643ce9f6cd1a88676e5d76d Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Mon, 11 Aug 2025 22:41:49 +0000 Subject: [PATCH 15/25] [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --- tests/unittests/conftest.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/unittests/conftest.py b/tests/unittests/conftest.py index aff46720797..790ad91de90 100644 --- a/tests/unittests/conftest.py +++ b/tests/unittests/conftest.py @@ -44,8 +44,8 @@ def use_deterministic_algorithms(): def get_free_port(): """Find an available free port on localhost with better reservation.""" - import time import random + import time # Try multiple times with different base ports to avoid conflicts for _ in range(10): From 4b3c37e63957589be04bc2af893bd322e84f0150 Mon Sep 17 00:00:00 2001 From: Jirka B Date: Tue, 12 Aug 2025 10:06:49 +0200 Subject: [PATCH 16/25] get_free_port() --- tests/unittests/conftest.py | 33 ++++++++------------------------- 1 file changed, 8 insertions(+), 25 deletions(-) diff --git a/tests/unittests/conftest.py b/tests/unittests/conftest.py index 790ad91de90..7afa62e9433 100644 --- a/tests/unittests/conftest.py +++ b/tests/unittests/conftest.py @@ -42,31 +42,15 @@ def use_deterministic_algorithms(): torch.use_deterministic_algorithms(False) -def get_free_port(): - """Find an available free port on localhost with better reservation.""" - import random - import time - - # Try multiple times with different base ports to avoid conflicts - for _ in range(10): - try: - with socket.socket(socket.AF_INET, socket.SOCK_STREAM) as s: - # Use a random port in a higher range to avoid common conflicts - base_port = random.randint(20000, 30000) - s.bind(("localhost", base_port)) - s.setsockopt(socket.SOL_SOCKET, socket.SO_REUSEADDR, 1) - port = s.getsockname()[1] - # Brief delay to reduce race conditions - time.sleep(0.1) - return port - except OSError: - continue - - # Fallback to original method +def get_free_port() -> int: + """Find an available free port on localhost and keep it reserved.""" with socket.socket(socket.AF_INET, socket.SOCK_STREAM) as s: - s.bind(("localhost", 0)) + s.bind(("localhost", 0)) # Bind to a free port provided by the OS s.setsockopt(socket.SOL_SOCKET, socket.SO_REUSEADDR, 1) - return s.getsockname()[1] + port = s.getsockname()[1] + # Keep socket open longer to prevent immediate reuse + s.listen(1) + return port def setup_ddp(rank, world_size, master_port): @@ -102,9 +86,8 @@ def pytest_sessionstart(): """Global initialization of multiprocessing pool; runs before any test.""" if not USE_PYTEST_POOL: return - port = get_free_port() pool = Pool(processes=NUM_PROCESSES) - pool.starmap(setup_ddp, [(rank, NUM_PROCESSES, port) for rank in range(NUM_PROCESSES)]) + pool.starmap(setup_ddp, [(rank, NUM_PROCESSES, get_free_port()) for rank in range(NUM_PROCESSES)]) pytest.pool = pool From b5f9ae37f5e482f6d6e38339b9206f35eb1aeacf Mon Sep 17 00:00:00 2001 From: Jirka B Date: Tue, 12 Aug 2025 10:58:17 +0200 Subject: [PATCH 17/25] settimeout --- tests/unittests/conftest.py | 1 + 1 file changed, 1 insertion(+) diff --git a/tests/unittests/conftest.py b/tests/unittests/conftest.py index 7afa62e9433..597bb3538f2 100644 --- a/tests/unittests/conftest.py +++ b/tests/unittests/conftest.py @@ -45,6 +45,7 @@ def use_deterministic_algorithms(): def get_free_port() -> int: """Find an available free port on localhost and keep it reserved.""" with socket.socket(socket.AF_INET, socket.SOCK_STREAM) as s: + s.settimeout(5) # Set a timeout of 5 seconds s.bind(("localhost", 0)) # Bind to a free port provided by the OS s.setsockopt(socket.SOL_SOCKET, socket.SO_REUSEADDR, 1) port = s.getsockname()[1] From 666cb7fb134d18657667d548a962975a2a058c24 Mon Sep 17 00:00:00 2001 From: Jirka B Date: Tue, 12 Aug 2025 11:00:37 +0200 Subject: [PATCH 18/25] setup_ddp --- tests/unittests/conftest.py | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) diff --git a/tests/unittests/conftest.py b/tests/unittests/conftest.py index 597bb3538f2..34104cee3fc 100644 --- a/tests/unittests/conftest.py +++ b/tests/unittests/conftest.py @@ -54,7 +54,7 @@ def get_free_port() -> int: return port -def setup_ddp(rank, world_size, master_port): +def setup_ddp(rank: int, world_size: int) -> None: """Initialize ddp environment. If a particular test relies on the order of the processes in the pool to be [0, 1, 2, ...], then this function @@ -64,11 +64,10 @@ def setup_ddp(rank, world_size, master_port): Args: rank: the rank of the process world_size: the number of processes - master_port: the port to use for the master process """ os.environ["MASTER_ADDR"] = "localhost" - os.environ["MASTER_PORT"] = str(master_port) + os.environ["MASTER_PORT"] = str(get_free_port()) if torch.distributed.group.WORLD is not None: # if already initialized, destroy the process group torch.distributed.destroy_process_group() @@ -88,7 +87,7 @@ def pytest_sessionstart(): if not USE_PYTEST_POOL: return pool = Pool(processes=NUM_PROCESSES) - pool.starmap(setup_ddp, [(rank, NUM_PROCESSES, get_free_port()) for rank in range(NUM_PROCESSES)]) + pool.starmap(setup_ddp, [(rank, NUM_PROCESSES) for rank in range(NUM_PROCESSES)]) pytest.pool = pool From 12253e60f387060db99af8e6fd58766cc72aaa0a Mon Sep 17 00:00:00 2001 From: Jirka B Date: Tue, 12 Aug 2025 11:21:10 +0200 Subject: [PATCH 19/25] timeout-minutes: 70 --- .github/workflows/ci-tests.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/ci-tests.yml b/.github/workflows/ci-tests.yml index a44ddb5e7d8..5075cebb459 100644 --- a/.github/workflows/ci-tests.yml +++ b/.github/workflows/ci-tests.yml @@ -91,7 +91,7 @@ jobs: # Timeout: https://stackoverflow.com/a/59076067/4521646 # seems that macOS jobs take much more than orger OS - timeout-minutes: 120 + timeout-minutes: 70 steps: - uses: actions/checkout@v4 From e73e1d3078bd2abfb9ec07d17fe8b543f84c5059 Mon Sep 17 00:00:00 2001 From: Jirka B Date: Tue, 12 Aug 2025 11:24:56 +0200 Subject: [PATCH 20/25] enumerate --- tests/unittests/conftest.py | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/tests/unittests/conftest.py b/tests/unittests/conftest.py index 34104cee3fc..c17e084d148 100644 --- a/tests/unittests/conftest.py +++ b/tests/unittests/conftest.py @@ -54,7 +54,7 @@ def get_free_port() -> int: return port -def setup_ddp(rank: int, world_size: int) -> None: +def setup_ddp(rank: int, world_size: int, port: int) -> None: """Initialize ddp environment. If a particular test relies on the order of the processes in the pool to be [0, 1, 2, ...], then this function @@ -67,7 +67,7 @@ def setup_ddp(rank: int, world_size: int) -> None: """ os.environ["MASTER_ADDR"] = "localhost" - os.environ["MASTER_PORT"] = str(get_free_port()) + os.environ["MASTER_PORT"] = str(port) if torch.distributed.group.WORLD is not None: # if already initialized, destroy the process group torch.distributed.destroy_process_group() @@ -86,8 +86,9 @@ def pytest_sessionstart(): """Global initialization of multiprocessing pool; runs before any test.""" if not USE_PYTEST_POOL: return + port = get_free_port() pool = Pool(processes=NUM_PROCESSES) - pool.starmap(setup_ddp, [(rank, NUM_PROCESSES) for rank in range(NUM_PROCESSES)]) + pool.starmap(setup_ddp, [(rank, NUM_PROCESSES, port + i) for i, rank in enumerate(range(NUM_PROCESSES))]) pytest.pool = pool From b8eae2c52680209970754a0fd554dafd7a94171b Mon Sep 17 00:00:00 2001 From: Jirka B Date: Tue, 12 Aug 2025 11:36:27 +0200 Subject: [PATCH 21/25] get_free_port --- tests/unittests/conftest.py | 3 --- 1 file changed, 3 deletions(-) diff --git a/tests/unittests/conftest.py b/tests/unittests/conftest.py index c17e084d148..fdec272f9ae 100644 --- a/tests/unittests/conftest.py +++ b/tests/unittests/conftest.py @@ -45,12 +45,9 @@ def use_deterministic_algorithms(): def get_free_port() -> int: """Find an available free port on localhost and keep it reserved.""" with socket.socket(socket.AF_INET, socket.SOCK_STREAM) as s: - s.settimeout(5) # Set a timeout of 5 seconds s.bind(("localhost", 0)) # Bind to a free port provided by the OS s.setsockopt(socket.SOL_SOCKET, socket.SO_REUSEADDR, 1) port = s.getsockname()[1] - # Keep socket open longer to prevent immediate reuse - s.listen(1) return port From 2021ca51e577dfad4d11b34dbc01433c9dab71b0 Mon Sep 17 00:00:00 2001 From: Jirka B Date: Tue, 12 Aug 2025 11:38:30 +0200 Subject: [PATCH 22/25] get_free_port --- tests/unittests/conftest.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/tests/unittests/conftest.py b/tests/unittests/conftest.py index fdec272f9ae..5ee2d99ca1b 100644 --- a/tests/unittests/conftest.py +++ b/tests/unittests/conftest.py @@ -48,7 +48,7 @@ def get_free_port() -> int: s.bind(("localhost", 0)) # Bind to a free port provided by the OS s.setsockopt(socket.SOL_SOCKET, socket.SO_REUSEADDR, 1) port = s.getsockname()[1] - return port + return int(port) def setup_ddp(rank: int, world_size: int, port: int) -> None: @@ -61,6 +61,7 @@ def setup_ddp(rank: int, world_size: int, port: int) -> None: Args: rank: the rank of the process world_size: the number of processes + port: the port to use for communication """ os.environ["MASTER_ADDR"] = "localhost" From f7fd132dead578676701578773ec779c2d17b7ee Mon Sep 17 00:00:00 2001 From: Jirka B Date: Tue, 12 Aug 2025 23:10:05 +0200 Subject: [PATCH 23/25] master port --- tests/unittests/conftest.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/unittests/conftest.py b/tests/unittests/conftest.py index 5ee2d99ca1b..dd1ac85ee66 100644 --- a/tests/unittests/conftest.py +++ b/tests/unittests/conftest.py @@ -86,7 +86,7 @@ def pytest_sessionstart(): return port = get_free_port() pool = Pool(processes=NUM_PROCESSES) - pool.starmap(setup_ddp, [(rank, NUM_PROCESSES, port + i) for i, rank in enumerate(range(NUM_PROCESSES))]) + pool.starmap(setup_ddp, [(rank, NUM_PROCESSES, port) for rank in range(NUM_PROCESSES)]) pytest.pool = pool From 154a4c6045e1202f357c4c01b0c160c04102fb92 Mon Sep 17 00:00:00 2001 From: Jirka B Date: Wed, 13 Aug 2025 10:21:06 +0200 Subject: [PATCH 24/25] no rerun --- .github/workflows/ci-tests.yml | 4 ---- 1 file changed, 4 deletions(-) diff --git a/.github/workflows/ci-tests.yml b/.github/workflows/ci-tests.yml index 5075cebb459..94ad52165fc 100644 --- a/.github/workflows/ci-tests.yml +++ b/.github/workflows/ci-tests.yml @@ -194,8 +194,6 @@ jobs: $TEST_DIRS \ --cov=torchmetrics \ --durations=50 \ - --reruns 3 \ - --reruns-delay 1 \ -m "not DDP" \ -n auto \ --dist=load \ @@ -213,8 +211,6 @@ jobs: --cov=torchmetrics \ --durations=50 \ -m DDP \ - --reruns 3 \ - --reruns-delay 1 \ ${{ env.UNITTEST_TIMEOUT }} - name: Statistics From 774fdd3820e08684f0d829f29be6f034a1e393bf Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Wed, 8 Oct 2025 10:01:04 +0000 Subject: [PATCH 25/25] [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --- README.md | 12 +++++++----- 1 file changed, 7 insertions(+), 5 deletions(-) diff --git a/README.md b/README.md index fd00bcaa6bd..c77320656fd 100644 --- a/README.md +++ b/README.md @@ -39,13 +39,15 @@ ______________________________________________________________________ # Looking for GPUs? -Over 340,000 developers use [Lightning Cloud](https://lightning.ai/?utm_source=tm_readme&utm_medium=referral&utm_campaign=tm_readme) - purpose-built for PyTorch and PyTorch Lightning. -- [GPUs](https://lightning.ai/pricing?utm_source=tm_readme&utm_medium=referral&utm_campaign=tm_readme) from $0.19. -- [Clusters](https://lightning.ai/clusters?utm_source=tm_readme&utm_medium=referral&utm_campaign=tm_readme): frontier-grade training/inference clusters. + +Over 340,000 developers use [Lightning Cloud](https://lightning.ai/?utm_source=tm_readme&utm_medium=referral&utm_campaign=tm_readme) - purpose-built for PyTorch and PyTorch Lightning. + +- [GPUs](https://lightning.ai/pricing?utm_source=tm_readme&utm_medium=referral&utm_campaign=tm_readme) from $0.19. +- [Clusters](https://lightning.ai/clusters?utm_source=tm_readme&utm_medium=referral&utm_campaign=tm_readme): frontier-grade training/inference clusters. - [AI Studio (vibe train)](https://lightning.ai/studios?utm_source=tm_readme&utm_medium=referral&utm_campaign=tm_readme): workspaces where AI helps you debug, tune and vibe train. -- [AI Studio (vibe deploy)](https://lightning.ai/studios?utm_source=tm_readme&utm_medium=referral&utm_campaign=tm_readme): workspaces where AI helps you optimize, and deploy models. +- [AI Studio (vibe deploy)](https://lightning.ai/studios?utm_source=tm_readme&utm_medium=referral&utm_campaign=tm_readme): workspaces where AI helps you optimize, and deploy models. - [Notebooks](https://lightning.ai/notebooks?utm_source=tm_readme&utm_medium=referral&utm_campaign=tm_readme): Persistent GPU workspaces where AI helps you code and analyze. -- [Inference](https://lightning.ai/deploy?utm_source=tm_readme&utm_medium=referral&utm_campaign=tm_readme): Deploy models as inference APIs. +- [Inference](https://lightning.ai/deploy?utm_source=tm_readme&utm_medium=referral&utm_campaign=tm_readme): Deploy models as inference APIs. # Installation