Skip to content

Commit e1cfc68

Browse files
committed
Add SLURM support for multi-node tests
To make it easier to run on large clusters, Bobber should be able to run on SLURM clusters with Pyxis and Enroot installed. This would replace the need for mpirun and SSH keys/daemons inside the containers, making it easier to run tests without copying images between nodes or synchronizing SSH keys. Signed-Off-By: Robert Clark <roclark@nvidia.com>
1 parent 3430c23 commit e1cfc68

14 files changed

Lines changed: 377 additions & 2 deletions

File tree

.gitignore

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -4,3 +4,4 @@ build/*
44
dist/*
55
env/*
66
nvidia_bobber.egg-info/
7+
*.out

bobber/bobber.py

Lines changed: 18 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,7 @@
11
# SPDX-License-Identifier: MIT
22
import bobber.lib.docker
33
import json
4+
import sys
45
from argparse import ArgumentParser, ArgumentTypeError, Namespace
56
from copy import copy
67
from bobber import __version__
@@ -85,11 +86,20 @@ def parse_args(version: str) -> Namespace:
8586

8687
# More general options which apply to a majority of the running commands
8788
# Note that all arguments prepended with '--' are optional
89+
commands_parent.add_argument('--slurm', help='Run a test on an existing '
90+
'SLURM cluster with Pyxis/Enroot installed',
91+
action='store_true')
92+
commands_parent.add_argument('--storage-path', help='Path at which the '
93+
'filesystem under test is mounted',
94+
required='--slurm' in sys.argv)
8895
commands_parent.add_argument('log_path', metavar='log-path', help='Path '
8996
'used to store log files on the head node')
90-
commands_parent.add_argument('hosts', help='Comma-separated list of '
97+
commands_parent.add_argument('hosts', help='Number of hosts to queue a '
98+
'job for in a SLURM cluster.' if '--slurm'
99+
in sys.argv else 'Comma-separated list of '
91100
'hostnames or IP addresses',
92-
type=unique_hosts)
101+
type=int if '--slurm' in sys.argv
102+
else unique_hosts)
93103
commands_parent.add_argument('--config-path', help='Read a JSON config '
94104
'file with expected parameters and use those '
95105
'values for testing. Ignores all other '
@@ -365,6 +375,12 @@ def execute_command(args: Namespace, version: str) -> NoReturn:
365375
bobber.lib.docker.cast(args.storage_path, args.ignore_gpu, version)
366376
elif args.command == LOAD:
367377
bobber.lib.docker.load(args.filename)
378+
elif args.slurm and args.command == RUN_NCCL:
379+
args = load_settings(args)
380+
bobber.lib.system.slurm.run_nccl(args, version)
381+
elif args.slurm and args.command == RUN_DALI:
382+
args = load_settings(args)
383+
bobber.lib.system.slurm.run_dali(args, version)
368384
else:
369385
# Update the version to be used in filenames
370386
version_underscore = version.replace('.', '_')

bobber/lib/analysis/dali.py

Lines changed: 40 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -139,6 +139,42 @@ def _update_results(image_type_match: dict, results: list) -> dict:
139139
return image_type_match
140140

141141

142+
def _slurm_test_sections(log_contents: str) -> list:
143+
"""
144+
Parse the SLURM log test sections.
145+
146+
The SLURM log files for DALI tests have a different structure to the output
147+
which needs to be special-handled. These sections are parsed by reading
148+
from the beginning of one sub-section (ie. small JPGs) until the first time
149+
the next sub-section is encountered (ie. large JPGs).
150+
151+
Parameters
152+
----------
153+
log_contents : str
154+
A ``string`` of the complete contents from the log file.
155+
156+
Returns
157+
-------
158+
list
159+
Returns a ``list`` of strings where each element is the complete output
160+
from a test subsection.
161+
"""
162+
small_jpg = re.findall('800x600/file_read_pipeline.*'
163+
'?3840x2160/file_read_pipeline',
164+
log_contents, re.DOTALL)
165+
large_jpg = re.findall('3840x2160/file_read_pipeline.*'
166+
'?800x600/tfrecord_pipeline',
167+
log_contents, re.DOTALL)
168+
small_tf = re.findall('800x600/tfrecord_pipeline.*'
169+
'?3840x2160/tfrecord_pipeline',
170+
log_contents, re.DOTALL)
171+
large_tf = re.findall('3840x2160/tfrecord_pipeline.*'
172+
'OK', log_contents, re.DOTALL)
173+
sections = [small_jpg, large_jpg, small_tf, large_tf]
174+
sections = ['\n'.join(section) for section in sections]
175+
return sections
176+
177+
142178
def _result_parsing(log_contents: str, systems: int, image_results: dict,
143179
log_file: str) -> dict:
144180
"""
@@ -188,6 +224,10 @@ def _result_parsing(log_contents: str, systems: int, image_results: dict,
188224
]
189225

190226
test_sections = re.findall(r'RUN 1/1.*?OK', log_contents, re.DOTALL)
227+
# The SLURM tests have a different layout and need to be grabbed
228+
# appropriately
229+
if '+ srun --nodes=' in log_contents:
230+
test_sections = _slurm_test_sections(log_contents)
191231
if len(test_sections) != 4:
192232
print(f'Warning: Invalid number of results found in {log_file} log '
193233
'file. Skipping...')

bobber/lib/exit_codes.py

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -8,3 +8,5 @@
88
CONTAINER_NOT_RUNNING = 32 # Bobber container not running
99
NVIDIA_RUNTIME_ERROR = 33 # NVIDIA container runtime not found
1010
CONTAINER_VERSION_MISMATCH = 34 # Container different from application
11+
SLURM_QUEUE_ERROR = 40 # Error queueing a SLURM job
12+
SBATCH_CALL_ERROR = 41 # Error running sbatch

bobber/lib/system/__init__.py

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1 +1,5 @@
11
# SPDX-License-Identifier: MIT
2+
from bobber.lib.system import slurm
3+
4+
run_dali = slurm.run_dali
5+
run_nccl = slurm.run_nccl

bobber/lib/system/slurm.py

Lines changed: 175 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,175 @@
1+
# SPDX-License-Identifier: MIT
2+
import os
3+
import subprocess
4+
import sys
5+
from argparse import Namespace
6+
from bobber.lib.exit_codes import SBATCH_CALL_ERROR, SLURM_QUEUE_ERROR
7+
from typing import NoReturn
8+
9+
10+
def _slurm_scripts_path() -> str:
11+
"""
12+
Find the absolute path to the slurm_scripts directory.
13+
14+
The slurm_scripts directory contains several *.sub files which are required
15+
to launch test commands via SLURM. Depending on how and where Bobber is
16+
installed on a system, the absolute path to this directory may change, but
17+
the relative path is easy to find compared to this module. By allowing
18+
Python to determine the absolute path to this module, the absolute path to
19+
slurm_scripts can be found by combining the absolute path of this module
20+
and the relative path to the slurm_scripts directory.
21+
22+
Returns
23+
-------
24+
str
25+
Returns a ``string`` of the absolute path to the slurm_scripts
26+
directory.
27+
"""
28+
directory = os.path.dirname(os.path.realpath(__file__))
29+
directory = os.path.join(directory, '../../slurm_scripts')
30+
return directory
31+
32+
33+
def _sbatch_path() -> str:
34+
"""
35+
Find the full path to the sbatch script.
36+
37+
While launching a Python process without "shell=True" as is done for the
38+
test commands below, the "sbatch" command is not available as Python
39+
launches a new process without a proper PATH variable. Running "which
40+
sbatch" with a shell instance provides the full path to sbatch which can
41+
later be used directly to invoke the script directly instead of using the
42+
alias. If sbatch is not installed on the system, the application will exit.
43+
44+
Returns
45+
-------
46+
str
47+
Returns a ``string`` of the full local path to the sbatch script.
48+
"""
49+
result = subprocess.run('which sbatch', capture_output=True, shell=True)
50+
if not result.stderr and result.stdout:
51+
return str(result.stdout.strip().decode('ascii'))
52+
else:
53+
print('sbatch command not found. Please ensure SLURM is installed and '
54+
'functional.')
55+
sys.exit(SBATCH_CALL_ERROR)
56+
57+
58+
def run_nccl(args: Namespace, version: str) -> NoReturn:
59+
"""
60+
Launch a multi-node NCCL test via SLURM.
61+
62+
Launch a NCCL test for N-nodes managed by a SLURM cluster. Multiple tests
63+
are queued-up as sbatch commands which will only launch once the previous
64+
test has completed.
65+
66+
Parameters
67+
----------
68+
args : Namespace
69+
A ``Namespace`` of all settings specified by the user for the test.
70+
version : str
71+
A ``string`` of the Bobber version.
72+
"""
73+
# Update the version to be used in filenames
74+
version_underscore = version.replace('.', '_')
75+
# If not sweeping, set the range of nodes from N-hosts to N-hosts for a
76+
# single iteration of tests.
77+
lower_bound = args.hosts
78+
if args.sweep:
79+
lower_bound = 1
80+
for hosts in range(lower_bound, args.hosts + 1):
81+
for iteration in range(1, args.iterations + 1):
82+
nccl_log = os.path.join(args.log_path,
83+
f'nccl_iteration_{iteration}_'
84+
f'gpus_{args.gpus}_'
85+
f'nccl_max_{args.nccl_max}_'
86+
f'gid_{args.compute_gid}_'
87+
f'nccl_tc_{args.nccl_tc}_'
88+
f'systems_{hosts}_'
89+
f'version_{version_underscore}.log')
90+
nccl_path = os.path.join(_slurm_scripts_path(), 'nccl.sub')
91+
sbatch = _sbatch_path()
92+
env = {
93+
'HOSTS': str(hosts),
94+
'FS_PATH': args.storage_path,
95+
'CONT_VERSION': f'nvcr.io/nvidian/bobber:{version}',
96+
'NCCL_MAX': str(args.nccl_max),
97+
'LOGDIR': args.log_path,
98+
'LOGPATH': nccl_log,
99+
'NCCL_IB_HCAS': args.nccl_ib_hcas,
100+
'COMPUTE_GID': str(args.compute_gid),
101+
'NCCL_TC': args.nccl_tc or ''
102+
}
103+
cmd = [f'{sbatch}',
104+
'-N',
105+
f'{hosts}',
106+
f'--gpus-per-node={args.gpus}',
107+
'--wait',
108+
'--dependency=singleton',
109+
f'{nccl_path}']
110+
try:
111+
print('Running:', cmd)
112+
subprocess.Popen(cmd, env=env)
113+
except subprocess.CalledProcessError:
114+
print('Error queueing SLURM job for NCCL tests. '
115+
'See output for errors.')
116+
sys.exit(SLURM_QUEUE_ERROR)
117+
118+
119+
def run_dali(args: Namespace, version: str) -> NoReturn:
120+
"""
121+
Launch a multi-node DALI test via SLURM.
122+
123+
Launch a DALI test for N-nodes managed by a SLURM cluster. Multiple tests
124+
are queued-up as sbatch commands which will only launch once the previous
125+
test has completed.
126+
127+
Parameters
128+
----------
129+
args : Namespace
130+
A ``Namespace`` of all settings specified by the user for the test.
131+
version : str
132+
A ``string`` of the Bobber version.
133+
"""
134+
# Update the version to be used in filenames
135+
version_underscore = version.replace('.', '_')
136+
# If not sweeping, set the range of nodes from N-hosts to N-hosts for a
137+
# single iteration of tests.
138+
lower_bound = args.hosts
139+
if args.sweep:
140+
lower_bound = 1
141+
for hosts in range(lower_bound, args.hosts + 1):
142+
for iteration in range(1, args.iterations + 1):
143+
dali_log = os.path.join(args.log_path,
144+
f'dali_iteration_{iteration}_'
145+
f'gpus_{args.gpus}_'
146+
f'batch_size_lg_{args.batch_size_lg}_'
147+
f'batch_size_sm_{args.batch_size_sm}_'
148+
f'systems_{hosts}_'
149+
f'version_{version_underscore}.log')
150+
dali_path = os.path.join(_slurm_scripts_path(), 'dali.sub')
151+
sbatch = _sbatch_path()
152+
env = {
153+
'HOSTS': str(hosts),
154+
'FS_PATH': args.storage_path,
155+
'CONT_VERSION': f'nvcr.io/nvidian/bobber:{version}',
156+
'GPUS': str(args.gpus),
157+
'LOGDIR': args.log_path,
158+
'LOGPATH': dali_log,
159+
'BATCH_SIZE_SM': str(args.batch_size_sm),
160+
'BATCH_SIZE_LG': str(args.batch_size_lg)
161+
}
162+
cmd = [f'{sbatch}',
163+
'-N',
164+
f'{hosts}',
165+
f'--gpus-per-node={args.gpus}',
166+
'--wait',
167+
'--dependency=singleton',
168+
f'{dali_path}']
169+
try:
170+
print('Running:', cmd)
171+
subprocess.Popen(cmd, env=env)
172+
except subprocess.CalledProcessError:
173+
print('Error queueing SLURM job for DALI tests. '
174+
'See output for errors.')
175+
sys.exit(SLURM_QUEUE_ERROR)

bobber/slurm_scripts/dali.sub

Lines changed: 26 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,26 @@
1+
#!/bin/bash
2+
#SBATCH --job-name bobber_dali
3+
# SPDX-License-Identifier: MIT
4+
set -euxo pipefail
5+
6+
# Required vars
7+
: "${HOSTS:=4}"
8+
: "${FS_PATH:=/mnt/fs}"
9+
: "${CONT_VERSION:=nvcr.io/nvidian/bobber:6.1.1}"
10+
: "${LOGDIR:=test_logs/}"
11+
: "${LOGPATH:=test_logs/dali.log}"
12+
: "${BATCH_SIZE_LG:=150}"
13+
: "${BATCH_SIZE_SM:=150}"
14+
15+
mkdir -p ${LOGDIR}
16+
17+
srun --nodes=1 --mpi=pmix --exclusive --gres=gpu:8 --container-image ${CONT_VERSION} --container-mounts=${FS_PATH}:/mnt/fs_under_test /tests/dali_setup.sh |& tee ${LOGPATH}
18+
BATCH_SIZE=${BATCH_SIZE_SM} DATASET_PATH="/mnt/fs_under_test/imageinary_data/800x600/file_read_pipeline_images" srun --nodes=${HOSTS} --ntasks-per-node=1 --mpi=pmix --exclusive --gres=gpu:8 --container-image ${CONT_VERSION} --container-mounts=${FS_PATH}:/mnt/fs_under_test /tests/dali_slurm.sh |& tee ${LOGPATH}
19+
srun --nodes=${HOSTS} --exclusive sudo /sbin/sysctl vm.drop_caches=3
20+
BATCH_SIZE=${BATCH_SIZE_LG} DATASET_PATH="/mnt/fs_under_test/imageinary_data/3840x2160/file_read_pipeline_images" srun --nodes=${HOSTS} --ntasks-per-node=1 --mpi=pmix --exclusive --gres=gpu:8 --container-image ${CONT_VERSION} --container-mounts=${FS_PATH}:/mnt/fs_under_test /tests/dali_slurm.sh |& tee ${LOGPATH}
21+
srun --nodes=${HOSTS} --exclusive sudo /sbin/sysctl vm.drop_caches=3
22+
BATCH_SIZE=${BATCH_SIZE_SM} DATASET_PATH="/mnt/fs_under_test/imageinary_data/800x600/tfrecord_pipeline/tfrecord-*" srun --nodes=${HOSTS} --ntasks-per-node=1 --mpi=pmix --exclusive --gres=gpu:8 --container-image ${CONT_VERSION} --container-mounts=${FS_PATH}:/mnt/fs_under_test /tests/dali_slurm.sh |& tee ${LOGPATH}
23+
srun --nodes=${HOSTS} --exclusive sudo /sbin/sysctl vm.drop_caches=3
24+
BATCH_SIZE=${BATCH_SIZE_LG} DATASET_PATH="/mnt/fs_under_test/imageinary_data/3840x2160/tfrecord_pipeline/tfrecord-*" srun --nodes=${HOSTS} --ntasks-per-node=1 --mpi=pmix --exclusive --gres=gpu:8 --container-image ${CONT_VERSION} --container-mounts=${FS_PATH}:/mnt/fs_under_test /tests/dali_slurm.sh |& tee ${LOGPATH}
25+
srun --nodes=${HOSTS} --exclusive sudo /sbin/sysctl vm.drop_caches=3
26+
srun --nodes=1 --mpi=pmix --exclusive --gres=gpu:8 --container-image ${CONT_VERSION} --container-mounts=${FS_PATH}:/mnt/fs_under_test /tests/dali_cleanup.sh |& tee ${LOGPATH}

bobber/slurm_scripts/nccl.sub

Lines changed: 19 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,19 @@
1+
#!/bin/bash
2+
#SBATCH --job-name bobber_nccl
3+
# SPDX-License-Identifier: MIT
4+
set -euxo pipefail
5+
6+
# Required vars
7+
: "${HOSTS:=4}"
8+
: "${FS_PATH:=/mnt/fs}"
9+
: "${CONT_VERSION:=nvcr.io/nvidian/bobber:6.1.1}"
10+
: "${NCCL_MAX:=1}"
11+
: "${LOGDIR:=test_logs/}"
12+
: "${LOGPATH:=test_logs/nccl.log}"
13+
: "${NCCL_IB_HCAS:=}"
14+
: "${COMPUTE_GID:=0}"
15+
: "${NCCL_TC:=}"
16+
17+
mkdir -p ${LOGDIR}
18+
19+
NCCL_MAX=${NCCL_MAX} NCCL_IB_HCAS=${NCCL_IB_HCAS} COMPUTE_GID=${COMPUTE_GID} NCCL_TC=${NCCL_TC} srun --nodes=${HOSTS} --ntasks-per-node=8 --mpi=pmix --exclusive --container-image ${CONT_VERSION} --container-mounts=${FS_PATH}:/mnt/fs_under_test /tests/nccl_slurm.sh |& tee ${LOGPATH}
Lines changed: 19 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,19 @@
1+
#!/bin/bash
2+
# SPDX-License-Identifier: MIT
3+
if [ "x$GPUS" = "x" ]; then
4+
GPUS=8
5+
fi
6+
7+
if [ "x$BATCH_SIZE_SM" = "x" ]; then
8+
BATCH_SIZE_SM=150
9+
fi
10+
11+
if [ "x$BATCH_SIZE_LG" = "x" ]; then
12+
BATCH_SIZE_LG=150
13+
fi
14+
15+
if [[ "$DATASET" == *tfrecord* ]]; then
16+
python3 /dali/dali/test/python/test_RN50_data_pipeline.py -b $BATCH_SIZE --epochs=11 -g $GPUS --remove_default_pipeline_paths --tfrecord_pipeline_paths "$DATASET"
17+
else
18+
python3 /dali/dali/test/python/test_RN50_data_pipeline.py -b $BATCH_SIZE --epochs=11 -g $GPUS --remove_default_pipeline_paths --file_read_pipeline_paths "$DATASET"
19+
fi
Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,3 @@
1+
#!/bin/bash
2+
# SPDX-License-Identifier: MIT
3+
rm -r /mnt/fs_under_test/imageinary_data

0 commit comments

Comments
 (0)