Add SLURM support for multi-node tests

roclark · roclark · commit e1cfc68abd3b · 2021-04-07T15:46:15.000-05:00
To make it easier to run on large clusters, Bobber should be able to run
on SLURM clusters with Pyxis and Enroot installed. This would replace the
need for mpirun and SSH keys/daemons inside the containers, making it
easier to run tests without copying images between nodes or synchronizing
SSH keys.

Signed-Off-By: Robert Clark &lt;roclark@nvidia.com&gt;
diff --git a/.gitignore b/.gitignore
@@ -4,3 +4,4 @@ build/*
 dist/*
 env/*
 nvidia_bobber.egg-info/
+*.out
diff --git a/bobber/bobber.py b/bobber/bobber.py
@@ -1,6 +1,7 @@
 # SPDX-License-Identifier: MIT
 import bobber.lib.docker
 import json
+import sys
 from argparse import ArgumentParser, ArgumentTypeError, Namespace
 from copy import copy
 from bobber import __version__
@@ -85,11 +86,20 @@ def parse_args(version: str) -> Namespace:
 
     # More general options which apply to a majority of the running commands
     # Note that all arguments prepended with '--' are optional
+    commands_parent.add_argument('--slurm', help='Run a test on an existing '
+                                 'SLURM cluster with Pyxis/Enroot installed',
+                                 action='store_true')
+    commands_parent.add_argument('--storage-path', help='Path at which the '
+                                 'filesystem under test is mounted',
+                                 required='--slurm' in sys.argv)
     commands_parent.add_argument('log_path', metavar='log-path', help='Path '
                                  'used to store log files on the head node')
-    commands_parent.add_argument('hosts', help='Comma-separated list of '
+    commands_parent.add_argument('hosts', help='Number of hosts to queue a '
+                                 'job for in a SLURM cluster.' if '--slurm'
+                                 in sys.argv else 'Comma-separated list of '
                                  'hostnames or IP addresses',
-                                 type=unique_hosts)
+                                 type=int if '--slurm' in sys.argv
+                                 else unique_hosts)
     commands_parent.add_argument('--config-path', help='Read a JSON config '
                                  'file with expected parameters and use those '
                                  'values for testing. Ignores all other '
@@ -365,6 +375,12 @@ def execute_command(args: Namespace, version: str) -> NoReturn:
         bobber.lib.docker.cast(args.storage_path, args.ignore_gpu, version)
     elif args.command == LOAD:
         bobber.lib.docker.load(args.filename)
+    elif args.slurm and args.command == RUN_NCCL:
+        args = load_settings(args)
+        bobber.lib.system.slurm.run_nccl(args, version)
+    elif args.slurm and args.command == RUN_DALI:
+        args = load_settings(args)
+        bobber.lib.system.slurm.run_dali(args, version)
     else:
         # Update the version to be used in filenames
         version_underscore = version.replace('.', '_')
diff --git a/bobber/lib/analysis/dali.py b/bobber/lib/analysis/dali.py
@@ -139,6 +139,42 @@ def _update_results(image_type_match: dict, results: list) -> dict:
     return image_type_match
 
 
+def _slurm_test_sections(log_contents: str) -> list:
+    """
+    Parse the SLURM log test sections.
+
+    The SLURM log files for DALI tests have a different structure to the output
+    which needs to be special-handled. These sections are parsed by reading
+    from the beginning of one sub-section (ie. small JPGs) until the first time
+    the next sub-section is encountered (ie. large JPGs).
+
+    Parameters
+    ----------
+    log_contents : str
+        A ``string`` of the complete contents from the log file.
+
+    Returns
+    -------
+    list
+        Returns a ``list`` of strings where each element is the complete output
+        from a test subsection.
+    """
+    small_jpg = re.findall('800x600/file_read_pipeline.*'
+                           '?3840x2160/file_read_pipeline',
+                           log_contents, re.DOTALL)
+    large_jpg = re.findall('3840x2160/file_read_pipeline.*'
+                           '?800x600/tfrecord_pipeline',
+                           log_contents, re.DOTALL)
+    small_tf = re.findall('800x600/tfrecord_pipeline.*'
+                          '?3840x2160/tfrecord_pipeline',
+                          log_contents, re.DOTALL)
+    large_tf = re.findall('3840x2160/tfrecord_pipeline.*'
+                          'OK', log_contents, re.DOTALL)
+    sections = [small_jpg, large_jpg, small_tf, large_tf]
+    sections = ['\n'.join(section) for section in sections]
+    return sections
+
+
 def _result_parsing(log_contents: str, systems: int, image_results: dict,
                     log_file: str) -> dict:
     """
@@ -188,6 +224,10 @@ def _result_parsing(log_contents: str, systems: int, image_results: dict,
     ]
 
     test_sections = re.findall(r'RUN 1/1.*?OK', log_contents, re.DOTALL)
+    # The SLURM tests have a different layout and need to be grabbed
+    # appropriately
+    if '+ srun --nodes=' in log_contents:
+        test_sections = _slurm_test_sections(log_contents)
     if len(test_sections) != 4:
         print(f'Warning: Invalid number of results found in {log_file} log '
               'file. Skipping...')
diff --git a/bobber/lib/exit_codes.py b/bobber/lib/exit_codes.py
@@ -8,3 +8,5 @@
 CONTAINER_NOT_RUNNING = 32  # Bobber container not running
 NVIDIA_RUNTIME_ERROR = 33  # NVIDIA container runtime not found
 CONTAINER_VERSION_MISMATCH = 34  # Container different from application
+SLURM_QUEUE_ERROR = 40  # Error queueing a SLURM job
+SBATCH_CALL_ERROR = 41  # Error running sbatch
diff --git a/bobber/lib/system/__init__.py b/bobber/lib/system/__init__.py
@@ -1 +1,5 @@
 # SPDX-License-Identifier: MIT
+from bobber.lib.system import slurm
+
+run_dali = slurm.run_dali
+run_nccl = slurm.run_nccl
diff --git a/bobber/lib/system/slurm.py b/bobber/lib/system/slurm.py
@@ -0,0 +1,175 @@
+# SPDX-License-Identifier: MIT
+import os
+import subprocess
+import sys
+from argparse import Namespace
+from bobber.lib.exit_codes import SBATCH_CALL_ERROR, SLURM_QUEUE_ERROR
+from typing import NoReturn
+
+
+def _slurm_scripts_path() -> str:
+    """
+    Find the absolute path to the slurm_scripts directory.
+
+    The slurm_scripts directory contains several *.sub files which are required
+    to launch test commands via SLURM. Depending on how and where Bobber is
+    installed on a system, the absolute path to this directory may change, but
+    the relative path is easy to find compared to this module. By allowing
+    Python to determine the absolute path to this module, the absolute path to
+    slurm_scripts can be found by combining the absolute path of this module
+    and the relative path to the slurm_scripts directory.
+
+    Returns
+    -------
+    str
+        Returns a ``string`` of the absolute path to the slurm_scripts
+        directory.
+    """
+    directory = os.path.dirname(os.path.realpath(__file__))
+    directory = os.path.join(directory, '../../slurm_scripts')
+    return directory
+
+
+def _sbatch_path() -> str:
+    """
+    Find the full path to the sbatch script.
+
+    While launching a Python process without "shell=True" as is done for the
+    test commands below, the "sbatch" command is not available as Python
+    launches a new process without a proper PATH variable. Running "which
+    sbatch" with a shell instance provides the full path to sbatch which can
+    later be used directly to invoke the script directly instead of using the
+    alias. If sbatch is not installed on the system, the application will exit.
+
+    Returns
+    -------
+    str
+        Returns a ``string`` of the full local path to the sbatch script.
+    """
+    result = subprocess.run('which sbatch', capture_output=True, shell=True)
+    if not result.stderr and result.stdout:
+        return str(result.stdout.strip().decode('ascii'))
+    else:
+        print('sbatch command not found. Please ensure SLURM is installed and '
+              'functional.')
+        sys.exit(SBATCH_CALL_ERROR)
+
+
+def run_nccl(args: Namespace, version: str) -> NoReturn:
+    """
+    Launch a multi-node NCCL test via SLURM.
+
+    Launch a NCCL test for N-nodes managed by a SLURM cluster. Multiple tests
+    are queued-up as sbatch commands which will only launch once the previous
+    test has completed.
+
+    Parameters
+    ----------
+    args : Namespace
+        A ``Namespace`` of all settings specified by the user for the test.
+    version : str
+        A ``string`` of the Bobber version.
+    """
+    # Update the version to be used in filenames
+    version_underscore = version.replace('.', '_')
+    # If not sweeping, set the range of nodes from N-hosts to N-hosts for a
+    # single iteration of tests.
+    lower_bound = args.hosts
+    if args.sweep:
+        lower_bound = 1
+    for hosts in range(lower_bound, args.hosts + 1):
+        for iteration in range(1, args.iterations + 1):
+            nccl_log = os.path.join(args.log_path,
+                                    f'nccl_iteration_{iteration}_'
+                                    f'gpus_{args.gpus}_'
+                                    f'nccl_max_{args.nccl_max}_'
+                                    f'gid_{args.compute_gid}_'
+                                    f'nccl_tc_{args.nccl_tc}_'
+                                    f'systems_{hosts}_'
+                                    f'version_{version_underscore}.log')
+            nccl_path = os.path.join(_slurm_scripts_path(), 'nccl.sub')
+            sbatch = _sbatch_path()
+            env = {
+                'HOSTS': str(hosts),
+                'FS_PATH': args.storage_path,
+                'CONT_VERSION': f'nvcr.io/nvidian/bobber:{version}',
+                'NCCL_MAX': str(args.nccl_max),
+                'LOGDIR': args.log_path,
+                'LOGPATH': nccl_log,
+                'NCCL_IB_HCAS': args.nccl_ib_hcas,
+                'COMPUTE_GID': str(args.compute_gid),
+                'NCCL_TC': args.nccl_tc or ''
+            }
+            cmd = [f'{sbatch}',
+                   '-N',
+                   f'{hosts}',
+                   f'--gpus-per-node={args.gpus}',
+                   '--wait',
+                   '--dependency=singleton',
+                   f'{nccl_path}']
+            try:
+                print('Running:', cmd)
+                subprocess.Popen(cmd, env=env)
+            except subprocess.CalledProcessError:
+                print('Error queueing SLURM job for NCCL tests. '
+                      'See output for errors.')
+                sys.exit(SLURM_QUEUE_ERROR)
+
+
+def run_dali(args: Namespace, version: str) -> NoReturn:
+    """
+    Launch a multi-node DALI test via SLURM.
+
+    Launch a DALI test for N-nodes managed by a SLURM cluster. Multiple tests
+    are queued-up as sbatch commands which will only launch once the previous
+    test has completed.
+
+    Parameters
+    ----------
+    args : Namespace
+        A ``Namespace`` of all settings specified by the user for the test.
+    version : str
+        A ``string`` of the Bobber version.
+    """
+    # Update the version to be used in filenames
+    version_underscore = version.replace('.', '_')
+    # If not sweeping, set the range of nodes from N-hosts to N-hosts for a
+    # single iteration of tests.
+    lower_bound = args.hosts
+    if args.sweep:
+        lower_bound = 1
+    for hosts in range(lower_bound, args.hosts + 1):
+        for iteration in range(1, args.iterations + 1):
+            dali_log = os.path.join(args.log_path,
+                                    f'dali_iteration_{iteration}_'
+                                    f'gpus_{args.gpus}_'
+                                    f'batch_size_lg_{args.batch_size_lg}_'
+                                    f'batch_size_sm_{args.batch_size_sm}_'
+                                    f'systems_{hosts}_'
+                                    f'version_{version_underscore}.log')
+            dali_path = os.path.join(_slurm_scripts_path(), 'dali.sub')
+            sbatch = _sbatch_path()
+            env = {
+                'HOSTS': str(hosts),
+                'FS_PATH': args.storage_path,
+                'CONT_VERSION': f'nvcr.io/nvidian/bobber:{version}',
+                'GPUS': str(args.gpus),
+                'LOGDIR': args.log_path,
+                'LOGPATH': dali_log,
+                'BATCH_SIZE_SM': str(args.batch_size_sm),
+                'BATCH_SIZE_LG': str(args.batch_size_lg)
+            }
+            cmd = [f'{sbatch}',
+                   '-N',
+                   f'{hosts}',
+                   f'--gpus-per-node={args.gpus}',
+                   '--wait',
+                   '--dependency=singleton',
+                   f'{dali_path}']
+            try:
+                print('Running:', cmd)
+                subprocess.Popen(cmd, env=env)
+            except subprocess.CalledProcessError:
+                print('Error queueing SLURM job for DALI tests. '
+                      'See output for errors.')
+                sys.exit(SLURM_QUEUE_ERROR)
diff --git a/bobber/slurm_scripts/dali.sub b/bobber/slurm_scripts/dali.sub
@@ -0,0 +1,26 @@
+#!/bin/bash
+#SBATCH --job-name bobber_dali
+# SPDX-License-Identifier: MIT
+set -euxo pipefail
+
+# Required vars
+: "${HOSTS:=4}"
+: "${FS_PATH:=/mnt/fs}"
+: "${CONT_VERSION:=nvcr.io/nvidian/bobber:6.1.1}"
+: "${LOGDIR:=test_logs/}"
+: "${LOGPATH:=test_logs/dali.log}"
+: "${BATCH_SIZE_LG:=150}"
+: "${BATCH_SIZE_SM:=150}"
+
+mkdir -p ${LOGDIR}
+
+srun --nodes=1 --mpi=pmix --exclusive --gres=gpu:8 --container-image ${CONT_VERSION} --container-mounts=${FS_PATH}:/mnt/fs_under_test /tests/dali_setup.sh |& tee ${LOGPATH}
+BATCH_SIZE=${BATCH_SIZE_SM} DATASET_PATH="/mnt/fs_under_test/imageinary_data/800x600/file_read_pipeline_images" srun --nodes=${HOSTS} --ntasks-per-node=1 --mpi=pmix --exclusive --gres=gpu:8 --container-image ${CONT_VERSION} --container-mounts=${FS_PATH}:/mnt/fs_under_test /tests/dali_slurm.sh |& tee ${LOGPATH}
+srun --nodes=${HOSTS} --exclusive sudo /sbin/sysctl vm.drop_caches=3
+BATCH_SIZE=${BATCH_SIZE_LG} DATASET_PATH="/mnt/fs_under_test/imageinary_data/3840x2160/file_read_pipeline_images" srun --nodes=${HOSTS} --ntasks-per-node=1 --mpi=pmix --exclusive --gres=gpu:8 --container-image ${CONT_VERSION} --container-mounts=${FS_PATH}:/mnt/fs_under_test /tests/dali_slurm.sh |& tee ${LOGPATH}
+srun --nodes=${HOSTS} --exclusive sudo /sbin/sysctl vm.drop_caches=3
+BATCH_SIZE=${BATCH_SIZE_SM} DATASET_PATH="/mnt/fs_under_test/imageinary_data/800x600/tfrecord_pipeline/tfrecord-*" srun --nodes=${HOSTS} --ntasks-per-node=1 --mpi=pmix --exclusive --gres=gpu:8 --container-image ${CONT_VERSION} --container-mounts=${FS_PATH}:/mnt/fs_under_test /tests/dali_slurm.sh |& tee ${LOGPATH}
+srun --nodes=${HOSTS} --exclusive sudo /sbin/sysctl vm.drop_caches=3
+BATCH_SIZE=${BATCH_SIZE_LG} DATASET_PATH="/mnt/fs_under_test/imageinary_data/3840x2160/tfrecord_pipeline/tfrecord-*" srun --nodes=${HOSTS} --ntasks-per-node=1 --mpi=pmix --exclusive --gres=gpu:8 --container-image ${CONT_VERSION} --container-mounts=${FS_PATH}:/mnt/fs_under_test /tests/dali_slurm.sh |& tee ${LOGPATH}
+srun --nodes=${HOSTS} --exclusive sudo /sbin/sysctl vm.drop_caches=3
+srun --nodes=1 --mpi=pmix --exclusive --gres=gpu:8 --container-image ${CONT_VERSION} --container-mounts=${FS_PATH}:/mnt/fs_under_test /tests/dali_cleanup.sh |& tee ${LOGPATH}
diff --git a/bobber/slurm_scripts/nccl.sub b/bobber/slurm_scripts/nccl.sub
@@ -0,0 +1,19 @@
+#!/bin/bash
+#SBATCH --job-name bobber_nccl
+# SPDX-License-Identifier: MIT
+set -euxo pipefail
+
+# Required vars
+: "${HOSTS:=4}"
+: "${FS_PATH:=/mnt/fs}"
+: "${CONT_VERSION:=nvcr.io/nvidian/bobber:6.1.1}"
+: "${NCCL_MAX:=1}"
+: "${LOGDIR:=test_logs/}"
+: "${LOGPATH:=test_logs/nccl.log}"
+: "${NCCL_IB_HCAS:=}"
+: "${COMPUTE_GID:=0}"
+: "${NCCL_TC:=}"
+
+mkdir -p ${LOGDIR}
+
+NCCL_MAX=${NCCL_MAX} NCCL_IB_HCAS=${NCCL_IB_HCAS} COMPUTE_GID=${COMPUTE_GID} NCCL_TC=${NCCL_TC} srun --nodes=${HOSTS} --ntasks-per-node=8 --mpi=pmix --exclusive --container-image ${CONT_VERSION} --container-mounts=${FS_PATH}:/mnt/fs_under_test /tests/nccl_slurm.sh |& tee ${LOGPATH}
diff --git a/bobber/test_scripts/call_dali_slurm.sh b/bobber/test_scripts/call_dali_slurm.sh
@@ -0,0 +1,19 @@
+#!/bin/bash
+# SPDX-License-Identifier: MIT
+if [ "x$GPUS" = "x" ]; then
+	GPUS=8
+fi
+
+if [ "x$BATCH_SIZE_SM" = "x" ]; then
+	BATCH_SIZE_SM=150
+fi
+
+if [ "x$BATCH_SIZE_LG" = "x" ]; then
+	BATCH_SIZE_LG=150
+fi
+
+if [[ "$DATASET" == *tfrecord* ]]; then
+  python3 /dali/dali/test/python/test_RN50_data_pipeline.py -b $BATCH_SIZE --epochs=11 -g $GPUS --remove_default_pipeline_paths --tfrecord_pipeline_paths "$DATASET"
+else
+  python3 /dali/dali/test/python/test_RN50_data_pipeline.py -b $BATCH_SIZE --epochs=11 -g $GPUS --remove_default_pipeline_paths --file_read_pipeline_paths "$DATASET"
+fi
diff --git a/bobber/test_scripts/dali_cleanup.sh b/bobber/test_scripts/dali_cleanup.sh
@@ -0,0 +1,3 @@
+#!/bin/bash
+# SPDX-License-Identifier: MIT
+rm -r /mnt/fs_under_test/imageinary_data
diff --git a/bobber/test_scripts/dali_setup.sh b/bobber/test_scripts/dali_setup.sh
@@ -0,0 +1,23 @@
+#!/bin/bash
+# SPDX-License-Identifier: MIT
+if [ "x$GPUS" = "x" ]; then
+	GPUS=8
+fi
+
+GPUS_ZERO_BASE=$(($GPUS-1))
+
+mkdir -p /mnt/fs_under_test/imageinary_data/3840x2160/file_read_pipeline_images/images
+mkdir -p /mnt/fs_under_test/imageinary_data/800x600/file_read_pipeline_images/images
+mkdir -p /mnt/fs_under_test/imageinary_data/3840x2160/tfrecord_pipeline
+mkdir -p /mnt/fs_under_test/imageinary_data/800x600/tfrecord_pipeline
+mkdir -p /mnt/fs_under_test/imageinary_data/3840x2160/tfrecord_pipeline.idx
+mkdir -p /mnt/fs_under_test/imageinary_data/800x600/tfrecord_pipeline.idx
+
+imagine create-images --path /mnt/fs_under_test/imageinary_data/3840x2160/file_read_pipeline_images/images --name 4k_image_ --width 3840 --height 2160 --count $(($GPUS*1000)) --image_format jpg --size
+imagine create-images --path /mnt/fs_under_test/imageinary_data/800x600/file_read_pipeline_images/images --name small_image_ --width 800 --height 600 --count $(($GPUS*1000)) --image_format jpg --size
+
+imagine create-tfrecords --source_path /mnt/fs_under_test/imageinary_data/3840x2160/file_read_pipeline_images/images --dest_path /mnt/fs_under_test/imageinary_data/3840x2160/tfrecord_pipeline --name tfrecord- --img_per_file 1000
+imagine create-tfrecords --source_path /mnt/fs_under_test/imageinary_data/800x600/file_read_pipeline_images/images --dest_path /mnt/fs_under_test/imageinary_data/800x600/tfrecord_pipeline --name tfrecord- --img_per_file 1000
+
+for i in $(seq 0 $GPUS_ZERO_BASE); do /dali/tools/tfrecord2idx /mnt/fs_under_test/imageinary_data/3840x2160/tfrecord_pipeline/tfrecord-$i /mnt/fs_under_test/imageinary_data/3840x2160/tfrecord_pipeline.idx/tfrecord-$i; done
+for i in $(seq 0 $GPUS_ZERO_BASE); do /dali/tools/tfrecord2idx /mnt/fs_under_test/imageinary_data/800x600/tfrecord_pipeline/tfrecord-$i /mnt/fs_under_test/imageinary_data/800x600/tfrecord_pipeline.idx/tfrecord-$i; done
diff --git a/bobber/test_scripts/dali_slurm.sh b/bobber/test_scripts/dali_slurm.sh
diff --git a/bobber/test_scripts/nccl_slurm.sh b/bobber/test_scripts/nccl_slurm.sh
diff --git a/setup.py b/setup.py

-Original file line number
+Diff line change
 dist/*
 env/*
 nvidia_bobber.egg-info/
 +*.out
Original file line number	Diff line number	Diff line change
`@@ -0,0 +1,3 @@`
	`1`	`+#!/bin/bash`
	`2`	`+# SPDX-License-Identifier: MIT`
	`3`	`+rm -r /mnt/fs_under_test/imageinary_data`