From cc61ed36818e376eb29d2ea30104996173a994fa Mon Sep 17 00:00:00 2001 From: Claudia Watson Date: Wed, 22 Oct 2025 15:52:43 +0100 Subject: [PATCH 1/2] Adds bandwidth.yml playbook to download, build, and run nvbandwidth. bandwidth.yml is ran via cudatests.yml --- ansible/adhoc/cudatests.yml | 5 +++ ansible/roles/cuda/defaults/main.yml | 3 ++ ansible/roles/cuda/tasks/bandwidth.yml | 56 ++++++++++++++++++++++++++ 3 files changed, 64 insertions(+) create mode 100644 ansible/roles/cuda/tasks/bandwidth.yml diff --git a/ansible/adhoc/cudatests.yml b/ansible/adhoc/cudatests.yml index f571f8a89..8c325158a 100644 --- a/ansible/adhoc/cudatests.yml +++ b/ansible/adhoc/cudatests.yml @@ -7,3 +7,8 @@ - ansible.builtin.import_role: name: cuda tasks_from: samples.yml + + - name: Run CUDA bandwidth tasks + ansible.builtin.import_role: + name: cuda + tasks_from: bandwidth.yml diff --git a/ansible/roles/cuda/defaults/main.yml b/ansible/roles/cuda/defaults/main.yml index 692301d23..14d3d90f7 100644 --- a/ansible/roles/cuda/defaults/main.yml +++ b/ansible/roles/cuda/defaults/main.yml @@ -16,3 +16,6 @@ cuda_samples_programs: - bandwidthTest # cuda_devices: # discovered from deviceQuery run cuda_persistenced_state: started +# variables for nvbandwidth (for bandwidth.yml tasks run in cudatests.yml) +cuda_bandwidth_path: "/var/lib/{{ ansible_user }}/cuda_bandwidth" +cuda_bandwidth_release_url: "https://github.com/NVIDIA/nvbandwidth/archive/refs/tags/v0.8.tar.gz" diff --git a/ansible/roles/cuda/tasks/bandwidth.yml b/ansible/roles/cuda/tasks/bandwidth.yml new file mode 100644 index 000000000..0d18088f6 --- /dev/null +++ b/ansible/roles/cuda/tasks/bandwidth.yml @@ -0,0 +1,56 @@ +--- +- name: Ensure cuda_bandwidth_path exists + ansible.builtin.file: + state: directory + path: "{{ cuda_bandwidth_path }}" + owner: "{{ ansible_user }}" + group: "{{ ansible_user }}" + mode: "0755" + +- name: Download CUDA bandwith test release + ansible.builtin.unarchive: + remote_src: true + src: "{{ cuda_bandwidth_release_url }}" + dest: "{{ cuda_bandwidth_path }}" + owner: "{{ ansible_user }}" + group: "{{ ansible_user }}" + creates: "{{ cuda_bandwidth_path }}/nvbandwidth-0.8" + +- name: Creates CUDA bandwidth test build directory + ansible.builtin.file: + state: directory + path: "{{ cuda_bandwidth_path }}/nvbandwidth-0.8/build" + mode: "0755" + +- name: Ensure cudatests directory exists + ansible.builtin.file: + path: "{{ appliances_environment_root }}/cudatests" + state: directory + mode: '0755' + delegate_to: localhost + +- name: Build CUDA bandwidth test + ansible.builtin.shell: + cmd: | + source /cvmfs/software.eessi.io/versions/2023.06/init/bash && + module load Boost/1.82.0-GCC-12.3.0 && + . /etc/profile.d/sh.local && cmake .. && + make -j {{ ansible_processor_vcpus }} + chdir: "{{ cuda_bandwidth_path }}/nvbandwidth-0.8/build" + creates: "{{ cuda_bandwidth_path }}/nvbandwidth-0.8/build/nvbandwidth" + +- name: Run CUDA bandwidth test + ansible.builtin.shell: | + export LD_LIBRARY_PATH=/cvmfs/software.eessi.io/versions/2023.06/software/linux/x86_64/amd/zen4/software/GCCcore/12.3.0/lib64:\ + /cvmfs/software.eessi.io/versions/2023.06/software/linux/x86_64/amd/zen4/software/Boost/1.82.0-GCC-12.3.0/lib + ./nvbandwidth + args: + chdir: "{{ cuda_bandwidth_path }}/nvbandwidth-0.8/build/" + register: cuda_bandwidth_output + +- name: Save CUDA bandwidth output to bandwidth_results.txt + ansible.builtin.copy: + content: "{{ cuda_bandwidth_output.stdout }}" + dest: "{{ appliances_environment_root }}/cudatests/bandwidth_results.txt" + mode: '0644' + delegate_to: localhost From 358c47e448670951174aa4aba3926f04828cfe25 Mon Sep 17 00:00:00 2001 From: Claudia Watson Date: Tue, 28 Oct 2025 13:05:26 +0000 Subject: [PATCH 2/2] Adds bandwidth.yml playbook for NVIDIA nvbandwidth and removes samples.yml tasks from adhoc/cudatest.yml --- ansible/adhoc/cudatests.yml | 6 +----- ansible/roles/cuda/defaults/main.yml | 3 ++- ansible/roles/cuda/tasks/bandwidth.yml | 13 +++++++------ 3 files changed, 10 insertions(+), 12 deletions(-) diff --git a/ansible/adhoc/cudatests.yml b/ansible/adhoc/cudatests.yml index 8c325158a..0f5cf78a0 100644 --- a/ansible/adhoc/cudatests.yml +++ b/ansible/adhoc/cudatests.yml @@ -2,12 +2,8 @@ - hosts: cuda become: true gather_facts: true - tags: cuda_samples + tags: cuda_bandwidth tasks: - - ansible.builtin.import_role: - name: cuda - tasks_from: samples.yml - - name: Run CUDA bandwidth tasks ansible.builtin.import_role: name: cuda diff --git a/ansible/roles/cuda/defaults/main.yml b/ansible/roles/cuda/defaults/main.yml index 14d3d90f7..5de10d8f8 100644 --- a/ansible/roles/cuda/defaults/main.yml +++ b/ansible/roles/cuda/defaults/main.yml @@ -17,5 +17,6 @@ cuda_samples_programs: # cuda_devices: # discovered from deviceQuery run cuda_persistenced_state: started # variables for nvbandwidth (for bandwidth.yml tasks run in cudatests.yml) +cuda_bandwidth_version: 'v0.8' cuda_bandwidth_path: "/var/lib/{{ ansible_user }}/cuda_bandwidth" -cuda_bandwidth_release_url: "https://github.com/NVIDIA/nvbandwidth/archive/refs/tags/v0.8.tar.gz" +cuda_bandwidth_release_url: "https://github.com/NVIDIA/nvbandwidth/archive/refs/tags/{{ cuda_bandwidth_version }}.tar.gz" diff --git a/ansible/roles/cuda/tasks/bandwidth.yml b/ansible/roles/cuda/tasks/bandwidth.yml index 0d18088f6..bef5c5d94 100644 --- a/ansible/roles/cuda/tasks/bandwidth.yml +++ b/ansible/roles/cuda/tasks/bandwidth.yml @@ -1,5 +1,5 @@ --- -- name: Ensure cuda_bandwidth_path exists +- name: Ensure CUDA bandwidth path exists ansible.builtin.file: state: directory path: "{{ cuda_bandwidth_path }}" @@ -31,26 +31,27 @@ - name: Build CUDA bandwidth test ansible.builtin.shell: - cmd: | + cmd: > source /cvmfs/software.eessi.io/versions/2023.06/init/bash && + module load buildenv/default-foss-2023b && module load Boost/1.82.0-GCC-12.3.0 && - . /etc/profile.d/sh.local && cmake .. && + . /etc/profile.d/sh.local && + cmake .. && make -j {{ ansible_processor_vcpus }} chdir: "{{ cuda_bandwidth_path }}/nvbandwidth-0.8/build" creates: "{{ cuda_bandwidth_path }}/nvbandwidth-0.8/build/nvbandwidth" - name: Run CUDA bandwidth test ansible.builtin.shell: | - export LD_LIBRARY_PATH=/cvmfs/software.eessi.io/versions/2023.06/software/linux/x86_64/amd/zen4/software/GCCcore/12.3.0/lib64:\ - /cvmfs/software.eessi.io/versions/2023.06/software/linux/x86_64/amd/zen4/software/Boost/1.82.0-GCC-12.3.0/lib ./nvbandwidth args: chdir: "{{ cuda_bandwidth_path }}/nvbandwidth-0.8/build/" register: cuda_bandwidth_output + changed_when: true - name: Save CUDA bandwidth output to bandwidth_results.txt ansible.builtin.copy: content: "{{ cuda_bandwidth_output.stdout }}" - dest: "{{ appliances_environment_root }}/cudatests/bandwidth_results.txt" + dest: "{{ appliances_environment_root }}/cudatests/{{ inventory_hostname }}bandwidth_results.txt" mode: '0644' delegate_to: localhost