From e1c58a33002b8b3721f0a1619f0ee884786c8183 Mon Sep 17 00:00:00 2001 From: IlyasMoutawwakil Date: Tue, 16 Jan 2024 09:10:24 +0000 Subject: [PATCH 01/10] added amd-smi interface --- codecarbon/core/gpu.py | 193 ++++++++++++++++++++++++++------ codecarbon/core/util.py | 20 ++++ codecarbon/emissions_tracker.py | 7 +- 3 files changed, 185 insertions(+), 35 deletions(-) diff --git a/codecarbon/core/gpu.py b/codecarbon/core/gpu.py index 70a81cabc..97158c4c4 100644 --- a/codecarbon/core/gpu.py +++ b/codecarbon/core/gpu.py @@ -1,10 +1,23 @@ +from collections import namedtuple from dataclasses import dataclass, field -import pynvml - from codecarbon.core.units import Energy, Power, Time +from codecarbon.core.util import is_amd_system, is_nvidia_system from codecarbon.external.logger import logger +USE_AMDSMI = False +USE_PYNVML = False + +if is_nvidia_system(): + import pynvml + + USE_PYNVML = True + +if is_amd_system(): + import amdsmi + + USE_AMDSMI = True + @dataclass class GPUDevice: @@ -92,46 +105,105 @@ def _get_total_energy_consumption(self): """Returns total energy consumption for this GPU in millijoules (mJ) since the driver was last reloaded https://docs.nvidia.com/deploy/nvml-api/group__nvmlDeviceQueries.html#group__nvmlDeviceQueries_1g732ab899b5bd18ac4bfb93c02de4900a """ - return pynvml.nvmlDeviceGetTotalEnergyConsumption(self.handle) + if USE_PYNVML: + return pynvml.nvmlDeviceGetTotalEnergyConsumption(self.handle) + elif USE_AMDSMI: + # returns energy in microjoules (amd-smi metric --energy) + return amdsmi.amdsmi_get_power_measure(self.handle)["energy_accumulator"] + else: + raise Exception("No GPU interface available") def _get_gpu_name(self): """Returns the name of the GPU device https://docs.nvidia.com/deploy/nvml-api/group__nvmlDeviceQueries.html#group__nvmlDeviceQueries_1ga5361803e044c6fdf3b08523fb6d1481 """ - name = pynvml.nvmlDeviceGetName(self.handle) + if USE_PYNVML: + name = pynvml.nvmlDeviceGetName(self.handle) + elif USE_AMDSMI: + name = amdsmi.amdsmi_get_board_info(self.handle)["manufacturer_name"] + else: + raise Exception("No GPU interface available") + return self._to_utf8(name) def _get_uuid(self): """Returns the globally unique GPU device UUID https://docs.nvidia.com/deploy/nvml-api/group__nvmlDeviceQueries.html#group__nvmlDeviceQueries_1g72710fb20f30f0c2725ce31579832654 """ - uuid = pynvml.nvmlDeviceGetUUID(self.handle) + if USE_PYNVML: + uuid = pynvml.nvmlDeviceGetUUID(self.handle) + elif USE_AMDSMI: + uuid = amdsmi.amdsmi_get_device_uuid(self.handle) + else: + raise Exception("No GPU interface available") + return self._to_utf8(uuid) def _get_memory_info(self): """Returns memory info in bytes https://docs.nvidia.com/deploy/nvml-api/group__nvmlDeviceQueries.html#group__nvmlDeviceQueries_1g2dfeb1db82aa1de91aa6edf941c85ca8 """ - return pynvml.nvmlDeviceGetMemoryInfo(self.handle) + if USE_PYNVML: + return pynvml.nvmlDeviceGetMemoryInfo(self.handle) + elif USE_AMDSMI: + # returns memory in megabytes (amd-smi metric --mem-usage) + memory_info = amdsmi.amdsmi_get_vram_usage(self.handle) + AMDMemory = namedtuple("AMDMemory", ["total", "used", "free"]) + return AMDMemory( + total=memory_info["vram_total"] * 1024 * 1024, + used=memory_info["vram_used"] * 1024 * 1024, + free=(memory_info["vram_total"] - memory_info["vram_used"]) + * 1024 + * 1024, + ) + else: + raise Exception("No GPU interface available") def _get_temperature(self): """Returns degrees in the Celsius scale https://docs.nvidia.com/deploy/nvml-api/group__nvmlDeviceQueries.html#group__nvmlDeviceQueries_1g92d1c5182a14dd4be7090e3c1480b121 """ - return pynvml.nvmlDeviceGetTemperature(self.handle, pynvml.NVML_TEMPERATURE_GPU) + if USE_PYNVML: + return pynvml.nvmlDeviceGetTemperature( + self.handle, + sensor=pynvml.NVML_TEMPERATURE_GPU, + ) + elif USE_AMDSMI: + return amdsmi.amdsmi_dev_get_temp_metric( + self.handle, + sensor_type=amdsmi.AmdSmiTemperatureType.EDGE, + metric=amdsmi.AmdSmiTemperatureMetric.CURRENT, + ) + else: + raise Exception("No GPU interface available") def _get_power_usage(self): """Returns power usage in milliwatts https://docs.nvidia.com/deploy/nvml-api/group__nvmlDeviceQueries.html#group__nvmlDeviceQueries_1g7ef7dff0ff14238d08a19ad7fb23fc87 """ - return pynvml.nvmlDeviceGetPowerUsage(self.handle) + if USE_PYNVML: + return pynvml.nvmlDeviceGetPowerUsage(self.handle) + elif USE_AMDSMI: + # returns power in Watts (amd-smi metric --power) + return ( + amdsmi.amdsmi_get_power_measure(self.handle)["average_socket_power"] + * 1000 + ) + else: + raise Exception("No GPU interface available") def _get_power_limit(self): """Returns max power usage in milliwatts https://docs.nvidia.com/deploy/nvml-api/group__nvmlDeviceQueries.html#group__nvmlDeviceQueries_1g263b5bf552d5ec7fcd29a088264d10ad """ try: - return pynvml.nvmlDeviceGetEnforcedPowerLimit(self.handle) + if USE_PYNVML: + return pynvml.nvmlDeviceGetEnforcedPowerLimit(self.handle) + elif USE_AMDSMI: + # returns power limit in Watts (amd-smi static --limit) + return ( + amdsmi.amdsmi_get_power_measure(self.handle)["power_limit"] * 1000 + ) except Exception: return None @@ -139,51 +211,100 @@ def _get_gpu_utilization(self): """Returns the % of utilization of the kernels during the last sample https://docs.nvidia.com/deploy/nvml-api/structnvmlUtilization__t.html#structnvmlUtilization__t """ - return pynvml.nvmlDeviceGetUtilizationRates(self.handle).gpu + if USE_PYNVML: + return pynvml.nvmlDeviceGetUtilizationRates(self.handle).gpu + elif USE_AMDSMI: + return amdsmi.amdsmi_get_gpu_activity(self.handle)["gfx_activity"] + else: + raise Exception("No GPU interface available") def _get_compute_mode(self): """Returns the compute mode of the GPU https://docs.nvidia.com/deploy/nvml-api/group__nvmlDeviceEnumvs.html#group__nvmlDeviceEnumvs_1gbed1b88f2e3ba39070d31d1db4340233 """ - return pynvml.nvmlDeviceGetComputeMode(self.handle) + if USE_PYNVML: + return pynvml.nvmlDeviceGetComputeMode(self.handle) + elif USE_AMDSMI: + return None + else: + raise Exception("No GPU interface available") def _get_compute_processes(self): - """Returns the list of processes ids having a compute context on the - device with the memory used + """Returns the list of processes ids having a compute context on the device with the memory used https://docs.nvidia.com/deploy/nvml-api/group__nvmlDeviceQueries.html#group__nvmlDeviceQueries_1g46ceaea624d5c96e098e03c453419d68 """ try: - processes = pynvml.nvmlDeviceGetComputeRunningProcesses(self.handle) - - return [{"pid": p.pid, "used_memory": p.usedGpuMemory} for p in processes] - except pynvml.NVMLError: + if USE_PYNVML: + processes = pynvml.nvmlDeviceGetComputeRunningProcesses(self.handle) + return [ + {"pid": p.pid, "used_memory": p.usedGpuMemory} for p in processes + ] + elif USE_AMDSMI: + processes_handles = amdsmi.amdsmi_get_process_list(self.handle) + processes_info = [ + amdsmi.amdsmi_get_process_info(self.handle, p) + for p in processes_handles + ] + return [ + {"pid": p["pid"], "used_memory": p["memory_usage"]["vram_usage"]} + for p in processes_info + ] + except Exception: return [] def _get_graphics_processes(self): - """Returns the list of processes ids having a graphics context on the - device with the memory used + """Returns the list of processes ids having a graphics context on the device with the memory used https://docs.nvidia.com/deploy/nvml-api/group__nvmlDeviceQueries.html#group__nvmlDeviceQueries_1g7eacf7fa7ba4f4485d166736bf31195e """ try: - processes = pynvml.nvmlDeviceGetGraphicsRunningProcesses(self.handle) - - return [{"pid": p.pid, "used_memory": p.usedGpuMemory} for p in processes] - except pynvml.NVMLError: + if USE_PYNVML: + processes = pynvml.nvmlDeviceGetGraphicsRunningProcesses(self.handle) + return [ + {"pid": p.pid, "used_memory": p.usedGpuMemory} for p in processes + ] + elif USE_AMDSMI: + processes_handles = amdsmi.amdsmi_get_process_list(self.handle) + processes_info = [ + amdsmi.amdsmi_get_process_info(self.handle, p) + for p in processes_handles + ] + return [ + {"pid": p["pid"], "used_memory": p["memory_usage"]["vram_usage"]} + for p in processes_info + if p["engine_usage"]["gfx"] > 0 + ] + except Exception: return [] class AllGPUDevices: def __init__(self): if is_gpu_details_available(): - logger.debug("GPU available. Starting setup") - self.device_count = pynvml.nvmlDeviceGetCount() + if USE_PYNVML: + logger.debug("Nvidia GPU available. Starting setup") + pynvml.nvmlInit() + self.device_count = pynvml.nvmlDeviceGetCount() + elif USE_AMDSMI: + logger.debug("AMD GPU available. Starting setup") + amdsmi.amdsmi_init() + self.device_count = len(amdsmi.amdsmi_get_device_handles()) + else: + logger.error("No GPU interface available") + self.device_count = 0 else: logger.error("There is no GPU available") self.device_count = 0 self.devices = [] for i in range(self.device_count): - handle = pynvml.nvmlDeviceGetHandleByIndex(i) - gpu_device = GPUDevice(handle=handle, gpu_index=i) + if USE_PYNVML: + handle = pynvml.nvmlDeviceGetHandleByIndex(i) + gpu_device = GPUDevice(handle=handle, gpu_index=i) + elif USE_AMDSMI: + handle = amdsmi.amdsmi_get_device_handles()[i] + gpu_device = GPUDevice(handle=handle, gpu_index=i) + else: + raise Exception("No GPU interface available") + self.devices.append(gpu_device) def get_gpu_static_info(self): @@ -206,7 +327,7 @@ def get_gpu_static_info(self): devices_static_info.append(gpu_device.get_static_details()) return devices_static_info - except pynvml.NVMLError: + except Exception: logger.warning("Failed to retrieve gpu static info", exc_info=True) return [] @@ -238,7 +359,7 @@ def get_gpu_details(self): devices_info.append(gpu_device.get_gpu_details()) return devices_info - except pynvml.NVMLError: + except Exception: logger.warning("Failed to retrieve gpu information", exc_info=True) return [] @@ -261,7 +382,7 @@ def get_delta(self, last_duration: Time): devices_info.append(gpu_device.delta(last_duration)) return devices_info - except pynvml.NVMLError: + except Exception: logger.warning("Failed to retrieve gpu information", exc_info=True) return [] @@ -269,8 +390,14 @@ def get_delta(self, last_duration: Time): def is_gpu_details_available(): """Returns True if the GPU details are available.""" try: - pynvml.nvmlInit() - return True + if USE_PYNVML: + pynvml.nvmlInit() + return True + elif USE_AMDSMI: + amdsmi.amdsmi_init() + return True + else: + return False - except pynvml.NVMLError: + except Exception: return False diff --git a/codecarbon/core/util.py b/codecarbon/core/util.py index 7bf66edb3..ef1d7b81b 100644 --- a/codecarbon/core/util.py +++ b/codecarbon/core/util.py @@ -117,3 +117,23 @@ def count_cpus() -> int: num_cpus = num_cpus_matches[0].replace("NumCPUs=", "") logger.debug(f"Detected {num_cpus} cpus available on SLURM.") return int(num_cpus) + + +def is_amd_system(): + """Returns True if the system has an amd-smi interface.""" + try: + # Check if amd-smi is available + subprocess.check_output(["amd-smi", "--help"]) + return True + except subprocess.CalledProcessError: + return False + + +def is_nvidia_system(): + """Returns True if the system has an nvidia-smi interface.""" + try: + # Check if nvidia-smi is available + subprocess.check_output(["nvidia-smi", "--help"]) + return True + except Exception: + return False diff --git a/codecarbon/emissions_tracker.py b/codecarbon/emissions_tracker.py index b16bbf0de..249dd8e9c 100644 --- a/codecarbon/emissions_tracker.py +++ b/codecarbon/emissions_tracker.py @@ -18,7 +18,7 @@ from codecarbon.core.config import get_hierarchical_config, parse_gpu_ids from codecarbon.core.emissions import Emissions from codecarbon.core.units import Energy, Power, Time -from codecarbon.core.util import count_cpus, suppress +from codecarbon.core.util import count_cpus, is_amd_system, is_nvidia_system, suppress from codecarbon.external.geography import CloudMetadata, GeoMetadata from codecarbon.external.hardware import CPU, GPU, RAM from codecarbon.external.logger import logger, set_logger_format, set_logger_level @@ -280,7 +280,10 @@ def __init__( # Hardware detection logger.info("[setup] GPU Tracking...") if gpu.is_gpu_details_available(): - logger.info("Tracking Nvidia GPU via pynvml") + if is_nvidia_system(): + logger.info("Tracking Nvidia GPU via pynvml") + elif is_amd_system(): + logger.info("Tracking AMD GPU via amdsmi") gpu_devices = GPU.from_utils(self._gpu_ids) self._hardware.append(gpu_devices) gpu_names = [n["name"] for n in gpu_devices.devices.get_gpu_static_info()] From fc93306ee9d8d14265b71efe75b748d4cb91d264 Mon Sep 17 00:00:00 2001 From: IlyasMoutawwakil Date: Tue, 16 Jan 2024 09:46:50 +0000 Subject: [PATCH 02/10] fix energy unit --- codecarbon/core/gpu.py | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/codecarbon/core/gpu.py b/codecarbon/core/gpu.py index 97158c4c4..fe0edc8e7 100644 --- a/codecarbon/core/gpu.py +++ b/codecarbon/core/gpu.py @@ -108,8 +108,12 @@ def _get_total_energy_consumption(self): if USE_PYNVML: return pynvml.nvmlDeviceGetTotalEnergyConsumption(self.handle) elif USE_AMDSMI: - # returns energy in microjoules (amd-smi metric --energy) - return amdsmi.amdsmi_get_power_measure(self.handle)["energy_accumulator"] + # returns energy in "Energy Status Units" which is equivalent to 15.3 microjoules (amd-smi metric --energy) + return ( + amdsmi.amdsmi_get_power_measure(self.handle)["energy_accumulator"] + * 15.3 + / 1000 + ) else: raise Exception("No GPU interface available") From 0626e4b6a4753ff3ee9b6c684010a94702110444 Mon Sep 17 00:00:00 2001 From: IlyasMoutawwakil Date: Tue, 16 Jan 2024 10:04:52 +0000 Subject: [PATCH 03/10] use counter_resolution instead of hard coding it --- codecarbon/core/gpu.py | 9 +++------ 1 file changed, 3 insertions(+), 6 deletions(-) diff --git a/codecarbon/core/gpu.py b/codecarbon/core/gpu.py index fe0edc8e7..6fe632a2c 100644 --- a/codecarbon/core/gpu.py +++ b/codecarbon/core/gpu.py @@ -108,12 +108,9 @@ def _get_total_energy_consumption(self): if USE_PYNVML: return pynvml.nvmlDeviceGetTotalEnergyConsumption(self.handle) elif USE_AMDSMI: - # returns energy in "Energy Status Units" which is equivalent to 15.3 microjoules (amd-smi metric --energy) - return ( - amdsmi.amdsmi_get_power_measure(self.handle)["energy_accumulator"] - * 15.3 - / 1000 - ) + # returns energy in "Energy Status Units" which is equivalent to around 15.3 microjoules + energy = amdsmi.amdsmi_dev_get_energy_count(self.handle) + return energy["power"] * energy["counter_resolution"] / 1000 else: raise Exception("No GPU interface available") From 37f07ecf9b4ac5781389747fd0a2a92d35ce1197 Mon Sep 17 00:00:00 2001 From: benoit-cty <4-benoit-cty@users.noreply.git.leximpact.dev> Date: Fri, 26 Jan 2024 12:40:28 +0100 Subject: [PATCH 04/10] wip : handle AMD and Nvidia at the same time --- codecarbon/core/gpu.py | 29 +++++++++++++++-------------- 1 file changed, 15 insertions(+), 14 deletions(-) diff --git a/codecarbon/core/gpu.py b/codecarbon/core/gpu.py index 6fe632a2c..a69c98086 100644 --- a/codecarbon/core/gpu.py +++ b/codecarbon/core/gpu.py @@ -279,34 +279,35 @@ def _get_graphics_processes(self): class AllGPUDevices: + devices = [] + device_count:int = 0 + def __init__(self): + self.devices = [] if is_gpu_details_available(): if USE_PYNVML: logger.debug("Nvidia GPU available. Starting setup") pynvml.nvmlInit() self.device_count = pynvml.nvmlDeviceGetCount() - elif USE_AMDSMI: + for i in range(self.device_count): + handle = pynvml.nvmlDeviceGetHandleByIndex(i) + gpu_device = GPUDevice(handle=handle, gpu_index=i) + self.devices.append(gpu_device) + if USE_AMDSMI: logger.debug("AMD GPU available. Starting setup") amdsmi.amdsmi_init() self.device_count = len(amdsmi.amdsmi_get_device_handles()) + for i in range(self.device_count): + handle = amdsmi.amdsmi_get_device_handles()[i] + gpu_device = GPUDevice(handle=handle, gpu_index=i) + self.devices.append(gpu_device) else: logger.error("No GPU interface available") - self.device_count = 0 else: logger.error("There is no GPU available") - self.device_count = 0 - self.devices = [] - for i in range(self.device_count): - if USE_PYNVML: - handle = pynvml.nvmlDeviceGetHandleByIndex(i) - gpu_device = GPUDevice(handle=handle, gpu_index=i) - elif USE_AMDSMI: - handle = amdsmi.amdsmi_get_device_handles()[i] - gpu_device = GPUDevice(handle=handle, gpu_index=i) - else: - raise Exception("No GPU interface available") + self.device_count = len(self.devices) - self.devices.append(gpu_device) + def get_gpu_static_info(self): """Get all GPUs static information. From 0002c2e57681ac3afec8b8c09f32249434deb735 Mon Sep 17 00:00:00 2001 From: IlyasMoutawwakil Date: Mon, 29 Jan 2024 14:36:50 +0000 Subject: [PATCH 05/10] added support for amd and nvidia at the same time --- codecarbon/core/gpu.py | 335 +++++++++++++++----------------- codecarbon/core/util.py | 20 -- codecarbon/emissions_tracker.py | 13 +- 3 files changed, 169 insertions(+), 199 deletions(-) diff --git a/codecarbon/core/gpu.py b/codecarbon/core/gpu.py index a69c98086..c846badf9 100644 --- a/codecarbon/core/gpu.py +++ b/codecarbon/core/gpu.py @@ -1,32 +1,66 @@ +import subprocess +from typing import List, Any from collections import namedtuple from dataclasses import dataclass, field + from codecarbon.core.units import Energy, Power, Time -from codecarbon.core.util import is_amd_system, is_nvidia_system from codecarbon.external.logger import logger -USE_AMDSMI = False -USE_PYNVML = False -if is_nvidia_system(): +def is_rocm_system(): + """Returns True if the system has an rocm-smi interface.""" + try: + # Check if rocm-smi is available + subprocess.check_output(["rocm-smi", "--help"]) + return True + except subprocess.CalledProcessError: + return False + + +def is_nvidia_system(): + """Returns True if the system has an nvidia-smi interface.""" + try: + # Check if nvidia-smi is available + subprocess.check_output(["nvidia-smi", "--help"]) + return True + except Exception: + return False + + +try: import pynvml - USE_PYNVML = True + PYNVML_AVAILABLE = True +except ImportError: + if is_nvidia_system(): + logger.warning( + "Nvidia GPU detected but pynvml is not available. " + "Please install pynvml to get GPU metrics." + ) + PYNVML_AVAILABLE = False -if is_amd_system(): +try: import amdsmi - USE_AMDSMI = True + AMDSMI_AVAILABLE = True +except ImportError: + if is_rocm_system(): + logger.warning( + "AMD GPU detected but amdsmi is not available. " + "Please install amdsmi to get GPU metrics." + ) + AMDSMI_AVAILABLE = False @dataclass class GPUDevice: - handle: any + handle: Any gpu_index: int - # Energy consumed in kWh - energy_delta: Energy = field(default_factory=lambda: Energy(0)) # Power based on reading power: Power = field(default_factory=lambda: Power(0)) + # Energy consumed in kWh + energy_delta: Energy = field(default_factory=lambda: Energy(0)) # Last energy reading in kWh last_energy: Energy = field(default_factory=lambda: Energy(0)) @@ -101,213 +135,184 @@ def _to_utf8(self, str_or_bytes): return str_or_bytes + +@dataclass +class NvidiaGPUDevice(GPUDevice): def _get_total_energy_consumption(self): """Returns total energy consumption for this GPU in millijoules (mJ) since the driver was last reloaded https://docs.nvidia.com/deploy/nvml-api/group__nvmlDeviceQueries.html#group__nvmlDeviceQueries_1g732ab899b5bd18ac4bfb93c02de4900a """ - if USE_PYNVML: - return pynvml.nvmlDeviceGetTotalEnergyConsumption(self.handle) - elif USE_AMDSMI: - # returns energy in "Energy Status Units" which is equivalent to around 15.3 microjoules - energy = amdsmi.amdsmi_dev_get_energy_count(self.handle) - return energy["power"] * energy["counter_resolution"] / 1000 - else: - raise Exception("No GPU interface available") + return pynvml.nvmlDeviceGetTotalEnergyConsumption(self.handle) def _get_gpu_name(self): """Returns the name of the GPU device https://docs.nvidia.com/deploy/nvml-api/group__nvmlDeviceQueries.html#group__nvmlDeviceQueries_1ga5361803e044c6fdf3b08523fb6d1481 """ - if USE_PYNVML: - name = pynvml.nvmlDeviceGetName(self.handle) - elif USE_AMDSMI: - name = amdsmi.amdsmi_get_board_info(self.handle)["manufacturer_name"] - else: - raise Exception("No GPU interface available") - + name = pynvml.nvmlDeviceGetName(self.handle) return self._to_utf8(name) def _get_uuid(self): """Returns the globally unique GPU device UUID https://docs.nvidia.com/deploy/nvml-api/group__nvmlDeviceQueries.html#group__nvmlDeviceQueries_1g72710fb20f30f0c2725ce31579832654 """ - if USE_PYNVML: - uuid = pynvml.nvmlDeviceGetUUID(self.handle) - elif USE_AMDSMI: - uuid = amdsmi.amdsmi_get_device_uuid(self.handle) - else: - raise Exception("No GPU interface available") - + uuid = pynvml.nvmlDeviceGetUUID(self.handle) return self._to_utf8(uuid) def _get_memory_info(self): """Returns memory info in bytes https://docs.nvidia.com/deploy/nvml-api/group__nvmlDeviceQueries.html#group__nvmlDeviceQueries_1g2dfeb1db82aa1de91aa6edf941c85ca8 """ - if USE_PYNVML: - return pynvml.nvmlDeviceGetMemoryInfo(self.handle) - elif USE_AMDSMI: - # returns memory in megabytes (amd-smi metric --mem-usage) - memory_info = amdsmi.amdsmi_get_vram_usage(self.handle) - AMDMemory = namedtuple("AMDMemory", ["total", "used", "free"]) - return AMDMemory( - total=memory_info["vram_total"] * 1024 * 1024, - used=memory_info["vram_used"] * 1024 * 1024, - free=(memory_info["vram_total"] - memory_info["vram_used"]) - * 1024 - * 1024, - ) - else: - raise Exception("No GPU interface available") + return pynvml.nvmlDeviceGetMemoryInfo(self.handle) def _get_temperature(self): """Returns degrees in the Celsius scale https://docs.nvidia.com/deploy/nvml-api/group__nvmlDeviceQueries.html#group__nvmlDeviceQueries_1g92d1c5182a14dd4be7090e3c1480b121 """ - if USE_PYNVML: - return pynvml.nvmlDeviceGetTemperature( - self.handle, - sensor=pynvml.NVML_TEMPERATURE_GPU, - ) - elif USE_AMDSMI: - return amdsmi.amdsmi_dev_get_temp_metric( - self.handle, - sensor_type=amdsmi.AmdSmiTemperatureType.EDGE, - metric=amdsmi.AmdSmiTemperatureMetric.CURRENT, - ) - else: - raise Exception("No GPU interface available") + return pynvml.nvmlDeviceGetTemperature( + self.handle, sensor=pynvml.NVML_TEMPERATURE_GPU + ) def _get_power_usage(self): """Returns power usage in milliwatts https://docs.nvidia.com/deploy/nvml-api/group__nvmlDeviceQueries.html#group__nvmlDeviceQueries_1g7ef7dff0ff14238d08a19ad7fb23fc87 """ - if USE_PYNVML: - return pynvml.nvmlDeviceGetPowerUsage(self.handle) - elif USE_AMDSMI: - # returns power in Watts (amd-smi metric --power) - return ( - amdsmi.amdsmi_get_power_measure(self.handle)["average_socket_power"] - * 1000 - ) - else: - raise Exception("No GPU interface available") + return pynvml.nvmlDeviceGetPowerUsage(self.handle) def _get_power_limit(self): """Returns max power usage in milliwatts https://docs.nvidia.com/deploy/nvml-api/group__nvmlDeviceQueries.html#group__nvmlDeviceQueries_1g263b5bf552d5ec7fcd29a088264d10ad """ - try: - if USE_PYNVML: - return pynvml.nvmlDeviceGetEnforcedPowerLimit(self.handle) - elif USE_AMDSMI: - # returns power limit in Watts (amd-smi static --limit) - return ( - amdsmi.amdsmi_get_power_measure(self.handle)["power_limit"] * 1000 - ) - except Exception: - return None + return pynvml.nvmlDeviceGetEnforcedPowerLimit(self.handle) def _get_gpu_utilization(self): """Returns the % of utilization of the kernels during the last sample https://docs.nvidia.com/deploy/nvml-api/structnvmlUtilization__t.html#structnvmlUtilization__t """ - if USE_PYNVML: - return pynvml.nvmlDeviceGetUtilizationRates(self.handle).gpu - elif USE_AMDSMI: - return amdsmi.amdsmi_get_gpu_activity(self.handle)["gfx_activity"] - else: - raise Exception("No GPU interface available") + return pynvml.nvmlDeviceGetUtilizationRates(self.handle).gpu def _get_compute_mode(self): """Returns the compute mode of the GPU https://docs.nvidia.com/deploy/nvml-api/group__nvmlDeviceEnumvs.html#group__nvmlDeviceEnumvs_1gbed1b88f2e3ba39070d31d1db4340233 """ - if USE_PYNVML: - return pynvml.nvmlDeviceGetComputeMode(self.handle) - elif USE_AMDSMI: - return None - else: - raise Exception("No GPU interface available") + return pynvml.nvmlDeviceGetComputeMode(self.handle) def _get_compute_processes(self): """Returns the list of processes ids having a compute context on the device with the memory used https://docs.nvidia.com/deploy/nvml-api/group__nvmlDeviceQueries.html#group__nvmlDeviceQueries_1g46ceaea624d5c96e098e03c453419d68 """ - try: - if USE_PYNVML: - processes = pynvml.nvmlDeviceGetComputeRunningProcesses(self.handle) - return [ - {"pid": p.pid, "used_memory": p.usedGpuMemory} for p in processes - ] - elif USE_AMDSMI: - processes_handles = amdsmi.amdsmi_get_process_list(self.handle) - processes_info = [ - amdsmi.amdsmi_get_process_info(self.handle, p) - for p in processes_handles - ] - return [ - {"pid": p["pid"], "used_memory": p["memory_usage"]["vram_usage"]} - for p in processes_info - ] - except Exception: - return [] + processes = pynvml.nvmlDeviceGetComputeRunningProcesses(self.handle) + return [{"pid": p.pid, "used_memory": p.usedGpuMemory} for p in processes] def _get_graphics_processes(self): """Returns the list of processes ids having a graphics context on the device with the memory used https://docs.nvidia.com/deploy/nvml-api/group__nvmlDeviceQueries.html#group__nvmlDeviceQueries_1g7eacf7fa7ba4f4485d166736bf31195e """ - try: - if USE_PYNVML: - processes = pynvml.nvmlDeviceGetGraphicsRunningProcesses(self.handle) - return [ - {"pid": p.pid, "used_memory": p.usedGpuMemory} for p in processes - ] - elif USE_AMDSMI: - processes_handles = amdsmi.amdsmi_get_process_list(self.handle) - processes_info = [ - amdsmi.amdsmi_get_process_info(self.handle, p) - for p in processes_handles - ] - return [ - {"pid": p["pid"], "used_memory": p["memory_usage"]["vram_usage"]} - for p in processes_info - if p["engine_usage"]["gfx"] > 0 - ] - except Exception: - return [] + processes = pynvml.nvmlDeviceGetGraphicsRunningProcesses(self.handle) + return [{"pid": p.pid, "used_memory": p.usedGpuMemory} for p in processes] + + +class AMDGPUDevice(GPUDevice): + def _get_total_energy_consumption(self): + """Returns energy in "Energy Status Units" which is equivalent to around 15.3 microjoules""" + energy_count = amdsmi.amdsmi_dev_get_energy_count(self.handle) + energy = energy_count["power"] * energy_count["counter_resolution"] / 1000 + return energy + + def _get_gpu_name(self): + """Returns the name of the GPU device""" + name = amdsmi.amdsmi_get_board_info(self.handle)["manufacturer_name"] + return self._to_utf8(name) + + def _get_uuid(self): + """Returns the globally unique GPU device UUID""" + uuid = amdsmi.amdsmi_get_device_uuid(self.handle) + return self._to_utf8(uuid) + + def _get_memory_info(self): + """Returns memory info in bytes""" + memory_info = amdsmi.amdsmi_get_vram_usage(self.handle) + AMDMemory = namedtuple("AMDMemory", ["total", "used", "free"]) + return AMDMemory( + total=memory_info["vram_total"] * 1024 * 1024, + used=memory_info["vram_used"] * 1024 * 1024, + free=(memory_info["vram_total"] - memory_info["vram_used"]) * 1024 * 1024, + ) + + def _get_temperature(self): + """Returns degrees in the Celsius scale""" + return amdsmi.amdsmi_dev_get_temp_metric( + self.handle, + sensor_type=amdsmi.AmdSmiTemperatureType.EDGE, + metric=amdsmi.AmdSmiTemperatureMetric.CURRENT, + ) + + def _get_power_usage(self): + """Returns power usage in milliwatts""" + return ( + amdsmi.amdsmi_get_power_measure(self.handle)["average_socket_power"] * 1000 + ) + + def _get_power_limit(self): + """Returns max power usage in milliwatts""" + return amdsmi.amdsmi_get_power_measure(self.handle)["power_limit"] * 1000 + + def _get_gpu_utilization(self): + """Returns the % of utilization of the kernels during the last sample""" + return amdsmi.amdsmi_get_gpu_activity(self.handle)["gfx_activity"] + + def _get_compute_mode(self): + """Returns the compute mode of the GPU""" + return None + + def _get_compute_processes(self): + """Returns the list of processes ids having a compute context on the device with the memory used""" + processes_handles = amdsmi.amdsmi_get_process_list(self.handle) + processes_infos = [ + amdsmi.amdsmi_get_process_info(self.handle, p) for p in processes_handles + ] + return [ + {"pid": p["pid"], "used_memory": p["memory_usage"]["vram_mem"]} + for p in processes_infos + ] + + def _get_graphics_processes(self): + """Returns the list of processes ids having a graphics context on the device with the memory used""" + processes_handles = amdsmi.amdsmi_get_process_list(self.handle) + processes_infos = [ + amdsmi.amdsmi_get_process_info(self.handle, p) for p in processes_handles + ] + return [ + {"pid": p["pid"], "used_memory": p["memory_usage"]["vram_usage"]} + for p in processes_infos + if p["engine_usage"]["gfx"] > 0 + ] class AllGPUDevices: - devices = [] - device_count:int = 0 - + device_count: int + devices: List[GPUDevice] + def __init__(self): self.devices = [] - if is_gpu_details_available(): - if USE_PYNVML: - logger.debug("Nvidia GPU available. Starting setup") - pynvml.nvmlInit() - self.device_count = pynvml.nvmlDeviceGetCount() - for i in range(self.device_count): - handle = pynvml.nvmlDeviceGetHandleByIndex(i) - gpu_device = GPUDevice(handle=handle, gpu_index=i) - self.devices.append(gpu_device) - if USE_AMDSMI: - logger.debug("AMD GPU available. Starting setup") - amdsmi.amdsmi_init() - self.device_count = len(amdsmi.amdsmi_get_device_handles()) - for i in range(self.device_count): - handle = amdsmi.amdsmi_get_device_handles()[i] - gpu_device = GPUDevice(handle=handle, gpu_index=i) - self.devices.append(gpu_device) - else: - logger.error("No GPU interface available") - else: - logger.error("There is no GPU available") - self.device_count = len(self.devices) - + if is_nvidia_system() and PYNVML_AVAILABLE: + logger.debug("PyNVML available. Starting setup") + pynvml.nvmlInit() + nvidia_devices_count = pynvml.nvmlDeviceGetCount() + for i in range(nvidia_devices_count): + handle = pynvml.nvmlDeviceGetHandleByIndex(i) + nvidia_gpu_device = NvidiaGPUDevice(handle=handle, gpu_index=i) + self.devices.append(nvidia_gpu_device) + + if is_rocm_system() and AMDSMI_AVAILABLE: + logger.debug("AMDSMI available. Starting setup") + amdsmi.amdsmi_init() + amd_devices_handles = amdsmi.amdsmi_get_device_handles() + for i, handle in enumerate(amd_devices_handles): + amd_gpu_device = AMDGPUDevice(handle=handle, gpu_index=i) + self.devices.append(amd_gpu_device) + + self.device_count = len(self.devices) def get_gpu_static_info(self): """Get all GPUs static information. @@ -357,7 +362,7 @@ def get_gpu_details(self): try: devices_info = [] for i in range(self.device_count): - gpu_device: GPUDevice = self.devices[i] + gpu_device = self.devices[i] devices_info.append(gpu_device.get_gpu_details()) return devices_info @@ -380,26 +385,10 @@ def get_delta(self, last_duration: Time): try: devices_info = [] for i in range(self.device_count): - gpu_device: GPUDevice = self.devices[i] + gpu_device = self.devices[i] devices_info.append(gpu_device.delta(last_duration)) return devices_info except Exception: logger.warning("Failed to retrieve gpu information", exc_info=True) return [] - - -def is_gpu_details_available(): - """Returns True if the GPU details are available.""" - try: - if USE_PYNVML: - pynvml.nvmlInit() - return True - elif USE_AMDSMI: - amdsmi.amdsmi_init() - return True - else: - return False - - except Exception: - return False diff --git a/codecarbon/core/util.py b/codecarbon/core/util.py index ef1d7b81b..7bf66edb3 100644 --- a/codecarbon/core/util.py +++ b/codecarbon/core/util.py @@ -117,23 +117,3 @@ def count_cpus() -> int: num_cpus = num_cpus_matches[0].replace("NumCPUs=", "") logger.debug(f"Detected {num_cpus} cpus available on SLURM.") return int(num_cpus) - - -def is_amd_system(): - """Returns True if the system has an amd-smi interface.""" - try: - # Check if amd-smi is available - subprocess.check_output(["amd-smi", "--help"]) - return True - except subprocess.CalledProcessError: - return False - - -def is_nvidia_system(): - """Returns True if the system has an nvidia-smi interface.""" - try: - # Check if nvidia-smi is available - subprocess.check_output(["nvidia-smi", "--help"]) - return True - except Exception: - return False diff --git a/codecarbon/emissions_tracker.py b/codecarbon/emissions_tracker.py index 249dd8e9c..cb8469e37 100644 --- a/codecarbon/emissions_tracker.py +++ b/codecarbon/emissions_tracker.py @@ -18,7 +18,7 @@ from codecarbon.core.config import get_hierarchical_config, parse_gpu_ids from codecarbon.core.emissions import Emissions from codecarbon.core.units import Energy, Power, Time -from codecarbon.core.util import count_cpus, is_amd_system, is_nvidia_system, suppress +from codecarbon.core.util import count_cpus, suppress from codecarbon.external.geography import CloudMetadata, GeoMetadata from codecarbon.external.hardware import CPU, GPU, RAM from codecarbon.external.logger import logger, set_logger_format, set_logger_level @@ -279,11 +279,12 @@ def __init__( # Hardware detection logger.info("[setup] GPU Tracking...") - if gpu.is_gpu_details_available(): - if is_nvidia_system(): - logger.info("Tracking Nvidia GPU via pynvml") - elif is_amd_system(): - logger.info("Tracking AMD GPU via amdsmi") + if gpu.is_nvidia_system() or gpu.is_rocm_system(): + if gpu.is_nvidia_system(): + logger.info("Tracking Nvidia GPUs via PyNVML") + elif gpu.is_rocm_system(): + logger.info("Tracking AMD GPUs via AMDSMI") + gpu_devices = GPU.from_utils(self._gpu_ids) self._hardware.append(gpu_devices) gpu_names = [n["name"] for n in gpu_devices.devices.get_gpu_static_info()] From c498440ae8237083f930af3c4d1af73018332a59 Mon Sep 17 00:00:00 2001 From: benoit-cty Date: Wed, 18 Feb 2026 15:01:45 +0100 Subject: [PATCH 06/10] Fix merge conflict --- codecarbon/core/gpu.py | 19 +++++---- codecarbon/core/resource_tracker.py | 28 +++++++------ codecarbon/emissions_tracker.py | 65 ----------------------------- 3 files changed, 26 insertions(+), 86 deletions(-) diff --git a/codecarbon/core/gpu.py b/codecarbon/core/gpu.py index 04cadcd01..d764142c3 100644 --- a/codecarbon/core/gpu.py +++ b/codecarbon/core/gpu.py @@ -1,10 +1,8 @@ import subprocess -from typing import List, Any from collections import namedtuple from dataclasses import dataclass, field from typing import Any, Dict, List, Union - from codecarbon.core.units import Energy, Power, Time from codecarbon.external.logger import logger @@ -15,7 +13,7 @@ def is_rocm_system(): # Check if rocm-smi is available subprocess.check_output(["rocm-smi", "--help"]) return True - except subprocess.CalledProcessError: + except (subprocess.CalledProcessError, OSError): return False @@ -201,9 +199,7 @@ def _get_temperature(self) -> int: """Returns degrees in the Celsius scale https://docs.nvidia.com/deploy/nvml-api/group__nvmlDeviceQueries.html#group__nvmlDeviceQueries_1g92d1c5182a14dd4be7090e3c1480b121 """ - return pynvml.nvmlDeviceGetTemperature( - self.handle, sensor=pynvml.NVML_TEMPERATURE_GPU - ) + return pynvml.nvmlDeviceGetTemperature(self.handle, pynvml.NVML_TEMPERATURE_GPU) def _get_power_usage(self) -> int: """Returns power usage in milliwatts @@ -215,7 +211,11 @@ def _get_power_limit(self) -> Union[int, None]: """Returns max power usage in milliwatts https://docs.nvidia.com/deploy/nvml-api/group__nvmlDeviceQueries.html#group__nvmlDeviceQueries_1g263b5bf552d5ec7fcd29a088264d10ad """ - return pynvml.nvmlDeviceGetEnforcedPowerLimit(self.handle) + try: + return pynvml.nvmlDeviceGetEnforcedPowerLimit(self.handle) + except Exception: + logger.warning("Failed to retrieve gpu power limit", exc_info=True) + return None def _get_gpu_utilization(self): """Returns the % of utilization of the kernels during the last sample @@ -328,7 +328,8 @@ class AllGPUDevices: devices: List[GPUDevice] def __init__(self) -> None: - if is_gpu_details_available(): + gpu_details_available = is_gpu_details_available() + if gpu_details_available: logger.debug("GPU available. Starting setup") self.device_count = pynvml.nvmlDeviceGetCount() else: @@ -336,7 +337,7 @@ def __init__(self) -> None: self.device_count = 0 self.devices = [] - if is_nvidia_system() and PYNVML_AVAILABLE: + if gpu_details_available and PYNVML_AVAILABLE: logger.debug("PyNVML available. Starting setup") pynvml.nvmlInit() nvidia_devices_count = pynvml.nvmlDeviceGetCount() diff --git a/codecarbon/core/resource_tracker.py b/codecarbon/core/resource_tracker.py index 120faf4ef..cbf61de18 100644 --- a/codecarbon/core/resource_tracker.py +++ b/codecarbon/core/resource_tracker.py @@ -209,14 +209,20 @@ def set_CPU_tracking(self): def set_GPU_tracking(self): logger.info("[setup] GPU Tracking...") - if self.tracker._gpu_ids: + if isinstance(self.tracker._gpu_ids, str): self.tracker._gpu_ids = parse_gpu_ids(self.tracker._gpu_ids) - if self.tracker._gpu_ids: - self.tracker._conf["gpu_ids"] = self.tracker._gpu_ids - self.tracker._conf["gpu_count"] = len(self.tracker._gpu_ids) - - if gpu.is_gpu_details_available(): - logger.info("Tracking Nvidia GPU via pynvml") + self.tracker._conf["gpu_ids"] = self.tracker._gpu_ids + self.tracker._conf["gpu_count"] = len(self.tracker._gpu_ids) + + is_nvidia = gpu.is_nvidia_system() + is_rocm = gpu.is_rocm_system() + if is_nvidia or is_rocm: + if is_nvidia: + logger.info("Tracking Nvidia GPUs via PyNVML") + self.gpu_tracker = "pynvml" + else: + logger.info("Tracking AMD GPUs via AMDSMI") + self.gpu_tracker = "amdsmi" gpu_devices = GPU.from_utils(self.tracker._gpu_ids) self.tracker._hardware.append(gpu_devices) gpu_names = [n["name"] for n in gpu_devices.devices.get_gpu_static_info()] @@ -224,11 +230,9 @@ def set_GPU_tracking(self): self.tracker._conf["gpu_model"] = "".join( [f"{i} x {name}" for name, i in gpu_names_dict.items()] ) - if self.tracker._conf.get("gpu_count") is None: - self.tracker._conf["gpu_count"] = len( - gpu_devices.devices.get_gpu_static_info() - ) - self.gpu_tracker = "pynvml" + self.tracker._conf["gpu_count"] = len( + gpu_devices.devices.get_gpu_static_info() + ) else: logger.info("No GPU found.") diff --git a/codecarbon/emissions_tracker.py b/codecarbon/emissions_tracker.py index f06c855a1..3fdd4f1b8 100644 --- a/codecarbon/emissions_tracker.py +++ b/codecarbon/emissions_tracker.py @@ -369,71 +369,6 @@ def __init__( self._tasks: Dict[str, Task] = {} self._active_task: Optional[str] = None self._active_task_emissions_at_start: Optional[EmissionsData] = None -# TODO: move this in ResourceTracker() -# if isinstance(self._gpu_ids, str): -# self._gpu_ids: List[int] = parse_gpu_ids(self._gpu_ids) -# self._conf["gpu_ids"] = self._gpu_ids -# self._conf["gpu_count"] = len(self._gpu_ids) - -# logger.info("[setup] RAM Tracking...") -# ram = RAM(tracking_mode=self._tracking_mode) -# self._conf["ram_total_size"] = ram.machine_memory_GB -# self._hardware: List[Union[RAM, CPU, GPU]] = [ram] - -# # Hardware detection -# logger.info("[setup] GPU Tracking...") -# if gpu.is_nvidia_system() or gpu.is_rocm_system(): -# if gpu.is_nvidia_system(): -# logger.info("Tracking Nvidia GPUs via PyNVML") -# elif gpu.is_rocm_system(): -# logger.info("Tracking AMD GPUs via AMDSMI") - -# gpu_devices = GPU.from_utils(self._gpu_ids) -# self._hardware.append(gpu_devices) -# gpu_names = [n["name"] for n in gpu_devices.devices.get_gpu_static_info()] -# gpu_names_dict = Counter(gpu_names) -# self._conf["gpu_model"] = "".join( -# [f"{i} x {name}" for name, i in gpu_names_dict.items()] -# ) -# self._conf["gpu_count"] = len(gpu_devices.devices.get_gpu_static_info()) -# else: -# logger.info("No GPU found.") - -# logger.info("[setup] CPU Tracking...") -# if cpu.is_powergadget_available(): -# logger.info("Tracking Intel CPU via Power Gadget") -# hardware = CPU.from_utils(self._output_dir, "intel_power_gadget") -# self._hardware.append(hardware) -# self._conf["cpu_model"] = hardware.get_model() -# elif cpu.is_rapl_available(): -# logger.info("Tracking Intel CPU via RAPL interface") -# hardware = CPU.from_utils(self._output_dir, "intel_rapl") -# self._hardware.append(hardware) -# self._conf["cpu_model"] = hardware.get_model() -# else: -# logger.warning( -# "No CPU tracking mode found. Falling back on CPU constant mode." -# ) -# tdp = cpu.TDP() -# power = tdp.tdp -# model = tdp.model -# if (power is None) and self._default_cpu_power: -# # We haven't been able to calculate CPU power but user has input a default one. We use it -# user_input_power = self._default_cpu_power -# logger.debug(f"Using user input TDP: {user_input_power} W") -# power = user_input_power -# logger.info(f"CPU Model on constant consumption mode: {model}") -# self._conf["cpu_model"] = model -# if tdp: -# hardware = CPU.from_utils(self._output_dir, "constant", model, power) -# self._hardware.append(hardware) -# else: -# logger.warning( -# "Failed to match CPU TDP constant. " -# + "Falling back on a global constant." -# ) -# hardware = CPU.from_utils(self._output_dir, "constant") -# self._hardware.append(hardware) # Tracking mode detection self._hardware = [] resource_tracker = ResourceTracker(self) From 097e674ac274608c650650d86d92c968c251d96b Mon Sep 17 00:00:00 2001 From: benoit-cty Date: Wed, 18 Feb 2026 16:30:29 +0100 Subject: [PATCH 07/10] Upgrade AMDSMI entries --- codecarbon/core/gpu.py | 94 +++++++++++++++++++++++++++--------------- 1 file changed, 61 insertions(+), 33 deletions(-) diff --git a/codecarbon/core/gpu.py b/codecarbon/core/gpu.py index d764142c3..28ddfa0a7 100644 --- a/codecarbon/core/gpu.py +++ b/codecarbon/core/gpu.py @@ -50,6 +50,15 @@ def is_nvidia_system(): "Please install amdsmi to get GPU metrics." ) AMDSMI_AVAILABLE = False +except AttributeError as e: + # In some environments, amdsmi may be present but not properly configured, leading to AttributeError when importing + logger.warning( + "AMD GPU detected but amdsmi is not properly configured. " + "Please ensure amdsmi is correctly installed to get GPU metrics." + "Tips : check consistency between Python amdsmi package and ROCm versions, and ensure AMD drivers are up to date." + f" Error: {e}" + ) + AMDSMI_AVAILABLE = False @dataclass @@ -248,52 +257,72 @@ def _get_graphics_processes(self) -> List: class AMDGPUDevice(GPUDevice): def _get_total_energy_consumption(self): - """Returns energy in "Energy Status Units" which is equivalent to around 15.3 microjoules""" - energy_count = amdsmi.amdsmi_dev_get_energy_count(self.handle) + """Returns energy in millijoules. Energy Status Units is equivalent to around 15.3 microjoules.""" + energy_count = amdsmi.amdsmi_get_energy_count(self.handle) + # energy_count contains 'power' and 'counter_resolution' + # Result is in uJ (microjoules), convert to mJ energy = energy_count["power"] * energy_count["counter_resolution"] / 1000 return energy def _get_gpu_name(self): """Returns the name of the GPU device""" - name = amdsmi.amdsmi_get_board_info(self.handle)["manufacturer_name"] + try: + asic_info = amdsmi.amdsmi_get_gpu_asic_info(self.handle) + name = asic_info.get("market_name", "Unknown GPU") + except Exception: + name = "Unknown GPU" return self._to_utf8(name) def _get_uuid(self): """Returns the globally unique GPU device UUID""" - uuid = amdsmi.amdsmi_get_device_uuid(self.handle) + uuid = amdsmi.amdsmi_get_gpu_device_uuid(self.handle) return self._to_utf8(uuid) def _get_memory_info(self): """Returns memory info in bytes""" - memory_info = amdsmi.amdsmi_get_vram_usage(self.handle) + memory_info = amdsmi.amdsmi_get_gpu_vram_usage(self.handle) AMDMemory = namedtuple("AMDMemory", ["total", "used", "free"]) + # vram_total and vram_used are already in MB + total_mb = memory_info["vram_total"] + used_mb = memory_info["vram_used"] return AMDMemory( - total=memory_info["vram_total"] * 1024 * 1024, - used=memory_info["vram_used"] * 1024 * 1024, - free=(memory_info["vram_total"] - memory_info["vram_used"]) * 1024 * 1024, + total=total_mb * 1024 * 1024, + used=used_mb * 1024 * 1024, + free=(total_mb - used_mb) * 1024 * 1024, ) def _get_temperature(self): - """Returns degrees in the Celsius scale""" - return amdsmi.amdsmi_dev_get_temp_metric( + """Returns degrees in the Celsius scale. Returns temperature in millidegrees Celsius.""" + # amdsmi_get_temp_metric returns temperature in millidegrees Celsius + temp_milli_celsius = amdsmi.amdsmi_get_temp_metric( self.handle, sensor_type=amdsmi.AmdSmiTemperatureType.EDGE, metric=amdsmi.AmdSmiTemperatureMetric.CURRENT, ) + # Convert from millidegrees to degrees + return temp_milli_celsius // 1000 def _get_power_usage(self): """Returns power usage in milliwatts""" - return ( - amdsmi.amdsmi_get_power_measure(self.handle)["average_socket_power"] * 1000 - ) + # amdsmi_get_power_info returns power in watts, convert to milliwatts + power_info = amdsmi.amdsmi_get_power_info(self.handle) + return int(power_info["average_socket_power"] * 1000) def _get_power_limit(self): """Returns max power usage in milliwatts""" - return amdsmi.amdsmi_get_power_measure(self.handle)["power_limit"] * 1000 + # Get power cap info which contains power_cap in uW (microwatts) + try: + power_cap_info = amdsmi.amdsmi_get_power_cap_info(self.handle) + # power_cap is in uW, convert to mW + return int(power_cap_info["power_cap"] / 1000) + except Exception: + logger.warning("Failed to retrieve gpu power cap", exc_info=True) + return None def _get_gpu_utilization(self): """Returns the % of utilization of the kernels during the last sample""" - return amdsmi.amdsmi_get_gpu_activity(self.handle)["gfx_activity"] + activity = amdsmi.amdsmi_get_gpu_activity(self.handle) + return activity["gfx_activity"] def _get_compute_mode(self): """Returns the compute mode of the GPU""" @@ -301,26 +330,25 @@ def _get_compute_mode(self): def _get_compute_processes(self): """Returns the list of processes ids having a compute context on the device with the memory used""" - processes_handles = amdsmi.amdsmi_get_process_list(self.handle) - processes_infos = [ - amdsmi.amdsmi_get_process_info(self.handle, p) for p in processes_handles - ] - return [ - {"pid": p["pid"], "used_memory": p["memory_usage"]["vram_mem"]} - for p in processes_infos - ] + try: + processes = amdsmi.amdsmi_get_gpu_process_list(self.handle) + return [{"pid": p["pid"], "used_memory": p["mem"]} for p in processes] + except Exception: + logger.warning("Failed to retrieve gpu compute processes", exc_info=True) + return [] def _get_graphics_processes(self): """Returns the list of processes ids having a graphics context on the device with the memory used""" - processes_handles = amdsmi.amdsmi_get_process_list(self.handle) - processes_infos = [ - amdsmi.amdsmi_get_process_info(self.handle, p) for p in processes_handles - ] - return [ - {"pid": p["pid"], "used_memory": p["memory_usage"]["vram_usage"]} - for p in processes_infos - if p["engine_usage"]["gfx"] > 0 - ] + try: + processes = amdsmi.amdsmi_get_gpu_process_list(self.handle) + return [ + {"pid": p["pid"], "used_memory": p["mem"]} + for p in processes + if p["engine_usage"].get("gfx", 0) > 0 + ] + except Exception: + logger.warning("Failed to retrieve gpu graphics processes", exc_info=True) + return [] class AllGPUDevices: @@ -349,7 +377,7 @@ def __init__(self) -> None: if is_rocm_system() and AMDSMI_AVAILABLE: logger.debug("AMDSMI available. Starting setup") amdsmi.amdsmi_init() - amd_devices_handles = amdsmi.amdsmi_get_device_handles() + amd_devices_handles = amdsmi.amdsmi_get_processor_handles() for i, handle in enumerate(amd_devices_handles): amd_gpu_device = AMDGPUDevice(handle=handle, gpu_index=i) self.devices.append(amd_gpu_device) From a24d2462b90538b13c4e69ef951ac34aab990bba Mon Sep 17 00:00:00 2001 From: benoit-cty Date: Wed, 18 Feb 2026 16:44:43 +0100 Subject: [PATCH 08/10] Remove warning for amdsmi.amdsmi_get_gpu_process_list --- codecarbon/core/gpu.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/codecarbon/core/gpu.py b/codecarbon/core/gpu.py index 28ddfa0a7..854d978c9 100644 --- a/codecarbon/core/gpu.py +++ b/codecarbon/core/gpu.py @@ -334,7 +334,7 @@ def _get_compute_processes(self): processes = amdsmi.amdsmi_get_gpu_process_list(self.handle) return [{"pid": p["pid"], "used_memory": p["mem"]} for p in processes] except Exception: - logger.warning("Failed to retrieve gpu compute processes", exc_info=True) + # logger.warning("Failed to retrieve gpu compute processes", exc_info=True) return [] def _get_graphics_processes(self): @@ -347,7 +347,7 @@ def _get_graphics_processes(self): if p["engine_usage"].get("gfx", 0) > 0 ] except Exception: - logger.warning("Failed to retrieve gpu graphics processes", exc_info=True) + # logger.warning("Failed to retrieve gpu graphics processes", exc_info=True) return [] From dd178e809e774b2d2cf9b3f85eb14ed6f2ddb50e Mon Sep 17 00:00:00 2001 From: benoit-cty Date: Wed, 18 Feb 2026 17:00:48 +0100 Subject: [PATCH 09/10] Debug detection --- codecarbon/core/gpu.py | 41 +++++++++++++++++++++++++++-------------- 1 file changed, 27 insertions(+), 14 deletions(-) diff --git a/codecarbon/core/gpu.py b/codecarbon/core/gpu.py index 854d978c9..42a1d4572 100644 --- a/codecarbon/core/gpu.py +++ b/codecarbon/core/gpu.py @@ -30,6 +30,7 @@ def is_nvidia_system(): try: import pynvml + pynvml.nvmlInit() PYNVML_AVAILABLE = True except ImportError: if is_nvidia_system(): @@ -38,6 +39,13 @@ def is_nvidia_system(): "Please install pynvml to get GPU metrics." ) PYNVML_AVAILABLE = False +except Exception: + if is_nvidia_system(): + logger.warning( + "Nvidia GPU detected but pynvml initialization failed. " + "Please ensure NVIDIA drivers are properly installed." + ) + PYNVML_AVAILABLE = False try: import amdsmi @@ -365,7 +373,7 @@ def __init__(self) -> None: self.device_count = 0 self.devices = [] - if gpu_details_available and PYNVML_AVAILABLE: + if PYNVML_AVAILABLE: logger.debug("PyNVML available. Starting setup") pynvml.nvmlInit() nvidia_devices_count = pynvml.nvmlDeviceGetCount() @@ -374,14 +382,24 @@ def __init__(self) -> None: nvidia_gpu_device = NvidiaGPUDevice(handle=handle, gpu_index=i) self.devices.append(nvidia_gpu_device) - if is_rocm_system() and AMDSMI_AVAILABLE: + if AMDSMI_AVAILABLE: logger.debug("AMDSMI available. Starting setup") - amdsmi.amdsmi_init() - amd_devices_handles = amdsmi.amdsmi_get_processor_handles() - for i, handle in enumerate(amd_devices_handles): - amd_gpu_device = AMDGPUDevice(handle=handle, gpu_index=i) - self.devices.append(amd_gpu_device) - + try: + amdsmi.amdsmi_init() + amd_devices_handles = amdsmi.amdsmi_get_processor_handles() + if len(amd_devices_handles) == 0: + print( + "No AMD GPUs foundon machine with amdsmi_get_processor_handles() !" + ) + else: + for i, handle in enumerate(amd_devices_handles): + logger.debug( + f"Found AMD GPU device with handle {handle} and index {i} : {amdsmi.amdsmi_get_gpu_device_uuid(handle)}" + ) + amd_gpu_device = AMDGPUDevice(handle=handle, gpu_index=i) + self.devices.append(amd_gpu_device) + except amdsmi.AmdSmiException as e: + logger.warning(f"Failed to initialize AMDSMI: {e}", exc_info=True) self.device_count = len(self.devices) def get_gpu_static_info(self) -> List: @@ -466,9 +484,4 @@ def get_delta(self, last_duration: Time) -> List: def is_gpu_details_available() -> bool: """Returns True if the GPU details are available.""" - try: - pynvml.nvmlInit() - return True - - except pynvml.NVMLError: - return False + return PYNVML_AVAILABLE or AMDSMI_AVAILABLE From 333afc48f3a93992a2542ffd5a642e9e28999be0 Mon Sep 17 00:00:00 2001 From: benoit-cty Date: Wed, 18 Feb 2026 17:03:40 +0100 Subject: [PATCH 10/10] Fix Uninitialized --- codecarbon/core/gpu.py | 2 -- 1 file changed, 2 deletions(-) diff --git a/codecarbon/core/gpu.py b/codecarbon/core/gpu.py index 42a1d4572..f2bf16d9f 100644 --- a/codecarbon/core/gpu.py +++ b/codecarbon/core/gpu.py @@ -367,10 +367,8 @@ def __init__(self) -> None: gpu_details_available = is_gpu_details_available() if gpu_details_available: logger.debug("GPU available. Starting setup") - self.device_count = pynvml.nvmlDeviceGetCount() else: logger.error("There is no GPU available") - self.device_count = 0 self.devices = [] if PYNVML_AVAILABLE: