From e1c58a33002b8b3721f0a1619f0ee884786c8183 Mon Sep 17 00:00:00 2001
From: IlyasMoutawwakil <ilyas.moutawwakil@gmail.com>
Date: Tue, 16 Jan 2024 09:10:24 +0000
Subject: [PATCH 01/10] added amd-smi interface

---
 codecarbon/core/gpu.py          | 193 ++++++++++++++++++++++++++------
 codecarbon/core/util.py         |  20 ++++
 codecarbon/emissions_tracker.py |   7 +-
 3 files changed, 185 insertions(+), 35 deletions(-)

diff --git a/codecarbon/core/gpu.py b/codecarbon/core/gpu.py
index 70a81cabc..97158c4c4 100644
--- a/codecarbon/core/gpu.py
+++ b/codecarbon/core/gpu.py
@@ -1,10 +1,23 @@
+from collections import namedtuple
 from dataclasses import dataclass, field
 
-import pynvml
-
 from codecarbon.core.units import Energy, Power, Time
+from codecarbon.core.util import is_amd_system, is_nvidia_system
 from codecarbon.external.logger import logger
 
+USE_AMDSMI = False
+USE_PYNVML = False
+
+if is_nvidia_system():
+    import pynvml
+
+    USE_PYNVML = True
+
+if is_amd_system():
+    import amdsmi
+
+    USE_AMDSMI = True
+
 
 @dataclass
 class GPUDevice:
@@ -92,46 +105,105 @@ def _get_total_energy_consumption(self):
         """Returns total energy consumption for this GPU in millijoules (mJ) since the driver was last reloaded
         https://docs.nvidia.com/deploy/nvml-api/group__nvmlDeviceQueries.html#group__nvmlDeviceQueries_1g732ab899b5bd18ac4bfb93c02de4900a
         """
-        return pynvml.nvmlDeviceGetTotalEnergyConsumption(self.handle)
+        if USE_PYNVML:
+            return pynvml.nvmlDeviceGetTotalEnergyConsumption(self.handle)
+        elif USE_AMDSMI:
+            # returns energy in microjoules (amd-smi metric --energy)
+            return amdsmi.amdsmi_get_power_measure(self.handle)["energy_accumulator"]
+        else:
+            raise Exception("No GPU interface available")
 
     def _get_gpu_name(self):
         """Returns the name of the GPU device
         https://docs.nvidia.com/deploy/nvml-api/group__nvmlDeviceQueries.html#group__nvmlDeviceQueries_1ga5361803e044c6fdf3b08523fb6d1481
         """
-        name = pynvml.nvmlDeviceGetName(self.handle)
+        if USE_PYNVML:
+            name = pynvml.nvmlDeviceGetName(self.handle)
+        elif USE_AMDSMI:
+            name = amdsmi.amdsmi_get_board_info(self.handle)["manufacturer_name"]
+        else:
+            raise Exception("No GPU interface available")
+
         return self._to_utf8(name)
 
     def _get_uuid(self):
         """Returns the globally unique GPU device UUID
         https://docs.nvidia.com/deploy/nvml-api/group__nvmlDeviceQueries.html#group__nvmlDeviceQueries_1g72710fb20f30f0c2725ce31579832654
         """
-        uuid = pynvml.nvmlDeviceGetUUID(self.handle)
+        if USE_PYNVML:
+            uuid = pynvml.nvmlDeviceGetUUID(self.handle)
+        elif USE_AMDSMI:
+            uuid = amdsmi.amdsmi_get_device_uuid(self.handle)
+        else:
+            raise Exception("No GPU interface available")
+
         return self._to_utf8(uuid)
 
     def _get_memory_info(self):
         """Returns memory info in bytes
         https://docs.nvidia.com/deploy/nvml-api/group__nvmlDeviceQueries.html#group__nvmlDeviceQueries_1g2dfeb1db82aa1de91aa6edf941c85ca8
         """
-        return pynvml.nvmlDeviceGetMemoryInfo(self.handle)
+        if USE_PYNVML:
+            return pynvml.nvmlDeviceGetMemoryInfo(self.handle)
+        elif USE_AMDSMI:
+            # returns memory in megabytes (amd-smi metric --mem-usage)
+            memory_info = amdsmi.amdsmi_get_vram_usage(self.handle)
+            AMDMemory = namedtuple("AMDMemory", ["total", "used", "free"])
+            return AMDMemory(
+                total=memory_info["vram_total"] * 1024 * 1024,
+                used=memory_info["vram_used"] * 1024 * 1024,
+                free=(memory_info["vram_total"] - memory_info["vram_used"])
+                * 1024
+                * 1024,
+            )
+        else:
+            raise Exception("No GPU interface available")
 
     def _get_temperature(self):
         """Returns degrees in the Celsius scale
         https://docs.nvidia.com/deploy/nvml-api/group__nvmlDeviceQueries.html#group__nvmlDeviceQueries_1g92d1c5182a14dd4be7090e3c1480b121
         """
-        return pynvml.nvmlDeviceGetTemperature(self.handle, pynvml.NVML_TEMPERATURE_GPU)
+        if USE_PYNVML:
+            return pynvml.nvmlDeviceGetTemperature(
+                self.handle,
+                sensor=pynvml.NVML_TEMPERATURE_GPU,
+            )
+        elif USE_AMDSMI:
+            return amdsmi.amdsmi_dev_get_temp_metric(
+                self.handle,
+                sensor_type=amdsmi.AmdSmiTemperatureType.EDGE,
+                metric=amdsmi.AmdSmiTemperatureMetric.CURRENT,
+            )
+        else:
+            raise Exception("No GPU interface available")
 
     def _get_power_usage(self):
         """Returns power usage in milliwatts
         https://docs.nvidia.com/deploy/nvml-api/group__nvmlDeviceQueries.html#group__nvmlDeviceQueries_1g7ef7dff0ff14238d08a19ad7fb23fc87
         """
-        return pynvml.nvmlDeviceGetPowerUsage(self.handle)
+        if USE_PYNVML:
+            return pynvml.nvmlDeviceGetPowerUsage(self.handle)
+        elif USE_AMDSMI:
+            # returns power in Watts (amd-smi metric --power)
+            return (
+                amdsmi.amdsmi_get_power_measure(self.handle)["average_socket_power"]
+                * 1000
+            )
+        else:
+            raise Exception("No GPU interface available")
 
     def _get_power_limit(self):
         """Returns max power usage in milliwatts
         https://docs.nvidia.com/deploy/nvml-api/group__nvmlDeviceQueries.html#group__nvmlDeviceQueries_1g263b5bf552d5ec7fcd29a088264d10ad
         """
         try:
-            return pynvml.nvmlDeviceGetEnforcedPowerLimit(self.handle)
+            if USE_PYNVML:
+                return pynvml.nvmlDeviceGetEnforcedPowerLimit(self.handle)
+            elif USE_AMDSMI:
+                # returns power limit in Watts (amd-smi static --limit)
+                return (
+                    amdsmi.amdsmi_get_power_measure(self.handle)["power_limit"] * 1000
+                )
         except Exception:
             return None
 
@@ -139,51 +211,100 @@ def _get_gpu_utilization(self):
         """Returns the % of utilization of the kernels during the last sample
         https://docs.nvidia.com/deploy/nvml-api/structnvmlUtilization__t.html#structnvmlUtilization__t
         """
-        return pynvml.nvmlDeviceGetUtilizationRates(self.handle).gpu
+        if USE_PYNVML:
+            return pynvml.nvmlDeviceGetUtilizationRates(self.handle).gpu
+        elif USE_AMDSMI:
+            return amdsmi.amdsmi_get_gpu_activity(self.handle)["gfx_activity"]
+        else:
+            raise Exception("No GPU interface available")
 
     def _get_compute_mode(self):
         """Returns the compute mode of the GPU
         https://docs.nvidia.com/deploy/nvml-api/group__nvmlDeviceEnumvs.html#group__nvmlDeviceEnumvs_1gbed1b88f2e3ba39070d31d1db4340233
         """
-        return pynvml.nvmlDeviceGetComputeMode(self.handle)
+        if USE_PYNVML:
+            return pynvml.nvmlDeviceGetComputeMode(self.handle)
+        elif USE_AMDSMI:
+            return None
+        else:
+            raise Exception("No GPU interface available")
 
     def _get_compute_processes(self):
-        """Returns the list of processes ids having a compute context on the
-        device with the memory used
+        """Returns the list of processes ids having a compute context on the device with the memory used
         https://docs.nvidia.com/deploy/nvml-api/group__nvmlDeviceQueries.html#group__nvmlDeviceQueries_1g46ceaea624d5c96e098e03c453419d68
         """
         try:
-            processes = pynvml.nvmlDeviceGetComputeRunningProcesses(self.handle)
-
-            return [{"pid": p.pid, "used_memory": p.usedGpuMemory} for p in processes]
-        except pynvml.NVMLError:
+            if USE_PYNVML:
+                processes = pynvml.nvmlDeviceGetComputeRunningProcesses(self.handle)
+                return [
+                    {"pid": p.pid, "used_memory": p.usedGpuMemory} for p in processes
+                ]
+            elif USE_AMDSMI:
+                processes_handles = amdsmi.amdsmi_get_process_list(self.handle)
+                processes_info = [
+                    amdsmi.amdsmi_get_process_info(self.handle, p)
+                    for p in processes_handles
+                ]
+                return [
+                    {"pid": p["pid"], "used_memory": p["memory_usage"]["vram_usage"]}
+                    for p in processes_info
+                ]
+        except Exception:
             return []
 
     def _get_graphics_processes(self):
-        """Returns the list of processes ids having a graphics context on the
-        device with the memory used
+        """Returns the list of processes ids having a graphics context on the device with the memory used
         https://docs.nvidia.com/deploy/nvml-api/group__nvmlDeviceQueries.html#group__nvmlDeviceQueries_1g7eacf7fa7ba4f4485d166736bf31195e
         """
         try:
-            processes = pynvml.nvmlDeviceGetGraphicsRunningProcesses(self.handle)
-
-            return [{"pid": p.pid, "used_memory": p.usedGpuMemory} for p in processes]
-        except pynvml.NVMLError:
+            if USE_PYNVML:
+                processes = pynvml.nvmlDeviceGetGraphicsRunningProcesses(self.handle)
+                return [
+                    {"pid": p.pid, "used_memory": p.usedGpuMemory} for p in processes
+                ]
+            elif USE_AMDSMI:
+                processes_handles = amdsmi.amdsmi_get_process_list(self.handle)
+                processes_info = [
+                    amdsmi.amdsmi_get_process_info(self.handle, p)
+                    for p in processes_handles
+                ]
+                return [
+                    {"pid": p["pid"], "used_memory": p["memory_usage"]["vram_usage"]}
+                    for p in processes_info
+                    if p["engine_usage"]["gfx"] > 0
+                ]
+        except Exception:
             return []
 
 
 class AllGPUDevices:
     def __init__(self):
         if is_gpu_details_available():
-            logger.debug("GPU available. Starting setup")
-            self.device_count = pynvml.nvmlDeviceGetCount()
+            if USE_PYNVML:
+                logger.debug("Nvidia GPU available. Starting setup")
+                pynvml.nvmlInit()
+                self.device_count = pynvml.nvmlDeviceGetCount()
+            elif USE_AMDSMI:
+                logger.debug("AMD GPU available. Starting setup")
+                amdsmi.amdsmi_init()
+                self.device_count = len(amdsmi.amdsmi_get_device_handles())
+            else:
+                logger.error("No GPU interface available")
+                self.device_count = 0
         else:
             logger.error("There is no GPU available")
             self.device_count = 0
         self.devices = []
         for i in range(self.device_count):
-            handle = pynvml.nvmlDeviceGetHandleByIndex(i)
-            gpu_device = GPUDevice(handle=handle, gpu_index=i)
+            if USE_PYNVML:
+                handle = pynvml.nvmlDeviceGetHandleByIndex(i)
+                gpu_device = GPUDevice(handle=handle, gpu_index=i)
+            elif USE_AMDSMI:
+                handle = amdsmi.amdsmi_get_device_handles()[i]
+                gpu_device = GPUDevice(handle=handle, gpu_index=i)
+            else:
+                raise Exception("No GPU interface available")
+
             self.devices.append(gpu_device)
 
     def get_gpu_static_info(self):
@@ -206,7 +327,7 @@ def get_gpu_static_info(self):
                 devices_static_info.append(gpu_device.get_static_details())
             return devices_static_info
 
-        except pynvml.NVMLError:
+        except Exception:
             logger.warning("Failed to retrieve gpu static info", exc_info=True)
             return []
 
@@ -238,7 +359,7 @@ def get_gpu_details(self):
                 devices_info.append(gpu_device.get_gpu_details())
             return devices_info
 
-        except pynvml.NVMLError:
+        except Exception:
             logger.warning("Failed to retrieve gpu information", exc_info=True)
             return []
 
@@ -261,7 +382,7 @@ def get_delta(self, last_duration: Time):
                 devices_info.append(gpu_device.delta(last_duration))
             return devices_info
 
-        except pynvml.NVMLError:
+        except Exception:
             logger.warning("Failed to retrieve gpu information", exc_info=True)
             return []
 
@@ -269,8 +390,14 @@ def get_delta(self, last_duration: Time):
 def is_gpu_details_available():
     """Returns True if the GPU details are available."""
     try:
-        pynvml.nvmlInit()
-        return True
+        if USE_PYNVML:
+            pynvml.nvmlInit()
+            return True
+        elif USE_AMDSMI:
+            amdsmi.amdsmi_init()
+            return True
+        else:
+            return False
 
-    except pynvml.NVMLError:
+    except Exception:
         return False
diff --git a/codecarbon/core/util.py b/codecarbon/core/util.py
index 7bf66edb3..ef1d7b81b 100644
--- a/codecarbon/core/util.py
+++ b/codecarbon/core/util.py
@@ -117,3 +117,23 @@ def count_cpus() -> int:
     num_cpus = num_cpus_matches[0].replace("NumCPUs=", "")
     logger.debug(f"Detected {num_cpus} cpus available on SLURM.")
     return int(num_cpus)
+
+
+def is_amd_system():
+    """Returns True if the system has an amd-smi interface."""
+    try:
+        # Check if amd-smi is available
+        subprocess.check_output(["amd-smi", "--help"])
+        return True
+    except subprocess.CalledProcessError:
+        return False
+
+
+def is_nvidia_system():
+    """Returns True if the system has an nvidia-smi interface."""
+    try:
+        # Check if nvidia-smi is available
+        subprocess.check_output(["nvidia-smi", "--help"])
+        return True
+    except Exception:
+        return False
diff --git a/codecarbon/emissions_tracker.py b/codecarbon/emissions_tracker.py
index b16bbf0de..249dd8e9c 100644
--- a/codecarbon/emissions_tracker.py
+++ b/codecarbon/emissions_tracker.py
@@ -18,7 +18,7 @@
 from codecarbon.core.config import get_hierarchical_config, parse_gpu_ids
 from codecarbon.core.emissions import Emissions
 from codecarbon.core.units import Energy, Power, Time
-from codecarbon.core.util import count_cpus, suppress
+from codecarbon.core.util import count_cpus, is_amd_system, is_nvidia_system, suppress
 from codecarbon.external.geography import CloudMetadata, GeoMetadata
 from codecarbon.external.hardware import CPU, GPU, RAM
 from codecarbon.external.logger import logger, set_logger_format, set_logger_level
@@ -280,7 +280,10 @@ def __init__(
         # Hardware detection
         logger.info("[setup] GPU Tracking...")
         if gpu.is_gpu_details_available():
-            logger.info("Tracking Nvidia GPU via pynvml")
+            if is_nvidia_system():
+                logger.info("Tracking Nvidia GPU via pynvml")
+            elif is_amd_system():
+                logger.info("Tracking AMD GPU via amdsmi")
             gpu_devices = GPU.from_utils(self._gpu_ids)
             self._hardware.append(gpu_devices)
             gpu_names = [n["name"] for n in gpu_devices.devices.get_gpu_static_info()]

From fc93306ee9d8d14265b71efe75b748d4cb91d264 Mon Sep 17 00:00:00 2001
From: IlyasMoutawwakil <ilyas.moutawwakil@gmail.com>
Date: Tue, 16 Jan 2024 09:46:50 +0000
Subject: [PATCH 02/10] fix energy unit

---
 codecarbon/core/gpu.py | 8 ++++++--
 1 file changed, 6 insertions(+), 2 deletions(-)

diff --git a/codecarbon/core/gpu.py b/codecarbon/core/gpu.py
index 97158c4c4..fe0edc8e7 100644
--- a/codecarbon/core/gpu.py
+++ b/codecarbon/core/gpu.py
@@ -108,8 +108,12 @@ def _get_total_energy_consumption(self):
         if USE_PYNVML:
             return pynvml.nvmlDeviceGetTotalEnergyConsumption(self.handle)
         elif USE_AMDSMI:
-            # returns energy in microjoules (amd-smi metric --energy)
-            return amdsmi.amdsmi_get_power_measure(self.handle)["energy_accumulator"]
+            # returns energy in "Energy Status Units" which is equivalent to 15.3 microjoules (amd-smi metric --energy)
+            return (
+                amdsmi.amdsmi_get_power_measure(self.handle)["energy_accumulator"]
+                * 15.3
+                / 1000
+            )
         else:
             raise Exception("No GPU interface available")
 

From 0626e4b6a4753ff3ee9b6c684010a94702110444 Mon Sep 17 00:00:00 2001
From: IlyasMoutawwakil <ilyas.moutawwakil@gmail.com>
Date: Tue, 16 Jan 2024 10:04:52 +0000
Subject: [PATCH 03/10] use counter_resolution instead of hard coding it

---
 codecarbon/core/gpu.py | 9 +++------
 1 file changed, 3 insertions(+), 6 deletions(-)

diff --git a/codecarbon/core/gpu.py b/codecarbon/core/gpu.py
index fe0edc8e7..6fe632a2c 100644
--- a/codecarbon/core/gpu.py
+++ b/codecarbon/core/gpu.py
@@ -108,12 +108,9 @@ def _get_total_energy_consumption(self):
         if USE_PYNVML:
             return pynvml.nvmlDeviceGetTotalEnergyConsumption(self.handle)
         elif USE_AMDSMI:
-            # returns energy in "Energy Status Units" which is equivalent to 15.3 microjoules (amd-smi metric --energy)
-            return (
-                amdsmi.amdsmi_get_power_measure(self.handle)["energy_accumulator"]
-                * 15.3
-                / 1000
-            )
+            # returns energy in "Energy Status Units" which is equivalent to around 15.3 microjoules
+            energy = amdsmi.amdsmi_dev_get_energy_count(self.handle)
+            return energy["power"] * energy["counter_resolution"] / 1000
         else:
             raise Exception("No GPU interface available")
 

From 37f07ecf9b4ac5781389747fd0a2a92d35ce1197 Mon Sep 17 00:00:00 2001
From: benoit-cty <4-benoit-cty@users.noreply.git.leximpact.dev>
Date: Fri, 26 Jan 2024 12:40:28 +0100
Subject: [PATCH 04/10] wip : handle AMD and Nvidia at the same time

---
 codecarbon/core/gpu.py | 29 +++++++++++++++--------------
 1 file changed, 15 insertions(+), 14 deletions(-)

diff --git a/codecarbon/core/gpu.py b/codecarbon/core/gpu.py
index 6fe632a2c..a69c98086 100644
--- a/codecarbon/core/gpu.py
+++ b/codecarbon/core/gpu.py
@@ -279,34 +279,35 @@ def _get_graphics_processes(self):
 
 
 class AllGPUDevices:
+    devices = []
+    device_count:int = 0
+    
     def __init__(self):
+        self.devices = []
         if is_gpu_details_available():
             if USE_PYNVML:
                 logger.debug("Nvidia GPU available. Starting setup")
                 pynvml.nvmlInit()
                 self.device_count = pynvml.nvmlDeviceGetCount()
-            elif USE_AMDSMI:
+                for i in range(self.device_count):
+                    handle = pynvml.nvmlDeviceGetHandleByIndex(i)
+                    gpu_device = GPUDevice(handle=handle, gpu_index=i)
+                    self.devices.append(gpu_device)
+            if USE_AMDSMI:
                 logger.debug("AMD GPU available. Starting setup")
                 amdsmi.amdsmi_init()
                 self.device_count = len(amdsmi.amdsmi_get_device_handles())
+                for i in range(self.device_count):
+                    handle = amdsmi.amdsmi_get_device_handles()[i]
+                    gpu_device = GPUDevice(handle=handle, gpu_index=i)
+                    self.devices.append(gpu_device)
             else:
                 logger.error("No GPU interface available")
-                self.device_count = 0
         else:
             logger.error("There is no GPU available")
-            self.device_count = 0
-        self.devices = []
-        for i in range(self.device_count):
-            if USE_PYNVML:
-                handle = pynvml.nvmlDeviceGetHandleByIndex(i)
-                gpu_device = GPUDevice(handle=handle, gpu_index=i)
-            elif USE_AMDSMI:
-                handle = amdsmi.amdsmi_get_device_handles()[i]
-                gpu_device = GPUDevice(handle=handle, gpu_index=i)
-            else:
-                raise Exception("No GPU interface available")
+        self.device_count = len(self.devices)
 
-            self.devices.append(gpu_device)
+        
 
     def get_gpu_static_info(self):
         """Get all GPUs static information.

From 0002c2e57681ac3afec8b8c09f32249434deb735 Mon Sep 17 00:00:00 2001
From: IlyasMoutawwakil <ilyas.moutawwakil@gmail.com>
Date: Mon, 29 Jan 2024 14:36:50 +0000
Subject: [PATCH 05/10] added support for amd and nvidia at the same time

---
 codecarbon/core/gpu.py          | 335 +++++++++++++++-----------------
 codecarbon/core/util.py         |  20 --
 codecarbon/emissions_tracker.py |  13 +-
 3 files changed, 169 insertions(+), 199 deletions(-)

diff --git a/codecarbon/core/gpu.py b/codecarbon/core/gpu.py
index a69c98086..c846badf9 100644
--- a/codecarbon/core/gpu.py
+++ b/codecarbon/core/gpu.py
@@ -1,32 +1,66 @@
+import subprocess
+from typing import List, Any
 from collections import namedtuple
 from dataclasses import dataclass, field
 
+
 from codecarbon.core.units import Energy, Power, Time
-from codecarbon.core.util import is_amd_system, is_nvidia_system
 from codecarbon.external.logger import logger
 
-USE_AMDSMI = False
-USE_PYNVML = False
 
-if is_nvidia_system():
+def is_rocm_system():
+    """Returns True if the system has an rocm-smi interface."""
+    try:
+        # Check if rocm-smi is available
+        subprocess.check_output(["rocm-smi", "--help"])
+        return True
+    except subprocess.CalledProcessError:
+        return False
+
+
+def is_nvidia_system():
+    """Returns True if the system has an nvidia-smi interface."""
+    try:
+        # Check if nvidia-smi is available
+        subprocess.check_output(["nvidia-smi", "--help"])
+        return True
+    except Exception:
+        return False
+
+
+try:
     import pynvml
 
-    USE_PYNVML = True
+    PYNVML_AVAILABLE = True
+except ImportError:
+    if is_nvidia_system():
+        logger.warning(
+            "Nvidia GPU detected but pynvml is not available. "
+            "Please install pynvml to get GPU metrics."
+        )
+    PYNVML_AVAILABLE = False
 
-if is_amd_system():
+try:
     import amdsmi
 
-    USE_AMDSMI = True
+    AMDSMI_AVAILABLE = True
+except ImportError:
+    if is_rocm_system():
+        logger.warning(
+            "AMD GPU detected but amdsmi is not available. "
+            "Please install amdsmi to get GPU metrics."
+        )
+    AMDSMI_AVAILABLE = False
 
 
 @dataclass
 class GPUDevice:
-    handle: any
+    handle: Any
     gpu_index: int
-    # Energy consumed in kWh
-    energy_delta: Energy = field(default_factory=lambda: Energy(0))
     # Power based on reading
     power: Power = field(default_factory=lambda: Power(0))
+    # Energy consumed in kWh
+    energy_delta: Energy = field(default_factory=lambda: Energy(0))
     # Last energy reading in kWh
     last_energy: Energy = field(default_factory=lambda: Energy(0))
 
@@ -101,213 +135,184 @@ def _to_utf8(self, str_or_bytes):
 
         return str_or_bytes
 
+
+@dataclass
+class NvidiaGPUDevice(GPUDevice):
     def _get_total_energy_consumption(self):
         """Returns total energy consumption for this GPU in millijoules (mJ) since the driver was last reloaded
         https://docs.nvidia.com/deploy/nvml-api/group__nvmlDeviceQueries.html#group__nvmlDeviceQueries_1g732ab899b5bd18ac4bfb93c02de4900a
         """
-        if USE_PYNVML:
-            return pynvml.nvmlDeviceGetTotalEnergyConsumption(self.handle)
-        elif USE_AMDSMI:
-            # returns energy in "Energy Status Units" which is equivalent to around 15.3 microjoules
-            energy = amdsmi.amdsmi_dev_get_energy_count(self.handle)
-            return energy["power"] * energy["counter_resolution"] / 1000
-        else:
-            raise Exception("No GPU interface available")
+        return pynvml.nvmlDeviceGetTotalEnergyConsumption(self.handle)
 
     def _get_gpu_name(self):
         """Returns the name of the GPU device
         https://docs.nvidia.com/deploy/nvml-api/group__nvmlDeviceQueries.html#group__nvmlDeviceQueries_1ga5361803e044c6fdf3b08523fb6d1481
         """
-        if USE_PYNVML:
-            name = pynvml.nvmlDeviceGetName(self.handle)
-        elif USE_AMDSMI:
-            name = amdsmi.amdsmi_get_board_info(self.handle)["manufacturer_name"]
-        else:
-            raise Exception("No GPU interface available")
-
+        name = pynvml.nvmlDeviceGetName(self.handle)
         return self._to_utf8(name)
 
     def _get_uuid(self):
         """Returns the globally unique GPU device UUID
         https://docs.nvidia.com/deploy/nvml-api/group__nvmlDeviceQueries.html#group__nvmlDeviceQueries_1g72710fb20f30f0c2725ce31579832654
         """
-        if USE_PYNVML:
-            uuid = pynvml.nvmlDeviceGetUUID(self.handle)
-        elif USE_AMDSMI:
-            uuid = amdsmi.amdsmi_get_device_uuid(self.handle)
-        else:
-            raise Exception("No GPU interface available")
-
+        uuid = pynvml.nvmlDeviceGetUUID(self.handle)
         return self._to_utf8(uuid)
 
     def _get_memory_info(self):
         """Returns memory info in bytes
         https://docs.nvidia.com/deploy/nvml-api/group__nvmlDeviceQueries.html#group__nvmlDeviceQueries_1g2dfeb1db82aa1de91aa6edf941c85ca8
         """
-        if USE_PYNVML:
-            return pynvml.nvmlDeviceGetMemoryInfo(self.handle)
-        elif USE_AMDSMI:
-            # returns memory in megabytes (amd-smi metric --mem-usage)
-            memory_info = amdsmi.amdsmi_get_vram_usage(self.handle)
-            AMDMemory = namedtuple("AMDMemory", ["total", "used", "free"])
-            return AMDMemory(
-                total=memory_info["vram_total"] * 1024 * 1024,
-                used=memory_info["vram_used"] * 1024 * 1024,
-                free=(memory_info["vram_total"] - memory_info["vram_used"])
-                * 1024
-                * 1024,
-            )
-        else:
-            raise Exception("No GPU interface available")
+        return pynvml.nvmlDeviceGetMemoryInfo(self.handle)
 
     def _get_temperature(self):
         """Returns degrees in the Celsius scale
         https://docs.nvidia.com/deploy/nvml-api/group__nvmlDeviceQueries.html#group__nvmlDeviceQueries_1g92d1c5182a14dd4be7090e3c1480b121
         """
-        if USE_PYNVML:
-            return pynvml.nvmlDeviceGetTemperature(
-                self.handle,
-                sensor=pynvml.NVML_TEMPERATURE_GPU,
-            )
-        elif USE_AMDSMI:
-            return amdsmi.amdsmi_dev_get_temp_metric(
-                self.handle,
-                sensor_type=amdsmi.AmdSmiTemperatureType.EDGE,
-                metric=amdsmi.AmdSmiTemperatureMetric.CURRENT,
-            )
-        else:
-            raise Exception("No GPU interface available")
+        return pynvml.nvmlDeviceGetTemperature(
+            self.handle, sensor=pynvml.NVML_TEMPERATURE_GPU
+        )
 
     def _get_power_usage(self):
         """Returns power usage in milliwatts
         https://docs.nvidia.com/deploy/nvml-api/group__nvmlDeviceQueries.html#group__nvmlDeviceQueries_1g7ef7dff0ff14238d08a19ad7fb23fc87
         """
-        if USE_PYNVML:
-            return pynvml.nvmlDeviceGetPowerUsage(self.handle)
-        elif USE_AMDSMI:
-            # returns power in Watts (amd-smi metric --power)
-            return (
-                amdsmi.amdsmi_get_power_measure(self.handle)["average_socket_power"]
-                * 1000
-            )
-        else:
-            raise Exception("No GPU interface available")
+        return pynvml.nvmlDeviceGetPowerUsage(self.handle)
 
     def _get_power_limit(self):
         """Returns max power usage in milliwatts
         https://docs.nvidia.com/deploy/nvml-api/group__nvmlDeviceQueries.html#group__nvmlDeviceQueries_1g263b5bf552d5ec7fcd29a088264d10ad
         """
-        try:
-            if USE_PYNVML:
-                return pynvml.nvmlDeviceGetEnforcedPowerLimit(self.handle)
-            elif USE_AMDSMI:
-                # returns power limit in Watts (amd-smi static --limit)
-                return (
-                    amdsmi.amdsmi_get_power_measure(self.handle)["power_limit"] * 1000
-                )
-        except Exception:
-            return None
+        return pynvml.nvmlDeviceGetEnforcedPowerLimit(self.handle)
 
     def _get_gpu_utilization(self):
         """Returns the % of utilization of the kernels during the last sample
         https://docs.nvidia.com/deploy/nvml-api/structnvmlUtilization__t.html#structnvmlUtilization__t
         """
-        if USE_PYNVML:
-            return pynvml.nvmlDeviceGetUtilizationRates(self.handle).gpu
-        elif USE_AMDSMI:
-            return amdsmi.amdsmi_get_gpu_activity(self.handle)["gfx_activity"]
-        else:
-            raise Exception("No GPU interface available")
+        return pynvml.nvmlDeviceGetUtilizationRates(self.handle).gpu
 
     def _get_compute_mode(self):
         """Returns the compute mode of the GPU
         https://docs.nvidia.com/deploy/nvml-api/group__nvmlDeviceEnumvs.html#group__nvmlDeviceEnumvs_1gbed1b88f2e3ba39070d31d1db4340233
         """
-        if USE_PYNVML:
-            return pynvml.nvmlDeviceGetComputeMode(self.handle)
-        elif USE_AMDSMI:
-            return None
-        else:
-            raise Exception("No GPU interface available")
+        return pynvml.nvmlDeviceGetComputeMode(self.handle)
 
     def _get_compute_processes(self):
         """Returns the list of processes ids having a compute context on the device with the memory used
         https://docs.nvidia.com/deploy/nvml-api/group__nvmlDeviceQueries.html#group__nvmlDeviceQueries_1g46ceaea624d5c96e098e03c453419d68
         """
-        try:
-            if USE_PYNVML:
-                processes = pynvml.nvmlDeviceGetComputeRunningProcesses(self.handle)
-                return [
-                    {"pid": p.pid, "used_memory": p.usedGpuMemory} for p in processes
-                ]
-            elif USE_AMDSMI:
-                processes_handles = amdsmi.amdsmi_get_process_list(self.handle)
-                processes_info = [
-                    amdsmi.amdsmi_get_process_info(self.handle, p)
-                    for p in processes_handles
-                ]
-                return [
-                    {"pid": p["pid"], "used_memory": p["memory_usage"]["vram_usage"]}
-                    for p in processes_info
-                ]
-        except Exception:
-            return []
+        processes = pynvml.nvmlDeviceGetComputeRunningProcesses(self.handle)
+        return [{"pid": p.pid, "used_memory": p.usedGpuMemory} for p in processes]
 
     def _get_graphics_processes(self):
         """Returns the list of processes ids having a graphics context on the device with the memory used
         https://docs.nvidia.com/deploy/nvml-api/group__nvmlDeviceQueries.html#group__nvmlDeviceQueries_1g7eacf7fa7ba4f4485d166736bf31195e
         """
-        try:
-            if USE_PYNVML:
-                processes = pynvml.nvmlDeviceGetGraphicsRunningProcesses(self.handle)
-                return [
-                    {"pid": p.pid, "used_memory": p.usedGpuMemory} for p in processes
-                ]
-            elif USE_AMDSMI:
-                processes_handles = amdsmi.amdsmi_get_process_list(self.handle)
-                processes_info = [
-                    amdsmi.amdsmi_get_process_info(self.handle, p)
-                    for p in processes_handles
-                ]
-                return [
-                    {"pid": p["pid"], "used_memory": p["memory_usage"]["vram_usage"]}
-                    for p in processes_info
-                    if p["engine_usage"]["gfx"] > 0
-                ]
-        except Exception:
-            return []
+        processes = pynvml.nvmlDeviceGetGraphicsRunningProcesses(self.handle)
+        return [{"pid": p.pid, "used_memory": p.usedGpuMemory} for p in processes]
+
+
+class AMDGPUDevice(GPUDevice):
+    def _get_total_energy_consumption(self):
+        """Returns energy in "Energy Status Units" which is equivalent to around 15.3 microjoules"""
+        energy_count = amdsmi.amdsmi_dev_get_energy_count(self.handle)
+        energy = energy_count["power"] * energy_count["counter_resolution"] / 1000
+        return energy
+
+    def _get_gpu_name(self):
+        """Returns the name of the GPU device"""
+        name = amdsmi.amdsmi_get_board_info(self.handle)["manufacturer_name"]
+        return self._to_utf8(name)
+
+    def _get_uuid(self):
+        """Returns the globally unique GPU device UUID"""
+        uuid = amdsmi.amdsmi_get_device_uuid(self.handle)
+        return self._to_utf8(uuid)
+
+    def _get_memory_info(self):
+        """Returns memory info in bytes"""
+        memory_info = amdsmi.amdsmi_get_vram_usage(self.handle)
+        AMDMemory = namedtuple("AMDMemory", ["total", "used", "free"])
+        return AMDMemory(
+            total=memory_info["vram_total"] * 1024 * 1024,
+            used=memory_info["vram_used"] * 1024 * 1024,
+            free=(memory_info["vram_total"] - memory_info["vram_used"]) * 1024 * 1024,
+        )
+
+    def _get_temperature(self):
+        """Returns degrees in the Celsius scale"""
+        return amdsmi.amdsmi_dev_get_temp_metric(
+            self.handle,
+            sensor_type=amdsmi.AmdSmiTemperatureType.EDGE,
+            metric=amdsmi.AmdSmiTemperatureMetric.CURRENT,
+        )
+
+    def _get_power_usage(self):
+        """Returns power usage in milliwatts"""
+        return (
+            amdsmi.amdsmi_get_power_measure(self.handle)["average_socket_power"] * 1000
+        )
+
+    def _get_power_limit(self):
+        """Returns max power usage in milliwatts"""
+        return amdsmi.amdsmi_get_power_measure(self.handle)["power_limit"] * 1000
+
+    def _get_gpu_utilization(self):
+        """Returns the % of utilization of the kernels during the last sample"""
+        return amdsmi.amdsmi_get_gpu_activity(self.handle)["gfx_activity"]
+
+    def _get_compute_mode(self):
+        """Returns the compute mode of the GPU"""
+        return None
+
+    def _get_compute_processes(self):
+        """Returns the list of processes ids having a compute context on the device with the memory used"""
+        processes_handles = amdsmi.amdsmi_get_process_list(self.handle)
+        processes_infos = [
+            amdsmi.amdsmi_get_process_info(self.handle, p) for p in processes_handles
+        ]
+        return [
+            {"pid": p["pid"], "used_memory": p["memory_usage"]["vram_mem"]}
+            for p in processes_infos
+        ]
+
+    def _get_graphics_processes(self):
+        """Returns the list of processes ids having a graphics context on the device with the memory used"""
+        processes_handles = amdsmi.amdsmi_get_process_list(self.handle)
+        processes_infos = [
+            amdsmi.amdsmi_get_process_info(self.handle, p) for p in processes_handles
+        ]
+        return [
+            {"pid": p["pid"], "used_memory": p["memory_usage"]["vram_usage"]}
+            for p in processes_infos
+            if p["engine_usage"]["gfx"] > 0
+        ]
 
 
 class AllGPUDevices:
-    devices = []
-    device_count:int = 0
-    
+    device_count: int
+    devices: List[GPUDevice]
+
     def __init__(self):
         self.devices = []
-        if is_gpu_details_available():
-            if USE_PYNVML:
-                logger.debug("Nvidia GPU available. Starting setup")
-                pynvml.nvmlInit()
-                self.device_count = pynvml.nvmlDeviceGetCount()
-                for i in range(self.device_count):
-                    handle = pynvml.nvmlDeviceGetHandleByIndex(i)
-                    gpu_device = GPUDevice(handle=handle, gpu_index=i)
-                    self.devices.append(gpu_device)
-            if USE_AMDSMI:
-                logger.debug("AMD GPU available. Starting setup")
-                amdsmi.amdsmi_init()
-                self.device_count = len(amdsmi.amdsmi_get_device_handles())
-                for i in range(self.device_count):
-                    handle = amdsmi.amdsmi_get_device_handles()[i]
-                    gpu_device = GPUDevice(handle=handle, gpu_index=i)
-                    self.devices.append(gpu_device)
-            else:
-                logger.error("No GPU interface available")
-        else:
-            logger.error("There is no GPU available")
-        self.device_count = len(self.devices)
 
-        
+        if is_nvidia_system() and PYNVML_AVAILABLE:
+            logger.debug("PyNVML available. Starting setup")
+            pynvml.nvmlInit()
+            nvidia_devices_count = pynvml.nvmlDeviceGetCount()
+            for i in range(nvidia_devices_count):
+                handle = pynvml.nvmlDeviceGetHandleByIndex(i)
+                nvidia_gpu_device = NvidiaGPUDevice(handle=handle, gpu_index=i)
+                self.devices.append(nvidia_gpu_device)
+
+        if is_rocm_system() and AMDSMI_AVAILABLE:
+            logger.debug("AMDSMI available. Starting setup")
+            amdsmi.amdsmi_init()
+            amd_devices_handles = amdsmi.amdsmi_get_device_handles()
+            for i, handle in enumerate(amd_devices_handles):
+                amd_gpu_device = AMDGPUDevice(handle=handle, gpu_index=i)
+                self.devices.append(amd_gpu_device)
+
+        self.device_count = len(self.devices)
 
     def get_gpu_static_info(self):
         """Get all GPUs static information.
@@ -357,7 +362,7 @@ def get_gpu_details(self):
         try:
             devices_info = []
             for i in range(self.device_count):
-                gpu_device: GPUDevice = self.devices[i]
+                gpu_device = self.devices[i]
                 devices_info.append(gpu_device.get_gpu_details())
             return devices_info
 
@@ -380,26 +385,10 @@ def get_delta(self, last_duration: Time):
         try:
             devices_info = []
             for i in range(self.device_count):
-                gpu_device: GPUDevice = self.devices[i]
+                gpu_device = self.devices[i]
                 devices_info.append(gpu_device.delta(last_duration))
             return devices_info
 
         except Exception:
             logger.warning("Failed to retrieve gpu information", exc_info=True)
             return []
-
-
-def is_gpu_details_available():
-    """Returns True if the GPU details are available."""
-    try:
-        if USE_PYNVML:
-            pynvml.nvmlInit()
-            return True
-        elif USE_AMDSMI:
-            amdsmi.amdsmi_init()
-            return True
-        else:
-            return False
-
-    except Exception:
-        return False
diff --git a/codecarbon/core/util.py b/codecarbon/core/util.py
index ef1d7b81b..7bf66edb3 100644
--- a/codecarbon/core/util.py
+++ b/codecarbon/core/util.py
@@ -117,23 +117,3 @@ def count_cpus() -> int:
     num_cpus = num_cpus_matches[0].replace("NumCPUs=", "")
     logger.debug(f"Detected {num_cpus} cpus available on SLURM.")
     return int(num_cpus)
-
-
-def is_amd_system():
-    """Returns True if the system has an amd-smi interface."""
-    try:
-        # Check if amd-smi is available
-        subprocess.check_output(["amd-smi", "--help"])
-        return True
-    except subprocess.CalledProcessError:
-        return False
-
-
-def is_nvidia_system():
-    """Returns True if the system has an nvidia-smi interface."""
-    try:
-        # Check if nvidia-smi is available
-        subprocess.check_output(["nvidia-smi", "--help"])
-        return True
-    except Exception:
-        return False
diff --git a/codecarbon/emissions_tracker.py b/codecarbon/emissions_tracker.py
index 249dd8e9c..cb8469e37 100644
--- a/codecarbon/emissions_tracker.py
+++ b/codecarbon/emissions_tracker.py
@@ -18,7 +18,7 @@
 from codecarbon.core.config import get_hierarchical_config, parse_gpu_ids
 from codecarbon.core.emissions import Emissions
 from codecarbon.core.units import Energy, Power, Time
-from codecarbon.core.util import count_cpus, is_amd_system, is_nvidia_system, suppress
+from codecarbon.core.util import count_cpus, suppress
 from codecarbon.external.geography import CloudMetadata, GeoMetadata
 from codecarbon.external.hardware import CPU, GPU, RAM
 from codecarbon.external.logger import logger, set_logger_format, set_logger_level
@@ -279,11 +279,12 @@ def __init__(
 
         # Hardware detection
         logger.info("[setup] GPU Tracking...")
-        if gpu.is_gpu_details_available():
-            if is_nvidia_system():
-                logger.info("Tracking Nvidia GPU via pynvml")
-            elif is_amd_system():
-                logger.info("Tracking AMD GPU via amdsmi")
+        if gpu.is_nvidia_system() or gpu.is_rocm_system():
+            if gpu.is_nvidia_system():
+                logger.info("Tracking Nvidia GPUs via PyNVML")
+            elif gpu.is_rocm_system():
+                logger.info("Tracking AMD GPUs via AMDSMI")
+
             gpu_devices = GPU.from_utils(self._gpu_ids)
             self._hardware.append(gpu_devices)
             gpu_names = [n["name"] for n in gpu_devices.devices.get_gpu_static_info()]

From c498440ae8237083f930af3c4d1af73018332a59 Mon Sep 17 00:00:00 2001
From: benoit-cty <anne@onyme.fr>
Date: Wed, 18 Feb 2026 15:01:45 +0100
Subject: [PATCH 06/10] Fix merge conflict

---
 codecarbon/core/gpu.py              | 19 +++++----
 codecarbon/core/resource_tracker.py | 28 +++++++------
 codecarbon/emissions_tracker.py     | 65 -----------------------------
 3 files changed, 26 insertions(+), 86 deletions(-)

diff --git a/codecarbon/core/gpu.py b/codecarbon/core/gpu.py
index 04cadcd01..d764142c3 100644
--- a/codecarbon/core/gpu.py
+++ b/codecarbon/core/gpu.py
@@ -1,10 +1,8 @@
 import subprocess
-from typing import List, Any
 from collections import namedtuple
 from dataclasses import dataclass, field
 from typing import Any, Dict, List, Union
 
-
 from codecarbon.core.units import Energy, Power, Time
 from codecarbon.external.logger import logger
 
@@ -15,7 +13,7 @@ def is_rocm_system():
         # Check if rocm-smi is available
         subprocess.check_output(["rocm-smi", "--help"])
         return True
-    except subprocess.CalledProcessError:
+    except (subprocess.CalledProcessError, OSError):
         return False
 
 
@@ -201,9 +199,7 @@ def _get_temperature(self) -> int:
         """Returns degrees in the Celsius scale
         https://docs.nvidia.com/deploy/nvml-api/group__nvmlDeviceQueries.html#group__nvmlDeviceQueries_1g92d1c5182a14dd4be7090e3c1480b121
         """
-        return pynvml.nvmlDeviceGetTemperature(
-            self.handle, sensor=pynvml.NVML_TEMPERATURE_GPU
-        )
+        return pynvml.nvmlDeviceGetTemperature(self.handle, pynvml.NVML_TEMPERATURE_GPU)
 
     def _get_power_usage(self) -> int:
         """Returns power usage in milliwatts
@@ -215,7 +211,11 @@ def _get_power_limit(self) -> Union[int, None]:
         """Returns max power usage in milliwatts
         https://docs.nvidia.com/deploy/nvml-api/group__nvmlDeviceQueries.html#group__nvmlDeviceQueries_1g263b5bf552d5ec7fcd29a088264d10ad
         """
-        return pynvml.nvmlDeviceGetEnforcedPowerLimit(self.handle)
+        try:
+            return pynvml.nvmlDeviceGetEnforcedPowerLimit(self.handle)
+        except Exception:
+            logger.warning("Failed to retrieve gpu power limit", exc_info=True)
+            return None
 
     def _get_gpu_utilization(self):
         """Returns the % of utilization of the kernels during the last sample
@@ -328,7 +328,8 @@ class AllGPUDevices:
     devices: List[GPUDevice]
 
     def __init__(self) -> None:
-        if is_gpu_details_available():
+        gpu_details_available = is_gpu_details_available()
+        if gpu_details_available:
             logger.debug("GPU available. Starting setup")
             self.device_count = pynvml.nvmlDeviceGetCount()
         else:
@@ -336,7 +337,7 @@ def __init__(self) -> None:
             self.device_count = 0
         self.devices = []
 
-        if is_nvidia_system() and PYNVML_AVAILABLE:
+        if gpu_details_available and PYNVML_AVAILABLE:
             logger.debug("PyNVML available. Starting setup")
             pynvml.nvmlInit()
             nvidia_devices_count = pynvml.nvmlDeviceGetCount()
diff --git a/codecarbon/core/resource_tracker.py b/codecarbon/core/resource_tracker.py
index 120faf4ef..cbf61de18 100644
--- a/codecarbon/core/resource_tracker.py
+++ b/codecarbon/core/resource_tracker.py
@@ -209,14 +209,20 @@ def set_CPU_tracking(self):
 
     def set_GPU_tracking(self):
         logger.info("[setup] GPU Tracking...")
-        if self.tracker._gpu_ids:
+        if isinstance(self.tracker._gpu_ids, str):
             self.tracker._gpu_ids = parse_gpu_ids(self.tracker._gpu_ids)
-            if self.tracker._gpu_ids:
-                self.tracker._conf["gpu_ids"] = self.tracker._gpu_ids
-                self.tracker._conf["gpu_count"] = len(self.tracker._gpu_ids)
-
-        if gpu.is_gpu_details_available():
-            logger.info("Tracking Nvidia GPU via pynvml")
+            self.tracker._conf["gpu_ids"] = self.tracker._gpu_ids
+            self.tracker._conf["gpu_count"] = len(self.tracker._gpu_ids)
+
+        is_nvidia = gpu.is_nvidia_system()
+        is_rocm = gpu.is_rocm_system()
+        if is_nvidia or is_rocm:
+            if is_nvidia:
+                logger.info("Tracking Nvidia GPUs via PyNVML")
+                self.gpu_tracker = "pynvml"
+            else:
+                logger.info("Tracking AMD GPUs via AMDSMI")
+                self.gpu_tracker = "amdsmi"
             gpu_devices = GPU.from_utils(self.tracker._gpu_ids)
             self.tracker._hardware.append(gpu_devices)
             gpu_names = [n["name"] for n in gpu_devices.devices.get_gpu_static_info()]
@@ -224,11 +230,9 @@ def set_GPU_tracking(self):
             self.tracker._conf["gpu_model"] = "".join(
                 [f"{i} x {name}" for name, i in gpu_names_dict.items()]
             )
-            if self.tracker._conf.get("gpu_count") is None:
-                self.tracker._conf["gpu_count"] = len(
-                    gpu_devices.devices.get_gpu_static_info()
-                )
-            self.gpu_tracker = "pynvml"
+            self.tracker._conf["gpu_count"] = len(
+                gpu_devices.devices.get_gpu_static_info()
+            )
         else:
             logger.info("No GPU found.")
 
diff --git a/codecarbon/emissions_tracker.py b/codecarbon/emissions_tracker.py
index f06c855a1..3fdd4f1b8 100644
--- a/codecarbon/emissions_tracker.py
+++ b/codecarbon/emissions_tracker.py
@@ -369,71 +369,6 @@ def __init__(
         self._tasks: Dict[str, Task] = {}
         self._active_task: Optional[str] = None
         self._active_task_emissions_at_start: Optional[EmissionsData] = None
-# TODO: move this in ResourceTracker()
-#         if isinstance(self._gpu_ids, str):
-#             self._gpu_ids: List[int] = parse_gpu_ids(self._gpu_ids)
-#             self._conf["gpu_ids"] = self._gpu_ids
-#             self._conf["gpu_count"] = len(self._gpu_ids)
-
-#         logger.info("[setup] RAM Tracking...")
-#         ram = RAM(tracking_mode=self._tracking_mode)
-#         self._conf["ram_total_size"] = ram.machine_memory_GB
-#         self._hardware: List[Union[RAM, CPU, GPU]] = [ram]
-
-#         # Hardware detection
-#         logger.info("[setup] GPU Tracking...")
-#         if gpu.is_nvidia_system() or gpu.is_rocm_system():
-#             if gpu.is_nvidia_system():
-#                 logger.info("Tracking Nvidia GPUs via PyNVML")
-#             elif gpu.is_rocm_system():
-#                 logger.info("Tracking AMD GPUs via AMDSMI")
-
-#             gpu_devices = GPU.from_utils(self._gpu_ids)
-#             self._hardware.append(gpu_devices)
-#             gpu_names = [n["name"] for n in gpu_devices.devices.get_gpu_static_info()]
-#             gpu_names_dict = Counter(gpu_names)
-#             self._conf["gpu_model"] = "".join(
-#                 [f"{i} x {name}" for name, i in gpu_names_dict.items()]
-#             )
-#             self._conf["gpu_count"] = len(gpu_devices.devices.get_gpu_static_info())
-#         else:
-#             logger.info("No GPU found.")
-
-#         logger.info("[setup] CPU Tracking...")
-#         if cpu.is_powergadget_available():
-#             logger.info("Tracking Intel CPU via Power Gadget")
-#             hardware = CPU.from_utils(self._output_dir, "intel_power_gadget")
-#             self._hardware.append(hardware)
-#             self._conf["cpu_model"] = hardware.get_model()
-#         elif cpu.is_rapl_available():
-#             logger.info("Tracking Intel CPU via RAPL interface")
-#             hardware = CPU.from_utils(self._output_dir, "intel_rapl")
-#             self._hardware.append(hardware)
-#             self._conf["cpu_model"] = hardware.get_model()
-#         else:
-#             logger.warning(
-#                 "No CPU tracking mode found. Falling back on CPU constant mode."
-#             )
-#             tdp = cpu.TDP()
-#             power = tdp.tdp
-#             model = tdp.model
-#             if (power is None) and self._default_cpu_power:
-#                 # We haven't been able to calculate CPU power but user has input a default one. We use it
-#                 user_input_power = self._default_cpu_power
-#                 logger.debug(f"Using user input TDP: {user_input_power} W")
-#                 power = user_input_power
-#             logger.info(f"CPU Model on constant consumption mode: {model}")
-#             self._conf["cpu_model"] = model
-#             if tdp:
-#                 hardware = CPU.from_utils(self._output_dir, "constant", model, power)
-#                 self._hardware.append(hardware)
-#             else:
-#                 logger.warning(
-#                     "Failed to match CPU TDP constant. "
-#                     + "Falling back on a global constant."
-#                 )
-#                 hardware = CPU.from_utils(self._output_dir, "constant")
-#                 self._hardware.append(hardware)
         # Tracking mode detection
         self._hardware = []
         resource_tracker = ResourceTracker(self)

From 097e674ac274608c650650d86d92c968c251d96b Mon Sep 17 00:00:00 2001
From: benoit-cty <anne@onyme.fr>
Date: Wed, 18 Feb 2026 16:30:29 +0100
Subject: [PATCH 07/10] Upgrade AMDSMI entries

---
 codecarbon/core/gpu.py | 94 +++++++++++++++++++++++++++---------------
 1 file changed, 61 insertions(+), 33 deletions(-)

diff --git a/codecarbon/core/gpu.py b/codecarbon/core/gpu.py
index d764142c3..28ddfa0a7 100644
--- a/codecarbon/core/gpu.py
+++ b/codecarbon/core/gpu.py
@@ -50,6 +50,15 @@ def is_nvidia_system():
             "Please install amdsmi to get GPU metrics."
         )
     AMDSMI_AVAILABLE = False
+except AttributeError as e:
+    # In some environments, amdsmi may be present but not properly configured, leading to AttributeError when importing
+    logger.warning(
+        "AMD GPU detected but amdsmi is not properly configured. "
+        "Please ensure amdsmi is correctly installed to get GPU metrics."
+        "Tips : check consistency between Python amdsmi package and ROCm versions, and ensure AMD drivers are up to date."
+        f" Error: {e}"
+    )
+    AMDSMI_AVAILABLE = False
 
 
 @dataclass
@@ -248,52 +257,72 @@ def _get_graphics_processes(self) -> List:
 
 class AMDGPUDevice(GPUDevice):
     def _get_total_energy_consumption(self):
-        """Returns energy in "Energy Status Units" which is equivalent to around 15.3 microjoules"""
-        energy_count = amdsmi.amdsmi_dev_get_energy_count(self.handle)
+        """Returns energy in millijoules. Energy Status Units is equivalent to around 15.3 microjoules."""
+        energy_count = amdsmi.amdsmi_get_energy_count(self.handle)
+        # energy_count contains 'power' and 'counter_resolution'
+        # Result is in uJ (microjoules), convert to mJ
         energy = energy_count["power"] * energy_count["counter_resolution"] / 1000
         return energy
 
     def _get_gpu_name(self):
         """Returns the name of the GPU device"""
-        name = amdsmi.amdsmi_get_board_info(self.handle)["manufacturer_name"]
+        try:
+            asic_info = amdsmi.amdsmi_get_gpu_asic_info(self.handle)
+            name = asic_info.get("market_name", "Unknown GPU")
+        except Exception:
+            name = "Unknown GPU"
         return self._to_utf8(name)
 
     def _get_uuid(self):
         """Returns the globally unique GPU device UUID"""
-        uuid = amdsmi.amdsmi_get_device_uuid(self.handle)
+        uuid = amdsmi.amdsmi_get_gpu_device_uuid(self.handle)
         return self._to_utf8(uuid)
 
     def _get_memory_info(self):
         """Returns memory info in bytes"""
-        memory_info = amdsmi.amdsmi_get_vram_usage(self.handle)
+        memory_info = amdsmi.amdsmi_get_gpu_vram_usage(self.handle)
         AMDMemory = namedtuple("AMDMemory", ["total", "used", "free"])
+        # vram_total and vram_used are already in MB
+        total_mb = memory_info["vram_total"]
+        used_mb = memory_info["vram_used"]
         return AMDMemory(
-            total=memory_info["vram_total"] * 1024 * 1024,
-            used=memory_info["vram_used"] * 1024 * 1024,
-            free=(memory_info["vram_total"] - memory_info["vram_used"]) * 1024 * 1024,
+            total=total_mb * 1024 * 1024,
+            used=used_mb * 1024 * 1024,
+            free=(total_mb - used_mb) * 1024 * 1024,
         )
 
     def _get_temperature(self):
-        """Returns degrees in the Celsius scale"""
-        return amdsmi.amdsmi_dev_get_temp_metric(
+        """Returns degrees in the Celsius scale. Returns temperature in millidegrees Celsius."""
+        # amdsmi_get_temp_metric returns temperature in millidegrees Celsius
+        temp_milli_celsius = amdsmi.amdsmi_get_temp_metric(
             self.handle,
             sensor_type=amdsmi.AmdSmiTemperatureType.EDGE,
             metric=amdsmi.AmdSmiTemperatureMetric.CURRENT,
         )
+        # Convert from millidegrees to degrees
+        return temp_milli_celsius // 1000
 
     def _get_power_usage(self):
         """Returns power usage in milliwatts"""
-        return (
-            amdsmi.amdsmi_get_power_measure(self.handle)["average_socket_power"] * 1000
-        )
+        # amdsmi_get_power_info returns power in watts, convert to milliwatts
+        power_info = amdsmi.amdsmi_get_power_info(self.handle)
+        return int(power_info["average_socket_power"] * 1000)
 
     def _get_power_limit(self):
         """Returns max power usage in milliwatts"""
-        return amdsmi.amdsmi_get_power_measure(self.handle)["power_limit"] * 1000
+        # Get power cap info which contains power_cap in uW (microwatts)
+        try:
+            power_cap_info = amdsmi.amdsmi_get_power_cap_info(self.handle)
+            # power_cap is in uW, convert to mW
+            return int(power_cap_info["power_cap"] / 1000)
+        except Exception:
+            logger.warning("Failed to retrieve gpu power cap", exc_info=True)
+            return None
 
     def _get_gpu_utilization(self):
         """Returns the % of utilization of the kernels during the last sample"""
-        return amdsmi.amdsmi_get_gpu_activity(self.handle)["gfx_activity"]
+        activity = amdsmi.amdsmi_get_gpu_activity(self.handle)
+        return activity["gfx_activity"]
 
     def _get_compute_mode(self):
         """Returns the compute mode of the GPU"""
@@ -301,26 +330,25 @@ def _get_compute_mode(self):
 
     def _get_compute_processes(self):
         """Returns the list of processes ids having a compute context on the device with the memory used"""
-        processes_handles = amdsmi.amdsmi_get_process_list(self.handle)
-        processes_infos = [
-            amdsmi.amdsmi_get_process_info(self.handle, p) for p in processes_handles
-        ]
-        return [
-            {"pid": p["pid"], "used_memory": p["memory_usage"]["vram_mem"]}
-            for p in processes_infos
-        ]
+        try:
+            processes = amdsmi.amdsmi_get_gpu_process_list(self.handle)
+            return [{"pid": p["pid"], "used_memory": p["mem"]} for p in processes]
+        except Exception:
+            logger.warning("Failed to retrieve gpu compute processes", exc_info=True)
+            return []
 
     def _get_graphics_processes(self):
         """Returns the list of processes ids having a graphics context on the device with the memory used"""
-        processes_handles = amdsmi.amdsmi_get_process_list(self.handle)
-        processes_infos = [
-            amdsmi.amdsmi_get_process_info(self.handle, p) for p in processes_handles
-        ]
-        return [
-            {"pid": p["pid"], "used_memory": p["memory_usage"]["vram_usage"]}
-            for p in processes_infos
-            if p["engine_usage"]["gfx"] > 0
-        ]
+        try:
+            processes = amdsmi.amdsmi_get_gpu_process_list(self.handle)
+            return [
+                {"pid": p["pid"], "used_memory": p["mem"]}
+                for p in processes
+                if p["engine_usage"].get("gfx", 0) > 0
+            ]
+        except Exception:
+            logger.warning("Failed to retrieve gpu graphics processes", exc_info=True)
+            return []
 
 
 class AllGPUDevices:
@@ -349,7 +377,7 @@ def __init__(self) -> None:
         if is_rocm_system() and AMDSMI_AVAILABLE:
             logger.debug("AMDSMI available. Starting setup")
             amdsmi.amdsmi_init()
-            amd_devices_handles = amdsmi.amdsmi_get_device_handles()
+            amd_devices_handles = amdsmi.amdsmi_get_processor_handles()
             for i, handle in enumerate(amd_devices_handles):
                 amd_gpu_device = AMDGPUDevice(handle=handle, gpu_index=i)
                 self.devices.append(amd_gpu_device)

From a24d2462b90538b13c4e69ef951ac34aab990bba Mon Sep 17 00:00:00 2001
From: benoit-cty <anne@onyme.fr>
Date: Wed, 18 Feb 2026 16:44:43 +0100
Subject: [PATCH 08/10] Remove warning for amdsmi.amdsmi_get_gpu_process_list

---
 codecarbon/core/gpu.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/codecarbon/core/gpu.py b/codecarbon/core/gpu.py
index 28ddfa0a7..854d978c9 100644
--- a/codecarbon/core/gpu.py
+++ b/codecarbon/core/gpu.py
@@ -334,7 +334,7 @@ def _get_compute_processes(self):
             processes = amdsmi.amdsmi_get_gpu_process_list(self.handle)
             return [{"pid": p["pid"], "used_memory": p["mem"]} for p in processes]
         except Exception:
-            logger.warning("Failed to retrieve gpu compute processes", exc_info=True)
+            # logger.warning("Failed to retrieve gpu compute processes", exc_info=True)
             return []
 
     def _get_graphics_processes(self):
@@ -347,7 +347,7 @@ def _get_graphics_processes(self):
                 if p["engine_usage"].get("gfx", 0) > 0
             ]
         except Exception:
-            logger.warning("Failed to retrieve gpu graphics processes", exc_info=True)
+            # logger.warning("Failed to retrieve gpu graphics processes", exc_info=True)
             return []
 
 

From dd178e809e774b2d2cf9b3f85eb14ed6f2ddb50e Mon Sep 17 00:00:00 2001
From: benoit-cty <anne@onyme.fr>
Date: Wed, 18 Feb 2026 17:00:48 +0100
Subject: [PATCH 09/10] Debug detection

---
 codecarbon/core/gpu.py | 41 +++++++++++++++++++++++++++--------------
 1 file changed, 27 insertions(+), 14 deletions(-)

diff --git a/codecarbon/core/gpu.py b/codecarbon/core/gpu.py
index 854d978c9..42a1d4572 100644
--- a/codecarbon/core/gpu.py
+++ b/codecarbon/core/gpu.py
@@ -30,6 +30,7 @@ def is_nvidia_system():
 try:
     import pynvml
 
+    pynvml.nvmlInit()
     PYNVML_AVAILABLE = True
 except ImportError:
     if is_nvidia_system():
@@ -38,6 +39,13 @@ def is_nvidia_system():
             "Please install pynvml to get GPU metrics."
         )
     PYNVML_AVAILABLE = False
+except Exception:
+    if is_nvidia_system():
+        logger.warning(
+            "Nvidia GPU detected but pynvml initialization failed. "
+            "Please ensure NVIDIA drivers are properly installed."
+        )
+    PYNVML_AVAILABLE = False
 
 try:
     import amdsmi
@@ -365,7 +373,7 @@ def __init__(self) -> None:
             self.device_count = 0
         self.devices = []
 
-        if gpu_details_available and PYNVML_AVAILABLE:
+        if PYNVML_AVAILABLE:
             logger.debug("PyNVML available. Starting setup")
             pynvml.nvmlInit()
             nvidia_devices_count = pynvml.nvmlDeviceGetCount()
@@ -374,14 +382,24 @@ def __init__(self) -> None:
                 nvidia_gpu_device = NvidiaGPUDevice(handle=handle, gpu_index=i)
                 self.devices.append(nvidia_gpu_device)
 
-        if is_rocm_system() and AMDSMI_AVAILABLE:
+        if AMDSMI_AVAILABLE:
             logger.debug("AMDSMI available. Starting setup")
-            amdsmi.amdsmi_init()
-            amd_devices_handles = amdsmi.amdsmi_get_processor_handles()
-            for i, handle in enumerate(amd_devices_handles):
-                amd_gpu_device = AMDGPUDevice(handle=handle, gpu_index=i)
-                self.devices.append(amd_gpu_device)
-
+            try:
+                amdsmi.amdsmi_init()
+                amd_devices_handles = amdsmi.amdsmi_get_processor_handles()
+                if len(amd_devices_handles) == 0:
+                    print(
+                        "No AMD GPUs foundon machine with amdsmi_get_processor_handles() !"
+                    )
+                else:
+                    for i, handle in enumerate(amd_devices_handles):
+                        logger.debug(
+                            f"Found AMD GPU device with handle {handle} and index {i} : {amdsmi.amdsmi_get_gpu_device_uuid(handle)}"
+                        )
+                        amd_gpu_device = AMDGPUDevice(handle=handle, gpu_index=i)
+                        self.devices.append(amd_gpu_device)
+            except amdsmi.AmdSmiException as e:
+                logger.warning(f"Failed to initialize AMDSMI: {e}", exc_info=True)
         self.device_count = len(self.devices)
 
     def get_gpu_static_info(self) -> List:
@@ -466,9 +484,4 @@ def get_delta(self, last_duration: Time) -> List:
 
 def is_gpu_details_available() -> bool:
     """Returns True if the GPU details are available."""
-    try:
-        pynvml.nvmlInit()
-        return True
-
-    except pynvml.NVMLError:
-        return False
+    return PYNVML_AVAILABLE or AMDSMI_AVAILABLE

From 333afc48f3a93992a2542ffd5a642e9e28999be0 Mon Sep 17 00:00:00 2001
From: benoit-cty <anne@onyme.fr>
Date: Wed, 18 Feb 2026 17:03:40 +0100
Subject: [PATCH 10/10] Fix Uninitialized

---
 codecarbon/core/gpu.py | 2 --
 1 file changed, 2 deletions(-)

diff --git a/codecarbon/core/gpu.py b/codecarbon/core/gpu.py
index 42a1d4572..f2bf16d9f 100644
--- a/codecarbon/core/gpu.py
+++ b/codecarbon/core/gpu.py
@@ -367,10 +367,8 @@ def __init__(self) -> None:
         gpu_details_available = is_gpu_details_available()
         if gpu_details_available:
             logger.debug("GPU available. Starting setup")
-            self.device_count = pynvml.nvmlDeviceGetCount()
         else:
             logger.error("There is no GPU available")
-            self.device_count = 0
         self.devices = []
 
         if PYNVML_AVAILABLE: