diff --git a/cuda_bindings/cuda/bindings/_nvml.pyx b/cuda_bindings/cuda/bindings/_nvml.pyx index 8248d5acfb..d9bddcc4bc 100644 --- a/cuda_bindings/cuda/bindings/_nvml.pyx +++ b/cuda_bindings/cuda/bindings/_nvml.pyx @@ -1187,6 +1187,395 @@ class RUSD(_IntEnum): POLL_PCI = 0x20 # Enable RUSD polling on pci group POLL_FAN = 0x40 # Enable RUSD polling on fan group POLL_PROC_UTIL = 0x80 # Enable RUSD polling on process utilization group + POLL_ALL = 0xFFFFFFFFFFFFFFFF # Enable RUSD polling on all groups + + +class PowerMizerMode(_IntEnum): + POWER_MIZER_MODE_ADAPTIVE = 0 # Adjust GPU clocks based on GPU utilization + POWER_MIZER_MODE_PREFER_MAXIMUM_PERFORMANCE = 1 # Raise GPU clocks to favor maximum performance, to the extent that thermal and other constraints allow + POWER_MIZER_MODE_AUTO = 2 # PowerMizer mode is driver controlled + POWER_MIZER_MODE_PREFER_CONSISTENT_PERFORMANCE = 3 # lock to GPU base clocks + + +class DeviceArch(_IntEnum): + DEVICE_ARCH_KEPLER = 2 + DEVICE_ARCH_MAXWELL = 3 + DEVICE_ARCH_PASCAL = 4 + DEVICE_ARCH_VOLTA = 5 + DEVICE_ARCH_TURING = 6 + DEVICE_ARCH_AMPERE = 7 + DEVICE_ARCH_ADA = 8 + DEVICE_ARCH_HOPPER = 9 + DEVICE_ARCH_BLACKWELL = 10 + DEVICE_ARCH_UNKNOWN = 0xFFFFFFFF + + +class BusType(_IntEnum): + BUS_TYPE_UNKNOWN = 0 + BUS_TYPE_PCI = 1 + BUS_TYPE_PCIE = 2 + BUS_TYPE_FPCI = 3 + BUS_TYPE_AGP = 4 + + +class FanControlPolicy(_IntEnum): + FAN_CONTROL_POLICY_TEMPERATURE_CONTINUOUS_SW = 0 # Temperature-controlled fan policy + FAN_CONTROL_POLICY_MANUAL = 1 # Manual fan control policy + + +class PowerSource(_IntEnum): + POWER_SOURCE_AC = 0x00000000 + POWER_SOURCE_BATTERY = 0x00000001 + POWER_SOURCE_UNDERSIZED = 0x00000002 + + +class PcieLinkMaxSpeed(_IntEnum): + PCIE_LINK_MAX_SPEED_INVALID = 0x00000000 + PCIE_LINK_MAX_SPEED_2500MBPS = 0x00000001 + PCIE_LINK_MAX_SPEED_5000MBPS = 0x00000002 + PCIE_LINK_MAX_SPEED_8000MBPS = 0x00000003 + PCIE_LINK_MAX_SPEED_16000MBPS = 0x00000004 + PCIE_LINK_MAX_SPEED_32000MBPS = 0x00000005 + PCIE_LINK_MAX_SPEED_64000MBPS = 0x00000006 + + +class AdaptiveClockingInfoStatus(_IntEnum): + ADAPTIVE_CLOCKING_INFO_STATUS_DISABLED = 0x00000000 + ADAPTIVE_CLOCKING_INFO_STATUS_ENABLED = 0x00000001 + + +MAX_GPU_UTILIZATIONS = 8 + + +class PcieAtomicsCap(_IntEnum): + PCIE_ATOMICS_CAP_FETCHADD32 = 0x01 + PCIE_ATOMICS_CAP_FETCHADD64 = 0x02 + PCIE_ATOMICS_CAP_SWAP32 = 0x04 + PCIE_ATOMICS_CAP_SWAP64 = 0x08 + PCIE_ATOMICS_CAP_CAS32 = 0x10 + PCIE_ATOMICS_CAP_CAS64 = 0x20 + PCIE_ATOMICS_CAP_CAS128 = 0x40 + PCIE_ATOMICS_OPS_MAX = 7 + + +class PowerScope(_IntEnum): + POWER_SCOPE_GPU = 0 + POWER_SCOPE_MODULE = 1 + POWER_SCOPE_MEMORY = 2 + + +# Need "Enum" suffix to disambiguate from nvmlGridLicenseExpiry_t +class GridLicenseExpiryEnum(_IntEnum): + GRID_LICENSE_EXPIRY_NOT_AVAILABLE = 0 + GRID_LICENSE_EXPIRY_INVALID = 1 + GRID_LICENSE_EXPIRY_VALID = 2 + GRID_LICENSE_EXPIRY_NOT_APPLICABLE = 3 + GRID_LICENSE_EXPIRY_PERMANENT = 4 + + +GRID_LICENSE_FEATURE_MAX_COUNT = 3 + + +class VgpuVirtualizationCapMigration(_IntEnum): + VGPU_VIRTUALIZATION_CAP_MIGRATION_NO = 0x0 + VGPU_VIRTUALIZATION_CAP_MIGRATION_YES = 0x1 + + +class VgpuPgpuVirtualizationCapMigration(_IntEnum): + VGPU_PGPU_VIRTUALIZATION_CAP_MIGRATION_NO = 0x0 + VGPU_PGPU_VIRTUALIZATION_CAP_MIGRATION_YES = 0x1 + + +class VgpuSchedulerPolicy(_IntEnum): + VGPU_SCHEDULER_POLICY_UNKNOWN = 0 + VGPU_SCHEDULER_POLICY_BEST_EFFORT = 1 + VGPU_SCHEDULER_POLICY_EQUAL_SHARE = 2 + VGPU_SCHEDULER_POLICY_FIXED_SHARE = 3 + SUPPORTED_VGPU_SCHEDULER_POLICY_COUNT = 3 + + +class VgpuSchedulerArr(_IntEnum): + VGPU_SCHEDULER_ARR_DEFAULT = 0 + VGPU_SCHEDULER_ARR_DISABLE = 1 + VGPU_SCHEDULER_ARR_ENABLE = 2 + + +class VgpuSchedulerEngineType(_IntEnum): + VGPU_SCHEDULER_ENGINE_TYPE_GRAPHICS = 1 + VGPU_SCHEDULER_ENGINE_TYPE_NVENC1 = 2 + + +class GridLicenseState(_IntEnum): + GRID_LICENSE_STATE_UNKNOWN = 0 + GRID_LICENSE_STATE_UNINITIALIZED = 1 + GRID_LICENSE_STATE_UNLICENSED_UNRESTRICTED = 2 + GRID_LICENSE_STATE_UNLICENSED_RESTRICTED = 3 + GRID_LICENSE_STATE_UNLICENSED = 4 + GRID_LICENSE_STATE_LICENSED = 5 + + +class NvlinkLowPowerThresholdUnit(_IntEnum): + NVLINK_LOW_POWER_THRESHOLD_UNIT_100US = 0x0 + NVLINK_LOW_POWER_THRESHOLD_UNIT_50US = 0x1 + + +class NvlinkPowerState(_IntEnum): + NVLINK_POWER_STATE_HIGH_SPEED = 0x0 + NVLINK_POWER_STATE_LOW_SPEED = 0x1 + + +NVLINK_LOW_POWER_THRESHOLD_MIN = 0x1 + + +class NvlinkLowPowerThreshold(_IntEnum): + NVLINK_LOW_POWER_THRESHOLD_MAX = 0x1FFF + NVLINK_LOW_POWER_THRESHOLD_RESET = 0xFFFFFFFF + NVLINK_LOW_POWER_THRESHOLD_DEFAULT = 0xFFFFFFFF + + +class C2CPowerState(_IntEnum): + C2C_POWER_STATE_FULL_POWER = 0 + C2C_POWER_STATE_LOW_POWER = 1 + + +class EventType(_IntEnum): + EVENT_TYPE_NONE = 0x0000000000000000 + EVENT_TYPE_SINGLE_BIT_ECC_ERROR = 0x0000000000000001 + EVENT_TYPE_DOUBLE_BIT_ECC_ERROR = 0x0000000000000002 + EVENT_TYPE_PSTATE = 0x0000000000000004 + EVENT_TYPE_XID_CRITICAL_ERROR = 0x0000000000000008 + EVENT_TYPE_CLOCK = 0x0000000000000010 + EVENT_TYPE_POWER_SOURCE_CHANGE = 0x0000000000000080 + EVENT_MIG_CONFIG_CHANGE = 0x0000000000000100 + EVENT_TYPE_SINGLE_BIT_ECC_ERROR_STORM = 0x0000000000000200 + EVENT_TYPE_DRAM_RETIREMENT_EVENT = 0x0000000000000400 + EVENT_TYPE_DRAM_RETIREMENT_FAILURE = 0x0000000000000800 + EVENT_TYPE_NON_FATAL_POISON_ERROR = 0x0000000000001000 + EVENT_TYPE_FATAL_POISON_ERROR = 0x0000000000002000 + EVENT_TYPE_GPU_UNAVAILABLE_ERROR = 0x0000000000004000 + EVENT_TYPE_GPU_RECOVERY_ACTION = 0x0000000000008000 + + +class SystemEventType(_IntEnum): + SYSTEM_EVENT_TYPE_GPU_DRIVER_UNBIND = 0x0000000000000001 + SYSTEM_EVENT_TYPE_GPU_DRIVER_BIND = 0x0000000000000002 + + +class ClocksEvent(_IntEnum): + CLOCKS_EVENT_REASON_GPU_IDLE = 0x0000000000000001 + CLOCKS_EVENT_REASON_APPLICATIONS_CLOCKS_SETTING = 0x0000000000000002 + CLOCKS_THROTTLE_REASON_USER_DEFINED_CLOCKS = 0x0000000000000002 + CLOCKS_EVENT_REASON_SW_POWER_CAP = 0x0000000000000004 + CLOCKS_THROTTLE_REASON_HW_SLOWDOWN = 0x0000000000000008 + CLOCKS_EVENT_REASON_SYNC_BOOST = 0x0000000000000010 + CLOCKS_EVENT_REASON_SW_THERMAL_SLOWDOWN = 0x0000000000000020 + CLOCKS_THROTTLE_REASON_HW_THERMAL_SLOWDOWN = 0x0000000000000040 + CLOCKS_THROTTLE_REASON_HW_POWER_BRAKE_SLOWDOWN = 0x0000000000000080 + CLOCKS_EVENT_REASON_DISPLAY_CLOCK_SETTING = 0x0000000000000100 + CLOCKS_EVENT_REASON_NONE = 0x0000000000000000 + CLOCKS_THROTTLE_REASON_GPU_IDLE = 0x0000000000000001 + CLOCKS_THROTTLE_REASON_APPLICATIONS_CLOCKS_SETTING = 0x0000000000002 + CLOCKS_THROTTLE_REASON_SYNC_BOOST = 0x00000000000010 + CLOCKS_THROTTLE_REASON_SW_POWER_CAP = 0x00000000000004 + CLOCKS_THROTTLE_REASON_SW_THERMAL_SLOWDOWN = 0x00000000000020 + CLOCKS_THROTTLE_REASON_DISPLAY_CLOCK_SETTING = 0x00000000000100 + CLOCKS_THROTTLE_REASON_NONE = 0x0000000000000000 + + +class EncoderQuery(_IntEnum): + ENCODER_QUERY_H264 = 0x00 + ENCODER_QUERY_HEVC = 0x01 + ENCODER_QUERY_AV1 = 0x02 + ENCODER_QUERY_UNKNOWN = 0xFF + + +class NvFBCSessionFlag(_IntEnum): + NVFBC_SESSION_FLAG_DIFFMAP_ENABLED = 0x00000001 + NVFBC_SESSION_FLAG_CLASSIFICATIONMAP_ENABLED = 0x00000002 + NVFBC_SESSION_FLAG_CAPTURE_WITH_WAIT_NO_WAIT = 0x00000004 + NVFBC_SESSION_FLAG_CAPTURE_WITH_WAIT_INFINITE = 0x00000008 + NVFBC_SESSION_FLAG_CAPTURE_WITH_WAIT_TIMEOUT = 0x00000010 + + +class CCSystemCpuCaps(_IntEnum): + CC_SYSTEM_CPU_CAPS_NONE = 0 + CC_SYSTEM_CPU_CAPS_AMD_SEV = 1 + CC_SYSTEM_CPU_CAPS_INTEL_TDX = 2 + CC_SYSTEM_CPU_CAPS_AMD_SEV_SNP = 3 + CC_SYSTEM_CPU_CAPS_AMD_SNP_VTOM = 4 + + +class CCSystemGpus(_IntEnum): + CC_SYSTEM_GPUS_CC_NOT_CAPABLE = 0 + CC_SYSTEM_GPUS_CC_CAPABLE = 1 + + +class CCSystemDevtoolsMode(_IntEnum): + CC_SYSTEM_DEVTOOLS_MODE_OFF = 0 + CC_SYSTEM_DEVTOOLS_MODE_ON = 1 + + +class CCSystemEnvironment(_IntEnum): + CC_SYSTEM_ENVIRONMENT_UNAVAILABLE = 0 + CC_SYSTEM_ENVIRONMENT_SIM = 1 + CC_SYSTEM_ENVIRONMENT_PROD = 2 + + +class CCSystemFeature(_IntEnum): + CC_SYSTEM_FEATURE_DISABLED = 0 + CC_SYSTEM_FEATURE_ENABLED = 1 + + +class CCSystemMultiGpu(_IntEnum): + CC_SYSTEM_MULTIGPU_NONE = 0 + CC_SYSTEM_MULTIGPU_PROTECTED_PCIE = 1 + CC_SYSTEM_MULTIGPU_NVLE = 2 + + +class CCAcceptingClientRequests(_IntEnum): + CC_ACCEPTING_CLIENT_REQUESTS_FALSE = 0 + CC_ACCEPTING_CLIENT_REQUESTS_TRUE = 1 + + +class GpuFabricState(_IntEnum): + GPU_FABRIC_STATE_NOT_SUPPORTED = 0 + GPU_FABRIC_STATE_NOT_STARTED = 1 + GPU_FABRIC_STATE_IN_PROGRESS = 2 + GPU_FABRIC_STATE_COMPLETED = 3 + + +class GpuFabricHealthMaskDegradedBw(_IntEnum): + GPU_FABRIC_HEALTH_MASK_DEGRADED_BW_NOT_SUPPORTED = 0 + GPU_FABRIC_HEALTH_MASK_DEGRADED_BW_TRUE = 1 + GPU_FABRIC_HEALTH_MASK_DEGRADED_BW_FALSE = 2 + GPU_FABRIC_HEALTH_MASK_SHIFT_DEGRADED_BW = 0 + GPU_FABRIC_HEALTH_MASK_WIDTH_DEGRADED_BW = 0x3 + + +class GpuFabricHealthMaskRouteRecovery(_IntEnum): + GPU_FABRIC_HEALTH_MASK_ROUTE_RECOVERY_NOT_SUPPORTED = 0 + GPU_FABRIC_HEALTH_MASK_ROUTE_RECOVERY_TRUE = 1 + GPU_FABRIC_HEALTH_MASK_ROUTE_RECOVERY_FALSE = 2 + GPU_FABRIC_HEALTH_MASK_SHIFT_ROUTE_RECOVERY = 2 + GPU_FABRIC_HEALTH_MASK_WIDTH_ROUTE_RECOVERY = 0x3 + + +class GpuFabricHealthMaskRouteUnhealthy(_IntEnum): + GPU_FABRIC_HEALTH_MASK_ROUTE_UNHEALTHY_NOT_SUPPORTED = 0 + GPU_FABRIC_HEALTH_MASK_ROUTE_UNHEALTHY_TRUE = 1 + GPU_FABRIC_HEALTH_MASK_ROUTE_UNHEALTHY_FALSE = 2 + GPU_FABRIC_HEALTH_MASK_SHIFT_ROUTE_UNHEALTHY = 4 + GPU_FABRIC_HEALTH_MASK_WIDTH_ROUTE_UNHEALTHY = 0x3 + + +class GpuFabricHealthMaskAccessTimeout(_IntEnum): + GPU_FABRIC_HEALTH_MASK_ACCESS_TIMEOUT_NOT_SUPPORTED = 0 + GPU_FABRIC_HEALTH_MASK_ACCESS_TIMEOUT_TRUE = 1 + GPU_FABRIC_HEALTH_MASK_ACCESS_TIMEOUT_FALSE = 2 + GPU_FABRIC_HEALTH_MASK_SHIFT_ACCESS_TIMEOUT = 6 + GPU_FABRIC_HEALTH_MASK_WIDTH_ACCESS_TIMEOUT = 0x3 + + +class GpuFabricHealthMaskIncorrectConfiguration(_IntEnum): + GPU_FABRIC_HEALTH_MASK_INCORRECT_CONFIGURATION_NOT_SUPPORTED = 0 + GPU_FABRIC_HEALTH_MASK_INCORRECT_CONFIGURATION_NONE = 1 + GPU_FABRIC_HEALTH_MASK_INCORRECT_CONFIGURATION_INCORRECT_SYSGUID = 2 + GPU_FABRIC_HEALTH_MASK_INCORRECT_CONFIGURATION_INCORRECT_CHASSIS_SN = 3 + GPU_FABRIC_HEALTH_MASK_INCORRECT_CONFIGRUATION_NO_PARTITION = 4 + GPU_FABRIC_HEALTH_MASK_INCORRECT_CONFIGURATION_INSUFFICIENT_NVLINKS = 5 + GPU_FABRIC_HEALTH_MASK_INCORRECT_CONFIGURATION_INCOMPATIBLE_GPU_FW = 6 + GPU_FABRIC_HEALTH_MASK_INCORRECT_CONFIGURATION_INVALID_LOCATION = 7 + GPU_FABRIC_HEALTH_MASK_SHIFT_INCORRECT_CONFIGURATION = 8 + GPU_FABRIC_HEALTH_MASK_WIDTH_INCORRECT_CONFIGURATION = 0xf + + +class GpuFabricHealthSummary(_IntEnum): + GPU_FABRIC_HEALTH_SUMMARY_NOT_SUPPORTED = 0 + GPU_FABRIC_HEALTH_SUMMARY_HEALTHY = 1 + GPU_FABRIC_HEALTH_SUMMARY_UNHEALTHY = 2 + GPU_FABRIC_HEALTH_SUMMARY_LIMITED_CAPACITY = 3 + + +class InitFlag(_IntEnum): + INIT_FLAG_NO_GPUS = 1 + INIT_FLAG_NO_ATTACH = 2 + + +class NvlinkState(_IntEnum): + NVLINK_STATE_INACTIVE = 0x0 + NVLINK_STATE_ACTIVE = 0x1 + NVLINK_STATE_SLEEP = 0x2 + + +class NvlinkFirmwareUcodeType(_IntEnum): + NVLINK_FIRMWARE_UCODE_TYPE_MSE = 0x1 + NVLINK_FIRMWARE_UCODE_TYPE_NETIR = 0x2 + NVLINK_FIRMWARE_UCODE_TYPE_NETIR_UPHY = 0x3 + NVLINK_FIRMWARE_UCODE_TYPE_NETIR_CLN = 0x4 + NVLINK_FIRMWARE_UCODE_TYPE_NETIR_DLN = 0x5 + + +class DeviceMig(_IntEnum): + DEVICE_MIG_DISABLE = 0 + DEVICE_MIG_ENABLE = 1 + + +class GpuInstanceProfile(_IntEnum): + GPU_INSTANCE_PROFILE_1_SLICE = 0x0 + GPU_INSTANCE_PROFILE_2_SLICE = 0x1 + GPU_INSTANCE_PROFILE_3_SLICE = 0x2 + GPU_INSTANCE_PROFILE_4_SLICE = 0x3 + GPU_INSTANCE_PROFILE_7_SLICE = 0x4 + GPU_INSTANCE_PROFILE_8_SLICE = 0x5 + GPU_INSTANCE_PROFILE_6_SLICE = 0x6 + GPU_INSTANCE_PROFILE_1_SLICE_REV1 = 0x7 + GPU_INSTANCE_PROFILE_2_SLICE_REV1 = 0x8 + GPU_INSTANCE_PROFILE_1_SLICE_REV2 = 0x9 + GPU_INSTANCE_PROFILE_1_SLICE_GFX = 0x0A + GPU_INSTANCE_PROFILE_2_SLICE_GFX = 0x0B + GPU_INSTANCE_PROFILE_4_SLICE_GFX = 0x0C + GPU_INSTANCE_PROFILE_1_SLICE_NO_ME = 0x0D + GPU_INSTANCE_PROFILE_2_SLICE_NO_ME = 0x0E + GPU_INSTANCE_PROFILE_1_SLICE_ALL_ME = 0x0F + GPU_INSTANCE_PROFILE_2_SLICE_ALL_ME = 0x10 + GPU_INSTANCE_PROFILE_COUNT = 0x11 + + +class GpuInstanceProfileCaps(_IntEnum): + GPU_INSTANCE_PROFILE_CAPS_P2P = 0x1 + GPU_INSTANCE_PROFILE_CAPS_GFX = 0x2 + + +class ComputeInstanceProfileCaps(_IntEnum): + COMPUTE_INSTANCE_PROFILE_CAPS_GFX = 0x1 + + +class ComputeInstanceProfile(_IntEnum): + COMPUTE_INSTANCE_PROFILE_1_SLICE = 0x0 + COMPUTE_INSTANCE_PROFILE_2_SLICE = 0x1 + COMPUTE_INSTANCE_PROFILE_3_SLICE = 0x2 + COMPUTE_INSTANCE_PROFILE_4_SLICE = 0x3 + COMPUTE_INSTANCE_PROFILE_7_SLICE = 0x4 + COMPUTE_INSTANCE_PROFILE_8_SLICE = 0x5 + COMPUTE_INSTANCE_PROFILE_6_SLICE = 0x6 + COMPUTE_INSTANCE_PROFILE_1_SLICE_REV1 = 0x7 + COMPUTE_INSTANCE_PROFILE_COUNT = 0x8 + + +class ComputeInstanceEngineProfile(_IntEnum): + COMPUTE_INSTANCE_ENGINE_PROFILE_SHARED = 0x0 + COMPUTE_INSTANCE_ENGINE_PROFILE_COUNT = 0x1 + + +class PowerSmoothingProfileParam(_IntEnum): + POWER_SMOOTHING_PROFILE_PARAM_PERCENT_TMP_FLOOR = 0 + POWER_SMOOTHING_PROFILE_PARAM_RAMP_UP_RATE = 1 + POWER_SMOOTHING_PROFILE_PARAM_RAMP_DOWN_RATE = 2 + POWER_SMOOTHING_PROFILE_PARAM_RAMP_DOWN_HYSTERESIS = 3 + POWER_SMOOTHING_PROFILE_PARAM_SECONDARY_POWER_FLOOR = 4 + POWER_SMOOTHING_PROFILE_PARAM_PRIMARY_FLOOR_ACT_WIN_MULT = 5 + POWER_SMOOTHING_PROFILE_PARAM_PRIMARY_FLOOR_TAR_WIN_MULT = 6 + POWER_SMOOTHING_PROFILE_PARAM_PRIMARY_FLOOR_ACT_OFFSET = 7 ############################################################################### @@ -2180,6 +2569,17 @@ cdef class Memory_v2: raise ValueError("This Memory_v2 instance is read-only") self._ptr[0].total = val + @property + def reserved(self): + """int: """ + return self._ptr[0].reserved + + @reserved.setter + def reserved(self, val): + if self._readonly: + raise ValueError("This Memory_v2 instance is read-only") + self._ptr[0].reserved = val + @property def free(self): """int: """ @@ -2241,7 +2641,7 @@ cdef class Memory_v2: cdef _get_ba_r1memory_dtype_offsets(): cdef nvmlBAR1Memory_t pod = nvmlBAR1Memory_t() return _numpy.dtype({ - 'names': ['bar1total', 'bar1free', 'bar1_used'], + 'names': ['bar1_total', 'bar1_free', 'bar1_used'], 'formats': [_numpy.uint64, _numpy.uint64, _numpy.uint64], 'offsets': [ (&(pod.bar1Total)) - (&pod), @@ -2314,23 +2714,23 @@ cdef class BAR1Memory: setattr(self, key, val) @property - def bar1total(self): + def bar1_total(self): """int: """ return self._ptr[0].bar1Total - @bar1total.setter - def bar1total(self, val): + @bar1_total.setter + def bar1_total(self, val): if self._readonly: raise ValueError("This BAR1Memory instance is read-only") self._ptr[0].bar1Total = val @property - def bar1free(self): + def bar1_free(self): """int: """ return self._ptr[0].bar1Free - @bar1free.setter - def bar1free(self, val): + @bar1_free.setter + def bar1_free(self, val): if self._readonly: raise ValueError("This BAR1Memory instance is read-only") self._ptr[0].bar1Free = val diff --git a/cuda_core/cuda/core/__init__.py b/cuda_core/cuda/core/__init__.py index a10812606e..67a815d1de 100644 --- a/cuda_core/cuda/core/__init__.py +++ b/cuda_core/cuda/core/__init__.py @@ -28,7 +28,7 @@ finally: del bindings, importlib, subdir, cuda_major, cuda_minor -from cuda.core import utils # noqa: E402 +from cuda.core import system, utils # noqa: E402 from cuda.core._device import Device # noqa: E402 from cuda.core._event import Event, EventOptions # noqa: E402 from cuda.core._graph import ( # noqa: E402 @@ -62,8 +62,3 @@ from cuda.core._module import Kernel, ObjectCode # noqa: E402 from cuda.core._program import Program, ProgramOptions # noqa: E402 from cuda.core._stream import Stream, StreamOptions # noqa: E402 -from cuda.core._system import System # noqa: E402 - -system = System() -__import__("sys").modules[__spec__.name + ".system"] = system -del System diff --git a/cuda_core/cuda/core/_system.py b/cuda_core/cuda/core/_system.py deleted file mode 100644 index 6f06587b46..0000000000 --- a/cuda_core/cuda/core/_system.py +++ /dev/null @@ -1,114 +0,0 @@ -# SPDX-FileCopyrightText: Copyright (c) 2024-2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved. -# -# SPDX-License-Identifier: Apache-2.0 - -import warnings - -from cuda.core._device import Device -from cuda.core._utils.cuda_utils import driver, handle_return, runtime - - -class System: - """Provide information about the cuda system. - This class is a singleton and should not be instantiated directly. - """ - - _instance = None - - def __new__(cls): - if cls._instance is None: - cls._instance = super().__new__(cls) - return cls._instance - - def __init__(self): - if hasattr(self, "_initialized") and self._initialized: - return - self._initialized = True - - def get_driver_version(self) -> tuple[int, int]: - """ - Query the CUDA driver version. - - Returns - ------- - tuple of int - A 2-tuple of (major, minor) version numbers. - """ - version = handle_return(driver.cuDriverGetVersion()) - major = version // 1000 - minor = (version % 1000) // 10 - return (major, minor) - - @property - def driver_version(self) -> tuple[int, int]: - """ - Query the CUDA driver version. - - Returns - ------- - tuple of int - A 2-tuple of (major, minor) version numbers. - - .. deprecated:: 0.5.0 - `cuda.core.system.driver_version` will be removed in 0.6.0. - Use `cuda.core.system.get_driver_version()` instead. - """ - warnings.warn( - "cuda.core.system.driver_version is deprecated. Use cuda.core.system.get_driver_version() instead.", - DeprecationWarning, - stacklevel=1, - ) - return self.get_driver_version() - - def get_num_devices(self) -> int: - """ - Query the number of available GPUs. - - Returns - ------- - int - The number of available GPU devices. - """ - return handle_return(runtime.cudaGetDeviceCount()) - - @property - def num_devices(self) -> int: - """ - Query the number of available GPUs. - - Returns - ------- - int - The number of available GPU devices. - - .. deprecated:: 0.5.0 - `cuda.core.system.num_devices` will be removed in 0.6.0. - Use `cuda.core.system.get_num_devices()` instead. - """ - warnings.warn( - "cuda.core.system.num_devices is deprecated. Use cuda.core.system.get_num_devices() instead.", - DeprecationWarning, - stacklevel=1, - ) - return self.get_num_devices() - - @property - def devices(self) -> tuple: - """ - Query the available device instances. - - Returns - ------- - tuple of Device - A tuple containing instances of available devices. - - .. deprecated:: 0.5.0 - `cuda.core.system.devices` will be removed in 0.6.0. - Use `cuda.core.Device.get_all_devices()` instead. - """ - warnings.warn( - "cuda.core.system.devices is deprecated. Use cuda.core.Device.get_all_devices() instead.", - DeprecationWarning, - stacklevel=1, - ) - return Device.get_all_devices() diff --git a/cuda_core/cuda/core/experimental/__init__.py b/cuda_core/cuda/core/experimental/__init__.py index 3dbf3b7440..7f5c5caf21 100644 --- a/cuda_core/cuda/core/experimental/__init__.py +++ b/cuda_core/cuda/core/experimental/__init__.py @@ -38,7 +38,7 @@ def _warn_deprecated(): _warn_deprecated() -from cuda.core import utils # noqa: E402 +from cuda.core import system, utils # noqa: E402 # Make utils accessible as a submodule for backward compatibility __import__("sys").modules[__spec__.name + ".utils"] = utils @@ -73,8 +73,3 @@ def _warn_deprecated(): from cuda.core._module import Kernel, ObjectCode # noqa: E402 from cuda.core._program import Program, ProgramOptions # noqa: E402 from cuda.core._stream import Stream, StreamOptions # noqa: E402 -from cuda.core._system import System # noqa: E402 - -system = System() -__import__("sys").modules[__spec__.name + ".system"] = system -del System diff --git a/cuda_core/cuda/core/system/__init__.py b/cuda_core/cuda/core/system/__init__.py new file mode 100644 index 0000000000..8162f5b257 --- /dev/null +++ b/cuda_core/cuda/core/system/__init__.py @@ -0,0 +1,63 @@ +# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# +# SPDX-License-Identifier: Apache-2.0 + +# ruff: noqa: F403, F405 + + +__all__ = [ + "get_driver_version", + "get_driver_version_full", + "get_gpu_driver_version", + "get_num_devices", + "get_process_name", + "HAS_WORKING_NVML", +] + + +from .system import * + +if HAS_WORKING_NVML: + from ._nvml_context import initialize + from .device import Device, DeviceArchitecture + from .exceptions import * + + initialize() + + __all__.extend( + [ + "Device", + "DeviceArchitecture", + "UninitializedError", + "InvalidArgumentError", + "NotSupportedError", + "NoPermissionError", + "AlreadyInitializedError", + "NotFoundError", + "InsufficientSizeError", + "InsufficientPowerError", + "DriverNotLoadedError", + "TimeoutError", + "IrqIssueError", + "LibraryNotFoundError", + "FunctionNotFoundError", + "CorruptedInforomError", + "GpuIsLostError", + "ResetRequiredError", + "OperatingSystemError", + "LibRmVersionMismatchError", + "InUseError", + "MemoryError", + "NoDataError", + "VgpuEccNotSupportedError", + "InsufficientResourcesError", + "FreqNotSupportedError", + "ArgumentVersionMismatchError", + "DeprecatedError", + "NotReadyError", + "GpuNotFoundError", + "InvalidStateError", + "ResetTypeNotSupportedError", + "UnknownError", + ] + ) diff --git a/cuda_core/cuda/core/system/_nvml_context.pyx b/cuda_core/cuda/core/system/_nvml_context.pyx new file mode 100644 index 0000000000..eccce36a90 --- /dev/null +++ b/cuda_core/cuda/core/system/_nvml_context.pyx @@ -0,0 +1,93 @@ +# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# +# SPDX-License-Identifier: Apache-2.0 + +import os +import threading + +from cuda.bindings import _nvml as nvml + +from . import exceptions + + +ctypedef enum _NVMLState: + UNINITIALIZED = 0 + INITIALIZED = 1 + DISABLED_LIBRARY_NOT_FOUND = 2 + + +# Initialisation must occur per-process, so an initialised state is a +# (state, pid) pair +_NVML_STATE = _NVMLState.UNINITIALIZED +# """Current initialization state""" + +_NVML_OWNER_PID = 0 +# """PID of process that successfully called pynvml.nvmlInit""" + + +_lock = threading.Lock() + + +def initialize() -> None: + """Idempotent (per-process) initialization of NVUtil's NVML + + Notes + ----- + + Modifies global variables _NVML_STATE and _NVML_OWNER_PID""" + global _NVML_STATE, _NVML_OWNER_PID + + with _lock: + if _NVML_STATE == _NVMLState.DISABLED_LIBRARY_NOT_FOUND or ( + _NVML_STATE == _NVMLState.INITIALIZED and os.getpid() == _NVML_OWNER_PID + ): + return + elif ( + _NVML_STATE == _NVMLState.INITIALIZED and os.getpid() != _NVML_OWNER_PID + ) or _NVML_STATE == _NVMLState.UNINITIALIZED: + try: + nvml.init_v2() + except ( + exceptions.LibraryNotFoundError, + exceptions.DriverNotLoadedError, + exceptions.UnknownError, + ): + _NVML_STATE = _NVMLState.DISABLED_LIBRARY_NOT_FOUND + return + + # initialization was successful + _NVML_STATE = _NVMLState.INITIALIZED + _NVML_OWNER_PID = os.getpid() + else: + raise RuntimeError(f"Unhandled initialisation state ({_NVML_STATE=}, {_NVML_OWNER_PID=})") + + +def is_initialized() -> bool: + """ + Check whether the NVML context is initialized on this process. + + Returns + ------- + result: bool + Whether the NVML context is initialized on this process. + """ + return _NVML_STATE == _NVMLState.INITIALIZED and os.getpid() == _NVML_OWNER_PID + + +def validate() -> None: + """ + Validate NVML state. + + Validate that NVML is functional and that the system has at least one GPU available. + + Raises + ------ + nvml.LibraryNotFoundError + If the NVML library could not be found. + nvml.GpuNotFoundError + If no GPUs are available. + """ + if _NVML_STATE == _NVMLState.DISABLED_LIBRARY_NOT_FOUND: + raise exceptions.LibraryNotFoundError("The underlying NVML library was not found") + elif nvml.device_get_count_v2() == 0: + raise exceptions.GpuNotFoundError("No GPUs available") diff --git a/cuda_core/cuda/core/system/device.pyx b/cuda_core/cuda/core/system/device.pyx new file mode 100644 index 0000000000..3d40dd8305 --- /dev/null +++ b/cuda_core/cuda/core/system/device.pyx @@ -0,0 +1,306 @@ +# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# +# SPDX-License-Identifier: Apache-2.0 + +from libc.stdint cimport intptr_t +from libc.math cimport ceil + +from multiprocessing import cpu_count +from typing import Iterable + +from cuda.bindings import _nvml as nvml + +from .utils import unpack_bitmask + + +class DeviceArchitecture: + """ + Device architecture enumeration. + """ + + def __init__(self, architecture: int): + try: + self._architecture = nvml.DeviceArch(architecture) + except ValueError: + self._architecture = None + + @property + def id(self) -> int: + """ + The numeric id of the device architecture. + + Returns -1 if the device is unknown. + """ + if self._architecture is None: + return -1 + return int(self._architecture) + + @property + def name(self) -> str: + """ + The name of the device architecture. + + Returns "Unlisted" if the device is unknown. + """ + if self._architecture is None: + return "Unlisted" + name = self._architecture.name + return name[name.rfind("_") + 1 :].title() + + +cdef class MemoryInfo: + """ + Memory allocation information for a device. + """ + cdef object _memory_info + + def __init__(self, memory_info: nvml.Memory_v2): + self._memory_info = memory_info + + @property + def free(self) -> int: + """ + Unallocated device memory (in bytes) + """ + return self._memory_info.free + + @property + def total(self) -> int: + """ + Total physical device memory (in bytes) + """ + return self._memory_info.total + + @property + def used(self) -> int: + """ + Allocated device memory (in bytes) + """ + return self._memory_info.used + + @property + def reserved(self) -> int: + """ + Device memory (in bytes) reserved for system use (driver or firmware) + """ + return self._memory_info.reserved + + +cdef class BAR1MemoryInfo(MemoryInfo): + """ + BAR1 Memory allocation information for a device. + """ + cdef object _memory_info + + def __init__(self, memory_info: nvml.BAR1Memory): + self._memory_info = memory_info + + @property + def free(self) -> int: + """ + Unallocated BAR1 memory (in bytes) + """ + return self._memory_info.bar1_free + + @property + def total(self) -> int: + """ + Total BAR1 memory (in bytes) + """ + return self._memory_info.bar1_total + + @property + def used(self) -> int: + """ + Allocated used memory (in bytes) + """ + return self._memory_info.bar1_used + + +cdef class PciInfo: + """ + PCI information about a GPU device. + """ + cdef object _pci_info + + def __init__(self, pci_info: nvml.PciInfo): + self._pci_info = pci_info + + @property + def bus(self) -> int: + """ + The bus on which the device resides, 0 to 255 + """ + return self._pci_info.bus + + @property + def bus_id(self) -> str: + """ + The tuple domain:bus:device.function PCI identifier string + """ + return self._pci_info.bus_id + + @property + def device(self) -> int: + """ + The device's id on the bus, 0 to 31 + """ + return self._pci_info.device_ + + @property + def domain(self) -> int: + """ + The PCI domain on which the device's bus resides, 0 to 0xffffffff + """ + return self._pci_info.domain + + @property + def vendor_id(self) -> int: + """ + The PCI vendor id of the device + """ + return self._pci_info.pci_device_id & 0xFFFF + + @property + def device_id(self) -> int: + """ + The PCI device id of the device + """ + return self._pci_info.pci_device_id >> 16 + + +cdef class Device: + """ + Representation of a CUDA device. + + Parameters + ---------- + index: int, optional + Integer representing the CUDA device index to get a handle to. + uuid: bytes or str, optional + UUID of a CUDA device to get a handle to. + + Raises + ------ + ValueError + If neither `index` nor `uuid` are specified or if both are specified. + """ + + cdef intptr_t _handle + + def __init__(self, index: int | None = None, uuid: bytes | str | None = None): + if index is not None and uuid is not None: + raise ValueError("Handle requires only one of either device `index` or `uuid`.") + if index is None and uuid is None: + raise ValueError("Handle requires either a device `index` or `uuid`.") + + if index is not None: + self._handle = nvml.device_get_handle_by_index_v2(index) + else: + if isinstance(uuid, bytes): + uuid = uuid.decode("ascii") + self._handle = nvml.device_get_handle_by_uuid(uuid) + + @property + def handle(self) -> int: + return self._handle + + @classmethod + def get_all_devices(cls) -> Iterable[Device]: + """ + Query the available device instances. + + Returns + ------- + Iterator of Device + An iterator over available devices. + """ + total = nvml.device_get_count_v2() + for device_id in range(total): + yield cls(device_id) + + @property + def architecture(self) -> DeviceArchitecture: + """ + Device architecture. For example, a Tesla V100 will report + `DeviceArchitecture.name == "Volta"`, and RTX A6000 will report + `DeviceArchitecture.name == "Ampere"`. If the device returns an + architecture that is unknown to NVML then `DeviceArchitecture.name == + "Unknown"` is reported, whereas an architecture that is unknown to + cuda.core.system is reported as `DeviceArchitecture.name == "Unlisted"`. + """ + return DeviceArchitecture(nvml.device_get_architecture(self._handle)) + + @property + def bar1_memory_info(self) -> BAR1MemoryInfo: + """ + Get information about BAR1 memory. + + BAR1 is used to map the FB (device memory) so that it can be directly + accessed by the CPU or by 3rd party devices (peer-to-peer on the PCIE + bus). + """ + return BAR1MemoryInfo(nvml.device_get_bar1_memory_info(self._handle)) + + @property + def cpu_affinity(self) -> list[int]: + """ + Get a list containing the CPU indices to which the GPU is directly connected. + + Examples + -------- + >>> Device(index=0).cpu_affinity + [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, + 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59] + """ + return unpack_bitmask(nvml.device_get_cpu_affinity( + self._handle, + ceil(cpu_count() / 64), + )) + + @property + def cuda_compute_capability(self) -> tuple[int, int]: + """ + CUDA compute capability of the device, e.g.: `(7, 0)` for a Tesla V100. + + Returns a tuple `(major, minor)`. + """ + return nvml.device_get_cuda_compute_capability(self._handle) + + @property + def memory_info(self) -> MemoryInfo: + """ + Object with memory information. + """ + return MemoryInfo(nvml.device_get_memory_info_v2(self._handle)) + + @property + def name(self) -> str: + """ + Name of the device, e.g.: `"Tesla V100-SXM2-32GB"` + """ + return nvml.device_get_name(self._handle) + + @property + def pci_info(self) -> PciInfo: + """ + The PCI attributes of this device. + """ + return PciInfo(nvml.device_get_pci_info_v3(self._handle)) + + @property + def serial(self) -> str: + """ + Retrieves the globally unique board serial number associated with this + device's board. + """ + return nvml.device_get_serial(self._handle) + + @property + def uuid(self) -> str: + """ + Retrieves the globally unique immutable UUID associated with this + device, as a 5 part hexadecimal string, that augments the immutable, + board serial identifier. + """ + return nvml.device_get_uuid(self._handle) diff --git a/cuda_core/cuda/core/system/exceptions.py b/cuda_core/cuda/core/system/exceptions.py new file mode 100644 index 0000000000..5c6cfef889 --- /dev/null +++ b/cuda_core/cuda/core/system/exceptions.py @@ -0,0 +1,73 @@ +# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# +# SPDX-License-Identifier: Apache-2.0 + + +from cuda.bindings import _nvml as nvml + +UninitializedError = nvml.UninitializedError +InvalidArgumentError = nvml.InvalidArgumentError +NotSupportedError = nvml.NotSupportedError +NoPermissionError = nvml.NoPermissionError +AlreadyInitializedError = nvml.AlreadyInitializedError +NotFoundError = nvml.NotFoundError +InsufficientSizeError = nvml.InsufficientSizeError +InsufficientPowerError = nvml.InsufficientPowerError +DriverNotLoadedError = nvml.DriverNotLoadedError +TimeoutError = nvml.TimeoutError +IrqIssueError = nvml.IrqIssueError +LibraryNotFoundError = nvml.LibraryNotFoundError +FunctionNotFoundError = nvml.FunctionNotFoundError +CorruptedInforomError = nvml.CorruptedInforomError +GpuIsLostError = nvml.GpuIsLostError +ResetRequiredError = nvml.ResetRequiredError +OperatingSystemError = nvml.OperatingSystemError +LibRmVersionMismatchError = nvml.LibRmVersionMismatchError +InUseError = nvml.InUseError +MemoryError = nvml.MemoryError +NoDataError = nvml.NoDataError +VgpuEccNotSupportedError = nvml.VgpuEccNotSupportedError +InsufficientResourcesError = nvml.InsufficientResourcesError +FreqNotSupportedError = nvml.FreqNotSupportedError +ArgumentVersionMismatchError = nvml.ArgumentVersionMismatchError +DeprecatedError = nvml.DeprecatedError +NotReadyError = nvml.NotReadyError +GpuNotFoundError = nvml.GpuNotFoundError +InvalidStateError = nvml.InvalidStateError +ResetTypeNotSupportedError = nvml.ResetTypeNotSupportedError +UnknownError = nvml.UnknownError + + +__all__ = [ + "UninitializedError", + "InvalidArgumentError", + "NotSupportedError", + "NoPermissionError", + "AlreadyInitializedError", + "NotFoundError", + "InsufficientSizeError", + "InsufficientPowerError", + "DriverNotLoadedError", + "TimeoutError", + "IrqIssueError", + "LibraryNotFoundError", + "FunctionNotFoundError", + "CorruptedInforomError", + "GpuIsLostError", + "ResetRequiredError", + "OperatingSystemError", + "LibRmVersionMismatchError", + "InUseError", + "MemoryError", + "NoDataError", + "VgpuEccNotSupportedError", + "InsufficientResourcesError", + "FreqNotSupportedError", + "ArgumentVersionMismatchError", + "DeprecatedError", + "NotReadyError", + "GpuNotFoundError", + "InvalidStateError", + "ResetTypeNotSupportedError", + "UnknownError", +] diff --git a/cuda_core/cuda/core/system/system.pyx b/cuda_core/cuda/core/system/system.pyx new file mode 100644 index 0000000000..67972385f7 --- /dev/null +++ b/cuda_core/cuda/core/system/system.pyx @@ -0,0 +1,101 @@ +# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# +# SPDX-License-Identifier: Apache-2.0 + + +# This file needs to either use NVML exclusively, or when `cuda.bindings._nvml` +# isn't available, fall back to non-NVML-based methods for backward +# compatibility. + + +import cuda.bindings + +# TODO: Update after #1411 is merged +_BINDINGS_VERSION = tuple(int(x) for x in cuda.bindings.__version__.split(".")) + +HAS_WORKING_NVML = _BINDINGS_VERSION >= (13, 1, 2) or (_BINDINGS_VERSION[0] == 12 and _BINDINGS_VERSION[1:3] >= (9, 6)) or True + + +if HAS_WORKING_NVML: + from cuda.bindings import _nvml as nvml +else: + from cuda.core._utils.cuda_utils import driver, handle_return, runtime + + +def get_driver_version() -> tuple[int, int]: + """ + The CUDA driver version. + + Tuple in the format `(CUDA_MAJOR, CUDA_MINOR)`. + """ + return get_driver_version_full()[:2] + + +def get_driver_version_full() -> tuple[int, int, int]: + """ + The CUDA driver version. + + Tuple in the format `(CUDA_MAJOR, CUDA_MINOR, CUDA_PATCH)`. + """ + cdef int v + if HAS_WORKING_NVML: + v = nvml.system_get_cuda_driver_version() + else: + v = handle_return(driver.cuDriverGetVersion()) + return (v // 1000, (v // 10) % 100, v % 10) + + +def get_gpu_driver_version() -> tuple[int, ...]: + """ + The driver version. + """ + if not HAS_WORKING_NVML: + raise RuntimeError("NVML library is not available") + return tuple(int(v) for v in nvml.system_get_driver_version().split(".")) + + +def get_nvml_version() -> tuple[int, ...]: + """ + The version of the NVML library. + """ + if not HAS_WORKING_NVML: + raise RuntimeError("NVML library is not available") + return tuple(int(v) for v in nvml.system_get_nvml_version().split(".")) + + +def get_num_devices() -> int: + """ + Return the number of devices in the system. + """ + if HAS_WORKING_NVML: + return nvml.device_get_count_v2() + else: + return handle_return(runtime.cudaGetDeviceCount()) + + +def get_process_name(pid: int) -> str: + """ + The name of process with given PID. + + Parameters + ---------- + pid: int + The PID of the process for which to get the name. + + Returns + ------- + name: str + The process name. + """ + return nvml.system_get_process_name(pid) + + +__all__ = [ + "get_driver_version", + "get_driver_version_full", + "get_gpu_driver_version", + "get_nvml_version", + "get_num_devices", + "get_process_name", + "HAS_WORKING_NVML", +] diff --git a/cuda_core/cuda/core/system/utils.pyx b/cuda_core/cuda/core/system/utils.pyx new file mode 100644 index 0000000000..5af4802112 --- /dev/null +++ b/cuda_core/cuda/core/system/utils.pyx @@ -0,0 +1,60 @@ +# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# +# SPDX-License-Identifier: Apache-2.0 + +from cpython cimport array +from libc.stdint cimport uint64_t + + +cpdef str format_bytes(uint64_t x): + """Return formatted string in B, KiB, MiB, GiB or TiB""" + if x < 1024: + return f"{x} B" + elif x < 1024**2: + return f"{x / 1024:.2f} KiB" + elif x < 1024**3: + return f"{x / 1024**2:.2f} MiB" + elif x < 1024**4: + return f"{x / 1024**3:.2f} GiB" + else: + return f"{x / 1024**4:.2f} TiB" + + +cpdef list[int] unpack_bitmask(x: list[int] | array.array): + """ + Unpack a list of integers containing bitmasks. + + Parameters + ---------- + x: list of int + A list of integers + + Examples + -------- + >>> from cuda.core.experimental.system.utils import unpack_bitmask + >>> unpack_bitmask([1 + 2 + 8]) + [0, 1, 3] + >>> unpack_bitmask([1 + 2 + 16]) + [0, 1, 4] + >>> unpack_bitmask([1 + 2 + 16, 2 + 4]) + [0, 1, 4, 65, 66] + """ + cdef uint64_t[:] arr + cdef uint64_t i, j, idx + cdef int mask_bits = 64 + + if isinstance(x, list): + arr = array.array("Q", x) + else: + arr = x + + res = [] + + for i in range(len(x)): + cpu_offset = i * mask_bits + idx = 1 + for j in range(mask_bits): + if arr[i] & idx: + res.append(cpu_offset + j) + idx <<= 1 + return res diff --git a/cuda_core/tests/system/__init__.py b/cuda_core/tests/system/__init__.py new file mode 100644 index 0000000000..79599c77db --- /dev/null +++ b/cuda_core/tests/system/__init__.py @@ -0,0 +1,3 @@ +# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# +# SPDX-License-Identifier: Apache-2.0 diff --git a/cuda_core/tests/system/conftest.py b/cuda_core/tests/system/conftest.py new file mode 100644 index 0000000000..b507d35941 --- /dev/null +++ b/cuda_core/tests/system/conftest.py @@ -0,0 +1,11 @@ +# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# +# SPDX-License-Identifier: Apache-2.0 + + +import pytest +from cuda.core import system + +skip_if_nvml_unsupported = pytest.mark.skipif( + not system.HAS_WORKING_NVML, reason="NVML support requires cuda.bindings version 12.9.6+ or 13.1.2+" +) diff --git a/cuda_core/tests/system/test_nvml_context.py b/cuda_core/tests/system/test_nvml_context.py new file mode 100644 index 0000000000..cbec533378 --- /dev/null +++ b/cuda_core/tests/system/test_nvml_context.py @@ -0,0 +1,88 @@ +# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# +# SPDX-License-Identifier: Apache-2.0 + +# ruff: noqa: E402 + +from .conftest import skip_if_nvml_unsupported + +pytestmark = skip_if_nvml_unsupported + +import multiprocessing as mp +from platform import uname + +import pytest + +UNINITIALIZED = 0 +INITIALIZED = 1 +DISABLED_LIBRARY_NOT_FOUND = 2 + + +def _run_process(target): + p = mp.get_context("spawn").Process(target=target) + p.start() + p.join() + assert not p.exitcode + + +def _test_initialized(): + from cuda.core.system import _nvml_context + + assert _nvml_context._NVML_STATE == INITIALIZED + + +def test_initialized(): + _run_process(_test_initialized) + + +def _test_is_initialized(): + from cuda.core.system import _nvml_context + + assert _nvml_context._NVML_STATE == INITIALIZED + assert _nvml_context.is_initialized() is True + + +def test_is_initialized(): + _run_process(_test_is_initialized) + + +def _test_uninitialized(): + from cuda.core.system import _nvml_context + + _nvml_context._NVML_STATE = UNINITIALIZED + assert _nvml_context.is_initialized() is False + + +def test_uninitialized(): + _run_process(_test_uninitialized) + + +def _test_wrong_owner(): + from cuda.core.system import _nvml_context + + _nvml_context._NVML_OWNER_PID = 0 + assert _nvml_context.is_initialized() is False + + +def test_wrong_owner(): + _run_process(_test_wrong_owner) + + +@pytest.mark.skipif("microsoft-standard" in uname().release, reason="Probably a WSL system") +def test_no_wsl(): + assert "microsoft-standard" not in uname().release + + +@pytest.mark.skipif("microsoft-standard" not in uname().release, reason="Probably a non-WSL system") +def test_wsl(): + assert "microsoft-standard" in uname().release + + +def _test_validate(): + from cuda.core.system import _nvml_context + + assert _nvml_context.validate() is None + + +def test_validate(): + _run_process(_test_validate) diff --git a/cuda_core/tests/system/test_system_device.py b/cuda_core/tests/system/test_system_device.py new file mode 100644 index 0000000000..05e0c8a08f --- /dev/null +++ b/cuda_core/tests/system/test_system_device.py @@ -0,0 +1,163 @@ +# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# +# SPDX-License-Identifier: Apache-2.0 + +# ruff: noqa: E402 + +from .conftest import skip_if_nvml_unsupported + +pytestmark = skip_if_nvml_unsupported + +import os +import re +import sys + +import pytest +from cuda.core import system +from cuda.core.system import device as system_device + +if system.HAS_WORKING_NVML: + from cuda.bindings import _nvml as nvml + + if system.get_num_devices() == 0: + pytest.skip("No GPUs available to run device tests", allow_module_level=True) + + +def test_device_index_handle(): + for device in system.Device.get_all_devices(): + assert isinstance(device.handle, int) + + +def test_device_architecture(): + for device in system.Device.get_all_devices(): + device_arch = device.architecture + + assert isinstance(device_arch, system_device.DeviceArchitecture) + if sys.version_info < (3, 12): + assert device_arch.id in nvml.DeviceArch.__members__.values() + else: + assert device_arch.id in nvml.DeviceArch + + +def test_device_bar1_memory(): + for device in system.Device.get_all_devices(): + bar1_memory_info = device.bar1_memory_info + free, total, used = ( + bar1_memory_info.free, + bar1_memory_info.total, + bar1_memory_info.used, + ) + + assert isinstance(bar1_memory_info, system_device.BAR1MemoryInfo) + assert isinstance(free, int) + assert isinstance(total, int) + assert isinstance(used, int) + + assert free >= 0 + assert total >= 0 + assert used >= 0 + assert free + used == total + + +def test_device_cpu_affinity(): + skip_reasons = set() + for device in system.Device.get_all_devices(): + try: + affinity = device.cpu_affinity + except system.NotSupportedError: + skip_reasons.add(f"CPU affinity not supported on {device}") + else: + assert isinstance(affinity, list) + os.sched_setaffinity(0, affinity) + assert os.sched_getaffinity(0) == set(affinity) + if skip_reasons: + pytest.skip(" ; ".join(skip_reasons)) + + +def test_device_cuda_compute_capability(): + for device in system.Device.get_all_devices(): + cuda_compute_capability = device.cuda_compute_capability + assert isinstance(cuda_compute_capability, tuple) + assert len(cuda_compute_capability) == 2 + assert all([isinstance(i, int) for i in cuda_compute_capability]) + assert 3 <= cuda_compute_capability[0] <= 99 + assert 0 <= cuda_compute_capability[1] <= 9 + + +def test_device_memory(): + for device in system.Device.get_all_devices(): + memory_info = device.memory_info + free, total, used, reserved = memory_info.free, memory_info.total, memory_info.used, memory_info.reserved + + assert isinstance(memory_info, system_device.MemoryInfo) + assert isinstance(free, int) + assert isinstance(total, int) + assert isinstance(used, int) + assert isinstance(reserved, int) + + assert free >= 0 + assert total >= 0 + assert used >= 0 + assert reserved >= 0 + assert free + used + reserved == total + + +def test_device_name(): + for device in system.Device.get_all_devices(): + name = device.name + assert isinstance(name, str) + assert len(name) > 0 + + +def test_device_pci_info(): + for device in system.Device.get_all_devices(): + pci_info = device.pci_info + assert isinstance(pci_info, system_device.PciInfo) + + assert isinstance(pci_info.bus_id, str) + assert re.match("[a-f0-9]{8}:[a-f0-9]{2}:[a-f0-9]{2}.[a-f0-9]", pci_info.bus_id.lower()) + bus_id_domain = int(pci_info.bus_id.split(":")[0], 16) + bus_id_bus = int(pci_info.bus_id.split(":")[1], 16) + bus_id_device = int(pci_info.bus_id.split(":")[2][:2], 16) + + assert isinstance(pci_info.domain, int) + assert 0x00 <= pci_info.domain <= 0xFFFFFFFF + assert pci_info.domain == bus_id_domain + + assert isinstance(pci_info.bus, int) + assert 0x00 <= pci_info.bus <= 0xFF + assert pci_info.bus == bus_id_bus + + assert isinstance(pci_info.device, int) + assert 0x00 <= pci_info.device <= 0xFF + assert pci_info.device == bus_id_device + + assert isinstance(pci_info.vendor_id, int) + assert 0x0000 <= pci_info.vendor_id <= 0xFFFF + + assert isinstance(pci_info.device_id, int) + assert 0x0000 <= pci_info.device_id <= 0xFFFF + + +def test_device_serial(): + skip_reasons = set() + for device in system.Device.get_all_devices(): + try: + serial = device.serial + except system.NotSupportedError: + skip_reasons.add(f"Device serial not supported by device {device}") + else: + assert isinstance(serial, str) + assert len(serial) > 0 + + if skip_reasons: + pytest.skip(" ; ".join(skip_reasons)) + + +def test_device_uuid(): + for device in system.Device.get_all_devices(): + uuid = device.uuid + assert isinstance(uuid, str) + + # Expands to GPU-8hex-4hex-4hex-4hex-12hex, where 8hex means 8 consecutive + # hex characters, e.g.: "GPU-abcdef12-abcd-0123-4567-1234567890ab" diff --git a/cuda_core/tests/system/test_system_system.py b/cuda_core/tests/system/test_system_system.py new file mode 100644 index 0000000000..8f4cb59be2 --- /dev/null +++ b/cuda_core/tests/system/test_system_system.py @@ -0,0 +1,67 @@ +# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# +# SPDX-License-Identifier: Apache-2.0 + +# ruff: noqa: E402 + +import os + +import pytest +from cuda.core import system + +from .conftest import skip_if_nvml_unsupported + + +def test_cuda_driver_version(): + cuda_driver_version = system.get_driver_version_full() + assert isinstance(cuda_driver_version, tuple) + assert len(cuda_driver_version) == 3 + + ver_maj, ver_min, ver_patch = cuda_driver_version + assert ver_maj >= 10 + assert 0 <= ver_min <= 99 + assert 0 <= ver_patch <= 9 + + +@skip_if_nvml_unsupported +def test_gpu_driver_version(): + driver_version = system.get_gpu_driver_version() + assert isinstance(driver_version, tuple) + assert len(driver_version) in (2, 3) + + (ver_maj, ver_min, *ver_patch) = driver_version + assert 400 <= ver_maj < 1000 + assert ver_min >= 0 + if ver_patch: + assert 0 <= ver_patch[0] <= 99 + + +@skip_if_nvml_unsupported +def test_nvml_version(): + nvml_version = system.get_nvml_version() + assert isinstance(nvml_version, tuple) + assert len(nvml_version) in (3, 4) + + (cuda_ver_maj, ver_maj, ver_min, *ver_patch) = nvml_version + assert cuda_ver_maj >= 10 + assert 400 <= ver_maj < 1000 + assert ver_min >= 0 + if ver_patch: + assert 0 <= ver_patch[0] <= 99 + + +@skip_if_nvml_unsupported +def test_get_process_name(): + try: + process_name = system.get_process_name(os.getpid()) + except system.NotFoundError: + pytest.skip("Process not found") + + assert isinstance(process_name, str) + assert "python" in process_name + + +def test_device_count(): + device_count = system.get_num_devices() + assert isinstance(device_count, int) + assert device_count >= 0 diff --git a/cuda_core/tests/system/test_system_utils.py b/cuda_core/tests/system/test_system_utils.py new file mode 100644 index 0000000000..45e98b1991 --- /dev/null +++ b/cuda_core/tests/system/test_system_utils.py @@ -0,0 +1,42 @@ +# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# +# SPDX-License-Identifier: Apache-2.0 + +import pytest +from cuda.core.system.utils import format_bytes, unpack_bitmask + + +def test_format_bytes(): + assert format_bytes(0) == "0 B" + assert format_bytes(1) == "1 B" + assert format_bytes(1023) == "1023 B" + assert format_bytes(1024) == "1.00 KiB" + assert format_bytes(1024**2) == "1.00 MiB" + assert format_bytes(1024**3) == "1.00 GiB" + assert format_bytes(1024**4) == "1.00 TiB" + assert format_bytes(1024**5) == "1024.00 TiB" + assert format_bytes(1024**6) == "1048576.00 TiB" + + +@pytest.mark.parametrize( + "params", + [ + { + "input": [1152920405096267775, 0], + "output": [i for i in range(20)] + [i + 40 for i in range(20)], + }, + { + "input": [17293823668613283840, 65535], + "output": [i + 20 for i in range(20)] + [i + 60 for i in range(20)], + }, + {"input": [18446744073709551615, 0], "output": [i for i in range(64)]}, + {"input": [0, 18446744073709551615], "output": [i + 64 for i in range(64)]}, + ], +) +def test_unpack_bitmask(params): + assert unpack_bitmask(params["input"]) == params["output"] + + +def test_unpack_bitmask_single_value(): + with pytest.raises(TypeError): + unpack_bitmask(1)