Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
414 changes: 407 additions & 7 deletions cuda_bindings/cuda/bindings/_nvml.pyx

Large diffs are not rendered by default.

7 changes: 1 addition & 6 deletions cuda_core/cuda/core/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -28,7 +28,7 @@
finally:
del bindings, importlib, subdir, cuda_major, cuda_minor

from cuda.core import utils # noqa: E402
from cuda.core import system, utils # noqa: E402
from cuda.core._device import Device # noqa: E402
from cuda.core._event import Event, EventOptions # noqa: E402
from cuda.core._graph import ( # noqa: E402
Expand Down Expand Up @@ -62,8 +62,3 @@
from cuda.core._module import Kernel, ObjectCode # noqa: E402
from cuda.core._program import Program, ProgramOptions # noqa: E402
from cuda.core._stream import Stream, StreamOptions # noqa: E402
from cuda.core._system import System # noqa: E402

system = System()
__import__("sys").modules[__spec__.name + ".system"] = system
del System
114 changes: 0 additions & 114 deletions cuda_core/cuda/core/_system.py

This file was deleted.

7 changes: 1 addition & 6 deletions cuda_core/cuda/core/experimental/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -38,7 +38,7 @@ def _warn_deprecated():
_warn_deprecated()


from cuda.core import utils # noqa: E402
from cuda.core import system, utils # noqa: E402

# Make utils accessible as a submodule for backward compatibility
__import__("sys").modules[__spec__.name + ".utils"] = utils
Expand Down Expand Up @@ -73,8 +73,3 @@ def _warn_deprecated():
from cuda.core._module import Kernel, ObjectCode # noqa: E402
from cuda.core._program import Program, ProgramOptions # noqa: E402
from cuda.core._stream import Stream, StreamOptions # noqa: E402
from cuda.core._system import System # noqa: E402

system = System()
__import__("sys").modules[__spec__.name + ".system"] = system
del System
63 changes: 63 additions & 0 deletions cuda_core/cuda/core/system/__init__.py
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

FYI, cuda.core supports any cuda-bindings/cuda-python 12.x and 13.x, many of which do not have the NVML bindings available. So, we need a version guard here before importing anything that would expect the bindings to exist, and raise an exception in such cases.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Ah, good reminder. I guess that precludes cimport'ing anything from cuda.bindings._nvml, since _nvml is a moving target. Will just take that out for now...

Original file line number Diff line number Diff line change
@@ -0,0 +1,63 @@
# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
#
# SPDX-License-Identifier: Apache-2.0

# ruff: noqa: F403, F405


__all__ = [
"get_driver_version",
"get_driver_version_full",
"get_gpu_driver_version",
"get_num_devices",
"get_process_name",
"HAS_WORKING_NVML",
]


from .system import *

if HAS_WORKING_NVML:
from ._nvml_context import initialize
from .device import Device, DeviceArchitecture
from .exceptions import *

initialize()

__all__.extend(
[
"Device",
"DeviceArchitecture",
"UninitializedError",
"InvalidArgumentError",
"NotSupportedError",
"NoPermissionError",
"AlreadyInitializedError",
"NotFoundError",
"InsufficientSizeError",
"InsufficientPowerError",
"DriverNotLoadedError",
"TimeoutError",
"IrqIssueError",
"LibraryNotFoundError",
"FunctionNotFoundError",
"CorruptedInforomError",
"GpuIsLostError",
"ResetRequiredError",
"OperatingSystemError",
"LibRmVersionMismatchError",
"InUseError",
"MemoryError",
"NoDataError",
"VgpuEccNotSupportedError",
"InsufficientResourcesError",
"FreqNotSupportedError",
"ArgumentVersionMismatchError",
"DeprecatedError",
"NotReadyError",
"GpuNotFoundError",
"InvalidStateError",
"ResetTypeNotSupportedError",
"UnknownError",
]
)
93 changes: 93 additions & 0 deletions cuda_core/cuda/core/system/_nvml_context.pyx
Original file line number Diff line number Diff line change
@@ -0,0 +1,93 @@
# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
#
# SPDX-License-Identifier: Apache-2.0

import os
import threading

from cuda.bindings import _nvml as nvml

from . import exceptions


ctypedef enum _NVMLState:
UNINITIALIZED = 0
INITIALIZED = 1
DISABLED_LIBRARY_NOT_FOUND = 2


# Initialisation must occur per-process, so an initialised state is a
# (state, pid) pair
_NVML_STATE = _NVMLState.UNINITIALIZED
# """Current initialization state"""

_NVML_OWNER_PID = 0
# """PID of process that successfully called pynvml.nvmlInit"""


_lock = threading.Lock()


def initialize() -> None:
"""Idempotent (per-process) initialization of NVUtil's NVML

Notes
-----

Modifies global variables _NVML_STATE and _NVML_OWNER_PID"""
global _NVML_STATE, _NVML_OWNER_PID

with _lock:
if _NVML_STATE == _NVMLState.DISABLED_LIBRARY_NOT_FOUND or (
_NVML_STATE == _NVMLState.INITIALIZED and os.getpid() == _NVML_OWNER_PID
):
return
elif (
_NVML_STATE == _NVMLState.INITIALIZED and os.getpid() != _NVML_OWNER_PID
) or _NVML_STATE == _NVMLState.UNINITIALIZED:
try:
nvml.init_v2()
except (
exceptions.LibraryNotFoundError,
exceptions.DriverNotLoadedError,
exceptions.UnknownError,
):
_NVML_STATE = _NVMLState.DISABLED_LIBRARY_NOT_FOUND
return

# initialization was successful
_NVML_STATE = _NVMLState.INITIALIZED
_NVML_OWNER_PID = os.getpid()
else:
raise RuntimeError(f"Unhandled initialisation state ({_NVML_STATE=}, {_NVML_OWNER_PID=})")


def is_initialized() -> bool:
"""
Check whether the NVML context is initialized on this process.

Returns
-------
result: bool
Whether the NVML context is initialized on this process.
"""
return _NVML_STATE == _NVMLState.INITIALIZED and os.getpid() == _NVML_OWNER_PID


def validate() -> None:
"""
Validate NVML state.

Validate that NVML is functional and that the system has at least one GPU available.

Raises
------
nvml.LibraryNotFoundError
If the NVML library could not be found.
nvml.GpuNotFoundError
If no GPUs are available.
"""
if _NVML_STATE == _NVMLState.DISABLED_LIBRARY_NOT_FOUND:
raise exceptions.LibraryNotFoundError("The underlying NVML library was not found")
elif nvml.device_get_count_v2() == 0:
raise exceptions.GpuNotFoundError("No GPUs available")
Loading