diff --git a/.gitattributes b/.gitattributes index 6a3ee0fe72..81f2361d4c 100644 --- a/.gitattributes +++ b/.gitattributes @@ -6,6 +6,9 @@ cuda/_version.py export-subst # we do not own any headers checked in, don't touch them *.h binary *.hpp binary +# Exception: headers we own (cuda_core C++ implementation) +cuda_core/cuda/core/_cpp/*.h -binary text diff +cuda_core/cuda/core/_cpp/*.hpp -binary text diff # git should not convert line endings in PNG files *.png binary *.svg binary diff --git a/.gitignore b/.gitignore index d3d7c31208..fb40fae6d0 100644 --- a/.gitignore +++ b/.gitignore @@ -11,6 +11,7 @@ __pycache__/ .pytest_cache/ .benchmarks/ *.cpp +!*_impl.cpp !cuda_bindings/cuda/bindings/_lib/param_packer.cpp !cuda_bindings/cuda/bindings/_bindings/loader.cpp cache_driver diff --git a/ci/tools/merge_cuda_core_wheels.py b/ci/tools/merge_cuda_core_wheels.py index 87e2df13a7..e5320e9142 100644 --- a/ci/tools/merge_cuda_core_wheels.py +++ b/ci/tools/merge_cuda_core_wheels.py @@ -150,15 +150,21 @@ def merge_wheels(wheels: List[Path], output_dir: Path, show_wheel_contents: bool "__init__.py", "_version.py", "_include", + "_cpp", # Headers for Cython development "cu12", "cu13", ) + # _resource_handles is shared (not CUDA-version-specific) and must stay + # at top level. It's imported early in __init__.py before versioned code. + items_to_keep_prefix = ("_resource_handles",) all_items = os.scandir(base_wheel / base_dir) removed_count = 0 for f in all_items: f_abspath = f.path if f.name in items_to_keep: continue + if any(f.name.startswith(prefix) for prefix in items_to_keep_prefix): + continue if f.is_dir(): print(f" Removing directory: {f.name}", file=sys.stderr) shutil.rmtree(f_abspath) diff --git a/cuda_core/MANIFEST.in b/cuda_core/MANIFEST.in index 43d3815901..0bf6530caf 100644 --- a/cuda_core/MANIFEST.in +++ b/cuda_core/MANIFEST.in @@ -2,4 +2,4 @@ # # SPDX-License-Identifier: Apache-2.0 -recursive-include cuda/core *.pyx *.pxd +recursive-include cuda/core *.pyx *.pxd *.cpp *.hpp diff --git a/cuda_core/build_hooks.py b/cuda_core/build_hooks.py index 4337783563..1f51b99112 100644 --- a/cuda_core/build_hooks.py +++ b/cuda_core/build_hooks.py @@ -86,7 +86,21 @@ def get_cuda_paths(): print("CUDA paths:", CUDA_PATH) return CUDA_PATH - all_include_dirs = list(os.path.join(root, "include") for root in get_cuda_paths()) + def get_sources(mod_name): + """Get source files for a module, including any .cpp files.""" + sources = [f"cuda/core/{mod_name}.pyx"] + + # Add module-specific .cpp file from _cpp/ directory if it exists + cpp_file = f"cuda/core/_cpp/{mod_name.lstrip('_')}.cpp" + if os.path.exists(cpp_file): + sources.append(cpp_file) + + return sources + + def get_extension_kwargs(mod_name): + """Return Extension kwargs (libraries, etc.) per module.""" + return {"extra_compile_args": extra_compile_args} + extra_compile_args = [] if COMPILE_FOR_COVERAGE: # CYTHON_TRACE_NOGIL indicates to trace nogil functions. It is not @@ -96,10 +110,14 @@ def get_cuda_paths(): ext_modules = tuple( Extension( f"cuda.core.{mod.replace(os.path.sep, '.')}", - sources=[f"cuda/core/{mod}.pyx"], - include_dirs=all_include_dirs, + sources=get_sources(mod), + include_dirs=[ + "cuda/core/_include", + "cuda/core/_cpp", + ] + + list(os.path.join(root, "include") for root in get_cuda_paths()), language="c++", - extra_compile_args=extra_compile_args, + **get_extension_kwargs(mod), ) for mod in module_names ) diff --git a/cuda_core/cuda/core/__init__.py b/cuda_core/cuda/core/__init__.py index a10812606e..6bebb13b4e 100644 --- a/cuda_core/cuda/core/__init__.py +++ b/cuda_core/cuda/core/__init__.py @@ -15,6 +15,15 @@ import importlib +# The _resource_handles module exports a PyCapsule dispatch table that other +# extension modules access via PyCapsule_Import. We import it here to ensure +# it's loaded before other modules try to use it. +# +# We use importlib.import_module with the full path to avoid triggering +# circular import issues that can occur with relative imports during +# package initialization. +_resource_handles = importlib.import_module("cuda.core._resource_handles") + subdir = f"cu{cuda_major}" try: versioned_mod = importlib.import_module(f".{subdir}", __package__) diff --git a/cuda_core/cuda/core/_context.pxd b/cuda_core/cuda/core/_context.pxd new file mode 100644 index 0000000000..dc853fc75d --- /dev/null +++ b/cuda_core/cuda/core/_context.pxd @@ -0,0 +1,19 @@ +# SPDX-FileCopyrightText: Copyright (c) 2024-2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# +# SPDX-License-Identifier: Apache-2.0 + +from cuda.core._resource_handles cimport ContextHandle + +cdef class Context: + """Cython declaration for Context class. + + This class provides access to CUDA contexts. Context objects cannot be + instantiated directly - use factory methods or Device/Stream APIs. + """ + + cdef: + ContextHandle _h_context + int _device_id + + @staticmethod + cdef Context _from_handle(type cls, ContextHandle h_context, int device_id) diff --git a/cuda_core/cuda/core/_context.pyx b/cuda_core/cuda/core/_context.pyx index c1c28b3389..64663aadf3 100644 --- a/cuda_core/cuda/core/_context.pyx +++ b/cuda_core/cuda/core/_context.pyx @@ -4,35 +4,55 @@ from dataclasses import dataclass -from cuda.core._utils.cuda_utils import driver +from cuda.core._resource_handles cimport ( + ContextHandle, + intptr, + py, +) -@dataclass -class ContextOptions: - pass # TODO +__all__ = ['Context', 'ContextOptions'] cdef class Context: + """CUDA context wrapper. - cdef: - readonly object _handle - int _device_id + Context objects represent CUDA contexts and cannot be instantiated directly. + Use Device or Stream APIs to obtain context objects. + """ def __init__(self, *args, **kwargs): raise RuntimeError("Context objects cannot be instantiated directly. Please use Device or Stream APIs.") - @classmethod - def _from_ctx(cls, handle: driver.CUcontext, int device_id): - cdef Context ctx = Context.__new__(Context) - ctx._handle = handle + @staticmethod + cdef Context _from_handle(type cls, ContextHandle h_context, int device_id): + """Create Context from existing ContextHandle (cdef-only factory).""" + cdef Context ctx = cls.__new__(cls) + ctx._h_context = h_context ctx._device_id = device_id return ctx + @property + def handle(self): + """Return the underlying CUcontext handle.""" + if self._h_context.get() == NULL: + return None + return py(self._h_context) + def __eq__(self, other): if not isinstance(other, Context): return NotImplemented cdef Context _other = other - return int(self._handle) == int(_other._handle) + return intptr(self._h_context) == intptr(_other._h_context) def __hash__(self) -> int: - return hash(int(self._handle)) + return hash((type(self), intptr(self._h_context))) + + +@dataclass +class ContextOptions: + """Options for context creation. + + Currently unused, reserved for future use. + """ + pass # TODO diff --git a/cuda_core/cuda/core/_cpp/DESIGN.md b/cuda_core/cuda/core/_cpp/DESIGN.md new file mode 100644 index 0000000000..003dcfd945 --- /dev/null +++ b/cuda_core/cuda/core/_cpp/DESIGN.md @@ -0,0 +1,286 @@ +# Resource Handles Design + +This document describes the resource handle abstraction in cuda.core, which provides +robust lifetime management for CUDA resources. + +## Overview + +The cuda-core Python library provides a high-level interface to CUDA resources such as +Context, Device, Stream, and Event. These objects correspond to resources managed by +the CUDA Driver API, each having explicit creation and destruction routines. Several +of these CUDA resources also participate in non-trivial ownership hierarchies (e.g., +a stream belongs to a context), and releasing them may require additional arguments +or other resources (e.g., a device pointer freed through a specific stream). + +### Goals + +The goal of the handle abstraction is to provide a robust, explicit, and Python-agnostic +layer for ownership and lifetime management of CUDA resources. The intent is to use +handles as the backbone of the cuda-core resource hierarchy, enabling cuda-core Python +objects to manipulate handles rather than work directly with raw CUDA resources. + +While Python-facing objects expose convenient APIs and additional behaviors, the handle +layer isolates all concerns related to resource lifetime. By cleanly separating these +responsibilities, we achieve: + +- **Clearer architecture** with minimal cross-layer coupling +- **Safe transfer of resource ownership** between Python and other domains, including C++ +- **Ability to preserve resource validity** independent of Python +- **Well-specified semantics** for immutability, ownership, and reachability +- **Simplified reasoning about resource lifetimes**, especially with nested or dependent resources + +### Handle Semantics + +Resource handles provide **referentially transparent** wrappers around CUDA resources: + +- **No rebinding**: A handle always refers to the same resource. +- **No invalidation**: If a handle exists, its resource is valid. +- **Structural dependencies**: If resource A depends on resource B, A's handle + embeds B's handle, automatically extending B's lifetime. + +This eliminates global lifetime analysis. Correctness is enforced structurally—if you +have a handle, you have a valid resource. + +## Handle Types + +All handles are `std::shared_ptr` aliases that expose only the raw CUDA resource: + +```cpp +using ContextHandle = std::shared_ptr; +using StreamHandle = std::shared_ptr; +using EventHandle = std::shared_ptr; +using MemoryPoolHandle = std::shared_ptr; +using DevicePtrHandle = std::shared_ptr; +``` + +Internally, handles use **shared pointer aliasing**: the actual managed object is a +"box" containing the resource, its dependencies, and any state needed for destruction. +The public handle points only to the raw resource field, keeping the API minimal. + +### Why shared_ptr? + +- **Automatic reference counting**: Resources are released when the last reference + disappears. +- **Cross-language stability**: Works across Python/C++ boundaries without relying + on Python's garbage collector. +- **Interpreter independence**: Resources remain valid even during Python shutdown. +- **Type-erased deleters**: Destruction logic is captured at creation time, supporting + diverse lifetime strategies. + +## Accessing Handle Values + +Handles can be accessed in three ways via overloaded helper functions: + +| Function | Returns | Use Case | Notes +|----------|---------|----------|-------| +| `native(h)` | Raw CUDA type (e.g., `CUstream`) | Passing to CUDA APIs | An attribute of `cuda.bindings.cydriver` | +| `intptr(h)` | `intptr_t` | Python interop, foreign code | | +| `py(h)` | Python wrapper object | Returning to Python callers | An attribute of `cure.bindings.driver` + +These overloads exist because `std::shared_ptr` cannot have additional attributes. +Wrapping handles in Python objects would be superfluous overhead for internal use, +so we provide these helpers instead. + +Example usage from Cython: + +```cython +# Get raw handle for CUDA API calls +cdef CUstream raw_stream = native(h_stream) # cuda.bindings.cydriver.CUstream + +# Get as integer for other use cases +return hash(intptr(h_stream)) + +# Get Python wrapper for returning to user +return py(h_stream) # cuda.bindings.driver.CUstream +``` + +## Code Structure + +### Directory Layout + +``` +cuda/core/ +├── _resource_handles.pyx # Cython module (compiles resource_handles.cpp) +├── _resource_handles.pxd # Cython declarations and dispatch wrappers +└── _cpp/ + ├── resource_handles.hpp # C++ API declarations + ├── resource_handles.cpp # C++ implementation + └── resource_handles_cxx_api.hpp # Capsule struct definition +``` + +### Build Implications + +The `_cpp/` subdirectory contains C++ source files that are compiled into the +`_resource_handles` extension module. Other Cython modules in cuda.core do **not** +link against this code directly—they access it through a capsule mechanism +(explained below). + +## Capsule Architecture + +The implementation uses **two separate capsule mechanisms** for different purposes: + +### Capsule 1: C++ API Table (`_CXX_API`) + +**Problem**: Cython extension modules compile independently. If multiple modules +(`_memory.pyx`, `_ipc.pyx`, etc.) each linked `resource_handles.cpp`, they would +each have their own copies of: + +- Static driver function pointers +- Thread-local error state +- Other static data, including global caches + +**Solution**: Only `_resource_handles.so` links the C++ code. It exports a capsule +containing function pointers: + +```cpp +struct ResourceHandlesCxxApiV1 { + uint32_t abi_version; + uint32_t struct_size; + + // Thread-local error handling + CUresult (*get_last_error)() noexcept; + CUresult (*peek_last_error)() noexcept; + void (*clear_last_error)() noexcept; + + // Handle creation functions + ContextHandle (*get_primary_context)(int device_id) noexcept; + StreamHandle (*create_stream_handle)(...) noexcept; + // ... etc +}; +``` + +Other Cython modules import this capsule at runtime and call through the function +pointers. The `.pxd` file provides inline wrappers that hide this indirection: + +```cython +cdef inline StreamHandle create_stream_handle(...) except * nogil: + return _handles_table.create_stream_handle(...) +``` + +Importing modules are expected to call `_init_handles_table()` prior to calling +any wrapper functions. + +### Capsule 2: CUDA Driver API (`_CUDA_DRIVER_API_V1`) + +**Problem**: cuda.core cannot directly call CUDA driver functions because: + +1. We don't want to link against `libcuda.so` at build time. +2. The driver symbols must be resolved dynamically through cuda-bindings. + +**Solution**: `_resource_handles.pyx` creates a capsule containing CUDA driver +function pointers obtained from cuda-bindings: + +```cpp +struct CudaDriverApiV1 { + uint32_t abi_version; + uint32_t struct_size; + + uintptr_t cuDevicePrimaryCtxRetain; + uintptr_t cuDevicePrimaryCtxRelease; + uintptr_t cuStreamCreateWithPriority; + uintptr_t cuStreamDestroy; + // ... etc +}; +``` + +The C++ code retrieves this capsule once (via `load_driver_api()`) and caches the +function pointers for subsequent use. + +### Why Two Capsules? + +| Capsule | Direction | Purpose | +|---------|-----------|---------| +| `_CXX_API` | C++ → Cython | Share handle functions across modules | +| `_CUDA_DRIVER_API_V1` | Cython → C++ | Provide resolved driver symbols | + +## Key Implementation Details + +### Structural Dependencies + +When a resource depends on another, its handle embeds the dependency: + +```cpp +struct StreamBox { + CUstream resource; + ContextHandle h_context; // Keeps context alive +}; +``` + +The shared pointer's custom deleter captures any additional state needed for +destruction. This ensures resources are always destroyed in the correct order. + +### GIL Management + +Handle destructors may run from any thread. The implementation includes RAII guards +(`GILReleaseGuard`, `GILAcquireGuard`) that: + +- Release the GIL before calling CUDA APIs (for parallelism) +- Handle Python finalization gracefully (avoid GIL operations during shutdown) +- Ensure Python object manipulation happens with GIL held + +The handle API functions are safe to call with or without the GIL held. They +will release the GIL (if necessary) before calling CUDA driver API functions. + +### Error Handling + +Handle API functions do not raise Python exceptions. Instead, they return an empty +handle (null `shared_ptr`) on failure and store the error code in thread-local state. +Callers should check for failure and retrieve the error using `get_last_error()`: + +```cython +cdef StreamHandle h = create_stream_handle(h_ctx, flags, priority) +if not h: + # Handle creation failed - get the CUDA error code + cdef CUresult err = get_last_error() + # ... handle error (e.g., raise Python exception) +``` + +This design allows handle functions to be called from `nogil` blocks without requiring +GIL acquisition for exception handling on the success path. The error state is +thread-local, so concurrent calls from different threads do not interfere. + +Related functions: +- `get_last_error()`: Returns and clears the most recent error +- `peek_last_error()`: Returns the error without clearing it +- `clear_last_error()`: Clears the error state + +## Usage from Cython + +```cython +from cuda.core._resource_handles cimport ( + StreamHandle, + create_stream_handle, + native, + intptr, + get_last_error, + _init_handles_table, +) + +_init_handles_table() # prerequisite before calling handle API functions + +# Create a stream +cdef StreamHandle h_stream = create_stream_handle(h_ctx, flags, priority) +if not h_stream: + HANDLE_RETURN(get_last_error()) + +# Use in CUDA API +cuStreamSynchronize(native(h_stream)) + +# Return to Python +return py(h_stream) +``` + +## Summary + +The resource handle design: + +1. **Separates resource management** into its own layer, independent of Python objects. +2. **Encodes lifetimes structurally** via embedded handle dependencies. +3. **Uses capsules** to solve two distinct problems: + - Sharing C++ code across Cython modules without duplicate statics. + - Resolving CUDA driver symbols dynamically through cuda-bindings. +4. **Provides overloaded accessors** (`native`, `intptr`, `py`) since handles cannot + have attributes without unnecessary Python object wrappers. + +This architecture ensures CUDA resources are managed correctly regardless of Python +garbage collection timing, interpreter shutdown, or cross-language usage patterns. diff --git a/cuda_core/cuda/core/_cpp/resource_handles.cpp b/cuda_core/cuda/core/_cpp/resource_handles.cpp new file mode 100644 index 0000000000..5ffc84145c --- /dev/null +++ b/cuda_core/cuda/core/_cpp/resource_handles.cpp @@ -0,0 +1,877 @@ +// SPDX-FileCopyrightText: Copyright (c) 2024-2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +// +// SPDX-License-Identifier: Apache-2.0 + +#include + +#include "resource_handles.hpp" +#include "resource_handles_cxx_api.hpp" +#include +#include +#include +#include +#include +#include + +namespace cuda_core { + +// ============================================================================ +// CUDA driver lazy resolution via cuda-bindings (CPU-only import + MVC) +// ============================================================================ + +namespace { + +static std::once_flag driver_load_once; +static bool driver_loaded = false; + +#if PY_VERSION_HEX < 0x030D0000 +extern "C" int _Py_IsFinalizing(void); +#endif + +static inline bool py_is_finalizing() noexcept { +#if PY_VERSION_HEX >= 0x030D0000 + return Py_IsFinalizing(); +#else + // Python < 3.13 does not expose Py_IsFinalizing() publicly. Use the private + // API that exists in those versions. + return _Py_IsFinalizing() != 0; +#endif +} + +// ============================================================================ +// GIL management helpers +// ============================================================================ + +// Helper to release the GIL while calling into the CUDA driver. +// This guard is *conditional*: if the caller already dropped the GIL, +// we avoid calling PyEval_SaveThread (which requires holding the GIL). +// It also handles the case where Python is finalizing and GIL operations +// are no longer safe. +class GILReleaseGuard { +public: + GILReleaseGuard() : tstate_(nullptr), released_(false) { + // Don't try to manipulate GIL if Python is finalizing + if (!Py_IsInitialized() || py_is_finalizing()) { + return; + } + // PyGILState_Check() returns 1 if the GIL is held by this thread. + if (PyGILState_Check()) { + tstate_ = PyEval_SaveThread(); + released_ = true; + } + } + + ~GILReleaseGuard() { + if (released_) { + PyEval_RestoreThread(tstate_); + } + } + + // Non-copyable, non-movable + GILReleaseGuard(const GILReleaseGuard&) = delete; + GILReleaseGuard& operator=(const GILReleaseGuard&) = delete; + +private: + PyThreadState* tstate_; + bool released_; +}; + +// Helper to acquire the GIL when we might not hold it. +// Use in C++ destructors that need to manipulate Python objects. +class GILAcquireGuard { +public: + GILAcquireGuard() : acquired_(false) { + // Don't try to acquire GIL if Python is finalizing + if (!Py_IsInitialized() || py_is_finalizing()) { + return; + } + gstate_ = PyGILState_Ensure(); + acquired_ = true; + } + + ~GILAcquireGuard() { + if (acquired_) { + PyGILState_Release(gstate_); + } + } + + bool acquired() const { return acquired_; } + + // Non-copyable, non-movable + GILAcquireGuard(const GILAcquireGuard&) = delete; + GILAcquireGuard& operator=(const GILAcquireGuard&) = delete; + +private: + PyGILState_STATE gstate_; + bool acquired_; +}; + + +#define DECLARE_DRIVER_FN(name) using name##_t = decltype(&name); static name##_t p_##name = nullptr + +DECLARE_DRIVER_FN(cuDevicePrimaryCtxRetain); +DECLARE_DRIVER_FN(cuDevicePrimaryCtxRelease); +DECLARE_DRIVER_FN(cuCtxGetCurrent); + +DECLARE_DRIVER_FN(cuStreamCreateWithPriority); +DECLARE_DRIVER_FN(cuStreamDestroy); + +DECLARE_DRIVER_FN(cuEventCreate); +DECLARE_DRIVER_FN(cuEventDestroy); +DECLARE_DRIVER_FN(cuIpcOpenEventHandle); + +DECLARE_DRIVER_FN(cuDeviceGetCount); + +DECLARE_DRIVER_FN(cuMemPoolSetAccess); +DECLARE_DRIVER_FN(cuMemPoolDestroy); +DECLARE_DRIVER_FN(cuMemPoolCreate); +DECLARE_DRIVER_FN(cuDeviceGetMemPool); +DECLARE_DRIVER_FN(cuMemPoolImportFromShareableHandle); + +DECLARE_DRIVER_FN(cuMemAllocFromPoolAsync); +DECLARE_DRIVER_FN(cuMemAllocAsync); +DECLARE_DRIVER_FN(cuMemAlloc); +DECLARE_DRIVER_FN(cuMemAllocHost); + +DECLARE_DRIVER_FN(cuMemFreeAsync); +DECLARE_DRIVER_FN(cuMemFree); +DECLARE_DRIVER_FN(cuMemFreeHost); + +DECLARE_DRIVER_FN(cuMemPoolImportPointer); + +#undef DECLARE_DRIVER_FN + +static bool load_driver_api() noexcept { + struct CudaDriverApiV1 { + std::uint32_t abi_version; + std::uint32_t struct_size; + + std::uintptr_t cuDevicePrimaryCtxRetain; + std::uintptr_t cuDevicePrimaryCtxRelease; + std::uintptr_t cuCtxGetCurrent; + + std::uintptr_t cuStreamCreateWithPriority; + std::uintptr_t cuStreamDestroy; + + std::uintptr_t cuEventCreate; + std::uintptr_t cuEventDestroy; + std::uintptr_t cuIpcOpenEventHandle; + + std::uintptr_t cuDeviceGetCount; + + std::uintptr_t cuMemPoolSetAccess; + std::uintptr_t cuMemPoolDestroy; + std::uintptr_t cuMemPoolCreate; + std::uintptr_t cuDeviceGetMemPool; + std::uintptr_t cuMemPoolImportFromShareableHandle; + + std::uintptr_t cuMemAllocFromPoolAsync; + std::uintptr_t cuMemAllocAsync; + std::uintptr_t cuMemAlloc; + std::uintptr_t cuMemAllocHost; + + std::uintptr_t cuMemFreeAsync; + std::uintptr_t cuMemFree; + std::uintptr_t cuMemFreeHost; + + std::uintptr_t cuMemPoolImportPointer; + }; + + static constexpr const char* capsule_name = + "cuda.core._resource_handles._CUDA_DRIVER_API_V1"; + + GILAcquireGuard gil; + if (!gil.acquired()) { + return false; + } + + // `_resource_handles` is already loaded (it exports the handle API capsule), + // so avoid import machinery and just grab the module object. + PyObject* mod = PyImport_AddModule("cuda.core._resource_handles"); // borrowed + if (!mod) { + PyErr_Clear(); + return false; + } + + PyObject* fn = PyObject_GetAttrString(mod, "_get_cuda_driver_api_v1_capsule"); // new ref + if (!fn) { + PyErr_Clear(); + return false; + } + + PyObject* cap = PyObject_CallFunctionObjArgs(fn, nullptr); + Py_DECREF(fn); + if (!cap) { + PyErr_Clear(); + return false; + } + + const auto* api = static_cast(PyCapsule_GetPointer(cap, capsule_name)); + Py_DECREF(cap); + + if (!api) { + PyErr_Clear(); + return false; + } + if (api->abi_version != 1 || api->struct_size < sizeof(CudaDriverApiV1)) { + return false; + } + +#define LOAD_ADDR(name) \ + do { \ + if (api->name == 0) { \ + return false; \ + } \ + p_##name = reinterpret_cast(api->name); \ + } while (0) + + LOAD_ADDR(cuDevicePrimaryCtxRetain); + LOAD_ADDR(cuDevicePrimaryCtxRelease); + LOAD_ADDR(cuCtxGetCurrent); + + LOAD_ADDR(cuStreamCreateWithPriority); + LOAD_ADDR(cuStreamDestroy); + + LOAD_ADDR(cuEventCreate); + LOAD_ADDR(cuEventDestroy); + LOAD_ADDR(cuIpcOpenEventHandle); + + LOAD_ADDR(cuDeviceGetCount); + + LOAD_ADDR(cuMemPoolSetAccess); + LOAD_ADDR(cuMemPoolDestroy); + LOAD_ADDR(cuMemPoolCreate); + LOAD_ADDR(cuDeviceGetMemPool); + LOAD_ADDR(cuMemPoolImportFromShareableHandle); + + LOAD_ADDR(cuMemAllocFromPoolAsync); + LOAD_ADDR(cuMemAllocAsync); + LOAD_ADDR(cuMemAlloc); + LOAD_ADDR(cuMemAllocHost); + + LOAD_ADDR(cuMemFreeAsync); + LOAD_ADDR(cuMemFree); + LOAD_ADDR(cuMemFreeHost); + + LOAD_ADDR(cuMemPoolImportPointer); + +#undef LOAD_ADDR + + return true; +} + +static bool ensure_driver_loaded() noexcept { + std::call_once(driver_load_once, []() { driver_loaded = load_driver_api(); }); + return driver_loaded; +} + +} // namespace + +// ============================================================================ +// Thread-local error handling +// ============================================================================ + +// Thread-local status of the most recent CUDA API call in this module. +thread_local CUresult err = CUDA_SUCCESS; + +CUresult get_last_error() noexcept { + CUresult e = err; + err = CUDA_SUCCESS; + return e; +} + +CUresult peek_last_error() noexcept { + return err; +} + +void clear_last_error() noexcept { + err = CUDA_SUCCESS; +} + +// ============================================================================ +// Context Handles +// ============================================================================ + +struct ContextBox { + CUcontext resource; +}; + +ContextHandle create_context_handle_ref(CUcontext ctx) { + auto box = std::make_shared(ContextBox{ctx}); + return ContextHandle(box, &box->resource); +} + +// Thread-local cache of primary contexts indexed by device ID +thread_local std::vector primary_context_cache; + +ContextHandle get_primary_context(int device_id) noexcept { + if (!ensure_driver_loaded()) { + err = CUDA_ERROR_NOT_INITIALIZED; + return {}; + } + // Check thread-local cache + if (static_cast(device_id) < primary_context_cache.size()) { + if (auto cached = primary_context_cache[device_id]) { + return cached; + } + } + + // Cache miss - acquire primary context from driver + GILReleaseGuard gil; + CUcontext ctx; + if (CUDA_SUCCESS != (err = p_cuDevicePrimaryCtxRetain(&ctx, device_id))) { + return {}; + } + + auto box = std::shared_ptr( + new ContextBox{ctx}, + [device_id](const ContextBox* b) { + GILReleaseGuard gil; + p_cuDevicePrimaryCtxRelease(device_id); + delete b; + } + ); + auto h = ContextHandle(box, &box->resource); + + // Update cache + if (static_cast(device_id) >= primary_context_cache.size()) { + primary_context_cache.resize(device_id + 1); + } + primary_context_cache[device_id] = h; + return h; +} + +ContextHandle get_current_context() noexcept { + if (!ensure_driver_loaded()) { + err = CUDA_ERROR_NOT_INITIALIZED; + return {}; + } + GILReleaseGuard gil; + CUcontext ctx = nullptr; + if (CUDA_SUCCESS != (err = p_cuCtxGetCurrent(&ctx))) { + return {}; + } + if (!ctx) { + return {}; // No current context (not an error) + } + return create_context_handle_ref(ctx); +} + +// ============================================================================ +// Stream Handles +// ============================================================================ + +struct StreamBox { + CUstream resource; +}; + +StreamHandle create_stream_handle(ContextHandle h_ctx, unsigned int flags, int priority) { + if (!ensure_driver_loaded()) { + err = CUDA_ERROR_NOT_INITIALIZED; + return {}; + } + GILReleaseGuard gil; + CUstream stream; + if (CUDA_SUCCESS != (err = p_cuStreamCreateWithPriority(&stream, flags, priority))) { + return {}; + } + + auto box = std::shared_ptr( + new StreamBox{stream}, + [h_ctx](const StreamBox* b) { + GILReleaseGuard gil; + p_cuStreamDestroy(b->resource); + delete b; + } + ); + return StreamHandle(box, &box->resource); +} + +StreamHandle create_stream_handle_ref(CUstream stream) { + auto box = std::make_shared(StreamBox{stream}); + return StreamHandle(box, &box->resource); +} + +StreamHandle create_stream_handle_with_owner(CUstream stream, PyObject* owner) { + Py_XINCREF(owner); + auto box = std::shared_ptr( + new StreamBox{stream}, + [owner](const StreamBox* b) { + GILAcquireGuard gil; + if (gil.acquired()) { + Py_XDECREF(owner); + } + delete b; + } + ); + return StreamHandle(box, &box->resource); +} + +StreamHandle get_legacy_stream() noexcept { + static StreamHandle handle = create_stream_handle_ref(CU_STREAM_LEGACY); + return handle; +} + +StreamHandle get_per_thread_stream() noexcept { + static StreamHandle handle = create_stream_handle_ref(CU_STREAM_PER_THREAD); + return handle; +} + +// ============================================================================ +// Event Handles +// ============================================================================ + +struct EventBox { + CUevent resource; +}; + +EventHandle create_event_handle(ContextHandle h_ctx, unsigned int flags) { + if (!ensure_driver_loaded()) { + err = CUDA_ERROR_NOT_INITIALIZED; + return {}; + } + GILReleaseGuard gil; + CUevent event; + if (CUDA_SUCCESS != (err = p_cuEventCreate(&event, flags))) { + return {}; + } + + auto box = std::shared_ptr( + new EventBox{event}, + [h_ctx](const EventBox* b) { + GILReleaseGuard gil; + p_cuEventDestroy(b->resource); + delete b; + } + ); + return EventHandle(box, &box->resource); +} + +EventHandle create_event_handle(unsigned int flags) { + return create_event_handle(ContextHandle{}, flags); +} + +EventHandle create_event_handle_ipc(const CUipcEventHandle& ipc_handle) { + if (!ensure_driver_loaded()) { + err = CUDA_ERROR_NOT_INITIALIZED; + return {}; + } + GILReleaseGuard gil; + CUevent event; + if (CUDA_SUCCESS != (err = p_cuIpcOpenEventHandle(&event, ipc_handle))) { + return {}; + } + + auto box = std::shared_ptr( + new EventBox{event}, + [](const EventBox* b) { + GILReleaseGuard gil; + p_cuEventDestroy(b->resource); + delete b; + } + ); + return EventHandle(box, &box->resource); +} + +// ============================================================================ +// Memory Pool Handles +// ============================================================================ + +struct MemoryPoolBox { + CUmemoryPool resource; +}; + +// Helper to clear peer access before destroying a memory pool. +// Works around nvbug 5698116: recycled pool handles inherit peer access state. +static void clear_mempool_peer_access(CUmemoryPool pool) { + int device_count = 0; + if (p_cuDeviceGetCount(&device_count) != CUDA_SUCCESS || device_count <= 0) { + return; + } + + std::vector clear_access(device_count); + for (int i = 0; i < device_count; ++i) { + clear_access[i].location.type = CU_MEM_LOCATION_TYPE_DEVICE; + clear_access[i].location.id = i; + clear_access[i].flags = CU_MEM_ACCESS_FLAGS_PROT_NONE; + } + p_cuMemPoolSetAccess(pool, clear_access.data(), device_count); // Best effort +} + +static MemoryPoolHandle wrap_mempool_owned(CUmemoryPool pool) { + auto box = std::shared_ptr( + new MemoryPoolBox{pool}, + [](const MemoryPoolBox* b) { + GILReleaseGuard gil; + clear_mempool_peer_access(b->resource); + p_cuMemPoolDestroy(b->resource); + delete b; + } + ); + return MemoryPoolHandle(box, &box->resource); +} + +MemoryPoolHandle create_mempool_handle(const CUmemPoolProps& props) { + if (!ensure_driver_loaded()) { + err = CUDA_ERROR_NOT_INITIALIZED; + return {}; + } + GILReleaseGuard gil; + CUmemoryPool pool; + if (CUDA_SUCCESS != (err = p_cuMemPoolCreate(&pool, &props))) { + return {}; + } + return wrap_mempool_owned(pool); +} + +MemoryPoolHandle create_mempool_handle_ref(CUmemoryPool pool) { + auto box = std::make_shared(MemoryPoolBox{pool}); + return MemoryPoolHandle(box, &box->resource); +} + +MemoryPoolHandle get_device_mempool(int device_id) noexcept { + if (!ensure_driver_loaded()) { + err = CUDA_ERROR_NOT_INITIALIZED; + return {}; + } + GILReleaseGuard gil; + CUmemoryPool pool; + if (CUDA_SUCCESS != (err = p_cuDeviceGetMemPool(&pool, device_id))) { + return {}; + } + return create_mempool_handle_ref(pool); +} + +MemoryPoolHandle create_mempool_handle_ipc(int fd, CUmemAllocationHandleType handle_type) { + if (!ensure_driver_loaded()) { + err = CUDA_ERROR_NOT_INITIALIZED; + return {}; + } + GILReleaseGuard gil; + CUmemoryPool pool; + auto handle_ptr = reinterpret_cast(static_cast(fd)); + if (CUDA_SUCCESS != (err = p_cuMemPoolImportFromShareableHandle(&pool, handle_ptr, handle_type, 0))) { + return {}; + } + return wrap_mempool_owned(pool); +} + +// ============================================================================ +// Device Pointer Handles +// ============================================================================ + +struct DevicePtrBox { + CUdeviceptr resource; + mutable StreamHandle h_stream; +}; + +static DevicePtrBox* get_box(const DevicePtrHandle& h) { + const CUdeviceptr* p = h.get(); + return reinterpret_cast( + reinterpret_cast(const_cast(p)) - offsetof(DevicePtrBox, resource) + ); +} + +StreamHandle deallocation_stream(const DevicePtrHandle& h) { + return get_box(h)->h_stream; +} + +void set_deallocation_stream(const DevicePtrHandle& h, StreamHandle h_stream) { + get_box(h)->h_stream = std::move(h_stream); +} + +DevicePtrHandle deviceptr_alloc_from_pool(size_t size, MemoryPoolHandle h_pool, StreamHandle h_stream) { + if (!ensure_driver_loaded()) { + err = CUDA_ERROR_NOT_INITIALIZED; + return {}; + } + GILReleaseGuard gil; + CUdeviceptr ptr; + if (CUDA_SUCCESS != (err = p_cuMemAllocFromPoolAsync(&ptr, size, *h_pool, native(h_stream)))) { + return {}; + } + + auto box = std::shared_ptr( + new DevicePtrBox{ptr, h_stream}, + [h_pool](DevicePtrBox* b) { + GILReleaseGuard gil; + p_cuMemFreeAsync(b->resource, native(b->h_stream)); + delete b; + } + ); + return DevicePtrHandle(box, &box->resource); +} + +DevicePtrHandle deviceptr_alloc_async(size_t size, StreamHandle h_stream) { + if (!ensure_driver_loaded()) { + err = CUDA_ERROR_NOT_INITIALIZED; + return {}; + } + GILReleaseGuard gil; + CUdeviceptr ptr; + if (CUDA_SUCCESS != (err = p_cuMemAllocAsync(&ptr, size, native(h_stream)))) { + return {}; + } + + auto box = std::shared_ptr( + new DevicePtrBox{ptr, h_stream}, + [](DevicePtrBox* b) { + GILReleaseGuard gil; + p_cuMemFreeAsync(b->resource, native(b->h_stream)); + delete b; + } + ); + return DevicePtrHandle(box, &box->resource); +} + +DevicePtrHandle deviceptr_alloc(size_t size) { + if (!ensure_driver_loaded()) { + err = CUDA_ERROR_NOT_INITIALIZED; + return {}; + } + GILReleaseGuard gil; + CUdeviceptr ptr; + if (CUDA_SUCCESS != (err = p_cuMemAlloc(&ptr, size))) { + return {}; + } + + auto box = std::shared_ptr( + new DevicePtrBox{ptr, StreamHandle{}}, + [](DevicePtrBox* b) { + GILReleaseGuard gil; + p_cuMemFree(b->resource); + delete b; + } + ); + return DevicePtrHandle(box, &box->resource); +} + +DevicePtrHandle deviceptr_alloc_host(size_t size) { + if (!ensure_driver_loaded()) { + err = CUDA_ERROR_NOT_INITIALIZED; + return {}; + } + GILReleaseGuard gil; + void* ptr; + if (CUDA_SUCCESS != (err = p_cuMemAllocHost(&ptr, size))) { + return {}; + } + + auto box = std::shared_ptr( + new DevicePtrBox{reinterpret_cast(ptr), StreamHandle{}}, + [](DevicePtrBox* b) { + GILReleaseGuard gil; + p_cuMemFreeHost(reinterpret_cast(b->resource)); + delete b; + } + ); + return DevicePtrHandle(box, &box->resource); +} + +DevicePtrHandle deviceptr_create_ref(CUdeviceptr ptr) { + auto box = std::make_shared(DevicePtrBox{ptr, StreamHandle{}}); + return DevicePtrHandle(box, &box->resource); +} + +DevicePtrHandle deviceptr_create_with_owner(CUdeviceptr ptr, PyObject* owner) { + if (!owner) { + return deviceptr_create_ref(ptr); + } + Py_INCREF(owner); + auto box = std::shared_ptr( + new DevicePtrBox{ptr, StreamHandle{}}, + [owner](DevicePtrBox* b) { + GILAcquireGuard gil; + if (gil.acquired()) { + Py_DECREF(owner); + } + delete b; + } + ); + return DevicePtrHandle(box, &box->resource); +} + +// ============================================================================ +// IPC Pointer Cache +// ============================================================================ +// This cache handles duplicate IPC imports, which behave differently depending +// on the memory type: +// +// 1. Memory pool allocations (DeviceMemoryResource): +// Multiple imports of the same allocation succeed and return duplicate +// pointers. However, the driver has a reference counting bug (nvbug 5570902) +// where the first cuMemFreeAsync incorrectly unmaps the memory even when +// imported multiple times. A driver fix is expected. +// +// 2. Pinned memory allocations (PinnedMemoryResource): +// Duplicate imports result in CUDA_ERROR_ALREADY_MAPPED. +// +// The cache solves both issues by checking the cache before calling +// cuMemPoolImportPointer and returning the existing handle for duplicate +// imports. This provides a consistent user experience where the same IPC +// descriptor can be imported multiple times regardless of memory type. +// +// The cache key is the export_data bytes (CUmemPoolPtrExportData), not the +// returned pointer, because we must check before calling the driver API. + +// TODO: When driver fix for nvbug 5570902 is available, consider whether +// the cache is still needed for memory pool allocations (it will still be +// needed for pinned memory). +static bool use_ipc_ptr_cache() { + return true; +} + +// Wrapper for CUmemPoolPtrExportData to use as map key +struct ExportDataKey { + CUmemPoolPtrExportData data; + + bool operator==(const ExportDataKey& other) const { + return std::memcmp(&data, &other.data, sizeof(data)) == 0; + } +}; + +struct ExportDataKeyHash { + std::size_t operator()(const ExportDataKey& key) const { + // Simple hash of the bytes + std::size_t h = 0; + const auto* bytes = reinterpret_cast(&key.data); + for (std::size_t i = 0; i < sizeof(key.data); ++i) { + h = h * 31 + bytes[i]; + } + return h; + } +}; + +static std::mutex ipc_ptr_cache_mutex; +static std::unordered_map, ExportDataKeyHash> ipc_ptr_cache; + +DevicePtrHandle deviceptr_import_ipc(MemoryPoolHandle h_pool, const void* export_data, StreamHandle h_stream) { + if (!ensure_driver_loaded()) { + err = CUDA_ERROR_NOT_INITIALIZED; + return {}; + } + + auto data = const_cast( + reinterpret_cast(export_data)); + + if (use_ipc_ptr_cache()) { + // Check cache before calling cuMemPoolImportPointer + ExportDataKey key; + std::memcpy(&key.data, data, sizeof(key.data)); + + std::lock_guard lock(ipc_ptr_cache_mutex); + + auto it = ipc_ptr_cache.find(key); + if (it != ipc_ptr_cache.end()) { + if (auto box = it->second.lock()) { + // Cache hit - return existing handle + return DevicePtrHandle(box, &box->resource); + } + ipc_ptr_cache.erase(it); // Expired entry + } + + // Cache miss - import the pointer + GILReleaseGuard gil; + CUdeviceptr ptr; + if (CUDA_SUCCESS != (err = p_cuMemPoolImportPointer(&ptr, *h_pool, data))) { + return {}; + } + + // Create new handle with cache-clearing deleter + auto box = std::shared_ptr( + new DevicePtrBox{ptr, h_stream}, + [h_pool, key](DevicePtrBox* b) { + GILReleaseGuard gil; + { + std::lock_guard lock(ipc_ptr_cache_mutex); + // Only erase if expired - avoids race where another thread + // replaced the entry with a new import before we acquired the lock. + auto it = ipc_ptr_cache.find(key); + if (it != ipc_ptr_cache.end() && it->second.expired()) { + ipc_ptr_cache.erase(it); + } + } + p_cuMemFreeAsync(b->resource, native(b->h_stream)); + delete b; + } + ); + ipc_ptr_cache[key] = box; + return DevicePtrHandle(box, &box->resource); + + } else { + // No caching - simple handle creation + GILReleaseGuard gil; + CUdeviceptr ptr; + if (CUDA_SUCCESS != (err = p_cuMemPoolImportPointer(&ptr, *h_pool, data))) { + return {}; + } + + auto box = std::shared_ptr( + new DevicePtrBox{ptr, h_stream}, + [h_pool](DevicePtrBox* b) { + GILReleaseGuard gil; + p_cuMemFreeAsync(b->resource, native(b->h_stream)); + delete b; + } + ); + return DevicePtrHandle(box, &box->resource); + } +} + +// ============================================================================ +// Capsule C++ API table +// ============================================================================ + +const ResourceHandlesCxxApiV1* get_resource_handles_cxx_api_v1() noexcept { + static const ResourceHandlesCxxApiV1 table = []() { + ResourceHandlesCxxApiV1 t{}; + t.abi_version = RESOURCE_HANDLES_CXX_API_VERSION; + t.struct_size = static_cast(sizeof(ResourceHandlesCxxApiV1)); + + // Error handling + t.get_last_error = &get_last_error; + t.peek_last_error = &peek_last_error; + t.clear_last_error = &clear_last_error; + + // Context + t.create_context_handle_ref = &create_context_handle_ref; + t.get_primary_context = &get_primary_context; + t.get_current_context = &get_current_context; + + // Stream + t.create_stream_handle = &create_stream_handle; + t.create_stream_handle_ref = &create_stream_handle_ref; + t.create_stream_handle_with_owner = &create_stream_handle_with_owner; + t.get_legacy_stream = &get_legacy_stream; + t.get_per_thread_stream = &get_per_thread_stream; + + // Event (resolve overloads explicitly) + t.create_event_handle = + static_cast(&create_event_handle); + t.create_event_handle_noctx = + static_cast(&create_event_handle); + t.create_event_handle_ipc = &create_event_handle_ipc; + + // Memory pool + t.create_mempool_handle = &create_mempool_handle; + t.create_mempool_handle_ref = &create_mempool_handle_ref; + t.get_device_mempool = &get_device_mempool; + t.create_mempool_handle_ipc = &create_mempool_handle_ipc; + + // Device pointer + t.deviceptr_alloc_from_pool = &deviceptr_alloc_from_pool; + t.deviceptr_alloc_async = &deviceptr_alloc_async; + t.deviceptr_alloc = &deviceptr_alloc; + t.deviceptr_alloc_host = &deviceptr_alloc_host; + t.deviceptr_create_ref = &deviceptr_create_ref; + t.deviceptr_create_with_owner = &deviceptr_create_with_owner; + t.deviceptr_import_ipc = &deviceptr_import_ipc; + t.deallocation_stream = &deallocation_stream; + t.set_deallocation_stream = &set_deallocation_stream; + + return t; + }(); + return &table; +} + +} // namespace cuda_core diff --git a/cuda_core/cuda/core/_cpp/resource_handles.hpp b/cuda_core/cuda/core/_cpp/resource_handles.hpp new file mode 100644 index 0000000000..7649788fdd --- /dev/null +++ b/cuda_core/cuda/core/_cpp/resource_handles.hpp @@ -0,0 +1,298 @@ +// SPDX-FileCopyrightText: Copyright (c) 2024-2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +// +// SPDX-License-Identifier: Apache-2.0 + +#pragma once + +#include +#include +#include +#include + +namespace cuda_core { + +// ============================================================================ +// Thread-local error handling +// ============================================================================ + +// Get and clear the last CUDA error (like cudaGetLastError) +CUresult get_last_error() noexcept; + +// Get the last CUDA error without clearing it (like cudaPeekAtLastError) +CUresult peek_last_error() noexcept; + +// Explicitly clear the last error +void clear_last_error() noexcept; + +// ============================================================================ +// Handle type aliases - expose only the raw CUDA resource +// ============================================================================ + +using ContextHandle = std::shared_ptr; +using StreamHandle = std::shared_ptr; +using EventHandle = std::shared_ptr; +using MemoryPoolHandle = std::shared_ptr; + +// ============================================================================ +// Context handle functions +// ============================================================================ + +// Function to create a non-owning context handle (references existing context). +ContextHandle create_context_handle_ref(CUcontext ctx); + +// Get handle to the primary context for a device (with thread-local caching) +// Returns empty handle on error (caller must check) +ContextHandle get_primary_context(int device_id) noexcept; + +// Get handle to the current CUDA context +// Returns empty handle if no context is current (caller must check) +ContextHandle get_current_context() noexcept; + +// ============================================================================ +// Stream handle functions +// ============================================================================ + +// Create an owning stream handle by calling cuStreamCreateWithPriority. +// The stream structurally depends on the provided context handle. +// When the last reference is released, cuStreamDestroy is called automatically. +// Returns empty handle on error (caller must check). +StreamHandle create_stream_handle(ContextHandle h_ctx, unsigned int flags, int priority); + +// Create a non-owning stream handle (references existing stream). +// Use for borrowed streams (from foreign code) or built-in streams. +// The stream will NOT be destroyed when the handle is released. +// Caller is responsible for keeping the stream's context alive. +StreamHandle create_stream_handle_ref(CUstream stream); + +// Create a non-owning stream handle that prevents a Python owner from being GC'd. +// The owner's refcount is incremented; decremented when handle is released. +// The owner is responsible for keeping the stream's context alive. +StreamHandle create_stream_handle_with_owner(CUstream stream, PyObject* owner); + +// Get non-owning handle to the legacy default stream (CU_STREAM_LEGACY) +// Note: Legacy stream has no specific context dependency. +StreamHandle get_legacy_stream() noexcept; + +// Get non-owning handle to the per-thread default stream (CU_STREAM_PER_THREAD) +// Note: Per-thread stream has no specific context dependency. +StreamHandle get_per_thread_stream() noexcept; + +// ============================================================================ +// Event handle functions +// ============================================================================ + +// Create an owning event handle by calling cuEventCreate. +// The event structurally depends on the provided context handle. +// When the last reference is released, cuEventDestroy is called automatically. +// Returns empty handle on error (caller must check). +EventHandle create_event_handle(ContextHandle h_ctx, unsigned int flags); + +// Create an owning event handle without context dependency. +// Use for temporary events that are created and destroyed in the same scope. +// When the last reference is released, cuEventDestroy is called automatically. +// Returns empty handle on error (caller must check). +EventHandle create_event_handle(unsigned int flags); + +// Create an owning event handle from an IPC handle. +// The originating process owns the event and its context. +// When the last reference is released, cuEventDestroy is called automatically. +// Returns empty handle on error (caller must check). +EventHandle create_event_handle_ipc(const CUipcEventHandle& ipc_handle); + +// ============================================================================ +// Memory pool handle functions +// ============================================================================ + +// Create an owning memory pool handle by calling cuMemPoolCreate. +// Memory pools are device-scoped (not context-scoped). +// When the last reference is released, cuMemPoolDestroy is called automatically. +// Returns empty handle on error (caller must check). +MemoryPoolHandle create_mempool_handle(const CUmemPoolProps& props); + +// Create a non-owning memory pool handle (references existing pool). +// Use for device default/current pools that are managed by the driver. +// The pool will NOT be destroyed when the handle is released. +MemoryPoolHandle create_mempool_handle_ref(CUmemoryPool pool); + +// Get non-owning handle to the current memory pool for a device. +// Returns empty handle on error (caller must check). +MemoryPoolHandle get_device_mempool(int device_id) noexcept; + +// Create an owning memory pool handle from an IPC import. +// The file descriptor is NOT owned by this handle (caller manages FD separately). +// When the last reference is released, cuMemPoolDestroy is called automatically. +// Returns empty handle on error (caller must check). +MemoryPoolHandle create_mempool_handle_ipc(int fd, CUmemAllocationHandleType handle_type); + +// ============================================================================ +// Device pointer handle functions +// ============================================================================ + +using DevicePtrHandle = std::shared_ptr; + +// Allocate device memory from a pool asynchronously via cuMemAllocFromPoolAsync. +// The pointer structurally depends on the provided pool handle (captured in deleter). +// When the last reference is released, cuMemFreeAsync is called on the stored stream. +// Returns empty handle on error (caller must check). +DevicePtrHandle deviceptr_alloc_from_pool( + size_t size, + MemoryPoolHandle h_pool, + StreamHandle h_stream); + +// Allocate device memory asynchronously via cuMemAllocAsync. +// When the last reference is released, cuMemFreeAsync is called on the stored stream. +// Returns empty handle on error (caller must check). +DevicePtrHandle deviceptr_alloc_async(size_t size, StreamHandle h_stream); + +// Allocate device memory synchronously via cuMemAlloc. +// When the last reference is released, cuMemFree is called. +// Returns empty handle on error (caller must check). +DevicePtrHandle deviceptr_alloc(size_t size); + +// Allocate pinned host memory via cuMemAllocHost. +// When the last reference is released, cuMemFreeHost is called. +// Returns empty handle on error (caller must check). +DevicePtrHandle deviceptr_alloc_host(size_t size); + +// Create a non-owning device pointer handle (references existing pointer). +// Use for foreign pointers (e.g., from external libraries). +// The pointer will NOT be freed when the handle is released. +DevicePtrHandle deviceptr_create_ref(CUdeviceptr ptr); + +// Create a non-owning device pointer handle that prevents a Python owner from being GC'd. +// The owner's refcount is incremented; decremented when handle is released. +// The pointer will NOT be freed when the handle is released. +// If owner is nullptr, equivalent to deviceptr_create_ref. +DevicePtrHandle deviceptr_create_with_owner(CUdeviceptr ptr, PyObject* owner); + +// Import a device pointer from IPC via cuMemPoolImportPointer. +// When the last reference is released, cuMemFreeAsync is called on the stored stream. +// Note: Does not yet implement reference counting for nvbug 5570902. +// On error, returns empty handle and sets thread-local error (use get_last_error()). +DevicePtrHandle deviceptr_import_ipc( + MemoryPoolHandle h_pool, + const void* export_data, + StreamHandle h_stream); + +// Access the deallocation stream for a device pointer handle (read-only). +// For non-owning handles, the stream is not used but can still be accessed. +StreamHandle deallocation_stream(const DevicePtrHandle& h); + +// Set the deallocation stream for a device pointer handle. +void set_deallocation_stream(const DevicePtrHandle& h, StreamHandle h_stream); + +// ============================================================================ +// Overloaded helper functions to extract raw resources from handles +// ============================================================================ + +// native() - extract the raw CUDA handle +inline CUcontext native(const ContextHandle& h) noexcept { + return h ? *h : nullptr; +} + +inline CUstream native(const StreamHandle& h) noexcept { + return h ? *h : nullptr; +} + +inline CUevent native(const EventHandle& h) noexcept { + return h ? *h : nullptr; +} + +inline CUmemoryPool native(const MemoryPoolHandle& h) noexcept { + return h ? *h : nullptr; +} + +inline CUdeviceptr native(const DevicePtrHandle& h) noexcept { + return h ? *h : 0; +} + +// intptr() - extract handle as intptr_t for Python interop +// Using signed intptr_t per C standard convention and issue #1342 +inline std::intptr_t intptr(const ContextHandle& h) noexcept { + return reinterpret_cast(h ? *h : nullptr); +} + +inline std::intptr_t intptr(const StreamHandle& h) noexcept { + return reinterpret_cast(h ? *h : nullptr); +} + +inline std::intptr_t intptr(const EventHandle& h) noexcept { + return reinterpret_cast(h ? *h : nullptr); +} + +inline std::intptr_t intptr(const MemoryPoolHandle& h) noexcept { + return reinterpret_cast(h ? *h : nullptr); +} + +inline std::intptr_t intptr(const DevicePtrHandle& h) noexcept { + return h ? static_cast(*h) : 0; +} + +// py() - convert handle to Python driver wrapper object +// Returns new reference. Caller must hold GIL. +inline PyObject* py(const ContextHandle& h) { + static PyObject* cls = nullptr; + if (!cls) { + PyObject* mod = PyImport_ImportModule("cuda.bindings.driver"); + if (!mod) return nullptr; + cls = PyObject_GetAttrString(mod, "CUcontext"); + Py_DECREF(mod); + if (!cls) return nullptr; + } + std::uintptr_t val = h ? reinterpret_cast(*h) : 0; + return PyObject_CallFunction(cls, "K", val); +} + +inline PyObject* py(const StreamHandle& h) { + static PyObject* cls = nullptr; + if (!cls) { + PyObject* mod = PyImport_ImportModule("cuda.bindings.driver"); + if (!mod) return nullptr; + cls = PyObject_GetAttrString(mod, "CUstream"); + Py_DECREF(mod); + if (!cls) return nullptr; + } + std::uintptr_t val = h ? reinterpret_cast(*h) : 0; + return PyObject_CallFunction(cls, "K", val); +} + +inline PyObject* py(const EventHandle& h) { + static PyObject* cls = nullptr; + if (!cls) { + PyObject* mod = PyImport_ImportModule("cuda.bindings.driver"); + if (!mod) return nullptr; + cls = PyObject_GetAttrString(mod, "CUevent"); + Py_DECREF(mod); + if (!cls) return nullptr; + } + std::uintptr_t val = h ? reinterpret_cast(*h) : 0; + return PyObject_CallFunction(cls, "K", val); +} + +inline PyObject* py(const MemoryPoolHandle& h) { + static PyObject* cls = nullptr; + if (!cls) { + PyObject* mod = PyImport_ImportModule("cuda.bindings.driver"); + if (!mod) return nullptr; + cls = PyObject_GetAttrString(mod, "CUmemoryPool"); + Py_DECREF(mod); + if (!cls) return nullptr; + } + std::uintptr_t val = h ? reinterpret_cast(*h) : 0; + return PyObject_CallFunction(cls, "K", val); +} + +inline PyObject* py(const DevicePtrHandle& h) { + static PyObject* cls = nullptr; + if (!cls) { + PyObject* mod = PyImport_ImportModule("cuda.bindings.driver"); + if (!mod) return nullptr; + cls = PyObject_GetAttrString(mod, "CUdeviceptr"); + Py_DECREF(mod); + if (!cls) return nullptr; + } + std::uintptr_t val = h ? static_cast(*h) : 0; + return PyObject_CallFunction(cls, "K", val); +} + +} // namespace cuda_core diff --git a/cuda_core/cuda/core/_cpp/resource_handles_cxx_api.hpp b/cuda_core/cuda/core/_cpp/resource_handles_cxx_api.hpp new file mode 100644 index 0000000000..6ff07a6ee0 --- /dev/null +++ b/cuda_core/cuda/core/_cpp/resource_handles_cxx_api.hpp @@ -0,0 +1,79 @@ +// SPDX-FileCopyrightText: Copyright (c) 2024-2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +// +// SPDX-License-Identifier: Apache-2.0 + +#pragma once + +#include + +#include "resource_handles.hpp" + +namespace cuda_core { + +// C++ capsule API for cross-extension-module calls. +// +// The function-pointer table is exported from the Python extension module +// `cuda.core._resource_handles` as a PyCapsule named: +// +// "cuda.core._resource_handles._CXX_API" +// +// Other extension modules import the capsule and dispatch through the table to +// ensure there is a single owner of all correctness-critical static/thread_local +// state in resource_handles.cpp (caches, last-error state, etc.). + +static constexpr std::uint32_t RESOURCE_HANDLES_CXX_API_VERSION = 1; + +struct ResourceHandlesCxxApiV1 { + std::uint32_t abi_version; + std::uint32_t struct_size; + + // Thread-local error handling + CUresult (*get_last_error)() noexcept; + CUresult (*peek_last_error)() noexcept; + void (*clear_last_error)() noexcept; + + // Context handles + ContextHandle (*create_context_handle_ref)(CUcontext ctx); + ContextHandle (*get_primary_context)(int device_id) noexcept; + ContextHandle (*get_current_context)() noexcept; + + // Stream handles + StreamHandle (*create_stream_handle)(ContextHandle h_ctx, unsigned int flags, int priority); + StreamHandle (*create_stream_handle_ref)(CUstream stream); + StreamHandle (*create_stream_handle_with_owner)(CUstream stream, PyObject* owner); + StreamHandle (*get_legacy_stream)() noexcept; + StreamHandle (*get_per_thread_stream)() noexcept; + + // Event handles + EventHandle (*create_event_handle)(ContextHandle h_ctx, unsigned int flags); + EventHandle (*create_event_handle_noctx)(unsigned int flags); + EventHandle (*create_event_handle_ipc)(const CUipcEventHandle& ipc_handle); + + // Memory pool handles + MemoryPoolHandle (*create_mempool_handle)(const CUmemPoolProps& props); + MemoryPoolHandle (*create_mempool_handle_ref)(CUmemoryPool pool); + MemoryPoolHandle (*get_device_mempool)(int device_id) noexcept; + MemoryPoolHandle (*create_mempool_handle_ipc)(int fd, CUmemAllocationHandleType handle_type); + + // Device pointer handles + DevicePtrHandle (*deviceptr_alloc_from_pool)( + size_t size, + MemoryPoolHandle h_pool, + StreamHandle h_stream); + DevicePtrHandle (*deviceptr_alloc_async)(size_t size, StreamHandle h_stream); + DevicePtrHandle (*deviceptr_alloc)(size_t size); + DevicePtrHandle (*deviceptr_alloc_host)(size_t size); + DevicePtrHandle (*deviceptr_create_ref)(CUdeviceptr ptr); + DevicePtrHandle (*deviceptr_create_with_owner)(CUdeviceptr ptr, PyObject* owner); + DevicePtrHandle (*deviceptr_import_ipc)( + MemoryPoolHandle h_pool, + const void* export_data, + StreamHandle h_stream); + StreamHandle (*deallocation_stream)(const DevicePtrHandle& h); + void (*set_deallocation_stream)(const DevicePtrHandle& h, StreamHandle h_stream); +}; + +// Return pointer to a process-wide singleton table. +const ResourceHandlesCxxApiV1* get_resource_handles_cxx_api_v1() noexcept; + +} // namespace cuda_core diff --git a/cuda_core/cuda/core/_device.pyx b/cuda_core/cuda/core/_device.pyx index 2d775b6580..014b7dae78 100644 --- a/cuda_core/cuda/core/_device.pyx +++ b/cuda_core/cuda/core/_device.pyx @@ -11,8 +11,19 @@ from cuda.core._utils.cuda_utils cimport HANDLE_RETURN import threading from typing import Optional, TYPE_CHECKING, Union -from cuda.core._context import Context, ContextOptions +from cuda.core._context cimport Context +from cuda.core._context import ContextOptions +from cuda.core._event cimport Event as cyEvent from cuda.core._event import Event, EventOptions +from cuda.core._resource_handles cimport ( + ContextHandle, + _init_handles_table, + create_context_handle_ref, + get_primary_context, + native, +) + +_init_handles_table() from cuda.core._graph import GraphBuilder from cuda.core._stream import IsStreamT, Stream, StreamOptions from cuda.core._utils.clear_error_support import assert_type @@ -908,20 +919,6 @@ cdef class DeviceProperties: ) -cdef cydriver.CUcontext _get_primary_context(int dev_id) except?NULL: - try: - primary_ctxs = _tls.primary_ctxs - except AttributeError: - total = len(_tls.devices) - primary_ctxs = _tls.primary_ctxs = [0] * total - cdef cydriver.CUcontext ctx = (primary_ctxs[dev_id]) - if ctx == NULL: - with nogil: - HANDLE_RETURN(cydriver.cuDevicePrimaryCtxRetain(&ctx, dev_id)) - primary_ctxs[dev_id] = (ctx) - return ctx - - class Device: """Represent a GPU and act as an entry point for cuda.core features. @@ -948,7 +945,7 @@ class Device: Default value of `None` return the currently used device. """ - __slots__ = ("_id", "_memory_resource", "_has_inited", "_properties", "_uuid") + __slots__ = ("_device_id", "_memory_resource", "_has_inited", "_properties", "_uuid", "_context") def __new__(cls, device_id: Device | int | None = None): # Handle device_id argument. @@ -973,10 +970,9 @@ class Device: if err == cydriver.CUresult.CUDA_SUCCESS: device_id = int(dev) elif err == cydriver.CUresult.CUDA_ERROR_INVALID_CONTEXT: - with nogil: - HANDLE_RETURN(cydriver.cuCtxGetCurrent(&ctx)) - assert (ctx) == NULL - device_id = 0 # cudart behavior + # No context is current - verify and default to device 0 (cudart behavior) + assert cydriver.cuCtxGetCurrent(&ctx) == cydriver.CUresult.CUDA_SUCCESS and ctx == NULL + device_id = 0 else: HANDLE_RETURN(err) elif device_id < 0: @@ -990,13 +986,14 @@ class Device: with nogil: HANDLE_RETURN(cydriver.cuDeviceGetCount(&total)) devices = _tls.devices = [] - for dev_id in range(total): + for i in range(total): device = super().__new__(cls) - device._id = dev_id + device._device_id = i device._memory_resource = None device._has_inited = False device._properties = None device._uuid = None + device._context = None devices.append(device) try: @@ -1007,22 +1004,9 @@ class Device: def _check_context_initialized(self): if not self._has_inited: raise CUDAError( - f"Device {self._id} is not yet initialized, perhaps you forgot to call .set_current() first?" + f"Device {self._device_id} is not yet initialized, perhaps you forgot to call .set_current() first?" ) - def _get_current_context(self, bint check_consistency=False) -> driver.CUcontext: - cdef cydriver.CUcontext ctx - cdef cydriver.CUdevice dev - cdef cydriver.CUdevice this_dev = self._id - with nogil: - HANDLE_RETURN(cydriver.cuCtxGetCurrent(&ctx)) - if ctx == NULL: - raise CUDAError("No context is bound to the calling CPU thread.") - if check_consistency: - HANDLE_RETURN(cydriver.cuCtxGetDevice(&dev)) - if dev != this_dev: - raise CUDAError("Internal error (current device is not equal to Device.device_id)") - return driver.CUcontext(ctx) @classmethod def get_all_devices(cls): @@ -1041,12 +1025,12 @@ class Device: @property def device_id(self) -> int: """Return device ordinal.""" - return self._id + return self._device_id @property def pci_bus_id(self) -> str: """Return a PCI Bus Id string for this device.""" - bus_id = handle_return(runtime.cudaDeviceGetPCIBusId(13, self._id)) + bus_id = handle_return(runtime.cudaDeviceGetPCIBusId(13, self._device_id)) return bus_id[:12].decode() def can_access_peer(self, peer: Device | int) -> bool: @@ -1092,7 +1076,7 @@ class Device: cdef str uuid_hex if self._uuid is None: - dev = self._id + dev = self._device_id with nogil: IF CUDA_CORE_BUILD_MAJOR == 12: HANDLE_RETURN(cydriver.cuDeviceGetUuid_v2(&uuid, dev)) @@ -1111,7 +1095,7 @@ class Device: cdef int LENGTH = 256 cdef bytes name = bytes(LENGTH) cdef char* name_ptr = name - cdef cydriver.CUdevice this_dev = self._id + cdef cydriver.CUdevice this_dev = self._device_id with nogil: HANDLE_RETURN(cydriver.cuDeviceGetName(name_ptr, LENGTH, this_dev)) name = name.split(b"\0")[0] @@ -1121,7 +1105,7 @@ class Device: def properties(self) -> DeviceProperties: """Return a :obj:`~_device.DeviceProperties` class with information about the device.""" if self._properties is None: - self._properties = DeviceProperties._init(self._id) + self._properties = DeviceProperties._init(self._device_id) return self._properties @@ -1142,7 +1126,7 @@ class Device: @property def context(self) -> Context: - """Return the current :obj:`~_context.Context` associated with this device. + """Return the :obj:`~_context.Context` associated with this device. Note ---- @@ -1150,8 +1134,7 @@ class Device: """ self._check_context_initialized() - ctx = self._get_current_context(check_consistency=True) - return Context._from_ctx(ctx, self._id) + return self._context @property def memory_resource(self) -> MemoryResource: @@ -1160,7 +1143,7 @@ class Device: if self._memory_resource is None: # If the device is in TCC mode, or does not support memory pools for some other reason, # use the SynchronousMemoryResource which does not use memory pools. - device_id = self._id + device_id = self._device_id with nogil: HANDLE_RETURN( cydriver.cuDeviceGetAttribute( @@ -1169,10 +1152,10 @@ class Device: ) if attr == 1: from cuda.core._memory import DeviceMemoryResource - self._memory_resource = DeviceMemoryResource(self._id) + self._memory_resource = DeviceMemoryResource(self._device_id) else: from cuda.core._memory import _SynchronousMemoryResource - self._memory_resource = _SynchronousMemoryResource(self._id) + self._memory_resource = _SynchronousMemoryResource(self._device_id) return self._memory_resource @@ -1197,10 +1180,10 @@ class Device: def __int__(self): """Return device_id.""" - return self._id + return self._device_id def __repr__(self): - return f"" + return f"" def __hash__(self) -> int: return hash(self.uuid) @@ -1208,7 +1191,7 @@ class Device: def __eq__(self, other) -> bool: if not isinstance(other, Device): return NotImplemented - return self._id == other._id + return self._device_id == other._device_id def __reduce__(self): return Device, (self.device_id,) @@ -1243,30 +1226,36 @@ class Device: >>> # ... do work on device 0 ... """ - cdef cydriver.CUcontext prev_ctx - cdef cydriver.CUcontext curr_ctx + cdef ContextHandle h_context + cdef cydriver.CUcontext prev_ctx, curr_ctx + if ctx is not None: # TODO: revisit once Context is cythonized assert_type(ctx, Context) - if ctx._id != self._id: + if ctx._device_id != self._device_id: raise RuntimeError( "the provided context was created on the device with" - f" id={ctx._id}, which is different from the target id={self._id}" + f" id={ctx._device_id}, which is different from the target id={self._device_id}" ) # prev_ctx is the previous context - curr_ctx = (ctx._handle) + curr_ctx = native(ctx._h_context) + prev_ctx = NULL with nogil: HANDLE_RETURN(cydriver.cuCtxPopCurrent(&prev_ctx)) HANDLE_RETURN(cydriver.cuCtxPushCurrent(curr_ctx)) self._has_inited = True + self._context = ctx # Store owning context reference if prev_ctx != NULL: - return Context._from_ctx((prev_ctx), self._id) + return Context._from_handle(Context, create_context_handle_ref(prev_ctx), self._device_id) else: # use primary ctx - curr_ctx = _get_primary_context(self._id) + h_context = get_primary_context(self._device_id) + if h_context.get() == NULL: + raise ValueError("Cannot set NULL context as current") with nogil: - HANDLE_RETURN(cydriver.cuCtxSetCurrent(curr_ctx)) + HANDLE_RETURN(cydriver.cuCtxSetCurrent(native(h_context))) self._has_inited = True + self._context = Context._from_handle(Context, h_context, self._device_id) # Store owning context def create_context(self, options: ContextOptions = None) -> Context: """Create a new :obj:`~_context.Context` object. @@ -1317,7 +1306,7 @@ class Device: """ self._check_context_initialized() - return Stream._init(obj=obj, options=options, device_id=self._id) + return Stream._init(obj=obj, options=options, device_id=self._device_id, ctx=self._context) def create_event(self, options: EventOptions | None = None) -> Event: """Create an Event object without recording it to a Stream. @@ -1338,8 +1327,8 @@ class Device: """ self._check_context_initialized() - ctx = self._get_current_context() - return Event._init(self._id, ctx, options, True) + cdef Context ctx = self._context + return cyEvent._init(cyEvent, self._device_id, ctx._h_context, options, True) def allocate(self, size, stream: Stream | GraphBuilder | None = None) -> Buffer: """Allocate device memory from a specified stream. diff --git a/cuda_core/cuda/core/_event.pxd b/cuda_core/cuda/core/_event.pxd index 1f586f18df..f52c505079 100644 --- a/cuda_core/cuda/core/_event.pxd +++ b/cuda_core/cuda/core/_event.pxd @@ -3,17 +3,21 @@ # SPDX-License-Identifier: Apache-2.0 from cuda.bindings cimport cydriver +from cuda.core._resource_handles cimport ContextHandle, EventHandle cdef class Event: cdef: - cydriver.CUevent _handle + EventHandle _h_event + ContextHandle _h_context # Cached for fast access bint _timing_disabled bint _busy_waited bint _ipc_enabled object _ipc_descriptor int _device_id - object _ctx_handle + + @staticmethod + cdef Event _init(type cls, int device_id, ContextHandle h_context, options, bint is_free) cpdef close(self) diff --git a/cuda_core/cuda/core/_event.pyx b/cuda_core/cuda/core/_event.pyx index e97fdfbab4..1dec487665 100644 --- a/cuda_core/cuda/core/_event.pyx +++ b/cuda_core/cuda/core/_event.pyx @@ -5,9 +5,21 @@ from __future__ import annotations cimport cpython -from libc.stdint cimport uintptr_t from libc.string cimport memcpy from cuda.bindings cimport cydriver +from cuda.core._context cimport Context +from cuda.core._resource_handles cimport ( + ContextHandle, + EventHandle, + _init_handles_table, + create_event_handle, + create_event_handle_ipc, + intptr, + native, + py, +) + +_init_handles_table() from cuda.core._utils.cuda_utils cimport ( check_or_create_options, HANDLE_RETURN @@ -18,11 +30,9 @@ from dataclasses import dataclass import multiprocessing from typing import TYPE_CHECKING, Optional -from cuda.core._context import Context from cuda.core._utils.cuda_utils import ( CUDAError, check_multiprocessing_start_method, - driver, ) if TYPE_CHECKING: import cuda.bindings @@ -81,15 +91,13 @@ cdef class Event: and they should instead be created through a :obj:`~_stream.Stream` object. """ - def __cinit__(self): - self._handle = (NULL) def __init__(self, *args, **kwargs): raise RuntimeError("Event objects cannot be instantiated directly. Please use Stream APIs (record).") - @classmethod - def _init(cls, device_id: int, ctx_handle: Context, options=None, is_free=False): - cdef Event self = Event.__new__(cls) + @staticmethod + cdef Event _init(type cls, int device_id, ContextHandle h_context, options, bint is_free): + cdef Event self = cls.__new__(cls) cdef EventOptions opts = check_or_create_options(EventOptions, options, "Event options") cdef unsigned int flags = 0x0 self._timing_disabled = False @@ -111,23 +119,24 @@ cdef class Event: self._ipc_enabled = True if not self._timing_disabled: raise TypeError("IPC-enabled events cannot use timing.") - with nogil: - HANDLE_RETURN(cydriver.cuEventCreate(&self._handle, flags)) + # C++ creates the event and returns owning handle with context dependency + cdef EventHandle h_event = create_event_handle(h_context, flags) + if not h_event: + raise RuntimeError("Failed to create CUDA event") + self._h_event = h_event + self._h_context = h_context self._device_id = device_id - self._ctx_handle = ctx_handle if opts.ipc_enabled: self.get_ipc_descriptor() return self cpdef close(self): - """Destroy the event.""" - if self._handle != NULL: - with nogil: - HANDLE_RETURN(cydriver.cuEventDestroy(self._handle)) - self._handle = (NULL) + """Destroy the event. - def __dealloc__(self): - self.close() + Releases the event handle. The underlying CUDA event is destroyed + when the last reference is released. + """ + self._h_event.reset() def __isub__(self, other): return NotImplemented @@ -139,7 +148,7 @@ cdef class Event: # return self - other (in milliseconds) cdef float timing with nogil: - err = cydriver.cuEventElapsedTime(&timing, other._handle, self._handle) + err = cydriver.cuEventElapsedTime(&timing, native((other)._h_event), native(self._h_event)) if err == 0: return timing else: @@ -165,14 +174,14 @@ cdef class Event: raise RuntimeError(explanation) def __hash__(self) -> int: - return hash((self._ctx_handle, (self._handle))) + return hash((type(self), intptr(self._h_context), intptr(self._h_event))) def __eq__(self, other) -> bool: # Note: using isinstance because `Event` can be subclassed. if not isinstance(other, Event): return NotImplemented cdef Event _other = other - return (self._handle) == (_other._handle) + return intptr(self._h_event) == intptr(_other._h_event) def get_ipc_descriptor(self) -> IPCEventDescriptor: """Export an event allocated for sharing between processes.""" @@ -182,7 +191,7 @@ cdef class Event: raise RuntimeError("Event is not IPC-enabled") cdef cydriver.CUipcEventHandle data with nogil: - HANDLE_RETURN(cydriver.cuIpcGetEventHandle(&data, (self._handle))) + HANDLE_RETURN(cydriver.cuIpcGetEventHandle(&data, native(self._h_event))) cdef bytes data_b = cpython.PyBytes_FromStringAndSize((data.reserved), sizeof(data.reserved)) self._ipc_descriptor = IPCEventDescriptor._init(data_b, self._busy_waited) return self._ipc_descriptor @@ -193,14 +202,17 @@ cdef class Event: cdef cydriver.CUipcEventHandle data memcpy(data.reserved, (ipc_descriptor._reserved), sizeof(data.reserved)) cdef Event self = Event.__new__(cls) - with nogil: - HANDLE_RETURN(cydriver.cuIpcOpenEventHandle(&self._handle, data)) + # IPC events: the originating process owns the event and its context + cdef EventHandle h_event = create_event_handle_ipc(data) + if not h_event: + raise RuntimeError("Failed to open IPC event handle") + self._h_event = h_event + self._h_context = ContextHandle() self._timing_disabled = True self._busy_waited = ipc_descriptor._busy_waited self._ipc_enabled = True self._ipc_descriptor = ipc_descriptor - self._device_id = -1 # ?? - self._ctx_handle = None # ?? + self._device_id = -1 return self @property @@ -229,13 +241,13 @@ cdef class Event: """ with nogil: - HANDLE_RETURN(cydriver.cuEventSynchronize(self._handle)) + HANDLE_RETURN(cydriver.cuEventSynchronize(native(self._h_event))) @property def is_done(self) -> bool: """Return True if all captured works have been completed, otherwise False.""" with nogil: - result = cydriver.cuEventQuery(self._handle) + result = cydriver.cuEventQuery(native(self._h_event)) if result == cydriver.CUresult.CUDA_SUCCESS: return True if result == cydriver.CUresult.CUDA_ERROR_NOT_READY: @@ -251,7 +263,7 @@ cdef class Event: This handle is a Python object. To get the memory address of the underlying C handle, call ``int(Event.handle)``. """ - return driver.CUevent((self._handle)) + return py(self._h_event) @property def device(self) -> Device: @@ -271,8 +283,8 @@ cdef class Event: @property def context(self) -> Context: """Return the :obj:`~_context.Context` associated with this event.""" - if self._ctx_handle is not None and self._device_id >= 0: - return Context._from_ctx(self._ctx_handle, self._device_id) + if self._h_context and self._device_id >= 0: + return Context._from_handle(Context, self._h_context, self._device_id) cdef class IPCEventDescriptor: diff --git a/cuda_core/cuda/core/_graph.py b/cuda_core/cuda/core/_graph.py index df51126bb0..b6e266a9a8 100644 --- a/cuda_core/cuda/core/_graph.py +++ b/cuda_core/cuda/core/_graph.py @@ -453,7 +453,7 @@ def __cuda_stream__(self) -> tuple[int, int]: return self.stream.__cuda_stream__() def _get_conditional_context(self) -> driver.CUcontext: - return self._mnff.stream.context._handle + return self._mnff.stream.context.handle def create_conditional_handle(self, default_value=None) -> driver.CUgraphConditionalHandle: """Creates a conditional handle for the graph builder. diff --git a/cuda_core/cuda/core/_launcher.pyx b/cuda_core/cuda/core/_launcher.pyx index 94dc5d02b4..61e867744e 100644 --- a/cuda_core/cuda/core/_launcher.pyx +++ b/cuda_core/cuda/core/_launcher.pyx @@ -8,6 +8,7 @@ from cuda.bindings cimport cydriver from cuda.core._launch_config cimport LaunchConfig from cuda.core._kernel_arg_handler cimport ParamHolder +from cuda.core._resource_handles cimport native from cuda.core._stream cimport Stream_accept, Stream from cuda.core._utils.cuda_utils cimport ( check_or_create_options, @@ -87,7 +88,7 @@ def launch(stream: Stream | GraphBuilder | IsStreamT, config: LaunchConfig, kern # rich. if _use_ex: drv_cfg = conf._to_native_launch_config() - drv_cfg.hStream = s._handle + drv_cfg.hStream = native(s._h_stream) if conf.cooperative_launch: _check_cooperative_launch(kernel, conf, s) with nogil: @@ -99,7 +100,7 @@ def launch(stream: Stream | GraphBuilder | IsStreamT, config: LaunchConfig, kern func_handle, conf.grid[0], conf.grid[1], conf.grid[2], conf.block[0], conf.block[1], conf.block[2], - conf.shmem_size, s._handle, args_ptr, NULL + conf.shmem_size, native(s._h_stream), args_ptr, NULL ) ) diff --git a/cuda_core/cuda/core/_memory/_buffer.pxd b/cuda_core/cuda/core/_memory/_buffer.pxd index 730e448f63..4238bd8d82 100644 --- a/cuda_core/cuda/core/_memory/_buffer.pxd +++ b/cuda_core/cuda/core/_memory/_buffer.pxd @@ -4,6 +4,7 @@ from libc.stdint cimport uintptr_t +from cuda.core._resource_handles cimport DevicePtrHandle from cuda.core._stream cimport Stream @@ -15,16 +16,23 @@ cdef struct _MemAttrs: cdef class Buffer: cdef: - uintptr_t _ptr - size_t _size - MemoryResource _memory_resource - object _ipc_data - object _owner - object _ptr_obj - Stream _alloc_stream - _MemAttrs _mem_attrs - bint _mem_attrs_inited + DevicePtrHandle _h_ptr + size_t _size + MemoryResource _memory_resource + object _ipc_data + object _owner + _MemAttrs _mem_attrs + bint _mem_attrs_inited cdef class MemoryResource: pass + + +# Helper function to create a Buffer from a DevicePtrHandle +cdef Buffer Buffer_from_deviceptr_handle( + DevicePtrHandle h_ptr, + size_t size, + MemoryResource mr, + object ipc_descriptor = * +) diff --git a/cuda_core/cuda/core/_memory/_buffer.pyx b/cuda_core/cuda/core/_memory/_buffer.pyx index b92c9d51ce..32fe28bab4 100644 --- a/cuda_core/cuda/core/_memory/_buffer.pyx +++ b/cuda_core/cuda/core/_memory/_buffer.pyx @@ -13,6 +13,17 @@ from cuda.core._memory._device_memory_resource import DeviceMemoryResource from cuda.core._memory._pinned_memory_resource import PinnedMemoryResource from cuda.core._memory._ipc cimport IPCBufferDescriptor, IPCDataForBuffer from cuda.core._memory cimport _ipc +from cuda.core._resource_handles cimport ( + DevicePtrHandle, + StreamHandle, + _init_handles_table, + deviceptr_create_with_owner, + intptr, + native, + set_deallocation_stream, +) + +_init_handles_table() from cuda.core._stream cimport Stream_accept, Stream from cuda.core._utils.cuda_utils cimport HANDLE_RETURN @@ -50,12 +61,10 @@ cdef class Buffer: self._clear() def _clear(self): - self._ptr = 0 + self._h_ptr.reset() # Release the handle self._size = 0 self._memory_resource = None self._ipc_data = None - self._ptr_obj = None - self._alloc_stream = None self._owner = None self._mem_attrs_inited = False @@ -69,20 +78,23 @@ cdef class Buffer: stream: Stream | None = None, ipc_descriptor: IPCBufferDescriptor | None = None, owner : object | None = None ): - cdef Buffer self = Buffer.__new__(cls) - self._ptr = (int(ptr)) - self._ptr_obj = ptr - self._size = size + """Legacy init for compatibility - creates a non-owning ref handle. + + Note: The stream parameter is accepted for API compatibility but is + ignored since non-owning refs are never freed by the handle. + """ if mr is not None and owner is not None: raise ValueError("owner and memory resource cannot be both specified together") + cdef Buffer self = Buffer.__new__(cls) + self._h_ptr = deviceptr_create_with_owner((int(ptr)), owner) + self._size = size self._memory_resource = mr self._ipc_data = IPCDataForBuffer(ipc_descriptor, True) if ipc_descriptor is not None else None - self._alloc_stream = (stream) if stream is not None else None self._owner = owner + self._mem_attrs_inited = False return self - def __dealloc__(self): - self.close(self._alloc_stream) + # No __dealloc__ needed - RAII handles cleanup via _h_ptr destructor def __reduce__(self): # Must not serialize the parent's stream! @@ -107,8 +119,12 @@ cdef class Buffer: An object holding external allocation that the ``ptr`` points to. The reference is kept as long as the buffer is alive. The ``owner`` and ``mr`` cannot be specified together. + + Note + ---- + This creates a non-owning reference. The pointer will NOT be freed + when the Buffer is closed or garbage collected. """ - # TODO: It is better to take a stream for latter deallocation return Buffer._init(ptr, size, mr=mr, owner=owner) @classmethod @@ -135,7 +151,7 @@ cdef class Buffer: ---------- stream : :obj:`~_stream.Stream` | :obj:`~_graph.GraphBuilder`, optional The stream object to use for asynchronous deallocation. If None, - the behavior depends on the underlying memory resource. + the deallocation stream stored in the handle is used. """ Buffer_close(self, stream) @@ -155,29 +171,23 @@ cdef class Buffer: asynchronous copy """ - stream = Stream_accept(stream) - cdef Stream s_stream = stream + cdef Stream s = Stream_accept(stream) cdef size_t src_size = self._size if dst is None: if self._memory_resource is None: raise ValueError("a destination buffer must be provided (this " "buffer does not have a memory_resource)") - dst = self._memory_resource.allocate(src_size, stream) + dst = self._memory_resource.allocate(src_size, s) cdef size_t dst_size = dst._size if dst_size != src_size: raise ValueError( "buffer sizes mismatch between src and dst (sizes " f"are: src={src_size}, dst={dst_size})" ) - cdef cydriver.CUstream s = s_stream._handle with nogil: HANDLE_RETURN(cydriver.cuMemcpyAsync( - dst._ptr, - self._ptr, - src_size, - s - )) + native(dst._h_ptr), native(self._h_ptr), src_size, native(s._h_stream))) return dst def copy_from(self, src: Buffer, *, stream: Stream | GraphBuilder): @@ -192,8 +202,7 @@ cdef class Buffer: asynchronous copy """ - stream = Stream_accept(stream) - cdef Stream s_stream = stream + cdef Stream s = Stream_accept(stream) cdef size_t dst_size = self._size cdef size_t src_size = src._size @@ -201,14 +210,9 @@ cdef class Buffer: raise ValueError( "buffer sizes mismatch between src and dst (sizes " f"are: src={src_size}, dst={dst_size})" ) - cdef cydriver.CUstream s = s_stream._handle with nogil: HANDLE_RETURN(cydriver.cuMemcpyAsync( - self._ptr, - src._ptr, - dst_size, - s - )) + native(self._h_ptr), native(src._h_ptr), dst_size, native(s._h_stream))) def fill(self, value: int | BufferProtocol, *, stream: Stream | GraphBuilder): """Fill this buffer with a repeating byte pattern. @@ -236,12 +240,12 @@ cdef class Buffer: # Handle int case: 1-byte fill with automatic overflow checking. if isinstance(value, int): - Buffer_fill_uint8(self, value, s_stream._handle) + Buffer_fill_uint8(self, value, s_stream._h_stream) return # Handle bytes case: direct pointer access without intermediate objects. if isinstance(value, bytes): - Buffer_fill_from_ptr(self, value, len(value), s_stream._handle) + Buffer_fill_from_ptr(self, value, len(value), s_stream._h_stream) return # General buffer protocol path using C buffer API. @@ -251,7 +255,7 @@ cdef class Buffer: f"value must be an int or support the buffer protocol, got {type(value).__name__}" ) try: - Buffer_fill_from_ptr(self, buf.buf, buf.len, s_stream._handle) + Buffer_fill_from_ptr(self, buf.buf, buf.len, s_stream._h_stream) finally: PyBuffer_Release(&buf) @@ -306,9 +310,8 @@ cdef class Buffer: """Return the device ordinal of this buffer.""" if self._memory_resource is not None: return self._memory_resource.device_id - else: - Buffer_init_mem_attrs(self) - return self._mem_attrs.device_id + _init_mem_attrs(self) + return self._mem_attrs.device_id @property def handle(self) -> DevicePointerT: @@ -319,31 +322,25 @@ cdef class Buffer: This handle is a Python object. To get the memory address of the underlying C handle, call ``int(Buffer.handle)``. """ - if self._ptr_obj is not None: - return self._ptr_obj - elif self._ptr: - return self._ptr - else: - # contract: Buffer is closed - return 0 + # Return raw integer for compatibility with ctypes and other tools + # that expect a raw pointer value + return intptr(self._h_ptr) @property def is_device_accessible(self) -> bool: """Return True if this buffer can be accessed by the GPU, otherwise False.""" if self._memory_resource is not None: return self._memory_resource.is_device_accessible - else: - Buffer_init_mem_attrs(self) - return self._mem_attrs.is_device_accessible + _init_mem_attrs(self) + return self._mem_attrs.is_device_accessible @property def is_host_accessible(self) -> bool: """Return True if this buffer can be accessed by the CPU, otherwise False.""" if self._memory_resource is not None: return self._memory_resource.is_host_accessible - else: - Buffer_init_mem_attrs(self) - return self._mem_attrs.is_host_accessible + _init_mem_attrs(self) + return self._mem_attrs.is_host_accessible @property def is_mapped(self) -> bool: @@ -367,85 +364,52 @@ cdef class Buffer: return self._owner -# Buffer Implementation -# --------------------- -cdef inline void Buffer_close(Buffer self, stream): - cdef Stream s - if self._ptr: - if self._memory_resource is not None: - s = Stream_accept(stream) if stream is not None else self._alloc_stream - self._memory_resource.deallocate(self._ptr, self._size, s) - self._ptr = 0 - self._memory_resource = None - self._owner = None - self._ptr_obj = None - self._alloc_stream = None - - -cdef inline int Buffer_fill_uint8(Buffer self, uint8_t value, cydriver.CUstream s) except? -1: - with nogil: - HANDLE_RETURN(cydriver.cuMemsetD8Async(self._ptr, value, self._size, s)) - return 0 - - -cdef inline int Buffer_fill_from_ptr( - Buffer self, const char* ptr, size_t width, cydriver.CUstream s -) except? -1: - cdef size_t buffer_size = self._size - - if width == 1: - with nogil: - HANDLE_RETURN(cydriver.cuMemsetD8Async( - self._ptr, (ptr)[0], buffer_size, s)) - elif width == 2: - if buffer_size & 0x1: - raise ValueError(f"buffer size ({buffer_size}) must be divisible by 2") - with nogil: - HANDLE_RETURN(cydriver.cuMemsetD16Async( - self._ptr, (ptr)[0], buffer_size // 2, s)) - elif width == 4: - if buffer_size & 0x3: - raise ValueError(f"buffer size ({buffer_size}) must be divisible by 4") - with nogil: - HANDLE_RETURN(cydriver.cuMemsetD32Async( - self._ptr, (ptr)[0], buffer_size // 4, s)) - else: - raise ValueError(f"value must be 1, 2, or 4 bytes, got {width}") - return 0 - - -cdef Buffer_init_mem_attrs(Buffer self): +# Memory Attribute Query Helpers +# ------------------------------ +cdef inline void _init_mem_attrs(Buffer self): + """Initialize memory attributes by querying the pointer.""" if not self._mem_attrs_inited: - query_memory_attrs(self._mem_attrs, self._ptr) + _query_memory_attrs(self._mem_attrs, native(self._h_ptr)) self._mem_attrs_inited = True -cdef int query_memory_attrs(_MemAttrs &out, uintptr_t ptr) except -1 nogil: +cdef inline int _query_memory_attrs( + _MemAttrs& out, + cydriver.CUdeviceptr ptr +) except -1 nogil: + """Query memory attributes for a device pointer.""" cdef unsigned int memory_type = 0 cdef int is_managed = 0 cdef int device_id = 0 - _query_memory_attrs(memory_type, is_managed, device_id, ptr) + cdef cydriver.CUpointer_attribute attrs[3] + cdef uintptr_t vals[3] + + attrs[0] = cydriver.CUpointer_attribute.CU_POINTER_ATTRIBUTE_MEMORY_TYPE + attrs[1] = cydriver.CUpointer_attribute.CU_POINTER_ATTRIBUTE_IS_MANAGED + attrs[2] = cydriver.CUpointer_attribute.CU_POINTER_ATTRIBUTE_DEVICE_ORDINAL + vals[0] = &memory_type + vals[1] = &is_managed + vals[2] = &device_id + + cdef cydriver.CUresult ret + ret = cydriver.cuPointerGetAttributes(3, attrs, vals, ptr) + if ret == cydriver.CUresult.CUDA_ERROR_NOT_INITIALIZED: + with cython.gil: + # Device class handles the cuInit call internally + Device() + ret = cydriver.cuPointerGetAttributes(3, attrs, vals, ptr) + HANDLE_RETURN(ret) if memory_type == 0: # unregistered host pointer out.is_host_accessible = True out.is_device_accessible = False out.device_id = -1 - # for managed memory, the memory type can be CU_MEMORYTYPE_DEVICE, - # so we need to check it first not to falsely claim it is not - # host accessible. elif ( is_managed or memory_type == cydriver.CUmemorytype.CU_MEMORYTYPE_HOST ): - # For pinned memory allocated with cudaMallocHost or paged-locked - # with cudaHostRegister, the memory_type is - # cydriver.CUmemorytype.CU_MEMORYTYPE_HOST. - # TODO(ktokarski): In some cases, the registered memory requires - # using different ptr for device and host, we could check - # cuMemHostGetDevicePointer and - # CU_DEVICE_ATTRIBUTE_CAN_USE_HOST_POINTER_FOR_REGISTERED_MEM - # to double check the device accessibility. + # Managed memory or pinned host memory out.is_host_accessible = True out.is_device_accessible = True out.device_id = device_id @@ -454,28 +418,8 @@ cdef int query_memory_attrs(_MemAttrs &out, uintptr_t ptr) except -1 nogil: out.is_device_accessible = True out.device_id = device_id else: - raise ValueError(f"Unsupported memory type: {memory_type}") - return 0 - - -cdef inline int _query_memory_attrs(unsigned int& memory_type, int & is_managed, int& device_id, cydriver.CUdeviceptr ptr) except -1 nogil: - cdef cydriver.CUpointer_attribute attrs[3] - cdef uintptr_t vals[3] - attrs[0] = cydriver.CUpointer_attribute.CU_POINTER_ATTRIBUTE_MEMORY_TYPE - attrs[1] = cydriver.CUpointer_attribute.CU_POINTER_ATTRIBUTE_IS_MANAGED - attrs[2] = cydriver.CUpointer_attribute.CU_POINTER_ATTRIBUTE_DEVICE_ORDINAL - vals[0] = &memory_type - vals[1] = &is_managed - vals[2] = &device_id - - cdef cydriver.CUresult ret - ret = cydriver.cuPointerGetAttributes(3, attrs, vals, ptr) - if ret == cydriver.CUresult.CUDA_ERROR_NOT_INITIALIZED: with cython.gil: - # Device class handles the cuInit call internally - Device() - ret = cydriver.cuPointerGetAttributes(3, attrs, vals, ptr) - HANDLE_RETURN(ret) + raise ValueError(f"Unsupported memory type: {memory_type}") return 0 @@ -541,3 +485,72 @@ cdef class MemoryResource: def device_id(self) -> int: """Device ID associated with this memory resource, or -1 if not applicable.""" raise TypeError("MemoryResource.device_id must be implemented by subclasses.") + + +# Buffer Implementation Helpers +# ----------------------------- +cdef inline Buffer Buffer_from_deviceptr_handle( + DevicePtrHandle h_ptr, + size_t size, + MemoryResource mr, + object ipc_descriptor = None +): + """Create a Buffer from an existing DevicePtrHandle.""" + cdef Buffer buf = Buffer.__new__(Buffer) + buf._h_ptr = h_ptr + buf._size = size + buf._memory_resource = mr + buf._ipc_data = IPCDataForBuffer(ipc_descriptor, True) if ipc_descriptor is not None else None + buf._owner = None + buf._mem_attrs_inited = False + return buf + + +cdef inline void Buffer_close(Buffer self, object stream): + """Close a buffer, freeing its memory.""" + cdef Stream s + if not self._h_ptr: + return + # Update deallocation stream if provided + if stream is not None: + s = Stream_accept(stream) + set_deallocation_stream(self._h_ptr, s._h_stream) + # Reset handle - RAII deleter will free the memory (and release owner ref in C++) + self._h_ptr.reset() + self._size = 0 + self._memory_resource = None + self._ipc_data = None + self._owner = None + + +cdef inline int Buffer_fill_uint8(Buffer self, uint8_t value, StreamHandle h_stream) except? -1: + cdef cydriver.CUdeviceptr ptr = native(self._h_ptr) + cdef cydriver.CUstream s = native(h_stream) + with nogil: + HANDLE_RETURN(cydriver.cuMemsetD8Async(ptr, value, self._size, s)) + return 0 + + +cdef inline int Buffer_fill_from_ptr( + Buffer self, const char* ptr, size_t width, StreamHandle h_stream +) except? -1: + cdef size_t buffer_size = self._size + cdef cydriver.CUdeviceptr dst = native(self._h_ptr) + cdef cydriver.CUstream s = native(h_stream) + + if width == 1: + with nogil: + HANDLE_RETURN(cydriver.cuMemsetD8Async(dst, (ptr)[0], buffer_size, s)) + elif width == 2: + if buffer_size & 0x1: + raise ValueError(f"buffer size ({buffer_size}) must be divisible by 2") + with nogil: + HANDLE_RETURN(cydriver.cuMemsetD16Async(dst, (ptr)[0], buffer_size // 2, s)) + elif width == 4: + if buffer_size & 0x3: + raise ValueError(f"buffer size ({buffer_size}) must be divisible by 4") + with nogil: + HANDLE_RETURN(cydriver.cuMemsetD32Async(dst, (ptr)[0], buffer_size // 4, s)) + else: + raise ValueError(f"value must be 1, 2, or 4 bytes, got {width}") + return 0 diff --git a/cuda_core/cuda/core/_memory/_device_memory_resource.pyx b/cuda_core/cuda/core/_memory/_device_memory_resource.pyx index d0cc82184a..bec16b993c 100644 --- a/cuda_core/cuda/core/_memory/_device_memory_resource.pyx +++ b/cuda_core/cuda/core/_memory/_device_memory_resource.pyx @@ -20,6 +20,7 @@ import platform # no-cython-lint import uuid from cuda.core._utils.cuda_utils import check_multiprocessing_start_method +from cuda.core._resource_handles cimport native if TYPE_CHECKING: from .._device import Device @@ -254,7 +255,7 @@ cpdef DMR_mempool_get_access(DeviceMemoryResource dmr, int device_id): location.id = dev_id with nogil: - HANDLE_RETURN(cydriver.cuMemPoolGetAccess(&flags, dmr._handle, &location)) + HANDLE_RETURN(cydriver.cuMemPoolGetAccess(&flags, native(dmr._h_pool), &location)) if flags == cydriver.CUmemAccess_flags.CU_MEM_ACCESS_FLAGS_PROT_READWRITE: return "rw" diff --git a/cuda_core/cuda/core/_memory/_graph_memory_resource.pxd b/cuda_core/cuda/core/_memory/_graph_memory_resource.pxd index 2f6c35d72e..492aa23cd3 100644 --- a/cuda_core/cuda/core/_memory/_graph_memory_resource.pxd +++ b/cuda_core/cuda/core/_memory/_graph_memory_resource.pxd @@ -7,4 +7,4 @@ from cuda.core._memory._buffer cimport MemoryResource cdef class cyGraphMemoryResource(MemoryResource): cdef: - int _dev_id + int _device_id diff --git a/cuda_core/cuda/core/_memory/_graph_memory_resource.pyx b/cuda_core/cuda/core/_memory/_graph_memory_resource.pyx index bda075c201..daa38a1216 100644 --- a/cuda_core/cuda/core/_memory/_graph_memory_resource.pyx +++ b/cuda_core/cuda/core/_memory/_graph_memory_resource.pyx @@ -7,7 +7,15 @@ from __future__ import annotations from libc.stdint cimport intptr_t from cuda.bindings cimport cydriver -from cuda.core._memory._buffer cimport Buffer, MemoryResource +from cuda.core._memory._buffer cimport Buffer, Buffer_from_deviceptr_handle, MemoryResource +from cuda.core._resource_handles cimport ( + DevicePtrHandle, + _init_handles_table, + deviceptr_alloc_async, + native, +) + +_init_handles_table() from cuda.core._stream cimport default_stream, Stream_accept, Stream from cuda.core._utils.cuda_utils cimport HANDLE_RETURN @@ -22,7 +30,7 @@ __all__ = ['GraphMemoryResource'] cdef class GraphMemoryResourceAttributes: cdef: - int _dev_id + int _device_id def __init__(self, *args, **kwargs): raise RuntimeError("GraphMemoryResourceAttributes cannot be instantiated directly. Please use MemoryResource APIs.") @@ -30,7 +38,7 @@ cdef class GraphMemoryResourceAttributes: @classmethod def _init(cls, device_id: int): cdef GraphMemoryResourceAttributes self = GraphMemoryResourceAttributes.__new__(cls) - self._dev_id = device_id + self._device_id = device_id return self def __repr__(self): @@ -41,12 +49,12 @@ cdef class GraphMemoryResourceAttributes: cdef int _getattribute(self, cydriver.CUgraphMem_attribute attr_enum, void* value) except?-1: with nogil: - HANDLE_RETURN(cydriver.cuDeviceGetGraphMemAttribute(self._dev_id, attr_enum, value)) + HANDLE_RETURN(cydriver.cuDeviceGetGraphMemAttribute(self._device_id, attr_enum, value)) return 0 cdef int _setattribute(self, cydriver.CUgraphMem_attribute attr_enum, void* value) except?-1: with nogil: - HANDLE_RETURN(cydriver.cuDeviceSetGraphMemAttribute(self._dev_id, attr_enum, value)) + HANDLE_RETURN(cydriver.cuDeviceSetGraphMemAttribute(self._device_id, attr_enum, value)) return 0 @property @@ -100,7 +108,7 @@ cdef class GraphMemoryResourceAttributes: cdef class cyGraphMemoryResource(MemoryResource): def __cinit__(self, int device_id): - self._dev_id = device_id + self._device_id = device_id def allocate(self, size_t size, stream: Stream | GraphBuilder | None = None) -> Buffer: """ @@ -123,17 +131,17 @@ cdef class cyGraphMemoryResource(MemoryResource): def trim(self): """Free unused memory that was cached on the specified device for use with graphs back to the OS.""" with nogil: - HANDLE_RETURN(cydriver.cuDeviceGraphMemTrim(self._dev_id)) + HANDLE_RETURN(cydriver.cuDeviceGraphMemTrim(self._device_id)) @property def attributes(self) -> GraphMemoryResourceAttributes: """Asynchronous allocation attributes related to graphs.""" - return GraphMemoryResourceAttributes._init(self._dev_id) + return GraphMemoryResourceAttributes._init(self._device_id) @property def device_id(self) -> int: """The associated device ordinal.""" - return self._dev_id + return self._device_id @property def is_device_accessible(self) -> bool: @@ -186,22 +194,18 @@ cdef inline int check_capturing(cydriver.CUstream s) except?-1 nogil: cdef inline Buffer GMR_allocate(cyGraphMemoryResource self, size_t size, Stream stream): - cdef cydriver.CUstream s = stream._handle - cdef cydriver.CUdeviceptr devptr + cdef cydriver.CUstream s = native(stream._h_stream) + cdef DevicePtrHandle h_ptr with nogil: check_capturing(s) - HANDLE_RETURN(cydriver.cuMemAllocAsync(&devptr, size, s)) - cdef Buffer buf = Buffer.__new__(Buffer) - buf._ptr = (devptr) - buf._ptr_obj = None - buf._size = size - buf._memory_resource = self - buf._alloc_stream = stream - return buf + h_ptr = deviceptr_alloc_async(size, stream._h_stream) + if not h_ptr: + raise RuntimeError("Failed to allocate memory asynchronously") + return Buffer_from_deviceptr_handle(h_ptr, size, self, None) cdef inline void GMR_deallocate(intptr_t ptr, size_t size, Stream stream) noexcept: - cdef cydriver.CUstream s = stream._handle + cdef cydriver.CUstream s = native(stream._h_stream) cdef cydriver.CUdeviceptr devptr = ptr with nogil: HANDLE_RETURN(cydriver.cuMemFreeAsync(devptr, s)) diff --git a/cuda_core/cuda/core/_memory/_ipc.pxd b/cuda_core/cuda/core/_memory/_ipc.pxd index 0c7375efdb..5166aa8748 100644 --- a/cuda_core/cuda/core/_memory/_ipc.pxd +++ b/cuda_core/cuda/core/_memory/_ipc.pxd @@ -41,6 +41,8 @@ cdef class IPCBufferDescriptor: bytes _payload size_t _size + cdef const void* payload_ptr(self) noexcept + cdef class IPCAllocationHandle: cdef: diff --git a/cuda_core/cuda/core/_memory/_ipc.pyx b/cuda_core/cuda/core/_memory/_ipc.pyx index 793e4168d7..99608f55db 100644 --- a/cuda_core/cuda/core/_memory/_ipc.pyx +++ b/cuda_core/cuda/core/_memory/_ipc.pyx @@ -3,11 +3,21 @@ # SPDX-License-Identifier: Apache-2.0 cimport cpython -from libc.stdint cimport uintptr_t -from libc.string cimport memcpy from cuda.bindings cimport cydriver -from cuda.core._memory._buffer cimport Buffer +from cuda.core._memory._buffer cimport Buffer, Buffer_from_deviceptr_handle +from cuda.core._memory._memory_pool cimport _MemPool +from cuda.core._stream cimport Stream +from cuda.core._resource_handles cimport ( + DevicePtrHandle, + _init_handles_table, + create_mempool_handle_ipc, + deviceptr_import_ipc, + get_last_error, + native, +) + +_init_handles_table() from cuda.core._stream cimport default_stream from cuda.core._utils.cuda_utils cimport HANDLE_RETURN from cuda.core._utils.cuda_utils import check_multiprocessing_start_method @@ -87,6 +97,10 @@ cdef class IPCBufferDescriptor: def size(self): return self._size + cdef const void* payload_ptr(self) noexcept: + """Return the payload as a const void* for C API calls.""" + return (self._payload) + cdef class IPCAllocationHandle: """Shareable handle to an IPC-enabled device memory pool.""" @@ -150,7 +164,7 @@ cdef IPCBufferDescriptor Buffer_get_ipc_descriptor(Buffer self): cdef cydriver.CUmemPoolPtrExportData data with nogil: HANDLE_RETURN( - cydriver.cuMemPoolExportPointer(&data, (self._ptr)) + cydriver.cuMemPoolExportPointer(&data, native(self._h_ptr)) ) cdef bytes data_b = cpython.PyBytes_FromStringAndSize( (data.reserved), sizeof(data.reserved) @@ -166,16 +180,15 @@ cdef Buffer Buffer_from_ipc_descriptor( if stream is None: # Note: match this behavior to _MemPool.allocate() stream = default_stream() - cdef cydriver.CUmemPoolPtrExportData data - memcpy( - data.reserved, - (ipc_descriptor._payload), - sizeof(data.reserved) + cdef Stream s = stream + cdef DevicePtrHandle h_ptr = deviceptr_import_ipc( + mr._h_pool, + ipc_descriptor.payload_ptr(), + s._h_stream ) - cdef cydriver.CUdeviceptr ptr - with nogil: - HANDLE_RETURN(cydriver.cuMemPoolImportPointer(&ptr, mr._handle, &data)) - return Buffer._init(ptr, ipc_descriptor.size, mr, stream, ipc_descriptor) + if not h_ptr: + HANDLE_RETURN(get_last_error()) + return Buffer_from_deviceptr_handle(h_ptr, ipc_descriptor.size, mr, ipc_descriptor) # _MemPool IPC Implementation @@ -198,18 +211,15 @@ cdef _MemPool MP_from_allocation_handle(cls, alloc_handle): os.close(fd) raise - # Construct a new mempool + # Construct a new mempool. cdef _MemPool self = <_MemPool>(cls.__new__(cls)) self._mempool_owned = True + cdef int ipc_fd = int(alloc_handle) + self._h_pool = create_mempool_handle_ipc(ipc_fd, IPC_HANDLE_TYPE) + if not self._h_pool: + raise RuntimeError("Failed to import memory pool from IPC handle") self._ipc_data = IPCDataForMR(alloc_handle, True) - # Map the mempool into this process. - cdef int handle = int(alloc_handle) - with nogil: - HANDLE_RETURN(cydriver.cuMemPoolImportFromShareableHandle( - &(self._handle), (handle), IPC_HANDLE_TYPE, 0) - ) - # Register it. if uuid is not None: registered = self.register(uuid) @@ -240,7 +250,7 @@ cdef IPCAllocationHandle MP_export_mempool(_MemPool self): cdef int fd with nogil: HANDLE_RETURN(cydriver.cuMemPoolExportToShareableHandle( - &fd, self._handle, IPC_HANDLE_TYPE, 0) + &fd, native(self._h_pool), IPC_HANDLE_TYPE, 0) ) try: return IPCAllocationHandle._init(fd, uuid.uuid4()) diff --git a/cuda_core/cuda/core/_memory/_legacy.py b/cuda_core/cuda/core/_memory/_legacy.py index 317494ea9e..9250819610 100644 --- a/cuda_core/cuda/core/_memory/_legacy.py +++ b/cuda_core/cuda/core/_memory/_legacy.py @@ -84,12 +84,12 @@ def device_id(self) -> int: class _SynchronousMemoryResource(MemoryResource): - __slots__ = ("_dev_id",) + __slots__ = ("_device_id",) def __init__(self, device_id): from .._device import Device - self._dev_id = Device(device_id).device_id + self._device_id = Device(device_id).device_id def allocate(self, size, stream=None) -> Buffer: if stream is None: @@ -116,4 +116,4 @@ def is_host_accessible(self) -> bool: @property def device_id(self) -> int: - return self._dev_id + return self._device_id diff --git a/cuda_core/cuda/core/_memory/_memory_pool.pxd b/cuda_core/cuda/core/_memory/_memory_pool.pxd index 8d9961b68b..eaff8e4bab 100644 --- a/cuda_core/cuda/core/_memory/_memory_pool.pxd +++ b/cuda_core/cuda/core/_memory/_memory_pool.pxd @@ -5,12 +5,13 @@ from cuda.bindings cimport cydriver from cuda.core._memory._buffer cimport MemoryResource from cuda.core._memory._ipc cimport IPCDataForMR +from cuda.core._resource_handles cimport MemoryPoolHandle cdef class _MemPool(MemoryResource): cdef: int _dev_id - cydriver.CUmemoryPool _handle + MemoryPoolHandle _h_pool bint _mempool_owned IPCDataForMR _ipc_data object _attributes diff --git a/cuda_core/cuda/core/_memory/_memory_pool.pyx b/cuda_core/cuda/core/_memory/_memory_pool.pyx index f1b72d47b5..7a255ebb3d 100644 --- a/cuda_core/cuda/core/_memory/_memory_pool.pyx +++ b/cuda_core/cuda/core/_memory/_memory_pool.pyx @@ -10,9 +10,22 @@ from libc.string cimport memset from cpython.mem cimport PyMem_Malloc, PyMem_Free from cuda.bindings cimport cydriver -from cuda.core._memory._buffer cimport Buffer, MemoryResource +from cuda.core._memory._buffer cimport Buffer, Buffer_from_deviceptr_handle, MemoryResource from cuda.core._memory cimport _ipc from cuda.core._stream cimport default_stream, Stream_accept, Stream +from cuda.core._resource_handles cimport ( + MemoryPoolHandle, + DevicePtrHandle, + _init_handles_table, + create_mempool_handle, + create_mempool_handle_ref, + get_device_mempool, + deviceptr_alloc_from_pool, + native, + py, +) + +_init_handles_table() from cuda.core._utils.cuda_utils cimport ( HANDLE_RETURN, ) @@ -61,7 +74,7 @@ cdef class _MemPoolAttributes: cdef _MemPool mr = <_MemPool>(self._mr_weakref()) if mr is None: raise RuntimeError("_MemPool is expired") - cdef cydriver.CUmemoryPool pool_handle = mr._handle + cdef cydriver.CUmemoryPool pool_handle = native(mr._h_pool) with nogil: HANDLE_RETURN(cydriver.cuMemPoolGetAttribute(pool_handle, attr_enum, value)) return 0 @@ -127,7 +140,6 @@ cdef class _MemPool(MemoryResource): def __cinit__(self): self._dev_id = cydriver.CU_DEVICE_INVALID - self._handle = NULL self._mempool_owned = False self._ipc_data = None self._attributes = None @@ -202,9 +214,9 @@ cdef class _MemPool(MemoryResource): return self._dev_id @property - def handle(self) -> driver.CUmemoryPool: + def handle(self) -> object: """Handle to the underlying memory pool.""" - return driver.CUmemoryPool((self._handle)) + return py(self._h_pool) @property def is_handle_owned(self) -> bool: @@ -271,7 +283,7 @@ cdef class _MemPool(MemoryResource): i += 1 with nogil: - HANDLE_RETURN(cydriver.cuMemPoolSetAccess(self._handle, access_desc, count)) + HANDLE_RETURN(cydriver.cuMemPoolSetAccess(native(self._h_pool), access_desc, count)) finally: if access_desc != NULL: PyMem_Free(access_desc) @@ -308,64 +320,69 @@ cdef int _MP_init_current(_MemPool self, int dev_id, _MemPoolOptions opts) excep cdef cydriver.cuuint64_t current_threshold cdef cydriver.cuuint64_t max_threshold = ULLONG_MAX cdef cydriver.CUmemLocation loc + cdef cydriver.CUmemoryPool pool self._dev_id = dev_id self._mempool_owned = False - with nogil: - if opts._type == cydriver.CUmemAllocationType.CU_MEM_ALLOCATION_TYPE_PINNED \ - and opts._location == cydriver.CUmemLocationType.CU_MEM_LOCATION_TYPE_DEVICE: - assert dev_id >= 0 - HANDLE_RETURN(cydriver.cuDeviceGetMemPool(&(self._handle), dev_id)) - - # Set a higher release threshold to improve performance when there are - # no active allocations. By default, the release threshold is 0, which - # means memory is immediately released back to the OS when there are no - # active suballocations, causing performance issues. + if opts._type == cydriver.CUmemAllocationType.CU_MEM_ALLOCATION_TYPE_PINNED \ + and opts._location == cydriver.CUmemLocationType.CU_MEM_LOCATION_TYPE_DEVICE: + assert dev_id >= 0 + self._h_pool = get_device_mempool(dev_id) + + # Set a higher release threshold to improve performance when there are + # no active allocations. By default, the release threshold is 0, which + # means memory is immediately released back to the OS when there are no + # active suballocations, causing performance issues. + with nogil: HANDLE_RETURN( cydriver.cuMemPoolGetAttribute( - self._handle, + native(self._h_pool), cydriver.CUmemPool_attribute.CU_MEMPOOL_ATTR_RELEASE_THRESHOLD, ¤t_threshold ) ) - - # If threshold is 0 (default), set it to maximum to retain memory in the pool. if current_threshold == 0: HANDLE_RETURN(cydriver.cuMemPoolSetAttribute( - self._handle, + native(self._h_pool), cydriver.CUmemPool_attribute.CU_MEMPOOL_ATTR_RELEASE_THRESHOLD, &max_threshold )) - elif opts._type == cydriver.CUmemAllocationType.CU_MEM_ALLOCATION_TYPE_PINNED \ - and opts._location == cydriver.CUmemLocationType.CU_MEM_LOCATION_TYPE_HOST: - IF CUDA_CORE_BUILD_MAJOR >= 13: - assert dev_id == -1 + elif opts._type == cydriver.CUmemAllocationType.CU_MEM_ALLOCATION_TYPE_PINNED \ + and opts._location == cydriver.CUmemLocationType.CU_MEM_LOCATION_TYPE_HOST: + IF CUDA_CORE_BUILD_MAJOR >= 13: + assert dev_id == -1 + loc.id = dev_id + loc.type = opts._location + with nogil: + HANDLE_RETURN(cydriver.cuMemGetMemPool(&pool, &loc, opts._type)) + self._h_pool = create_mempool_handle_ref(pool) + ELSE: + raise RuntimeError("not supported") + elif opts._type == cydriver.CUmemAllocationType.CU_MEM_ALLOCATION_TYPE_PINNED \ + and opts._location == cydriver.CUmemLocationType.CU_MEM_LOCATION_TYPE_HOST_NUMA: + IF CUDA_CORE_BUILD_MAJOR >= 13: + assert dev_id == 0 + loc.id = 0 + loc.type = opts._location + with nogil: + HANDLE_RETURN(cydriver.cuMemGetMemPool(&pool, &loc, opts._type)) + self._h_pool = create_mempool_handle_ref(pool) + ELSE: + raise RuntimeError("not supported") + else: + IF CUDA_CORE_BUILD_MAJOR >= 13: + if opts._type == cydriver.CUmemAllocationType.CU_MEM_ALLOCATION_TYPE_MANAGED: + # Managed memory pools loc.id = dev_id loc.type = opts._location - HANDLE_RETURN(cydriver.cuMemGetMemPool(&(self._handle), &loc, opts._type)) - ELSE: - raise RuntimeError("not supported") - elif opts._type == cydriver.CUmemAllocationType.CU_MEM_ALLOCATION_TYPE_PINNED \ - and opts._location == cydriver.CUmemLocationType.CU_MEM_LOCATION_TYPE_HOST_NUMA: - IF CUDA_CORE_BUILD_MAJOR >= 13: - assert dev_id == 0 - loc.id = 0 - loc.type = opts._location - HANDLE_RETURN(cydriver.cuMemGetMemPool(&(self._handle), &loc, opts._type)) - ELSE: - raise RuntimeError("not supported") - else: - IF CUDA_CORE_BUILD_MAJOR >= 13: - if opts._type == cydriver.CUmemAllocationType.CU_MEM_ALLOCATION_TYPE_MANAGED: - # Managed memory pools - loc.id = dev_id - loc.type = opts._location - HANDLE_RETURN(cydriver.cuMemGetMemPool(&(self._handle), &loc, opts._type)) - else: - assert False - ELSE: + with nogil: + HANDLE_RETURN(cydriver.cuMemGetMemPool(&pool, &loc, opts._type)) + self._h_pool = create_mempool_handle_ref(pool) + else: assert False + ELSE: + assert False return 0 @@ -389,9 +406,7 @@ cdef int _MP_init_create(_MemPool self, int dev_id, _MemPoolOptions opts) except self._dev_id = dev_id self._mempool_owned = True - with nogil: - HANDLE_RETURN(cydriver.cuMemPoolCreate(&(self._handle), &properties)) - # TODO: should we also set the threshold here? + self._h_pool = create_mempool_handle(properties) if ipc_enabled: alloc_handle = _ipc.MP_export_mempool(self) @@ -411,24 +426,20 @@ cdef inline int check_not_capturing(cydriver.CUstream s) except?-1 nogil: cdef inline Buffer _MP_allocate(_MemPool self, size_t size, Stream stream): - cdef cydriver.CUstream s = stream._handle - cdef cydriver.CUdeviceptr devptr + cdef cydriver.CUstream s = native(stream._h_stream) + cdef DevicePtrHandle h_ptr with nogil: check_not_capturing(s) - HANDLE_RETURN(cydriver.cuMemAllocFromPoolAsync(&devptr, size, self._handle, s)) - cdef Buffer buf = Buffer.__new__(Buffer) - buf._ptr = (devptr) - buf._ptr_obj = None - buf._size = size - buf._memory_resource = self - buf._alloc_stream = stream - return buf + h_ptr = deviceptr_alloc_from_pool(size, self._h_pool, stream._h_stream) + if not h_ptr: + raise RuntimeError("Failed to allocate memory from pool") + return Buffer_from_deviceptr_handle(h_ptr, size, self, None) cdef inline void _MP_deallocate( _MemPool self, uintptr_t ptr, size_t size, Stream stream ) noexcept nogil: - cdef cydriver.CUstream s = stream._handle + cdef cydriver.CUstream s = native(stream._h_stream) cdef cydriver.CUdeviceptr devptr = ptr cdef cydriver.CUresult r with nogil: @@ -438,7 +449,7 @@ cdef inline void _MP_deallocate( cdef inline _MP_close(_MemPool self): - if self._handle == NULL: + if not self._h_pool: return # This works around nvbug 5698116. When a memory pool handle is recycled @@ -446,14 +457,12 @@ cdef inline _MP_close(_MemPool self): if self._peer_accessible_by: self.peer_accessible_by = [] - try: - if self._mempool_owned: - with nogil: - HANDLE_RETURN(cydriver.cuMemPoolDestroy(self._handle)) - finally: - self._dev_id = cydriver.CU_DEVICE_INVALID - self._handle = NULL - self._attributes = None - self._mempool_owned = False - self._ipc_data = None - self._peer_accessible_by = () + # Reset members in declaration order. + # The RAII deleter handles nvbug 5698116 workaround (clears peer access) + # and calls cuMemPoolDestroy if this is an owning handle. + self._h_pool.reset() + self._dev_id = cydriver.CU_DEVICE_INVALID + self._mempool_owned = False + self._ipc_data = None + self._attributes = None + self._peer_accessible_by = () diff --git a/cuda_core/cuda/core/_memoryview.pyx b/cuda_core/cuda/core/_memoryview.pyx index c12cbbaa8a..41321c8722 100644 --- a/cuda_core/cuda/core/_memoryview.pyx +++ b/cuda_core/cuda/core/_memoryview.pyx @@ -13,7 +13,17 @@ from typing import Optional import numpy +from cuda.bindings cimport cydriver +from cuda.core._resource_handles cimport ( + EventHandle, + _init_handles_table, + create_event_handle_noctx, + native, +) + +_init_handles_table() from cuda.core._utils.cuda_utils import handle_return, driver +from cuda.core._utils.cuda_utils cimport HANDLE_RETURN from cuda.core._memory import Buffer @@ -591,6 +601,7 @@ cpdef StridedMemoryView view_as_cai(obj, stream_ptr, view=None): buf.device_id = handle_return(driver.cuCtxGetDevice()) cdef intptr_t producer_s, consumer_s + cdef EventHandle h_event stream_ptr = int(stream_ptr) if stream_ptr != -1: stream = cai_data.get("stream") @@ -600,11 +611,12 @@ cpdef StridedMemoryView view_as_cai(obj, stream_ptr, view=None): assert producer_s > 0 # establish stream order if producer_s != consumer_s: - e = handle_return(driver.cuEventCreate( - driver.CUevent_flags.CU_EVENT_DISABLE_TIMING)) - handle_return(driver.cuEventRecord(e, producer_s)) - handle_return(driver.cuStreamWaitEvent(consumer_s, e, 0)) - handle_return(driver.cuEventDestroy(e)) + with nogil: + h_event = create_event_handle_noctx(cydriver.CUevent_flags.CU_EVENT_DISABLE_TIMING) + HANDLE_RETURN(cydriver.cuEventRecord( + native(h_event), producer_s)) + HANDLE_RETURN(cydriver.cuStreamWaitEvent( + consumer_s, native(h_event), 0)) return buf diff --git a/cuda_core/cuda/core/_resource_handles.pxd b/cuda_core/cuda/core/_resource_handles.pxd new file mode 100644 index 0000000000..801d354958 --- /dev/null +++ b/cuda_core/cuda/core/_resource_handles.pxd @@ -0,0 +1,246 @@ +# SPDX-FileCopyrightText: Copyright (c) 2024-2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# +# SPDX-License-Identifier: Apache-2.0 + +from libc.stddef cimport size_t +from libc.stdint cimport intptr_t, uint32_t +from libcpp.memory cimport shared_ptr + +from cpython.pycapsule cimport PyCapsule_Import + +from cuda.bindings cimport cydriver + +# Declare the C++ namespace and types (inline helpers live in the header). +cdef extern from "_cpp/resource_handles.hpp" namespace "cuda_core": + ctypedef shared_ptr[const cydriver.CUcontext] ContextHandle + ctypedef shared_ptr[const cydriver.CUstream] StreamHandle + ctypedef shared_ptr[const cydriver.CUevent] EventHandle + ctypedef shared_ptr[const cydriver.CUmemoryPool] MemoryPoolHandle + ctypedef shared_ptr[const cydriver.CUdeviceptr] DevicePtrHandle + + # native() - extract the raw CUDA handle (inline C++) + cydriver.CUcontext native(ContextHandle h) nogil + cydriver.CUstream native(StreamHandle h) nogil + cydriver.CUevent native(EventHandle h) nogil + cydriver.CUmemoryPool native(MemoryPoolHandle h) nogil + cydriver.CUdeviceptr native(DevicePtrHandle h) nogil + + # intptr() - extract handle as intptr_t for Python interop (inline C++) + intptr_t intptr(ContextHandle h) nogil + intptr_t intptr(StreamHandle h) nogil + intptr_t intptr(EventHandle h) nogil + intptr_t intptr(MemoryPoolHandle h) nogil + intptr_t intptr(DevicePtrHandle h) nogil + + # py() - convert handle to Python driver wrapper object (inline C++; requires GIL) + object py(ContextHandle h) + object py(StreamHandle h) + object py(EventHandle h) + object py(MemoryPoolHandle h) + object py(DevicePtrHandle h) + + +# The resource handles API table is exported from `cuda.core._resource_handles` +# as a PyCapsule named: +# +# "cuda.core._resource_handles._CXX_API" +# +# Consumers dispatch through this table to avoid relying on RTLD_GLOBAL and to +# ensure a single owner of correctness-critical static/thread_local state. +cdef extern from "_cpp/resource_handles_cxx_api.hpp" namespace "cuda_core": + cdef struct ResourceHandlesCxxApiV1: + uint32_t abi_version + uint32_t struct_size + + # Thread-local error handling + cydriver.CUresult (*get_last_error)() nogil + cydriver.CUresult (*peek_last_error)() nogil + void (*clear_last_error)() nogil + + # Context handles + ContextHandle (*create_context_handle_ref)(cydriver.CUcontext ctx) nogil + ContextHandle (*get_primary_context)(int device_id) nogil + ContextHandle (*get_current_context)() nogil + + # Stream handles + StreamHandle (*create_stream_handle)(ContextHandle h_ctx, unsigned int flags, int priority) nogil + StreamHandle (*create_stream_handle_ref)(cydriver.CUstream stream) nogil + StreamHandle (*create_stream_handle_with_owner)(cydriver.CUstream stream, object owner) + StreamHandle (*get_legacy_stream)() nogil + StreamHandle (*get_per_thread_stream)() nogil + + # Event handles + EventHandle (*create_event_handle)(ContextHandle h_ctx, unsigned int flags) nogil + EventHandle (*create_event_handle_noctx)(unsigned int flags) nogil + EventHandle (*create_event_handle_ipc)(const cydriver.CUipcEventHandle& ipc_handle) nogil + + # Memory pool handles + MemoryPoolHandle (*create_mempool_handle)(const cydriver.CUmemPoolProps& props) nogil + MemoryPoolHandle (*create_mempool_handle_ref)(cydriver.CUmemoryPool pool) nogil + MemoryPoolHandle (*get_device_mempool)(int device_id) nogil + MemoryPoolHandle (*create_mempool_handle_ipc)(int fd, cydriver.CUmemAllocationHandleType handle_type) nogil + + # Device pointer handles + DevicePtrHandle (*deviceptr_alloc_from_pool)( + size_t size, + MemoryPoolHandle h_pool, + StreamHandle h_stream) nogil + DevicePtrHandle (*deviceptr_alloc_async)(size_t size, StreamHandle h_stream) nogil + DevicePtrHandle (*deviceptr_alloc)(size_t size) nogil + DevicePtrHandle (*deviceptr_alloc_host)(size_t size) nogil + DevicePtrHandle (*deviceptr_create_ref)(cydriver.CUdeviceptr ptr) nogil + DevicePtrHandle (*deviceptr_create_with_owner)(cydriver.CUdeviceptr ptr, object owner) + DevicePtrHandle (*deviceptr_import_ipc)( + MemoryPoolHandle h_pool, + const void* export_data, + StreamHandle h_stream) nogil + StreamHandle (*deallocation_stream)(const DevicePtrHandle& h) nogil + void (*set_deallocation_stream)(const DevicePtrHandle& h, StreamHandle h_stream) nogil + + const ResourceHandlesCxxApiV1* get_resource_handles_cxx_api_v1() nogil + + +cdef const ResourceHandlesCxxApiV1* _handles_table = NULL + + +cdef inline const ResourceHandlesCxxApiV1* _get_handles_table() except NULL nogil: + global _handles_table + if _handles_table == NULL: + with gil: + if _handles_table == NULL: + _handles_table = PyCapsule_Import( + b"cuda.core._resource_handles._CXX_API", 0 + ) + if _handles_table == NULL: + raise ImportError("Failed to import cuda.core._resource_handles._CXX_API capsule") + if _handles_table.abi_version != 1: + raise ImportError("Unsupported resource handles C++ API version") + if _handles_table.struct_size < sizeof(ResourceHandlesCxxApiV1): + raise ImportError("Resource handles C++ API table is too small") + return _handles_table + + +# ----------------------------------------------------------------------------- +# Dispatch wrappers +# +# These wrappers assume _handles_table has been initialized. Consumers must call +# _init_handles_table() at module level before using these functions in nogil blocks. +# ----------------------------------------------------------------------------- + +cdef inline void _init_handles_table() except *: + """Initialize the handles table. Call at module level before using wrappers.""" + _get_handles_table() + + +cdef inline cydriver.CUresult get_last_error() noexcept nogil: + return _handles_table.get_last_error() + + +cdef inline cydriver.CUresult peek_last_error() noexcept nogil: + return _handles_table.peek_last_error() + + +cdef inline void clear_last_error() noexcept nogil: + _handles_table.clear_last_error() + + +cdef inline ContextHandle create_context_handle_ref(cydriver.CUcontext ctx) noexcept nogil: + return _handles_table.create_context_handle_ref(ctx) + + +cdef inline ContextHandle get_primary_context(int device_id) noexcept nogil: + return _handles_table.get_primary_context(device_id) + + +cdef inline ContextHandle get_current_context() noexcept nogil: + return _handles_table.get_current_context() + + +cdef inline StreamHandle create_stream_handle(ContextHandle h_ctx, unsigned int flags, int priority) noexcept nogil: + return _handles_table.create_stream_handle(h_ctx, flags, priority) + + +cdef inline StreamHandle create_stream_handle_ref(cydriver.CUstream stream) noexcept nogil: + return _handles_table.create_stream_handle_ref(stream) + + +cdef inline StreamHandle create_stream_handle_with_owner(cydriver.CUstream stream, object owner): + return _handles_table.create_stream_handle_with_owner(stream, owner) + + +cdef inline StreamHandle get_legacy_stream() noexcept nogil: + return _handles_table.get_legacy_stream() + + +cdef inline StreamHandle get_per_thread_stream() noexcept nogil: + return _handles_table.get_per_thread_stream() + + +cdef inline EventHandle create_event_handle(ContextHandle h_ctx, unsigned int flags) noexcept nogil: + return _handles_table.create_event_handle(h_ctx, flags) + + +cdef inline EventHandle create_event_handle_noctx(unsigned int flags) noexcept nogil: + return _handles_table.create_event_handle_noctx(flags) + + +cdef inline EventHandle create_event_handle_ipc(const cydriver.CUipcEventHandle& ipc_handle) noexcept nogil: + return _handles_table.create_event_handle_ipc(ipc_handle) + + +cdef inline MemoryPoolHandle create_mempool_handle(const cydriver.CUmemPoolProps& props) noexcept nogil: + return _handles_table.create_mempool_handle(props) + + +cdef inline MemoryPoolHandle create_mempool_handle_ref(cydriver.CUmemoryPool pool) noexcept nogil: + return _handles_table.create_mempool_handle_ref(pool) + + +cdef inline MemoryPoolHandle get_device_mempool(int device_id) noexcept nogil: + return _handles_table.get_device_mempool(device_id) + + +cdef inline MemoryPoolHandle create_mempool_handle_ipc(int fd, cydriver.CUmemAllocationHandleType handle_type) noexcept nogil: + return _handles_table.create_mempool_handle_ipc(fd, handle_type) + + +cdef inline DevicePtrHandle deviceptr_alloc_from_pool( + size_t size, + MemoryPoolHandle h_pool, + StreamHandle h_stream) noexcept nogil: + return _handles_table.deviceptr_alloc_from_pool(size, h_pool, h_stream) + + +cdef inline DevicePtrHandle deviceptr_alloc_async(size_t size, StreamHandle h_stream) noexcept nogil: + return _handles_table.deviceptr_alloc_async(size, h_stream) + + +cdef inline DevicePtrHandle deviceptr_alloc(size_t size) noexcept nogil: + return _handles_table.deviceptr_alloc(size) + + +cdef inline DevicePtrHandle deviceptr_alloc_host(size_t size) noexcept nogil: + return _handles_table.deviceptr_alloc_host(size) + + +cdef inline DevicePtrHandle deviceptr_create_ref(cydriver.CUdeviceptr ptr) noexcept nogil: + return _handles_table.deviceptr_create_ref(ptr) + + +cdef inline DevicePtrHandle deviceptr_create_with_owner(cydriver.CUdeviceptr ptr, object owner): + return _handles_table.deviceptr_create_with_owner(ptr, owner) + + +cdef inline DevicePtrHandle deviceptr_import_ipc( + MemoryPoolHandle h_pool, + const void* export_data, + StreamHandle h_stream) noexcept nogil: + return _handles_table.deviceptr_import_ipc(h_pool, export_data, h_stream) + + +cdef inline StreamHandle deallocation_stream(const DevicePtrHandle& h) noexcept nogil: + return _handles_table.deallocation_stream(h) + + +cdef inline void set_deallocation_stream(const DevicePtrHandle& h, StreamHandle h_stream) noexcept nogil: + _handles_table.set_deallocation_stream(h, h_stream) diff --git a/cuda_core/cuda/core/_resource_handles.pyx b/cuda_core/cuda/core/_resource_handles.pyx new file mode 100644 index 0000000000..48f790581e --- /dev/null +++ b/cuda_core/cuda/core/_resource_handles.pyx @@ -0,0 +1,137 @@ +# SPDX-FileCopyrightText: Copyright (c) 2024-2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# +# SPDX-License-Identifier: Apache-2.0 + +# This module exists to compile _cpp/resource_handles.cpp into a shared library. +# The helper functions (native, intptr, py) are implemented as inline C++ functions +# in _cpp/resource_handles.hpp and declared as extern in _resource_handles.pxd. + +from cpython.pycapsule cimport PyCapsule_New +from libc.stdint cimport uint32_t, uint64_t, uintptr_t + +from ._resource_handles_cxx_api cimport ( + ResourceHandlesCxxApiV1, + get_resource_handles_cxx_api_v1, +) + +import cython + + +cdef const char* _CXX_API_NAME = b"cuda.core._resource_handles._CXX_API" +cdef const char* _CUDA_DRIVER_API_V1_NAME = b"cuda.core._resource_handles._CUDA_DRIVER_API_V1" + +# Export the C++ handles dispatch table as a PyCapsule. +# Consumers use PyCapsule_Import(_CXX_API_NAME, 0) to retrieve it. +cdef const ResourceHandlesCxxApiV1* _handles_table = get_resource_handles_cxx_api_v1() +if _handles_table == NULL: + raise RuntimeError("Failed to initialize resource handles C++ API table") + +_CXX_API = PyCapsule_New(_handles_table, _CXX_API_NAME, NULL) +if _CXX_API is None: + raise RuntimeError("Failed to create _CXX_API capsule") + + +cdef struct CudaDriverApiV1: + uint32_t abi_version + uint32_t struct_size + + uintptr_t cuDevicePrimaryCtxRetain + uintptr_t cuDevicePrimaryCtxRelease + uintptr_t cuCtxGetCurrent + + uintptr_t cuStreamCreateWithPriority + uintptr_t cuStreamDestroy + + uintptr_t cuEventCreate + uintptr_t cuEventDestroy + uintptr_t cuIpcOpenEventHandle + + uintptr_t cuDeviceGetCount + + uintptr_t cuMemPoolSetAccess + uintptr_t cuMemPoolDestroy + uintptr_t cuMemPoolCreate + uintptr_t cuDeviceGetMemPool + uintptr_t cuMemPoolImportFromShareableHandle + + uintptr_t cuMemAllocFromPoolAsync + uintptr_t cuMemAllocAsync + uintptr_t cuMemAlloc + uintptr_t cuMemAllocHost + + uintptr_t cuMemFreeAsync + uintptr_t cuMemFree + uintptr_t cuMemFreeHost + + uintptr_t cuMemPoolImportPointer + + +cdef CudaDriverApiV1 _cuda_driver_api_v1 +cdef bint _cuda_driver_api_v1_inited = False + + +cdef inline uintptr_t _as_addr(object pfn) except 0: + return int(pfn) + + +cdef inline uintptr_t _resolve(object d, int driver_ver, uint64_t flags, bytes sym) except 0: + err, pfn, status = d.cuGetProcAddress(sym, driver_ver, flags) + if int(err) != 0 or pfn is None: + raise RuntimeError(f"cuGetProcAddress failed for {sym!r}, err={err}, status={status}") + return _as_addr(pfn) + + +def _get_cuda_driver_api_v1_capsule(): + """Return a PyCapsule containing cached CUDA driver entrypoints. + + This is evaluated lazily on first use so cuda-core remains importable on + CPU-only machines. + """ + global _cuda_driver_api_v1_inited, _cuda_driver_api_v1 + if not _cuda_driver_api_v1_inited: + import cuda.bindings.driver as d + + err, ver = d.cuDriverGetVersion() + if int(err) != 0: + raise RuntimeError(f"cuDriverGetVersion failed: {err}") + driver_ver = int(ver) + + flags = 0 # CU_GET_PROC_ADDRESS_DEFAULT + + _cuda_driver_api_v1.cuDevicePrimaryCtxRetain = _resolve(d, driver_ver, flags, b"cuDevicePrimaryCtxRetain") + _cuda_driver_api_v1.cuDevicePrimaryCtxRelease = _resolve(d, driver_ver, flags, b"cuDevicePrimaryCtxRelease") + _cuda_driver_api_v1.cuCtxGetCurrent = _resolve(d, driver_ver, flags, b"cuCtxGetCurrent") + + _cuda_driver_api_v1.cuStreamCreateWithPriority = _resolve(d, driver_ver, flags, b"cuStreamCreateWithPriority") + _cuda_driver_api_v1.cuStreamDestroy = _resolve(d, driver_ver, flags, b"cuStreamDestroy") + + _cuda_driver_api_v1.cuEventCreate = _resolve(d, driver_ver, flags, b"cuEventCreate") + _cuda_driver_api_v1.cuEventDestroy = _resolve(d, driver_ver, flags, b"cuEventDestroy") + _cuda_driver_api_v1.cuIpcOpenEventHandle = _resolve(d, driver_ver, flags, b"cuIpcOpenEventHandle") + + _cuda_driver_api_v1.cuDeviceGetCount = _resolve(d, driver_ver, flags, b"cuDeviceGetCount") + + _cuda_driver_api_v1.cuMemPoolSetAccess = _resolve(d, driver_ver, flags, b"cuMemPoolSetAccess") + _cuda_driver_api_v1.cuMemPoolDestroy = _resolve(d, driver_ver, flags, b"cuMemPoolDestroy") + _cuda_driver_api_v1.cuMemPoolCreate = _resolve(d, driver_ver, flags, b"cuMemPoolCreate") + _cuda_driver_api_v1.cuDeviceGetMemPool = _resolve(d, driver_ver, flags, b"cuDeviceGetMemPool") + _cuda_driver_api_v1.cuMemPoolImportFromShareableHandle = _resolve( + d, driver_ver, flags, b"cuMemPoolImportFromShareableHandle" + ) + + _cuda_driver_api_v1.cuMemAllocFromPoolAsync = _resolve(d, driver_ver, flags, b"cuMemAllocFromPoolAsync") + _cuda_driver_api_v1.cuMemAllocAsync = _resolve(d, driver_ver, flags, b"cuMemAllocAsync") + _cuda_driver_api_v1.cuMemAlloc = _resolve(d, driver_ver, flags, b"cuMemAlloc") + _cuda_driver_api_v1.cuMemAllocHost = _resolve(d, driver_ver, flags, b"cuMemAllocHost") + + _cuda_driver_api_v1.cuMemFreeAsync = _resolve(d, driver_ver, flags, b"cuMemFreeAsync") + _cuda_driver_api_v1.cuMemFree = _resolve(d, driver_ver, flags, b"cuMemFree") + _cuda_driver_api_v1.cuMemFreeHost = _resolve(d, driver_ver, flags, b"cuMemFreeHost") + + _cuda_driver_api_v1.cuMemPoolImportPointer = _resolve(d, driver_ver, flags, b"cuMemPoolImportPointer") + + _cuda_driver_api_v1.abi_version = 1 + _cuda_driver_api_v1.struct_size = cython.sizeof(CudaDriverApiV1) + _cuda_driver_api_v1_inited = True + + return PyCapsule_New(&_cuda_driver_api_v1, _CUDA_DRIVER_API_V1_NAME, NULL) diff --git a/cuda_core/cuda/core/_resource_handles_cxx_api.pxd b/cuda_core/cuda/core/_resource_handles_cxx_api.pxd new file mode 100644 index 0000000000..da3d8d4fd3 --- /dev/null +++ b/cuda_core/cuda/core/_resource_handles_cxx_api.pxd @@ -0,0 +1,68 @@ +# SPDX-FileCopyrightText: Copyright (c) 2024-2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# +# SPDX-License-Identifier: Apache-2.0 + +from libc.stdint cimport uint32_t +from libc.stddef cimport size_t + +from cuda.bindings cimport cydriver +from ._resource_handles cimport ( + ContextHandle, + DevicePtrHandle, + EventHandle, + MemoryPoolHandle, + StreamHandle, +) + + +cdef extern from "_cpp/resource_handles_cxx_api.hpp" namespace "cuda_core": + cdef struct ResourceHandlesCxxApiV1: + uint32_t abi_version + uint32_t struct_size + + # Thread-local error handling + cydriver.CUresult (*get_last_error)() nogil + cydriver.CUresult (*peek_last_error)() nogil + void (*clear_last_error)() nogil + + # Context handles + ContextHandle (*create_context_handle_ref)(cydriver.CUcontext ctx) nogil + ContextHandle (*get_primary_context)(int device_id) nogil + ContextHandle (*get_current_context)() nogil + + # Stream handles + StreamHandle (*create_stream_handle)(ContextHandle h_ctx, unsigned int flags, int priority) nogil + StreamHandle (*create_stream_handle_ref)(cydriver.CUstream stream) nogil + StreamHandle (*create_stream_handle_with_owner)(cydriver.CUstream stream, object owner) + StreamHandle (*get_legacy_stream)() nogil + StreamHandle (*get_per_thread_stream)() nogil + + # Event handles + EventHandle (*create_event_handle)(ContextHandle h_ctx, unsigned int flags) nogil + EventHandle (*create_event_handle_noctx)(unsigned int flags) nogil + EventHandle (*create_event_handle_ipc)(const cydriver.CUipcEventHandle& ipc_handle) nogil + + # Memory pool handles + MemoryPoolHandle (*create_mempool_handle)(const cydriver.CUmemPoolProps& props) nogil + MemoryPoolHandle (*create_mempool_handle_ref)(cydriver.CUmemoryPool pool) nogil + MemoryPoolHandle (*get_device_mempool)(int device_id) nogil + MemoryPoolHandle (*create_mempool_handle_ipc)(int fd, cydriver.CUmemAllocationHandleType handle_type) nogil + + # Device pointer handles + DevicePtrHandle (*deviceptr_alloc_from_pool)( + size_t size, + MemoryPoolHandle h_pool, + StreamHandle h_stream) nogil + DevicePtrHandle (*deviceptr_alloc_async)(size_t size, StreamHandle h_stream) nogil + DevicePtrHandle (*deviceptr_alloc)(size_t size) nogil + DevicePtrHandle (*deviceptr_alloc_host)(size_t size) nogil + DevicePtrHandle (*deviceptr_create_ref)(cydriver.CUdeviceptr ptr) nogil + DevicePtrHandle (*deviceptr_create_with_owner)(cydriver.CUdeviceptr ptr, object owner) + DevicePtrHandle (*deviceptr_import_ipc)( + MemoryPoolHandle h_pool, + const void* export_data, + StreamHandle h_stream) nogil + StreamHandle (*deallocation_stream)(const DevicePtrHandle& h) nogil + void (*set_deallocation_stream)(const DevicePtrHandle& h, StreamHandle h_stream) nogil + + const ResourceHandlesCxxApiV1* get_resource_handles_cxx_api_v1() nogil diff --git a/cuda_core/cuda/core/_stream.pxd b/cuda_core/cuda/core/_stream.pxd index edc25e2ba7..69bd5821ad 100644 --- a/cuda_core/cuda/core/_stream.pxd +++ b/cuda_core/cuda/core/_stream.pxd @@ -2,23 +2,22 @@ # # SPDX-License-Identifier: Apache-2.0 -from cuda.bindings cimport cydriver +from cuda.core._resource_handles cimport ContextHandle, StreamHandle cdef class Stream: cdef: - cydriver.CUstream _handle - object _owner - bint _builtin + StreamHandle _h_stream + ContextHandle _h_context + int _device_id int _nonblocking int _priority - cydriver.CUdevice _device_id - cydriver.CUcontext _ctx_handle + + @staticmethod + cdef Stream _from_handle(type cls, StreamHandle h_stream) cpdef close(self) - cdef int _get_context(self) except?-1 nogil - cdef int _get_device_and_context(self) except?-1 cpdef Stream default_stream() diff --git a/cuda_core/cuda/core/_stream.pyx b/cuda_core/cuda/core/_stream.pyx index b724f9aee3..aecf24b06e 100644 --- a/cuda_core/cuda/core/_stream.pyx +++ b/cuda_core/cuda/core/_stream.pyx @@ -12,8 +12,6 @@ from cuda.bindings cimport cydriver from cuda.core._event cimport Event as cyEvent from cuda.core._utils.cuda_utils cimport ( check_or_create_options, - CU_CONTEXT_INVALID, - get_device_from_ctx, HANDLE_RETURN, ) @@ -25,13 +23,28 @@ from typing import TYPE_CHECKING, Optional, Protocol, Union if TYPE_CHECKING: import cuda.bindings from cuda.core._device import Device -from cuda.core._context import Context +from cuda.core._context cimport Context from cuda.core._event import Event, EventOptions -from cuda.core._graph import GraphBuilder -from cuda.core._utils.cuda_utils import ( - driver, +from cuda.core._resource_handles cimport ( + ContextHandle, + EventHandle, + StreamHandle, + _init_handles_table, + create_context_handle_ref, + create_event_handle_noctx, + create_stream_handle, + create_stream_handle_with_owner, + get_current_context, + get_legacy_stream, + get_per_thread_stream, + intptr, + native, + py, ) +_init_handles_table() +from cuda.core._graph import GraphBuilder + @dataclass cdef class StreamOptions: @@ -78,52 +91,61 @@ cdef class Stream: object, or created directly through using an existing handle using Stream.from_handle(). """ - def __cinit__(self): - self._handle = (NULL) - self._owner = None - self._builtin = False - self._nonblocking = -1 # lazy init'd - self._priority = INT32_MIN # lazy init'd - self._device_id = cydriver.CU_DEVICE_INVALID # lazy init'd - self._ctx_handle = CU_CONTEXT_INVALID # lazy init'd - def __init__(self, *args, **kwargs): raise RuntimeError( "Stream objects cannot be instantiated directly. " "Please use Device APIs (create_stream) or other Stream APIs (from_handle)." ) + @staticmethod + cdef Stream _from_handle(type cls, StreamHandle h_stream): + """Create a Stream from an existing StreamHandle (cdef-only factory).""" + cdef Stream s = cls.__new__(cls) + s._h_stream = h_stream + # _h_context is default-initialized to empty ContextHandle by C++ + s._device_id = -1 # lazy init'd (invalid sentinel) + s._nonblocking = -1 # lazy init'd + s._priority = INT32_MIN # lazy init'd + return s + @classmethod def _legacy_default(cls): - cdef Stream self = Stream.__new__(cls) - self._handle = (cydriver.CU_STREAM_LEGACY) - self._builtin = True - return self + """Return the legacy default stream (supports subclassing).""" + return Stream._from_handle(cls, get_legacy_stream()) @classmethod def _per_thread_default(cls): - cdef Stream self = Stream.__new__(cls) - self._handle = (cydriver.CU_STREAM_PER_THREAD) - self._builtin = True - return self + """Return the per-thread default stream (supports subclassing).""" + return Stream._from_handle(cls, get_per_thread_stream()) @classmethod - def _init(cls, obj: IsStreamT | None = None, options=None, device_id: int = None): - cdef Stream self = Stream.__new__(cls) + def _init(cls, obj: IsStreamT | None = None, options=None, device_id: int = None, + ctx: Context = None): + cdef StreamHandle h_stream + cdef cydriver.CUstream borrowed + cdef ContextHandle h_context + cdef Stream self + + # Extract context handle if provided + if ctx is not None: + h_context = (ctx)._h_context if obj is not None and options is not None: raise ValueError("obj and options cannot be both specified") if obj is not None: - self._handle = _handle_from_stream_protocol(obj) - # TODO: check if obj is created under the current context/device - self._owner = obj - return self + # Borrowed stream from foreign object + # C++ handle prevents owner from being GC'd until handle is released + # Owner is responsible for keeping the stream's context alive + borrowed = _handle_from_stream_protocol(obj) + h_stream = create_stream_handle_with_owner(borrowed, obj) + return Stream._from_handle(cls, h_stream) cdef StreamOptions opts = check_or_create_options(StreamOptions, options, "Stream options") nonblocking = opts.nonblocking priority = opts.priority - flags = cydriver.CUstream_flags.CU_STREAM_NON_BLOCKING if nonblocking else cydriver.CUstream_flags.CU_STREAM_DEFAULT + cdef unsigned int flags = (cydriver.CUstream_flags.CU_STREAM_NON_BLOCKING if nonblocking + else cydriver.CUstream_flags.CU_STREAM_DEFAULT) # TODO: we might want to consider memoizing high/low per CUDA context and avoid this call cdef int high, low with nogil: @@ -136,57 +158,47 @@ cdef class Stream: else: prio = high - cdef cydriver.CUstream s - with nogil: - HANDLE_RETURN(cydriver.cuStreamCreateWithPriority(&s, flags, prio)) - self._handle = s + # C++ creates the stream and returns owning handle with context dependency + h_stream = create_stream_handle(h_context, flags, prio) + if not h_stream: + raise RuntimeError("Failed to create CUDA stream") + self = Stream._from_handle(cls, h_stream) self._nonblocking = int(nonblocking) self._priority = prio - self._device_id = device_id if device_id is not None else self._device_id + if device_id is not None: + self._device_id = device_id return self - def __dealloc__(self): - self.close() - cpdef close(self): """Destroy the stream. - Destroy the stream if we own it. Borrowed foreign stream - object will instead have their references released. - + Releases the stream handle. For owned streams, this destroys the + underlying CUDA stream. For borrowed streams, this releases the + reference and allows the Python owner to be GC'd. """ - if self._owner is None: - if self._handle and not self._builtin: - with nogil: - HANDLE_RETURN(cydriver.cuStreamDestroy(self._handle)) - else: - self._owner = None - self._handle = (NULL) + self._h_stream.reset() def __cuda_stream__(self) -> tuple[int, int]: """Return an instance of a __cuda_stream__ protocol.""" - return (0, (self._handle)) + return (0, intptr(self._h_stream)) def __hash__(self) -> int: # Ensure context is initialized for hash consistency - if self._ctx_handle == CU_CONTEXT_INVALID: - self._get_context() - return hash(((self._ctx_handle), (self._handle))) + Stream_ensure_ctx(self) + return hash((intptr(self._h_context), intptr(self._h_stream))) def __eq__(self, other) -> bool: if not isinstance(other, Stream): return NotImplemented cdef Stream _other = other # Fast path: compare handles first - if (self._handle) != ((_other)._handle): + if intptr(self._h_stream) != intptr(_other._h_stream): return False # Ensure contexts are initialized for both streams - if self._ctx_handle == CU_CONTEXT_INVALID: - self._get_context() - if _other._ctx_handle == CU_CONTEXT_INVALID: - _other._get_context() + Stream_ensure_ctx(self) + Stream_ensure_ctx(_other) # Compare contexts as well - return (self._ctx_handle) == ((_other)._ctx_handle) + return intptr(self._h_context) == intptr(_other._h_context) @property def handle(self) -> cuda.bindings.driver.CUstream: @@ -197,7 +209,7 @@ cdef class Stream: This handle is a Python object. To get the memory address of the underlying C handle, call ``int(Stream.handle)``. """ - return driver.CUstream((self._handle)) + return py(self._h_stream) @property def is_nonblocking(self) -> bool: @@ -205,11 +217,8 @@ cdef class Stream: cdef unsigned int flags if self._nonblocking == -1: with nogil: - HANDLE_RETURN(cydriver.cuStreamGetFlags(self._handle, &flags)) - if flags & cydriver.CUstream_flags.CU_STREAM_NON_BLOCKING: - self._nonblocking = True - else: - self._nonblocking = False + HANDLE_RETURN(cydriver.cuStreamGetFlags(native(self._h_stream), &flags)) + self._nonblocking = flags & cydriver.CUstream_flags.CU_STREAM_NON_BLOCKING return bool(self._nonblocking) @property @@ -218,14 +227,14 @@ cdef class Stream: cdef int prio if self._priority == INT32_MIN: with nogil: - HANDLE_RETURN(cydriver.cuStreamGetPriority(self._handle, &prio)) + HANDLE_RETURN(cydriver.cuStreamGetPriority(native(self._h_stream), &prio)) self._priority = prio return self._priority def sync(self): """Synchronize the stream.""" with nogil: - HANDLE_RETURN(cydriver.cuStreamSynchronize(self._handle)) + HANDLE_RETURN(cydriver.cuStreamSynchronize(native(self._h_stream))) def record(self, event: Event = None, options: EventOptions = None) -> Event: """Record an event onto the stream. @@ -250,17 +259,17 @@ cdef class Stream: # on the stream. Event flags such as disabling timing, nonblocking, # and CU_EVENT_RECORD_EXTERNAL, can be set in EventOptions. if event is None: - self._get_device_and_context() - event = Event._init((self._device_id), (self._ctx_handle), options, False) + Stream_ensure_ctx_device(self) + event = cyEvent._init(cyEvent, self._device_id, self._h_context, options, False) elif event.is_ipc_enabled: raise TypeError( "IPC-enabled events should not be re-recorded, instead create a " "new event by supplying options." ) - cdef cydriver.CUevent e = ((event))._handle + cdef cydriver.CUevent e = native(((event))._h_event) with nogil: - HANDLE_RETURN(cydriver.cuEventRecord(e, self._handle)) + HANDLE_RETURN(cydriver.cuEventRecord(e, native(self._h_stream))) return event def wait(self, event_or_stream: Union[Event, Stream]): @@ -273,32 +282,35 @@ cdef class Stream: on the stream and then waiting on it. """ - cdef cydriver.CUevent event - cdef cydriver.CUstream stream + cdef Stream stream + cdef EventHandle h_event + # Handle Event directly if isinstance(event_or_stream, Event): - event = (event_or_stream.handle) with nogil: # TODO: support flags other than 0? - HANDLE_RETURN(cydriver.cuStreamWaitEvent(self._handle, event, 0)) + HANDLE_RETURN(cydriver.cuStreamWaitEvent( + native(self._h_stream), native((event_or_stream)._h_event), 0)) + return + + # Convert to Stream if needed + if isinstance(event_or_stream, Stream): + stream = event_or_stream else: - if isinstance(event_or_stream, Stream): - stream = (event_or_stream.handle) - else: - try: - s = Stream._init(obj=event_or_stream) - except Exception as e: - raise ValueError( - "only an Event, Stream, or object supporting __cuda_stream__ can be waited," - f" got {type(event_or_stream)}" - ) from e - stream = (s.handle) - with nogil: - HANDLE_RETURN(cydriver.cuEventCreate(&event, cydriver.CUevent_flags.CU_EVENT_DISABLE_TIMING)) - HANDLE_RETURN(cydriver.cuEventRecord(event, stream)) - # TODO: support flags other than 0? - HANDLE_RETURN(cydriver.cuStreamWaitEvent(self._handle, event, 0)) - HANDLE_RETURN(cydriver.cuEventDestroy(event)) + try: + stream = Stream._init(obj=event_or_stream) + except Exception as e: + raise ValueError( + "only an Event, Stream, or object supporting __cuda_stream__ can be waited," + f" got {type(event_or_stream)}" + ) from e + + # Wait on stream via temporary event + with nogil: + h_event = create_event_handle_noctx(cydriver.CUevent_flags.CU_EVENT_DISABLE_TIMING) + HANDLE_RETURN(cydriver.cuEventRecord(native(h_event), native(stream._h_stream))) + # TODO: support flags other than 0? + HANDLE_RETURN(cydriver.cuStreamWaitEvent(native(self._h_stream), native(h_event), 0)) @property def device(self) -> Device: @@ -312,32 +324,15 @@ cdef class Stream: """ from cuda.core._device import Device # avoid circular import - self._get_device_and_context() - return Device((self._device_id)) - - cdef int _get_context(self) except?-1 nogil: - if self._ctx_handle == CU_CONTEXT_INVALID: - HANDLE_RETURN(cydriver.cuStreamGetCtx(self._handle, &(self._ctx_handle))) - return 0 - - cdef int _get_device_and_context(self) except?-1: - cdef cydriver.CUcontext curr_ctx - if self._device_id == cydriver.CU_DEVICE_INVALID: - with nogil: - # Get the current context - HANDLE_RETURN(cydriver.cuCtxGetCurrent(&curr_ctx)) - # Get the stream's context (self.ctx_handle is populated) - self._get_context() - # Get the stream's device (may require a context-switching dance) - self._device_id = get_device_from_ctx(self._ctx_handle, curr_ctx) - return 0 + Stream_ensure_ctx_device(self) + return Device(self._device_id) @property def context(self) -> Context: """Return the :obj:`~_context.Context` associated with this stream.""" - self._get_context() - self._get_device_and_context() - return Context._from_ctx((self._ctx_handle), (self._device_id)) + Stream_ensure_ctx(self) + Stream_ensure_ctx_device(self) + return Context._from_handle(Context, self._h_context, self._device_id) @staticmethod def from_handle(handle: int) -> Stream: @@ -417,6 +412,36 @@ cpdef Stream default_stream(): return C_LEGACY_DEFAULT_STREAM +cdef inline int Stream_ensure_ctx(Stream self) except?-1 nogil: + """Ensure the stream's context handle is populated.""" + cdef cydriver.CUcontext ctx + if not self._h_context: + HANDLE_RETURN(cydriver.cuStreamGetCtx(native(self._h_stream), &ctx)) + with gil: + self._h_context = create_context_handle_ref(ctx) + return 0 + + +cdef inline int Stream_ensure_ctx_device(Stream self) except?-1: + """Ensure the stream's context and device_id are populated.""" + cdef cydriver.CUcontext ctx + cdef cydriver.CUdevice target_dev + cdef bint switch_context + + if self._device_id < 0: + with nogil: + # Get device ID from context, switching context temporarily if needed + Stream_ensure_ctx(self) + switch_context = (get_current_context() != self._h_context) + if switch_context: + HANDLE_RETURN(cydriver.cuCtxPushCurrent(native(self._h_context))) + HANDLE_RETURN(cydriver.cuCtxGetDevice(&target_dev)) + if switch_context: + HANDLE_RETURN(cydriver.cuCtxPopCurrent(&ctx)) + self._device_id = target_dev + return 0 + + cdef cydriver.CUstream _handle_from_stream_protocol(obj) except*: if isinstance(obj, Stream): return (obj.handle) diff --git a/cuda_core/cuda/core/_utils/cuda_utils.pxd b/cuda_core/cuda/core/_utils/cuda_utils.pxd index ce30285aa5..9b5044beda 100644 --- a/cuda_core/cuda/core/_utils/cuda_utils.pxd +++ b/cuda_core/cuda/core/_utils/cuda_utils.pxd @@ -22,10 +22,6 @@ ctypedef fused integer_t: cdef const cydriver.CUcontext CU_CONTEXT_INVALID = (-2) -cdef cydriver.CUdevice get_device_from_ctx( - cydriver.CUcontext target_ctx, cydriver.CUcontext curr_ctx) except?cydriver.CU_DEVICE_INVALID nogil - - cdef int HANDLE_RETURN(supported_error_type err) except?-1 nogil diff --git a/cuda_core/cuda/core/_utils/cuda_utils.pyx b/cuda_core/cuda/core/_utils/cuda_utils.pyx index 0c3f6521a4..c7f867a0d5 100644 --- a/cuda_core/cuda/core/_utils/cuda_utils.pyx +++ b/cuda_core/cuda/core/_utils/cuda_utils.pyx @@ -197,25 +197,6 @@ def precondition(checker: Callable[..., None], str what="") -> Callable: return outer -cdef cydriver.CUdevice get_device_from_ctx( - cydriver.CUcontext target_ctx, cydriver.CUcontext curr_ctx) except?cydriver.CU_DEVICE_INVALID nogil: - """Get device ID from the given ctx.""" - cdef bint switch_context = (curr_ctx != target_ctx) - cdef cydriver.CUcontext ctx - cdef cydriver.CUdevice target_dev - with nogil: - if switch_context: - HANDLE_RETURN(cydriver.cuCtxPopCurrent(&ctx)) - assert curr_ctx == ctx - HANDLE_RETURN(cydriver.cuCtxPushCurrent(target_ctx)) - HANDLE_RETURN(cydriver.cuCtxGetDevice(&target_dev)) - if switch_context: - HANDLE_RETURN(cydriver.cuCtxPopCurrent(&ctx)) - assert target_ctx == ctx - HANDLE_RETURN(cydriver.cuCtxPushCurrent(curr_ctx)) - return target_dev - - def is_sequence(obj): """ Check if the given object is a sequence (list or tuple). diff --git a/cuda_core/cuda/core/experimental/_context.pxd b/cuda_core/cuda/core/experimental/_context.pxd new file mode 100644 index 0000000000..58ca887908 --- /dev/null +++ b/cuda_core/cuda/core/experimental/_context.pxd @@ -0,0 +1,6 @@ +# SPDX-FileCopyrightText: Copyright (c) 2024-2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# +# SPDX-License-Identifier: Apache-2.0 + +# Backward compatibility stub - use cuda.core._context instead +from cuda.core._context cimport Context diff --git a/cuda_core/pyproject.toml b/cuda_core/pyproject.toml index 94a9e931cc..bf509dc1c3 100644 --- a/cuda_core/pyproject.toml +++ b/cuda_core/pyproject.toml @@ -70,6 +70,7 @@ include = ["cuda.core*"] [tool.setuptools.package-data] "cuda.core._include" = ["*.h", "*.hpp", "*.cuh"] +"cuda.core._cpp" = ["*.cpp", "*.hpp"] [tool.setuptools.dynamic] version = { attr = "cuda.core._version.__version__" } diff --git a/cuda_core/tests/memory_ipc/test_ipc_duplicate_import.py b/cuda_core/tests/memory_ipc/test_ipc_duplicate_import.py new file mode 100644 index 0000000000..ca4ecc0749 --- /dev/null +++ b/cuda_core/tests/memory_ipc/test_ipc_duplicate_import.py @@ -0,0 +1,87 @@ +# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: Apache-2.0 + +"""Test for duplicate IPC buffer imports. + +Verifies that importing the same buffer descriptor multiple times returns the +same underlying handle, and that closing all imports works correctly without +crashing. This tests the workaround for nvbug 5570902 where IPC-imported +pointers are not correctly reference counted by the driver. +""" + +import contextlib +import multiprocessing as mp + +import pytest +from cuda.core import Buffer, Device +from helpers.logging import TimestampedLogger + +CHILD_TIMEOUT_SEC = 20 +NBYTES = 64 +POOL_SIZE = 2097152 + +ENABLE_LOGGING = False # Set True for test debugging and development + + +def child_main(log, queue): + log.prefix = " child: " + log("ready") + device = Device() + device.set_current() + mr = queue.get() + buffer_desc1 = queue.get() + buffer_desc2 = queue.get() + + # Import the same buffer twice - should return same handle due to cache + buffer1 = Buffer.from_ipc_descriptor(mr, buffer_desc1) + buffer2 = Buffer.from_ipc_descriptor(mr, buffer_desc2) + + log(f"buffer1.handle = {buffer1.handle}") + log(f"buffer2.handle = {buffer2.handle}") + log(f"same handle: {buffer1.handle == buffer2.handle}") + + # Close both - should not crash + buffer1.close() + log("buffer1 closed") + + buffer2.close() + log("buffer2 closed") + + device.sync() + log("done") + + +class TestIpcDuplicateImport: + """Test that duplicate IPC imports return the same handle and close safely.""" + + @pytest.fixture(autouse=True) + def _set_start_method(self): + # Ensure spawn is used for multiprocessing + with contextlib.suppress(RuntimeError): + mp.set_start_method("spawn", force=True) + + def test_main(self, ipc_device, ipc_memory_resource): + log = TimestampedLogger(prefix="parent: ", enabled=ENABLE_LOGGING) + ipc_device.set_current() + mr = ipc_memory_resource + + log("allocating buffer") + buffer = mr.allocate(NBYTES) + + # Start the child process. + log("starting child") + queue = mp.Queue() + process = mp.Process(target=child_main, args=(log, queue)) + process.start() + + # Send the memory resource and buffer descriptor twice. + log("sending mr and buffer descriptors") + queue.put(mr) + queue.put(buffer.get_ipc_descriptor()) + queue.put(buffer.get_ipc_descriptor()) + + log("waiting for child") + process.join(timeout=CHILD_TIMEOUT_SEC) + log(f"child exit code: {process.exitcode}") + assert process.exitcode == 0, f"Child process failed with exit code {process.exitcode}" + log("done") diff --git a/cuda_core/tests/test_comparable.py b/cuda_core/tests/test_comparable.py index a93e49e4e8..281ed4ab1c 100644 --- a/cuda_core/tests/test_comparable.py +++ b/cuda_core/tests/test_comparable.py @@ -9,8 +9,6 @@ """ from cuda.core import Device, Stream -from cuda.core._context import Context -from cuda.core._event import Event, EventOptions from cuda.core._stream import StreamOptions # ============================================================================ @@ -105,50 +103,34 @@ def test_event_subclass_equality(init_cuda): Event uses isinstance() for equality checking, similar to Stream. """ - - class MyEvent(Event): - pass - device = Device(0) device.set_current() - # Create two different events - event = Event._init(device.device_id, device.context, options=EventOptions()) - my_event = MyEvent._init(device.device_id, device.context, options=EventOptions()) + # Create events using public API + event1 = device.create_event() + event2 = device.create_event() + event3 = device.create_event() # Different events should not be equal (different handles) - assert event != my_event, "Different Event instances are not equal" + assert event1 != event2, "Different Event instances are not equal" + assert event2 != event3, "Different Event instances are not equal" - # Same subclass type with different handles - my_event2 = MyEvent._init(device.device_id, device.context, options=EventOptions()) - assert my_event != my_event2, "Different MyEvent instances are not equal" - - -def test_context_subclass_equality(init_cuda): - """Test Context subclass equality behavior.""" - - class MyContext(Context): - pass +def test_context_equality(init_cuda): + """Test Context equality behavior.""" device = Device(0) device.set_current() - stream = device.create_stream() - context = stream.context - - # MyContext._from_ctx() returns a Context instance, not MyContext - my_context = MyContext._from_ctx(context._handle, device.device_id) - assert type(my_context) is Context, "_from_ctx returns Context, not subclass" - assert type(my_context) is not MyContext - - # Since both are Context instances with same handle, they're equal - assert context == my_context, "Context instances with same handle are equal" - # Create another context from different stream + # Get context from different sources + stream1 = device.create_stream() stream2 = device.create_stream() + context1 = stream1.context context2 = stream2.context + device_context = device.context # Same device, same primary context, should be equal - assert context == context2, "Contexts from same device are equal" + assert context1 == context2, "Contexts from same device are equal" + assert context1 == device_context, "Stream context equals device context" def test_subclass_type_safety(init_cuda): diff --git a/cuda_core/tests/test_event.py b/cuda_core/tests/test_event.py index 0d8f3a3c2d..ef075d8580 100644 --- a/cuda_core/tests/test_event.py +++ b/cuda_core/tests/test_event.py @@ -148,14 +148,12 @@ def test_event_context(init_cuda): assert context is not None -def test_event_subclassing(): - class MyEvent(Event): - pass - +def test_event_creation(): + """Test Event creation via public API.""" dev = Device() dev.set_current() - event = MyEvent._init(dev.device_id, dev.context) - assert isinstance(event, MyEvent) + event = dev.create_event() + assert isinstance(event, Event) # ============================================================================ diff --git a/cuda_core/tests/test_hashable.py b/cuda_core/tests/test_hashable.py index 9bc89969a2..feeae9b07b 100644 --- a/cuda_core/tests/test_hashable.py +++ b/cuda_core/tests/test_hashable.py @@ -13,8 +13,6 @@ """ from cuda.core import Device -from cuda.core._context import Context -from cuda.core._event import Event, EventOptions from cuda.core._stream import Stream, StreamOptions # ============================================================================ @@ -128,65 +126,51 @@ class MyStream(Stream): assert hash(my_stream) != hash(my_stream2), "Different streams have different hashes" -def test_event_subclass_hash(init_cuda): - """Test Event subclass hash behavior.""" - - class MyEvent(Event): - pass - +def test_event_hash(init_cuda): + """Test Event hash behavior.""" device = Device(0) device.set_current() - # Create events with different handles - event = Event._init(device.device_id, device.context, options=EventOptions()) - my_event = MyEvent._init(device.device_id, device.context, options=EventOptions()) + # Create events using public API + event1 = device.create_event() + event2 = device.create_event() # Different events (different handles) -> different hashes - assert hash(event) != hash(my_event), "Different events have different hashes" - assert event != my_event, "Different handles means not equal" + assert hash(event1) != hash(event2), "Different events have different hashes" + assert event1 != event2, "Different handles means not equal" # Verify hash consistency - hash1 = hash(event) - hash2 = hash(event) + hash1 = hash(event1) + hash2 = hash(event1) assert hash1 == hash2, "Hash is consistent across multiple calls" # Both should be usable as dict keys - cache = {event: "base", my_event: "subclass"} + cache = {event1: "first", event2: "second"} assert len(cache) == 2, "Different events are distinct dict keys" - assert cache[event] == "base" - assert cache[my_event] == "subclass" - - -def test_context_subclass_hash(init_cuda): - """Test Context subclass hash behavior. + assert cache[event1] == "first" + assert cache[event2] == "second" - Context._from_ctx() always returns Context instances, even when called - as MyContext._from_ctx(). This means we can't create actual MyContext - instances in practice. - """ - - class MyContext(Context): - pass +def test_context_hash(init_cuda): + """Test Context hash behavior.""" device = Device(0) device.set_current() - stream = device.create_stream() - context = stream.context - # MyContext._from_ctx() returns Context, not MyContext - my_context = MyContext._from_ctx(context._handle, device.device_id) - assert type(my_context) is Context, "_from_ctx returns Context type" + # Get context from different sources + stream1 = device.create_stream() + stream2 = device.create_stream() + context1 = stream1.context + context2 = stream2.context - # Same handle -> same hash - assert hash(context) == hash(my_context), "Contexts with same handle have same hash" + # Same underlying context -> same hash + assert hash(context1) == hash(context2), "Contexts with same handle have same hash" # Verify equality matches hash - assert context == my_context, "Contexts with same handle are equal" - assert hash(context) == hash(my_context), "Equal contexts have equal hashes" + assert context1 == context2, "Contexts with same handle are equal" # Verify hash consistency - hash1 = hash(context) - hash2 = hash(context) + hash1 = hash(context1) + hash2 = hash(context1) assert hash1 == hash2, "Hash is consistent across multiple calls" @@ -200,33 +184,24 @@ def test_hash_equality_contract_maintained(init_cuda): allowing cross-type equality with consistent hashing. """ - class MyStream(Stream): - pass - - class MyEvent(Event): - pass - - class MyContext(Context): - pass - device = Device(0) device.set_current() - # Test Stream: base and subclass with same handle - my_stream = MyStream._init(options=StreamOptions(), device_id=device.device_id) - stream = Stream.from_handle(int(my_stream.handle)) + # Test Stream: two references to same handle + stream1 = device.create_stream() + stream2 = Stream.from_handle(int(stream1.handle)) - assert my_stream == stream, "Equal due to isinstance() check and same handle" - assert hash(my_stream) == hash(stream), "Equal objects have equal hashes" + assert stream1 == stream2, "Equal due to same handle" + assert hash(stream1) == hash(stream2), "Equal objects have equal hashes" - # Test Context: always returns base type from _from_ctx - ctx = device.context - my_ctx = MyContext._from_ctx(ctx._handle, device.device_id) + # Test Context: contexts from same device share same underlying context + ctx1 = device.context + ctx2 = device.create_stream().context - assert ctx == my_ctx, "Equal contexts with same handle" - assert hash(ctx) == hash(my_ctx), "Equal objects have equal hashes" + assert ctx1 == ctx2, "Equal contexts with same handle" + assert hash(ctx1) == hash(ctx2), "Equal objects have equal hashes" # Test that different handles still produce different hashes - my_stream2 = MyStream._init(options=StreamOptions(), device_id=device.device_id) - assert my_stream != my_stream2, "Different handles means not equal" - assert hash(my_stream) != hash(my_stream2), "Different objects have different hashes" + stream3 = device.create_stream() + assert stream1 != stream3, "Different handles means not equal" + assert hash(stream1) != hash(stream3), "Different objects have different hashes" diff --git a/cuda_core/tests/test_stream.py b/cuda_core/tests/test_stream.py index 01b0b861af..925daa7cd5 100644 --- a/cuda_core/tests/test_stream.py +++ b/cuda_core/tests/test_stream.py @@ -74,7 +74,7 @@ def test_stream_context(init_cuda): stream = Device().create_stream(options=StreamOptions()) context = stream.context assert context is not None - assert context._handle is not None + assert context.handle is not None def test_stream_from_foreign_stream(init_cuda):