diff --git a/.gitattributes b/.gitattributes
index 6a3ee0fe72..81f2361d4c 100644
--- a/.gitattributes
+++ b/.gitattributes
@@ -6,6 +6,9 @@ cuda/_version.py export-subst
 # we do not own any headers checked in, don't touch them
 *.h binary
 *.hpp binary
+# Exception: headers we own (cuda_core C++ implementation)
+cuda_core/cuda/core/_cpp/*.h -binary text diff
+cuda_core/cuda/core/_cpp/*.hpp -binary text diff
 # git should not convert line endings in PNG files
 *.png binary
 *.svg binary
diff --git a/.gitignore b/.gitignore
index d3d7c31208..fb40fae6d0 100644
--- a/.gitignore
+++ b/.gitignore
@@ -11,6 +11,7 @@ __pycache__/
 .pytest_cache/
 .benchmarks/
 *.cpp
+!*_impl.cpp
 !cuda_bindings/cuda/bindings/_lib/param_packer.cpp
 !cuda_bindings/cuda/bindings/_bindings/loader.cpp
 cache_driver
diff --git a/ci/tools/merge_cuda_core_wheels.py b/ci/tools/merge_cuda_core_wheels.py
index 87e2df13a7..e5320e9142 100644
--- a/ci/tools/merge_cuda_core_wheels.py
+++ b/ci/tools/merge_cuda_core_wheels.py
@@ -150,15 +150,21 @@ def merge_wheels(wheels: List[Path], output_dir: Path, show_wheel_contents: bool
             "__init__.py",
             "_version.py",
             "_include",
+            "_cpp",  # Headers for Cython development
             "cu12",
             "cu13",
         )
+        # _resource_handles is shared (not CUDA-version-specific) and must stay
+        # at top level. It's imported early in __init__.py before versioned code.
+        items_to_keep_prefix = ("_resource_handles",)
         all_items = os.scandir(base_wheel / base_dir)
         removed_count = 0
         for f in all_items:
             f_abspath = f.path
             if f.name in items_to_keep:
                 continue
+            if any(f.name.startswith(prefix) for prefix in items_to_keep_prefix):
+                continue
             if f.is_dir():
                 print(f"  Removing directory: {f.name}", file=sys.stderr)
                 shutil.rmtree(f_abspath)
diff --git a/cuda_core/MANIFEST.in b/cuda_core/MANIFEST.in
index 43d3815901..0bf6530caf 100644
--- a/cuda_core/MANIFEST.in
+++ b/cuda_core/MANIFEST.in
@@ -2,4 +2,4 @@
 #
 # SPDX-License-Identifier: Apache-2.0
 
-recursive-include cuda/core *.pyx *.pxd
+recursive-include cuda/core *.pyx *.pxd *.cpp *.hpp
diff --git a/cuda_core/build_hooks.py b/cuda_core/build_hooks.py
index 4337783563..1f51b99112 100644
--- a/cuda_core/build_hooks.py
+++ b/cuda_core/build_hooks.py
@@ -86,7 +86,21 @@ def get_cuda_paths():
         print("CUDA paths:", CUDA_PATH)
         return CUDA_PATH
 
-    all_include_dirs = list(os.path.join(root, "include") for root in get_cuda_paths())
+    def get_sources(mod_name):
+        """Get source files for a module, including any .cpp files."""
+        sources = [f"cuda/core/{mod_name}.pyx"]
+
+        # Add module-specific .cpp file from _cpp/ directory if it exists
+        cpp_file = f"cuda/core/_cpp/{mod_name.lstrip('_')}.cpp"
+        if os.path.exists(cpp_file):
+            sources.append(cpp_file)
+
+        return sources
+
+    def get_extension_kwargs(mod_name):
+        """Return Extension kwargs (libraries, etc.) per module."""
+        return {"extra_compile_args": extra_compile_args}
+
     extra_compile_args = []
     if COMPILE_FOR_COVERAGE:
         # CYTHON_TRACE_NOGIL indicates to trace nogil functions.  It is not
@@ -96,10 +110,14 @@ def get_cuda_paths():
     ext_modules = tuple(
         Extension(
             f"cuda.core.{mod.replace(os.path.sep, '.')}",
-            sources=[f"cuda/core/{mod}.pyx"],
-            include_dirs=all_include_dirs,
+            sources=get_sources(mod),
+            include_dirs=[
+                "cuda/core/_include",
+                "cuda/core/_cpp",
+            ]
+            + list(os.path.join(root, "include") for root in get_cuda_paths()),
             language="c++",
-            extra_compile_args=extra_compile_args,
+            **get_extension_kwargs(mod),
         )
         for mod in module_names
     )
diff --git a/cuda_core/cuda/core/__init__.py b/cuda_core/cuda/core/__init__.py
index a10812606e..6bebb13b4e 100644
--- a/cuda_core/cuda/core/__init__.py
+++ b/cuda_core/cuda/core/__init__.py
@@ -15,6 +15,15 @@
 
 import importlib
 
+# The _resource_handles module exports a PyCapsule dispatch table that other
+# extension modules access via PyCapsule_Import. We import it here to ensure
+# it's loaded before other modules try to use it.
+#
+# We use importlib.import_module with the full path to avoid triggering
+# circular import issues that can occur with relative imports during
+# package initialization.
+_resource_handles = importlib.import_module("cuda.core._resource_handles")
+
 subdir = f"cu{cuda_major}"
 try:
     versioned_mod = importlib.import_module(f".{subdir}", __package__)
diff --git a/cuda_core/cuda/core/_context.pxd b/cuda_core/cuda/core/_context.pxd
new file mode 100644
index 0000000000..dc853fc75d
--- /dev/null
+++ b/cuda_core/cuda/core/_context.pxd
@@ -0,0 +1,19 @@
+# SPDX-FileCopyrightText: Copyright (c) 2024-2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+#
+# SPDX-License-Identifier: Apache-2.0
+
+from cuda.core._resource_handles cimport ContextHandle
+
+cdef class Context:
+    """Cython declaration for Context class.
+
+    This class provides access to CUDA contexts. Context objects cannot be
+    instantiated directly - use factory methods or Device/Stream APIs.
+    """
+
+    cdef:
+        ContextHandle _h_context
+        int _device_id
+
+    @staticmethod
+    cdef Context _from_handle(type cls, ContextHandle h_context, int device_id)
diff --git a/cuda_core/cuda/core/_context.pyx b/cuda_core/cuda/core/_context.pyx
index c1c28b3389..64663aadf3 100644
--- a/cuda_core/cuda/core/_context.pyx
+++ b/cuda_core/cuda/core/_context.pyx
@@ -4,35 +4,55 @@
 
 from dataclasses import dataclass
 
-from cuda.core._utils.cuda_utils import driver
+from cuda.core._resource_handles cimport (
+    ContextHandle,
+    intptr,
+    py,
+)
 
 
-@dataclass
-class ContextOptions:
-    pass  # TODO
+__all__ = ['Context', 'ContextOptions']
 
 
 cdef class Context:
+    """CUDA context wrapper.
 
-    cdef:
-        readonly object _handle
-        int _device_id
+    Context objects represent CUDA contexts and cannot be instantiated directly.
+    Use Device or Stream APIs to obtain context objects.
+    """
 
     def __init__(self, *args, **kwargs):
         raise RuntimeError("Context objects cannot be instantiated directly. Please use Device or Stream APIs.")
 
-    @classmethod
-    def _from_ctx(cls, handle: driver.CUcontext, int device_id):
-        cdef Context ctx = Context.__new__(Context)
-        ctx._handle = handle
+    @staticmethod
+    cdef Context _from_handle(type cls, ContextHandle h_context, int device_id):
+        """Create Context from existing ContextHandle (cdef-only factory)."""
+        cdef Context ctx = cls.__new__(cls)
+        ctx._h_context = h_context
         ctx._device_id = device_id
         return ctx
 
+    @property
+    def handle(self):
+        """Return the underlying CUcontext handle."""
+        if self._h_context.get() == NULL:
+            return None
+        return py(self._h_context)
+
     def __eq__(self, other):
         if not isinstance(other, Context):
             return NotImplemented
         cdef Context _other = <Context>other
-        return int(self._handle) == int(_other._handle)
+        return intptr(self._h_context) == intptr(_other._h_context)
 
     def __hash__(self) -> int:
-        return hash(int(self._handle))
+        return hash((type(self), intptr(self._h_context)))
+
+
+@dataclass
+class ContextOptions:
+    """Options for context creation.
+
+    Currently unused, reserved for future use.
+    """
+    pass  # TODO
diff --git a/cuda_core/cuda/core/_cpp/DESIGN.md b/cuda_core/cuda/core/_cpp/DESIGN.md
new file mode 100644
index 0000000000..003dcfd945
--- /dev/null
+++ b/cuda_core/cuda/core/_cpp/DESIGN.md
@@ -0,0 +1,286 @@
+# Resource Handles Design
+
+This document describes the resource handle abstraction in cuda.core, which provides
+robust lifetime management for CUDA resources.
+
+## Overview
+
+The cuda-core Python library provides a high-level interface to CUDA resources such as
+Context, Device, Stream, and Event. These objects correspond to resources managed by
+the CUDA Driver API, each having explicit creation and destruction routines. Several
+of these CUDA resources also participate in non-trivial ownership hierarchies (e.g.,
+a stream belongs to a context), and releasing them may require additional arguments
+or other resources (e.g., a device pointer freed through a specific stream).
+
+### Goals
+
+The goal of the handle abstraction is to provide a robust, explicit, and Python-agnostic
+layer for ownership and lifetime management of CUDA resources. The intent is to use
+handles as the backbone of the cuda-core resource hierarchy, enabling cuda-core Python
+objects to manipulate handles rather than work directly with raw CUDA resources.
+
+While Python-facing objects expose convenient APIs and additional behaviors, the handle
+layer isolates all concerns related to resource lifetime. By cleanly separating these
+responsibilities, we achieve:
+
+- **Clearer architecture** with minimal cross-layer coupling
+- **Safe transfer of resource ownership** between Python and other domains, including C++
+- **Ability to preserve resource validity** independent of Python
+- **Well-specified semantics** for immutability, ownership, and reachability
+- **Simplified reasoning about resource lifetimes**, especially with nested or dependent resources
+
+### Handle Semantics
+
+Resource handles provide **referentially transparent** wrappers around CUDA resources:
+
+- **No rebinding**: A handle always refers to the same resource.
+- **No invalidation**: If a handle exists, its resource is valid.
+- **Structural dependencies**: If resource A depends on resource B, A's handle
+  embeds B's handle, automatically extending B's lifetime.
+
+This eliminates global lifetime analysis. Correctness is enforced structurally—if you
+have a handle, you have a valid resource.
+
+## Handle Types
+
+All handles are `std::shared_ptr` aliases that expose only the raw CUDA resource:
+
+```cpp
+using ContextHandle = std::shared_ptr<const CUcontext>;
+using StreamHandle = std::shared_ptr<const CUstream>;
+using EventHandle = std::shared_ptr<const CUevent>;
+using MemoryPoolHandle = std::shared_ptr<const CUmemoryPool>;
+using DevicePtrHandle = std::shared_ptr<const CUdeviceptr>;
+```
+
+Internally, handles use **shared pointer aliasing**: the actual managed object is a
+"box" containing the resource, its dependencies, and any state needed for destruction.
+The public handle points only to the raw resource field, keeping the API minimal.
+
+### Why shared_ptr?
+
+- **Automatic reference counting**: Resources are released when the last reference
+  disappears.
+- **Cross-language stability**: Works across Python/C++ boundaries without relying
+  on Python's garbage collector.
+- **Interpreter independence**: Resources remain valid even during Python shutdown.
+- **Type-erased deleters**: Destruction logic is captured at creation time, supporting
+  diverse lifetime strategies.
+
+## Accessing Handle Values
+
+Handles can be accessed in three ways via overloaded helper functions:
+
+| Function | Returns | Use Case | Notes
+|----------|---------|----------|-------|
+| `native(h)` | Raw CUDA type (e.g., `CUstream`) | Passing to CUDA APIs | An attribute of `cuda.bindings.cydriver` |
+| `intptr(h)` | `intptr_t` | Python interop, foreign code | |
+| `py(h)` | Python wrapper object | Returning to Python callers | An attribute of `cure.bindings.driver`
+
+These overloads exist because `std::shared_ptr` cannot have additional attributes.
+Wrapping handles in Python objects would be superfluous overhead for internal use,
+so we provide these helpers instead.
+
+Example usage from Cython:
+
+```cython
+# Get raw handle for CUDA API calls
+cdef CUstream raw_stream = native(h_stream)  # cuda.bindings.cydriver.CUstream
+
+# Get as integer for other use cases
+return hash(intptr(h_stream))
+
+# Get Python wrapper for returning to user
+return py(h_stream)  # cuda.bindings.driver.CUstream
+```
+
+## Code Structure
+
+### Directory Layout
+
+```
+cuda/core/
+├── _resource_handles.pyx    # Cython module (compiles resource_handles.cpp)
+├── _resource_handles.pxd    # Cython declarations and dispatch wrappers
+└── _cpp/
+    ├── resource_handles.hpp       # C++ API declarations
+    ├── resource_handles.cpp       # C++ implementation
+    └── resource_handles_cxx_api.hpp  # Capsule struct definition
+```
+
+### Build Implications
+
+The `_cpp/` subdirectory contains C++ source files that are compiled into the
+`_resource_handles` extension module. Other Cython modules in cuda.core do **not**
+link against this code directly—they access it through a capsule mechanism
+(explained below).
+
+## Capsule Architecture
+
+The implementation uses **two separate capsule mechanisms** for different purposes:
+
+### Capsule 1: C++ API Table (`_CXX_API`)
+
+**Problem**: Cython extension modules compile independently. If multiple modules
+(`_memory.pyx`, `_ipc.pyx`, etc.) each linked `resource_handles.cpp`, they would
+each have their own copies of:
+
+- Static driver function pointers
+- Thread-local error state
+- Other static data, including global caches
+
+**Solution**: Only `_resource_handles.so` links the C++ code. It exports a capsule
+containing function pointers:
+
+```cpp
+struct ResourceHandlesCxxApiV1 {
+    uint32_t abi_version;
+    uint32_t struct_size;
+
+    // Thread-local error handling
+    CUresult (*get_last_error)() noexcept;
+    CUresult (*peek_last_error)() noexcept;
+    void (*clear_last_error)() noexcept;
+
+    // Handle creation functions
+    ContextHandle (*get_primary_context)(int device_id) noexcept;
+    StreamHandle (*create_stream_handle)(...) noexcept;
+    // ... etc
+};
+```
+
+Other Cython modules import this capsule at runtime and call through the function
+pointers. The `.pxd` file provides inline wrappers that hide this indirection:
+
+```cython
+cdef inline StreamHandle create_stream_handle(...) except * nogil:
+    return _handles_table.create_stream_handle(...)
+```
+
+Importing modules are expected to call `_init_handles_table()` prior to calling
+any wrapper functions.
+
+### Capsule 2: CUDA Driver API (`_CUDA_DRIVER_API_V1`)
+
+**Problem**: cuda.core cannot directly call CUDA driver functions because:
+
+1. We don't want to link against `libcuda.so` at build time.
+2. The driver symbols must be resolved dynamically through cuda-bindings.
+
+**Solution**: `_resource_handles.pyx` creates a capsule containing CUDA driver
+function pointers obtained from cuda-bindings:
+
+```cpp
+struct CudaDriverApiV1 {
+    uint32_t abi_version;
+    uint32_t struct_size;
+
+    uintptr_t cuDevicePrimaryCtxRetain;
+    uintptr_t cuDevicePrimaryCtxRelease;
+    uintptr_t cuStreamCreateWithPriority;
+    uintptr_t cuStreamDestroy;
+    // ... etc
+};
+```
+
+The C++ code retrieves this capsule once (via `load_driver_api()`) and caches the
+function pointers for subsequent use.
+
+### Why Two Capsules?
+
+| Capsule | Direction | Purpose |
+|---------|-----------|---------|
+| `_CXX_API` | C++ → Cython | Share handle functions across modules |
+| `_CUDA_DRIVER_API_V1` | Cython → C++ | Provide resolved driver symbols |
+
+## Key Implementation Details
+
+### Structural Dependencies
+
+When a resource depends on another, its handle embeds the dependency:
+
+```cpp
+struct StreamBox {
+    CUstream resource;
+    ContextHandle h_context;  // Keeps context alive
+};
+```
+
+The shared pointer's custom deleter captures any additional state needed for
+destruction. This ensures resources are always destroyed in the correct order.
+
+### GIL Management
+
+Handle destructors may run from any thread. The implementation includes RAII guards
+(`GILReleaseGuard`, `GILAcquireGuard`) that:
+
+- Release the GIL before calling CUDA APIs (for parallelism)
+- Handle Python finalization gracefully (avoid GIL operations during shutdown)
+- Ensure Python object manipulation happens with GIL held
+
+The handle API functions are safe to call with or without the GIL held. They
+will release the GIL (if necessary) before calling CUDA driver API functions.
+
+### Error Handling
+
+Handle API functions do not raise Python exceptions. Instead, they return an empty
+handle (null `shared_ptr`) on failure and store the error code in thread-local state.
+Callers should check for failure and retrieve the error using `get_last_error()`:
+
+```cython
+cdef StreamHandle h = create_stream_handle(h_ctx, flags, priority)
+if not h:
+    # Handle creation failed - get the CUDA error code
+    cdef CUresult err = get_last_error()
+    # ... handle error (e.g., raise Python exception)
+```
+
+This design allows handle functions to be called from `nogil` blocks without requiring
+GIL acquisition for exception handling on the success path. The error state is
+thread-local, so concurrent calls from different threads do not interfere.
+
+Related functions:
+- `get_last_error()`: Returns and clears the most recent error
+- `peek_last_error()`: Returns the error without clearing it
+- `clear_last_error()`: Clears the error state
+
+## Usage from Cython
+
+```cython
+from cuda.core._resource_handles cimport (
+    StreamHandle,
+    create_stream_handle,
+    native,
+    intptr,
+    get_last_error,
+    _init_handles_table,
+)
+
+_init_handles_table()  # prerequisite before calling handle API functions
+
+# Create a stream
+cdef StreamHandle h_stream = create_stream_handle(h_ctx, flags, priority)
+if not h_stream:
+    HANDLE_RETURN(get_last_error())
+
+# Use in CUDA API
+cuStreamSynchronize(native(h_stream))
+
+# Return to Python
+return py(h_stream)
+```
+
+## Summary
+
+The resource handle design:
+
+1. **Separates resource management** into its own layer, independent of Python objects.
+2. **Encodes lifetimes structurally** via embedded handle dependencies.
+3. **Uses capsules** to solve two distinct problems:
+   - Sharing C++ code across Cython modules without duplicate statics.
+   - Resolving CUDA driver symbols dynamically through cuda-bindings.
+4. **Provides overloaded accessors** (`native`, `intptr`, `py`) since handles cannot
+   have attributes without unnecessary Python object wrappers.
+
+This architecture ensures CUDA resources are managed correctly regardless of Python
+garbage collection timing, interpreter shutdown, or cross-language usage patterns.
diff --git a/cuda_core/cuda/core/_cpp/resource_handles.cpp b/cuda_core/cuda/core/_cpp/resource_handles.cpp
new file mode 100644
index 0000000000..5ffc84145c
--- /dev/null
+++ b/cuda_core/cuda/core/_cpp/resource_handles.cpp
@@ -0,0 +1,877 @@
+// SPDX-FileCopyrightText: Copyright (c) 2024-2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+//
+// SPDX-License-Identifier: Apache-2.0
+
+#include <Python.h>
+
+#include "resource_handles.hpp"
+#include "resource_handles_cxx_api.hpp"
+#include <cuda.h>
+#include <cstdint>
+#include <cstring>
+#include <mutex>
+#include <unordered_map>
+#include <vector>
+
+namespace cuda_core {
+
+// ============================================================================
+// CUDA driver lazy resolution via cuda-bindings (CPU-only import + MVC)
+// ============================================================================
+
+namespace {
+
+static std::once_flag driver_load_once;
+static bool driver_loaded = false;
+
+#if PY_VERSION_HEX < 0x030D0000
+extern "C" int _Py_IsFinalizing(void);
+#endif
+
+static inline bool py_is_finalizing() noexcept {
+#if PY_VERSION_HEX >= 0x030D0000
+    return Py_IsFinalizing();
+#else
+    // Python < 3.13 does not expose Py_IsFinalizing() publicly. Use the private
+    // API that exists in those versions.
+    return _Py_IsFinalizing() != 0;
+#endif
+}
+
+// ============================================================================
+// GIL management helpers
+// ============================================================================
+
+// Helper to release the GIL while calling into the CUDA driver.
+// This guard is *conditional*: if the caller already dropped the GIL,
+// we avoid calling PyEval_SaveThread (which requires holding the GIL).
+// It also handles the case where Python is finalizing and GIL operations
+// are no longer safe.
+class GILReleaseGuard {
+public:
+    GILReleaseGuard() : tstate_(nullptr), released_(false) {
+        // Don't try to manipulate GIL if Python is finalizing
+        if (!Py_IsInitialized() || py_is_finalizing()) {
+            return;
+        }
+        // PyGILState_Check() returns 1 if the GIL is held by this thread.
+        if (PyGILState_Check()) {
+            tstate_ = PyEval_SaveThread();
+            released_ = true;
+        }
+    }
+
+    ~GILReleaseGuard() {
+        if (released_) {
+            PyEval_RestoreThread(tstate_);
+        }
+    }
+
+    // Non-copyable, non-movable
+    GILReleaseGuard(const GILReleaseGuard&) = delete;
+    GILReleaseGuard& operator=(const GILReleaseGuard&) = delete;
+
+private:
+    PyThreadState* tstate_;
+    bool released_;
+};
+
+// Helper to acquire the GIL when we might not hold it.
+// Use in C++ destructors that need to manipulate Python objects.
+class GILAcquireGuard {
+public:
+    GILAcquireGuard() : acquired_(false) {
+        // Don't try to acquire GIL if Python is finalizing
+        if (!Py_IsInitialized() || py_is_finalizing()) {
+            return;
+        }
+        gstate_ = PyGILState_Ensure();
+        acquired_ = true;
+    }
+
+    ~GILAcquireGuard() {
+        if (acquired_) {
+            PyGILState_Release(gstate_);
+        }
+    }
+
+    bool acquired() const { return acquired_; }
+
+    // Non-copyable, non-movable
+    GILAcquireGuard(const GILAcquireGuard&) = delete;
+    GILAcquireGuard& operator=(const GILAcquireGuard&) = delete;
+
+private:
+    PyGILState_STATE gstate_;
+    bool acquired_;
+};
+
+
+#define DECLARE_DRIVER_FN(name) using name##_t = decltype(&name); static name##_t p_##name = nullptr
+
+DECLARE_DRIVER_FN(cuDevicePrimaryCtxRetain);
+DECLARE_DRIVER_FN(cuDevicePrimaryCtxRelease);
+DECLARE_DRIVER_FN(cuCtxGetCurrent);
+
+DECLARE_DRIVER_FN(cuStreamCreateWithPriority);
+DECLARE_DRIVER_FN(cuStreamDestroy);
+
+DECLARE_DRIVER_FN(cuEventCreate);
+DECLARE_DRIVER_FN(cuEventDestroy);
+DECLARE_DRIVER_FN(cuIpcOpenEventHandle);
+
+DECLARE_DRIVER_FN(cuDeviceGetCount);
+
+DECLARE_DRIVER_FN(cuMemPoolSetAccess);
+DECLARE_DRIVER_FN(cuMemPoolDestroy);
+DECLARE_DRIVER_FN(cuMemPoolCreate);
+DECLARE_DRIVER_FN(cuDeviceGetMemPool);
+DECLARE_DRIVER_FN(cuMemPoolImportFromShareableHandle);
+
+DECLARE_DRIVER_FN(cuMemAllocFromPoolAsync);
+DECLARE_DRIVER_FN(cuMemAllocAsync);
+DECLARE_DRIVER_FN(cuMemAlloc);
+DECLARE_DRIVER_FN(cuMemAllocHost);
+
+DECLARE_DRIVER_FN(cuMemFreeAsync);
+DECLARE_DRIVER_FN(cuMemFree);
+DECLARE_DRIVER_FN(cuMemFreeHost);
+
+DECLARE_DRIVER_FN(cuMemPoolImportPointer);
+
+#undef DECLARE_DRIVER_FN
+
+static bool load_driver_api() noexcept {
+    struct CudaDriverApiV1 {
+        std::uint32_t abi_version;
+        std::uint32_t struct_size;
+
+        std::uintptr_t cuDevicePrimaryCtxRetain;
+        std::uintptr_t cuDevicePrimaryCtxRelease;
+        std::uintptr_t cuCtxGetCurrent;
+
+        std::uintptr_t cuStreamCreateWithPriority;
+        std::uintptr_t cuStreamDestroy;
+
+        std::uintptr_t cuEventCreate;
+        std::uintptr_t cuEventDestroy;
+        std::uintptr_t cuIpcOpenEventHandle;
+
+        std::uintptr_t cuDeviceGetCount;
+
+        std::uintptr_t cuMemPoolSetAccess;
+        std::uintptr_t cuMemPoolDestroy;
+        std::uintptr_t cuMemPoolCreate;
+        std::uintptr_t cuDeviceGetMemPool;
+        std::uintptr_t cuMemPoolImportFromShareableHandle;
+
+        std::uintptr_t cuMemAllocFromPoolAsync;
+        std::uintptr_t cuMemAllocAsync;
+        std::uintptr_t cuMemAlloc;
+        std::uintptr_t cuMemAllocHost;
+
+        std::uintptr_t cuMemFreeAsync;
+        std::uintptr_t cuMemFree;
+        std::uintptr_t cuMemFreeHost;
+
+        std::uintptr_t cuMemPoolImportPointer;
+    };
+
+    static constexpr const char* capsule_name =
+        "cuda.core._resource_handles._CUDA_DRIVER_API_V1";
+
+    GILAcquireGuard gil;
+    if (!gil.acquired()) {
+        return false;
+    }
+
+    // `_resource_handles` is already loaded (it exports the handle API capsule),
+    // so avoid import machinery and just grab the module object.
+    PyObject* mod = PyImport_AddModule("cuda.core._resource_handles");  // borrowed
+    if (!mod) {
+        PyErr_Clear();
+        return false;
+    }
+
+    PyObject* fn = PyObject_GetAttrString(mod, "_get_cuda_driver_api_v1_capsule");  // new ref
+    if (!fn) {
+        PyErr_Clear();
+        return false;
+    }
+
+    PyObject* cap = PyObject_CallFunctionObjArgs(fn, nullptr);
+    Py_DECREF(fn);
+    if (!cap) {
+        PyErr_Clear();
+        return false;
+    }
+
+    const auto* api = static_cast<const CudaDriverApiV1*>(PyCapsule_GetPointer(cap, capsule_name));
+    Py_DECREF(cap);
+
+    if (!api) {
+        PyErr_Clear();
+        return false;
+    }
+    if (api->abi_version != 1 || api->struct_size < sizeof(CudaDriverApiV1)) {
+        return false;
+    }
+
+#define LOAD_ADDR(name)                                             \
+    do {                                                            \
+        if (api->name == 0) {                                       \
+            return false;                                           \
+        }                                                           \
+        p_##name = reinterpret_cast<decltype(p_##name)>(api->name); \
+    } while (0)
+
+    LOAD_ADDR(cuDevicePrimaryCtxRetain);
+    LOAD_ADDR(cuDevicePrimaryCtxRelease);
+    LOAD_ADDR(cuCtxGetCurrent);
+
+    LOAD_ADDR(cuStreamCreateWithPriority);
+    LOAD_ADDR(cuStreamDestroy);
+
+    LOAD_ADDR(cuEventCreate);
+    LOAD_ADDR(cuEventDestroy);
+    LOAD_ADDR(cuIpcOpenEventHandle);
+
+    LOAD_ADDR(cuDeviceGetCount);
+
+    LOAD_ADDR(cuMemPoolSetAccess);
+    LOAD_ADDR(cuMemPoolDestroy);
+    LOAD_ADDR(cuMemPoolCreate);
+    LOAD_ADDR(cuDeviceGetMemPool);
+    LOAD_ADDR(cuMemPoolImportFromShareableHandle);
+
+    LOAD_ADDR(cuMemAllocFromPoolAsync);
+    LOAD_ADDR(cuMemAllocAsync);
+    LOAD_ADDR(cuMemAlloc);
+    LOAD_ADDR(cuMemAllocHost);
+
+    LOAD_ADDR(cuMemFreeAsync);
+    LOAD_ADDR(cuMemFree);
+    LOAD_ADDR(cuMemFreeHost);
+
+    LOAD_ADDR(cuMemPoolImportPointer);
+
+#undef LOAD_ADDR
+
+    return true;
+}
+
+static bool ensure_driver_loaded() noexcept {
+    std::call_once(driver_load_once, []() { driver_loaded = load_driver_api(); });
+    return driver_loaded;
+}
+
+}  // namespace
+
+// ============================================================================
+// Thread-local error handling
+// ============================================================================
+
+// Thread-local status of the most recent CUDA API call in this module.
+thread_local CUresult err = CUDA_SUCCESS;
+
+CUresult get_last_error() noexcept {
+    CUresult e = err;
+    err = CUDA_SUCCESS;
+    return e;
+}
+
+CUresult peek_last_error() noexcept {
+    return err;
+}
+
+void clear_last_error() noexcept {
+    err = CUDA_SUCCESS;
+}
+
+// ============================================================================
+// Context Handles
+// ============================================================================
+
+struct ContextBox {
+    CUcontext resource;
+};
+
+ContextHandle create_context_handle_ref(CUcontext ctx) {
+    auto box = std::make_shared<const ContextBox>(ContextBox{ctx});
+    return ContextHandle(box, &box->resource);
+}
+
+// Thread-local cache of primary contexts indexed by device ID
+thread_local std::vector<ContextHandle> primary_context_cache;
+
+ContextHandle get_primary_context(int device_id) noexcept {
+    if (!ensure_driver_loaded()) {
+        err = CUDA_ERROR_NOT_INITIALIZED;
+        return {};
+    }
+    // Check thread-local cache
+    if (static_cast<size_t>(device_id) < primary_context_cache.size()) {
+        if (auto cached = primary_context_cache[device_id]) {
+            return cached;
+        }
+    }
+
+    // Cache miss - acquire primary context from driver
+    GILReleaseGuard gil;
+    CUcontext ctx;
+    if (CUDA_SUCCESS != (err = p_cuDevicePrimaryCtxRetain(&ctx, device_id))) {
+        return {};
+    }
+
+    auto box = std::shared_ptr<const ContextBox>(
+        new ContextBox{ctx},
+        [device_id](const ContextBox* b) {
+            GILReleaseGuard gil;
+            p_cuDevicePrimaryCtxRelease(device_id);
+            delete b;
+        }
+    );
+    auto h = ContextHandle(box, &box->resource);
+
+    // Update cache
+    if (static_cast<size_t>(device_id) >= primary_context_cache.size()) {
+        primary_context_cache.resize(device_id + 1);
+    }
+    primary_context_cache[device_id] = h;
+    return h;
+}
+
+ContextHandle get_current_context() noexcept {
+    if (!ensure_driver_loaded()) {
+        err = CUDA_ERROR_NOT_INITIALIZED;
+        return {};
+    }
+    GILReleaseGuard gil;
+    CUcontext ctx = nullptr;
+    if (CUDA_SUCCESS != (err = p_cuCtxGetCurrent(&ctx))) {
+        return {};
+    }
+    if (!ctx) {
+        return {};  // No current context (not an error)
+    }
+    return create_context_handle_ref(ctx);
+}
+
+// ============================================================================
+// Stream Handles
+// ============================================================================
+
+struct StreamBox {
+    CUstream resource;
+};
+
+StreamHandle create_stream_handle(ContextHandle h_ctx, unsigned int flags, int priority) {
+    if (!ensure_driver_loaded()) {
+        err = CUDA_ERROR_NOT_INITIALIZED;
+        return {};
+    }
+    GILReleaseGuard gil;
+    CUstream stream;
+    if (CUDA_SUCCESS != (err = p_cuStreamCreateWithPriority(&stream, flags, priority))) {
+        return {};
+    }
+
+    auto box = std::shared_ptr<const StreamBox>(
+        new StreamBox{stream},
+        [h_ctx](const StreamBox* b) {
+            GILReleaseGuard gil;
+            p_cuStreamDestroy(b->resource);
+            delete b;
+        }
+    );
+    return StreamHandle(box, &box->resource);
+}
+
+StreamHandle create_stream_handle_ref(CUstream stream) {
+    auto box = std::make_shared<const StreamBox>(StreamBox{stream});
+    return StreamHandle(box, &box->resource);
+}
+
+StreamHandle create_stream_handle_with_owner(CUstream stream, PyObject* owner) {
+    Py_XINCREF(owner);
+    auto box = std::shared_ptr<const StreamBox>(
+        new StreamBox{stream},
+        [owner](const StreamBox* b) {
+            GILAcquireGuard gil;
+            if (gil.acquired()) {
+                Py_XDECREF(owner);
+            }
+            delete b;
+        }
+    );
+    return StreamHandle(box, &box->resource);
+}
+
+StreamHandle get_legacy_stream() noexcept {
+    static StreamHandle handle = create_stream_handle_ref(CU_STREAM_LEGACY);
+    return handle;
+}
+
+StreamHandle get_per_thread_stream() noexcept {
+    static StreamHandle handle = create_stream_handle_ref(CU_STREAM_PER_THREAD);
+    return handle;
+}
+
+// ============================================================================
+// Event Handles
+// ============================================================================
+
+struct EventBox {
+    CUevent resource;
+};
+
+EventHandle create_event_handle(ContextHandle h_ctx, unsigned int flags) {
+    if (!ensure_driver_loaded()) {
+        err = CUDA_ERROR_NOT_INITIALIZED;
+        return {};
+    }
+    GILReleaseGuard gil;
+    CUevent event;
+    if (CUDA_SUCCESS != (err = p_cuEventCreate(&event, flags))) {
+        return {};
+    }
+
+    auto box = std::shared_ptr<const EventBox>(
+        new EventBox{event},
+        [h_ctx](const EventBox* b) {
+            GILReleaseGuard gil;
+            p_cuEventDestroy(b->resource);
+            delete b;
+        }
+    );
+    return EventHandle(box, &box->resource);
+}
+
+EventHandle create_event_handle(unsigned int flags) {
+    return create_event_handle(ContextHandle{}, flags);
+}
+
+EventHandle create_event_handle_ipc(const CUipcEventHandle& ipc_handle) {
+    if (!ensure_driver_loaded()) {
+        err = CUDA_ERROR_NOT_INITIALIZED;
+        return {};
+    }
+    GILReleaseGuard gil;
+    CUevent event;
+    if (CUDA_SUCCESS != (err = p_cuIpcOpenEventHandle(&event, ipc_handle))) {
+        return {};
+    }
+
+    auto box = std::shared_ptr<const EventBox>(
+        new EventBox{event},
+        [](const EventBox* b) {
+            GILReleaseGuard gil;
+            p_cuEventDestroy(b->resource);
+            delete b;
+        }
+    );
+    return EventHandle(box, &box->resource);
+}
+
+// ============================================================================
+// Memory Pool Handles
+// ============================================================================
+
+struct MemoryPoolBox {
+    CUmemoryPool resource;
+};
+
+// Helper to clear peer access before destroying a memory pool.
+// Works around nvbug 5698116: recycled pool handles inherit peer access state.
+static void clear_mempool_peer_access(CUmemoryPool pool) {
+    int device_count = 0;
+    if (p_cuDeviceGetCount(&device_count) != CUDA_SUCCESS || device_count <= 0) {
+        return;
+    }
+
+    std::vector<CUmemAccessDesc> clear_access(device_count);
+    for (int i = 0; i < device_count; ++i) {
+        clear_access[i].location.type = CU_MEM_LOCATION_TYPE_DEVICE;
+        clear_access[i].location.id = i;
+        clear_access[i].flags = CU_MEM_ACCESS_FLAGS_PROT_NONE;
+    }
+    p_cuMemPoolSetAccess(pool, clear_access.data(), device_count);  // Best effort
+}
+
+static MemoryPoolHandle wrap_mempool_owned(CUmemoryPool pool) {
+    auto box = std::shared_ptr<const MemoryPoolBox>(
+        new MemoryPoolBox{pool},
+        [](const MemoryPoolBox* b) {
+            GILReleaseGuard gil;
+            clear_mempool_peer_access(b->resource);
+            p_cuMemPoolDestroy(b->resource);
+            delete b;
+        }
+    );
+    return MemoryPoolHandle(box, &box->resource);
+}
+
+MemoryPoolHandle create_mempool_handle(const CUmemPoolProps& props) {
+    if (!ensure_driver_loaded()) {
+        err = CUDA_ERROR_NOT_INITIALIZED;
+        return {};
+    }
+    GILReleaseGuard gil;
+    CUmemoryPool pool;
+    if (CUDA_SUCCESS != (err = p_cuMemPoolCreate(&pool, &props))) {
+        return {};
+    }
+    return wrap_mempool_owned(pool);
+}
+
+MemoryPoolHandle create_mempool_handle_ref(CUmemoryPool pool) {
+    auto box = std::make_shared<const MemoryPoolBox>(MemoryPoolBox{pool});
+    return MemoryPoolHandle(box, &box->resource);
+}
+
+MemoryPoolHandle get_device_mempool(int device_id) noexcept {
+    if (!ensure_driver_loaded()) {
+        err = CUDA_ERROR_NOT_INITIALIZED;
+        return {};
+    }
+    GILReleaseGuard gil;
+    CUmemoryPool pool;
+    if (CUDA_SUCCESS != (err = p_cuDeviceGetMemPool(&pool, device_id))) {
+        return {};
+    }
+    return create_mempool_handle_ref(pool);
+}
+
+MemoryPoolHandle create_mempool_handle_ipc(int fd, CUmemAllocationHandleType handle_type) {
+    if (!ensure_driver_loaded()) {
+        err = CUDA_ERROR_NOT_INITIALIZED;
+        return {};
+    }
+    GILReleaseGuard gil;
+    CUmemoryPool pool;
+    auto handle_ptr = reinterpret_cast<void*>(static_cast<uintptr_t>(fd));
+    if (CUDA_SUCCESS != (err = p_cuMemPoolImportFromShareableHandle(&pool, handle_ptr, handle_type, 0))) {
+        return {};
+    }
+    return wrap_mempool_owned(pool);
+}
+
+// ============================================================================
+// Device Pointer Handles
+// ============================================================================
+
+struct DevicePtrBox {
+    CUdeviceptr resource;
+    mutable StreamHandle h_stream;
+};
+
+static DevicePtrBox* get_box(const DevicePtrHandle& h) {
+    const CUdeviceptr* p = h.get();
+    return reinterpret_cast<DevicePtrBox*>(
+        reinterpret_cast<char*>(const_cast<CUdeviceptr*>(p)) - offsetof(DevicePtrBox, resource)
+    );
+}
+
+StreamHandle deallocation_stream(const DevicePtrHandle& h) {
+    return get_box(h)->h_stream;
+}
+
+void set_deallocation_stream(const DevicePtrHandle& h, StreamHandle h_stream) {
+    get_box(h)->h_stream = std::move(h_stream);
+}
+
+DevicePtrHandle deviceptr_alloc_from_pool(size_t size, MemoryPoolHandle h_pool, StreamHandle h_stream) {
+    if (!ensure_driver_loaded()) {
+        err = CUDA_ERROR_NOT_INITIALIZED;
+        return {};
+    }
+    GILReleaseGuard gil;
+    CUdeviceptr ptr;
+    if (CUDA_SUCCESS != (err = p_cuMemAllocFromPoolAsync(&ptr, size, *h_pool, native(h_stream)))) {
+        return {};
+    }
+
+    auto box = std::shared_ptr<DevicePtrBox>(
+        new DevicePtrBox{ptr, h_stream},
+        [h_pool](DevicePtrBox* b) {
+            GILReleaseGuard gil;
+            p_cuMemFreeAsync(b->resource, native(b->h_stream));
+            delete b;
+        }
+    );
+    return DevicePtrHandle(box, &box->resource);
+}
+
+DevicePtrHandle deviceptr_alloc_async(size_t size, StreamHandle h_stream) {
+    if (!ensure_driver_loaded()) {
+        err = CUDA_ERROR_NOT_INITIALIZED;
+        return {};
+    }
+    GILReleaseGuard gil;
+    CUdeviceptr ptr;
+    if (CUDA_SUCCESS != (err = p_cuMemAllocAsync(&ptr, size, native(h_stream)))) {
+        return {};
+    }
+
+    auto box = std::shared_ptr<DevicePtrBox>(
+        new DevicePtrBox{ptr, h_stream},
+        [](DevicePtrBox* b) {
+            GILReleaseGuard gil;
+            p_cuMemFreeAsync(b->resource, native(b->h_stream));
+            delete b;
+        }
+    );
+    return DevicePtrHandle(box, &box->resource);
+}
+
+DevicePtrHandle deviceptr_alloc(size_t size) {
+    if (!ensure_driver_loaded()) {
+        err = CUDA_ERROR_NOT_INITIALIZED;
+        return {};
+    }
+    GILReleaseGuard gil;
+    CUdeviceptr ptr;
+    if (CUDA_SUCCESS != (err = p_cuMemAlloc(&ptr, size))) {
+        return {};
+    }
+
+    auto box = std::shared_ptr<DevicePtrBox>(
+        new DevicePtrBox{ptr, StreamHandle{}},
+        [](DevicePtrBox* b) {
+            GILReleaseGuard gil;
+            p_cuMemFree(b->resource);
+            delete b;
+        }
+    );
+    return DevicePtrHandle(box, &box->resource);
+}
+
+DevicePtrHandle deviceptr_alloc_host(size_t size) {
+    if (!ensure_driver_loaded()) {
+        err = CUDA_ERROR_NOT_INITIALIZED;
+        return {};
+    }
+    GILReleaseGuard gil;
+    void* ptr;
+    if (CUDA_SUCCESS != (err = p_cuMemAllocHost(&ptr, size))) {
+        return {};
+    }
+
+    auto box = std::shared_ptr<DevicePtrBox>(
+        new DevicePtrBox{reinterpret_cast<CUdeviceptr>(ptr), StreamHandle{}},
+        [](DevicePtrBox* b) {
+            GILReleaseGuard gil;
+            p_cuMemFreeHost(reinterpret_cast<void*>(b->resource));
+            delete b;
+        }
+    );
+    return DevicePtrHandle(box, &box->resource);
+}
+
+DevicePtrHandle deviceptr_create_ref(CUdeviceptr ptr) {
+    auto box = std::make_shared<DevicePtrBox>(DevicePtrBox{ptr, StreamHandle{}});
+    return DevicePtrHandle(box, &box->resource);
+}
+
+DevicePtrHandle deviceptr_create_with_owner(CUdeviceptr ptr, PyObject* owner) {
+    if (!owner) {
+        return deviceptr_create_ref(ptr);
+    }
+    Py_INCREF(owner);
+    auto box = std::shared_ptr<DevicePtrBox>(
+        new DevicePtrBox{ptr, StreamHandle{}},
+        [owner](DevicePtrBox* b) {
+            GILAcquireGuard gil;
+            if (gil.acquired()) {
+                Py_DECREF(owner);
+            }
+            delete b;
+        }
+    );
+    return DevicePtrHandle(box, &box->resource);
+}
+
+// ============================================================================
+// IPC Pointer Cache
+// ============================================================================
+// This cache handles duplicate IPC imports, which behave differently depending
+// on the memory type:
+//
+// 1. Memory pool allocations (DeviceMemoryResource):
+//    Multiple imports of the same allocation succeed and return duplicate
+//    pointers. However, the driver has a reference counting bug (nvbug 5570902)
+//    where the first cuMemFreeAsync incorrectly unmaps the memory even when
+//    imported multiple times. A driver fix is expected.
+//
+// 2. Pinned memory allocations (PinnedMemoryResource):
+//    Duplicate imports result in CUDA_ERROR_ALREADY_MAPPED.
+//
+// The cache solves both issues by checking the cache before calling
+// cuMemPoolImportPointer and returning the existing handle for duplicate
+// imports. This provides a consistent user experience where the same IPC
+// descriptor can be imported multiple times regardless of memory type.
+//
+// The cache key is the export_data bytes (CUmemPoolPtrExportData), not the
+// returned pointer, because we must check before calling the driver API.
+
+// TODO: When driver fix for nvbug 5570902 is available, consider whether
+// the cache is still needed for memory pool allocations (it will still be
+// needed for pinned memory).
+static bool use_ipc_ptr_cache() {
+    return true;
+}
+
+// Wrapper for CUmemPoolPtrExportData to use as map key
+struct ExportDataKey {
+    CUmemPoolPtrExportData data;
+
+    bool operator==(const ExportDataKey& other) const {
+        return std::memcmp(&data, &other.data, sizeof(data)) == 0;
+    }
+};
+
+struct ExportDataKeyHash {
+    std::size_t operator()(const ExportDataKey& key) const {
+        // Simple hash of the bytes
+        std::size_t h = 0;
+        const auto* bytes = reinterpret_cast<const unsigned char*>(&key.data);
+        for (std::size_t i = 0; i < sizeof(key.data); ++i) {
+            h = h * 31 + bytes[i];
+        }
+        return h;
+    }
+};
+
+static std::mutex ipc_ptr_cache_mutex;
+static std::unordered_map<ExportDataKey, std::weak_ptr<DevicePtrBox>, ExportDataKeyHash> ipc_ptr_cache;
+
+DevicePtrHandle deviceptr_import_ipc(MemoryPoolHandle h_pool, const void* export_data, StreamHandle h_stream) {
+    if (!ensure_driver_loaded()) {
+        err = CUDA_ERROR_NOT_INITIALIZED;
+        return {};
+    }
+
+    auto data = const_cast<CUmemPoolPtrExportData*>(
+        reinterpret_cast<const CUmemPoolPtrExportData*>(export_data));
+
+    if (use_ipc_ptr_cache()) {
+        // Check cache before calling cuMemPoolImportPointer
+        ExportDataKey key;
+        std::memcpy(&key.data, data, sizeof(key.data));
+
+        std::lock_guard<std::mutex> lock(ipc_ptr_cache_mutex);
+
+        auto it = ipc_ptr_cache.find(key);
+        if (it != ipc_ptr_cache.end()) {
+            if (auto box = it->second.lock()) {
+                // Cache hit - return existing handle
+                return DevicePtrHandle(box, &box->resource);
+            }
+            ipc_ptr_cache.erase(it);  // Expired entry
+        }
+
+        // Cache miss - import the pointer
+        GILReleaseGuard gil;
+        CUdeviceptr ptr;
+        if (CUDA_SUCCESS != (err = p_cuMemPoolImportPointer(&ptr, *h_pool, data))) {
+            return {};
+        }
+
+        // Create new handle with cache-clearing deleter
+        auto box = std::shared_ptr<DevicePtrBox>(
+            new DevicePtrBox{ptr, h_stream},
+            [h_pool, key](DevicePtrBox* b) {
+                GILReleaseGuard gil;
+                {
+                    std::lock_guard<std::mutex> lock(ipc_ptr_cache_mutex);
+                    // Only erase if expired - avoids race where another thread
+                    // replaced the entry with a new import before we acquired the lock.
+                    auto it = ipc_ptr_cache.find(key);
+                    if (it != ipc_ptr_cache.end() && it->second.expired()) {
+                        ipc_ptr_cache.erase(it);
+                    }
+                }
+                p_cuMemFreeAsync(b->resource, native(b->h_stream));
+                delete b;
+            }
+        );
+        ipc_ptr_cache[key] = box;
+        return DevicePtrHandle(box, &box->resource);
+
+    } else {
+        // No caching - simple handle creation
+        GILReleaseGuard gil;
+        CUdeviceptr ptr;
+        if (CUDA_SUCCESS != (err = p_cuMemPoolImportPointer(&ptr, *h_pool, data))) {
+            return {};
+        }
+
+        auto box = std::shared_ptr<DevicePtrBox>(
+            new DevicePtrBox{ptr, h_stream},
+            [h_pool](DevicePtrBox* b) {
+                GILReleaseGuard gil;
+                p_cuMemFreeAsync(b->resource, native(b->h_stream));
+                delete b;
+            }
+        );
+        return DevicePtrHandle(box, &box->resource);
+    }
+}
+
+// ============================================================================
+// Capsule C++ API table
+// ============================================================================
+
+const ResourceHandlesCxxApiV1* get_resource_handles_cxx_api_v1() noexcept {
+    static const ResourceHandlesCxxApiV1 table = []() {
+        ResourceHandlesCxxApiV1 t{};
+        t.abi_version = RESOURCE_HANDLES_CXX_API_VERSION;
+        t.struct_size = static_cast<std::uint32_t>(sizeof(ResourceHandlesCxxApiV1));
+
+        // Error handling
+        t.get_last_error = &get_last_error;
+        t.peek_last_error = &peek_last_error;
+        t.clear_last_error = &clear_last_error;
+
+        // Context
+        t.create_context_handle_ref = &create_context_handle_ref;
+        t.get_primary_context = &get_primary_context;
+        t.get_current_context = &get_current_context;
+
+        // Stream
+        t.create_stream_handle = &create_stream_handle;
+        t.create_stream_handle_ref = &create_stream_handle_ref;
+        t.create_stream_handle_with_owner = &create_stream_handle_with_owner;
+        t.get_legacy_stream = &get_legacy_stream;
+        t.get_per_thread_stream = &get_per_thread_stream;
+
+        // Event (resolve overloads explicitly)
+        t.create_event_handle =
+            static_cast<EventHandle (*)(ContextHandle, unsigned int)>(&create_event_handle);
+        t.create_event_handle_noctx =
+            static_cast<EventHandle (*)(unsigned int)>(&create_event_handle);
+        t.create_event_handle_ipc = &create_event_handle_ipc;
+
+        // Memory pool
+        t.create_mempool_handle = &create_mempool_handle;
+        t.create_mempool_handle_ref = &create_mempool_handle_ref;
+        t.get_device_mempool = &get_device_mempool;
+        t.create_mempool_handle_ipc = &create_mempool_handle_ipc;
+
+        // Device pointer
+        t.deviceptr_alloc_from_pool = &deviceptr_alloc_from_pool;
+        t.deviceptr_alloc_async = &deviceptr_alloc_async;
+        t.deviceptr_alloc = &deviceptr_alloc;
+        t.deviceptr_alloc_host = &deviceptr_alloc_host;
+        t.deviceptr_create_ref = &deviceptr_create_ref;
+        t.deviceptr_create_with_owner = &deviceptr_create_with_owner;
+        t.deviceptr_import_ipc = &deviceptr_import_ipc;
+        t.deallocation_stream = &deallocation_stream;
+        t.set_deallocation_stream = &set_deallocation_stream;
+
+        return t;
+    }();
+    return &table;
+}
+
+}  // namespace cuda_core
diff --git a/cuda_core/cuda/core/_cpp/resource_handles.hpp b/cuda_core/cuda/core/_cpp/resource_handles.hpp
new file mode 100644
index 0000000000..7649788fdd
--- /dev/null
+++ b/cuda_core/cuda/core/_cpp/resource_handles.hpp
@@ -0,0 +1,298 @@
+// SPDX-FileCopyrightText: Copyright (c) 2024-2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+//
+// SPDX-License-Identifier: Apache-2.0
+
+#pragma once
+
+#include <Python.h>
+#include <cuda.h>
+#include <cstdint>
+#include <memory>
+
+namespace cuda_core {
+
+// ============================================================================
+// Thread-local error handling
+// ============================================================================
+
+// Get and clear the last CUDA error (like cudaGetLastError)
+CUresult get_last_error() noexcept;
+
+// Get the last CUDA error without clearing it (like cudaPeekAtLastError)
+CUresult peek_last_error() noexcept;
+
+// Explicitly clear the last error
+void clear_last_error() noexcept;
+
+// ============================================================================
+// Handle type aliases - expose only the raw CUDA resource
+// ============================================================================
+
+using ContextHandle = std::shared_ptr<const CUcontext>;
+using StreamHandle = std::shared_ptr<const CUstream>;
+using EventHandle = std::shared_ptr<const CUevent>;
+using MemoryPoolHandle = std::shared_ptr<const CUmemoryPool>;
+
+// ============================================================================
+// Context handle functions
+// ============================================================================
+
+// Function to create a non-owning context handle (references existing context).
+ContextHandle create_context_handle_ref(CUcontext ctx);
+
+// Get handle to the primary context for a device (with thread-local caching)
+// Returns empty handle on error (caller must check)
+ContextHandle get_primary_context(int device_id) noexcept;
+
+// Get handle to the current CUDA context
+// Returns empty handle if no context is current (caller must check)
+ContextHandle get_current_context() noexcept;
+
+// ============================================================================
+// Stream handle functions
+// ============================================================================
+
+// Create an owning stream handle by calling cuStreamCreateWithPriority.
+// The stream structurally depends on the provided context handle.
+// When the last reference is released, cuStreamDestroy is called automatically.
+// Returns empty handle on error (caller must check).
+StreamHandle create_stream_handle(ContextHandle h_ctx, unsigned int flags, int priority);
+
+// Create a non-owning stream handle (references existing stream).
+// Use for borrowed streams (from foreign code) or built-in streams.
+// The stream will NOT be destroyed when the handle is released.
+// Caller is responsible for keeping the stream's context alive.
+StreamHandle create_stream_handle_ref(CUstream stream);
+
+// Create a non-owning stream handle that prevents a Python owner from being GC'd.
+// The owner's refcount is incremented; decremented when handle is released.
+// The owner is responsible for keeping the stream's context alive.
+StreamHandle create_stream_handle_with_owner(CUstream stream, PyObject* owner);
+
+// Get non-owning handle to the legacy default stream (CU_STREAM_LEGACY)
+// Note: Legacy stream has no specific context dependency.
+StreamHandle get_legacy_stream() noexcept;
+
+// Get non-owning handle to the per-thread default stream (CU_STREAM_PER_THREAD)
+// Note: Per-thread stream has no specific context dependency.
+StreamHandle get_per_thread_stream() noexcept;
+
+// ============================================================================
+// Event handle functions
+// ============================================================================
+
+// Create an owning event handle by calling cuEventCreate.
+// The event structurally depends on the provided context handle.
+// When the last reference is released, cuEventDestroy is called automatically.
+// Returns empty handle on error (caller must check).
+EventHandle create_event_handle(ContextHandle h_ctx, unsigned int flags);
+
+// Create an owning event handle without context dependency.
+// Use for temporary events that are created and destroyed in the same scope.
+// When the last reference is released, cuEventDestroy is called automatically.
+// Returns empty handle on error (caller must check).
+EventHandle create_event_handle(unsigned int flags);
+
+// Create an owning event handle from an IPC handle.
+// The originating process owns the event and its context.
+// When the last reference is released, cuEventDestroy is called automatically.
+// Returns empty handle on error (caller must check).
+EventHandle create_event_handle_ipc(const CUipcEventHandle& ipc_handle);
+
+// ============================================================================
+// Memory pool handle functions
+// ============================================================================
+
+// Create an owning memory pool handle by calling cuMemPoolCreate.
+// Memory pools are device-scoped (not context-scoped).
+// When the last reference is released, cuMemPoolDestroy is called automatically.
+// Returns empty handle on error (caller must check).
+MemoryPoolHandle create_mempool_handle(const CUmemPoolProps& props);
+
+// Create a non-owning memory pool handle (references existing pool).
+// Use for device default/current pools that are managed by the driver.
+// The pool will NOT be destroyed when the handle is released.
+MemoryPoolHandle create_mempool_handle_ref(CUmemoryPool pool);
+
+// Get non-owning handle to the current memory pool for a device.
+// Returns empty handle on error (caller must check).
+MemoryPoolHandle get_device_mempool(int device_id) noexcept;
+
+// Create an owning memory pool handle from an IPC import.
+// The file descriptor is NOT owned by this handle (caller manages FD separately).
+// When the last reference is released, cuMemPoolDestroy is called automatically.
+// Returns empty handle on error (caller must check).
+MemoryPoolHandle create_mempool_handle_ipc(int fd, CUmemAllocationHandleType handle_type);
+
+// ============================================================================
+// Device pointer handle functions
+// ============================================================================
+
+using DevicePtrHandle = std::shared_ptr<const CUdeviceptr>;
+
+// Allocate device memory from a pool asynchronously via cuMemAllocFromPoolAsync.
+// The pointer structurally depends on the provided pool handle (captured in deleter).
+// When the last reference is released, cuMemFreeAsync is called on the stored stream.
+// Returns empty handle on error (caller must check).
+DevicePtrHandle deviceptr_alloc_from_pool(
+    size_t size,
+    MemoryPoolHandle h_pool,
+    StreamHandle h_stream);
+
+// Allocate device memory asynchronously via cuMemAllocAsync.
+// When the last reference is released, cuMemFreeAsync is called on the stored stream.
+// Returns empty handle on error (caller must check).
+DevicePtrHandle deviceptr_alloc_async(size_t size, StreamHandle h_stream);
+
+// Allocate device memory synchronously via cuMemAlloc.
+// When the last reference is released, cuMemFree is called.
+// Returns empty handle on error (caller must check).
+DevicePtrHandle deviceptr_alloc(size_t size);
+
+// Allocate pinned host memory via cuMemAllocHost.
+// When the last reference is released, cuMemFreeHost is called.
+// Returns empty handle on error (caller must check).
+DevicePtrHandle deviceptr_alloc_host(size_t size);
+
+// Create a non-owning device pointer handle (references existing pointer).
+// Use for foreign pointers (e.g., from external libraries).
+// The pointer will NOT be freed when the handle is released.
+DevicePtrHandle deviceptr_create_ref(CUdeviceptr ptr);
+
+// Create a non-owning device pointer handle that prevents a Python owner from being GC'd.
+// The owner's refcount is incremented; decremented when handle is released.
+// The pointer will NOT be freed when the handle is released.
+// If owner is nullptr, equivalent to deviceptr_create_ref.
+DevicePtrHandle deviceptr_create_with_owner(CUdeviceptr ptr, PyObject* owner);
+
+// Import a device pointer from IPC via cuMemPoolImportPointer.
+// When the last reference is released, cuMemFreeAsync is called on the stored stream.
+// Note: Does not yet implement reference counting for nvbug 5570902.
+// On error, returns empty handle and sets thread-local error (use get_last_error()).
+DevicePtrHandle deviceptr_import_ipc(
+    MemoryPoolHandle h_pool,
+    const void* export_data,
+    StreamHandle h_stream);
+
+// Access the deallocation stream for a device pointer handle (read-only).
+// For non-owning handles, the stream is not used but can still be accessed.
+StreamHandle deallocation_stream(const DevicePtrHandle& h);
+
+// Set the deallocation stream for a device pointer handle.
+void set_deallocation_stream(const DevicePtrHandle& h, StreamHandle h_stream);
+
+// ============================================================================
+// Overloaded helper functions to extract raw resources from handles
+// ============================================================================
+
+// native() - extract the raw CUDA handle
+inline CUcontext native(const ContextHandle& h) noexcept {
+    return h ? *h : nullptr;
+}
+
+inline CUstream native(const StreamHandle& h) noexcept {
+    return h ? *h : nullptr;
+}
+
+inline CUevent native(const EventHandle& h) noexcept {
+    return h ? *h : nullptr;
+}
+
+inline CUmemoryPool native(const MemoryPoolHandle& h) noexcept {
+    return h ? *h : nullptr;
+}
+
+inline CUdeviceptr native(const DevicePtrHandle& h) noexcept {
+    return h ? *h : 0;
+}
+
+// intptr() - extract handle as intptr_t for Python interop
+// Using signed intptr_t per C standard convention and issue #1342
+inline std::intptr_t intptr(const ContextHandle& h) noexcept {
+    return reinterpret_cast<std::intptr_t>(h ? *h : nullptr);
+}
+
+inline std::intptr_t intptr(const StreamHandle& h) noexcept {
+    return reinterpret_cast<std::intptr_t>(h ? *h : nullptr);
+}
+
+inline std::intptr_t intptr(const EventHandle& h) noexcept {
+    return reinterpret_cast<std::intptr_t>(h ? *h : nullptr);
+}
+
+inline std::intptr_t intptr(const MemoryPoolHandle& h) noexcept {
+    return reinterpret_cast<std::intptr_t>(h ? *h : nullptr);
+}
+
+inline std::intptr_t intptr(const DevicePtrHandle& h) noexcept {
+    return h ? static_cast<std::intptr_t>(*h) : 0;
+}
+
+// py() - convert handle to Python driver wrapper object
+// Returns new reference. Caller must hold GIL.
+inline PyObject* py(const ContextHandle& h) {
+    static PyObject* cls = nullptr;
+    if (!cls) {
+        PyObject* mod = PyImport_ImportModule("cuda.bindings.driver");
+        if (!mod) return nullptr;
+        cls = PyObject_GetAttrString(mod, "CUcontext");
+        Py_DECREF(mod);
+        if (!cls) return nullptr;
+    }
+    std::uintptr_t val = h ? reinterpret_cast<std::uintptr_t>(*h) : 0;
+    return PyObject_CallFunction(cls, "K", val);
+}
+
+inline PyObject* py(const StreamHandle& h) {
+    static PyObject* cls = nullptr;
+    if (!cls) {
+        PyObject* mod = PyImport_ImportModule("cuda.bindings.driver");
+        if (!mod) return nullptr;
+        cls = PyObject_GetAttrString(mod, "CUstream");
+        Py_DECREF(mod);
+        if (!cls) return nullptr;
+    }
+    std::uintptr_t val = h ? reinterpret_cast<std::uintptr_t>(*h) : 0;
+    return PyObject_CallFunction(cls, "K", val);
+}
+
+inline PyObject* py(const EventHandle& h) {
+    static PyObject* cls = nullptr;
+    if (!cls) {
+        PyObject* mod = PyImport_ImportModule("cuda.bindings.driver");
+        if (!mod) return nullptr;
+        cls = PyObject_GetAttrString(mod, "CUevent");
+        Py_DECREF(mod);
+        if (!cls) return nullptr;
+    }
+    std::uintptr_t val = h ? reinterpret_cast<std::uintptr_t>(*h) : 0;
+    return PyObject_CallFunction(cls, "K", val);
+}
+
+inline PyObject* py(const MemoryPoolHandle& h) {
+    static PyObject* cls = nullptr;
+    if (!cls) {
+        PyObject* mod = PyImport_ImportModule("cuda.bindings.driver");
+        if (!mod) return nullptr;
+        cls = PyObject_GetAttrString(mod, "CUmemoryPool");
+        Py_DECREF(mod);
+        if (!cls) return nullptr;
+    }
+    std::uintptr_t val = h ? reinterpret_cast<std::uintptr_t>(*h) : 0;
+    return PyObject_CallFunction(cls, "K", val);
+}
+
+inline PyObject* py(const DevicePtrHandle& h) {
+    static PyObject* cls = nullptr;
+    if (!cls) {
+        PyObject* mod = PyImport_ImportModule("cuda.bindings.driver");
+        if (!mod) return nullptr;
+        cls = PyObject_GetAttrString(mod, "CUdeviceptr");
+        Py_DECREF(mod);
+        if (!cls) return nullptr;
+    }
+    std::uintptr_t val = h ? static_cast<std::uintptr_t>(*h) : 0;
+    return PyObject_CallFunction(cls, "K", val);
+}
+
+}  // namespace cuda_core
diff --git a/cuda_core/cuda/core/_cpp/resource_handles_cxx_api.hpp b/cuda_core/cuda/core/_cpp/resource_handles_cxx_api.hpp
new file mode 100644
index 0000000000..6ff07a6ee0
--- /dev/null
+++ b/cuda_core/cuda/core/_cpp/resource_handles_cxx_api.hpp
@@ -0,0 +1,79 @@
+// SPDX-FileCopyrightText: Copyright (c) 2024-2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+//
+// SPDX-License-Identifier: Apache-2.0
+
+#pragma once
+
+#include <cstdint>
+
+#include "resource_handles.hpp"
+
+namespace cuda_core {
+
+// C++ capsule API for cross-extension-module calls.
+//
+// The function-pointer table is exported from the Python extension module
+// `cuda.core._resource_handles` as a PyCapsule named:
+//
+//   "cuda.core._resource_handles._CXX_API"
+//
+// Other extension modules import the capsule and dispatch through the table to
+// ensure there is a single owner of all correctness-critical static/thread_local
+// state in resource_handles.cpp (caches, last-error state, etc.).
+
+static constexpr std::uint32_t RESOURCE_HANDLES_CXX_API_VERSION = 1;
+
+struct ResourceHandlesCxxApiV1 {
+    std::uint32_t abi_version;
+    std::uint32_t struct_size;
+
+    // Thread-local error handling
+    CUresult (*get_last_error)() noexcept;
+    CUresult (*peek_last_error)() noexcept;
+    void (*clear_last_error)() noexcept;
+
+    // Context handles
+    ContextHandle (*create_context_handle_ref)(CUcontext ctx);
+    ContextHandle (*get_primary_context)(int device_id) noexcept;
+    ContextHandle (*get_current_context)() noexcept;
+
+    // Stream handles
+    StreamHandle (*create_stream_handle)(ContextHandle h_ctx, unsigned int flags, int priority);
+    StreamHandle (*create_stream_handle_ref)(CUstream stream);
+    StreamHandle (*create_stream_handle_with_owner)(CUstream stream, PyObject* owner);
+    StreamHandle (*get_legacy_stream)() noexcept;
+    StreamHandle (*get_per_thread_stream)() noexcept;
+
+    // Event handles
+    EventHandle (*create_event_handle)(ContextHandle h_ctx, unsigned int flags);
+    EventHandle (*create_event_handle_noctx)(unsigned int flags);
+    EventHandle (*create_event_handle_ipc)(const CUipcEventHandle& ipc_handle);
+
+    // Memory pool handles
+    MemoryPoolHandle (*create_mempool_handle)(const CUmemPoolProps& props);
+    MemoryPoolHandle (*create_mempool_handle_ref)(CUmemoryPool pool);
+    MemoryPoolHandle (*get_device_mempool)(int device_id) noexcept;
+    MemoryPoolHandle (*create_mempool_handle_ipc)(int fd, CUmemAllocationHandleType handle_type);
+
+    // Device pointer handles
+    DevicePtrHandle (*deviceptr_alloc_from_pool)(
+        size_t size,
+        MemoryPoolHandle h_pool,
+        StreamHandle h_stream);
+    DevicePtrHandle (*deviceptr_alloc_async)(size_t size, StreamHandle h_stream);
+    DevicePtrHandle (*deviceptr_alloc)(size_t size);
+    DevicePtrHandle (*deviceptr_alloc_host)(size_t size);
+    DevicePtrHandle (*deviceptr_create_ref)(CUdeviceptr ptr);
+    DevicePtrHandle (*deviceptr_create_with_owner)(CUdeviceptr ptr, PyObject* owner);
+    DevicePtrHandle (*deviceptr_import_ipc)(
+        MemoryPoolHandle h_pool,
+        const void* export_data,
+        StreamHandle h_stream);
+    StreamHandle (*deallocation_stream)(const DevicePtrHandle& h);
+    void (*set_deallocation_stream)(const DevicePtrHandle& h, StreamHandle h_stream);
+};
+
+// Return pointer to a process-wide singleton table.
+const ResourceHandlesCxxApiV1* get_resource_handles_cxx_api_v1() noexcept;
+
+}  // namespace cuda_core
diff --git a/cuda_core/cuda/core/_device.pyx b/cuda_core/cuda/core/_device.pyx
index 2d775b6580..014b7dae78 100644
--- a/cuda_core/cuda/core/_device.pyx
+++ b/cuda_core/cuda/core/_device.pyx
@@ -11,8 +11,19 @@ from cuda.core._utils.cuda_utils cimport HANDLE_RETURN
 import threading
 from typing import Optional, TYPE_CHECKING, Union
 
-from cuda.core._context import Context, ContextOptions
+from cuda.core._context cimport Context
+from cuda.core._context import ContextOptions
+from cuda.core._event cimport Event as cyEvent
 from cuda.core._event import Event, EventOptions
+from cuda.core._resource_handles cimport (
+    ContextHandle,
+    _init_handles_table,
+    create_context_handle_ref,
+    get_primary_context,
+    native,
+)
+
+_init_handles_table()
 from cuda.core._graph import GraphBuilder
 from cuda.core._stream import IsStreamT, Stream, StreamOptions
 from cuda.core._utils.clear_error_support import assert_type
@@ -908,20 +919,6 @@ cdef class DeviceProperties:
         )
 
 
-cdef cydriver.CUcontext _get_primary_context(int dev_id) except?NULL:
-    try:
-        primary_ctxs = _tls.primary_ctxs
-    except AttributeError:
-        total = len(_tls.devices)
-        primary_ctxs = _tls.primary_ctxs = [0] * total
-    cdef cydriver.CUcontext ctx = <cydriver.CUcontext><uintptr_t>(primary_ctxs[dev_id])
-    if ctx == NULL:
-        with nogil:
-            HANDLE_RETURN(cydriver.cuDevicePrimaryCtxRetain(&ctx, dev_id))
-        primary_ctxs[dev_id] = <uintptr_t>(ctx)
-    return ctx
-
-
 class Device:
     """Represent a GPU and act as an entry point for cuda.core features.
 
@@ -948,7 +945,7 @@ class Device:
         Default value of `None` return the currently used device.
 
     """
-    __slots__ = ("_id", "_memory_resource", "_has_inited", "_properties", "_uuid")
+    __slots__ = ("_device_id", "_memory_resource", "_has_inited", "_properties", "_uuid", "_context")
 
     def __new__(cls, device_id: Device | int | None = None):
         # Handle device_id argument.
@@ -973,10 +970,9 @@ class Device:
             if err == cydriver.CUresult.CUDA_SUCCESS:
                 device_id = int(dev)
             elif err == cydriver.CUresult.CUDA_ERROR_INVALID_CONTEXT:
-                with nogil:
-                    HANDLE_RETURN(cydriver.cuCtxGetCurrent(&ctx))
-                assert <void*>(ctx) == NULL
-                device_id = 0  # cudart behavior
+                # No context is current - verify and default to device 0 (cudart behavior)
+                assert cydriver.cuCtxGetCurrent(&ctx) == cydriver.CUresult.CUDA_SUCCESS and ctx == NULL
+                device_id = 0
             else:
                 HANDLE_RETURN(err)
         elif device_id < 0:
@@ -990,13 +986,14 @@ class Device:
             with nogil:
                 HANDLE_RETURN(cydriver.cuDeviceGetCount(&total))
             devices = _tls.devices = []
-            for dev_id in range(total):
+            for i in range(total):
                 device = super().__new__(cls)
-                device._id = dev_id
+                device._device_id = i
                 device._memory_resource = None
                 device._has_inited = False
                 device._properties = None
                 device._uuid = None
+                device._context = None
                 devices.append(device)
 
         try:
@@ -1007,22 +1004,9 @@ class Device:
     def _check_context_initialized(self):
         if not self._has_inited:
             raise CUDAError(
-                f"Device {self._id} is not yet initialized, perhaps you forgot to call .set_current() first?"
+                f"Device {self._device_id} is not yet initialized, perhaps you forgot to call .set_current() first?"
             )
 
-    def _get_current_context(self, bint check_consistency=False) -> driver.CUcontext:
-        cdef cydriver.CUcontext ctx
-        cdef cydriver.CUdevice dev
-        cdef cydriver.CUdevice this_dev = self._id
-        with nogil:
-            HANDLE_RETURN(cydriver.cuCtxGetCurrent(&ctx))
-            if ctx == NULL:
-                raise CUDAError("No context is bound to the calling CPU thread.")
-            if check_consistency:
-                HANDLE_RETURN(cydriver.cuCtxGetDevice(&dev))
-                if dev != this_dev:
-                    raise CUDAError("Internal error (current device is not equal to Device.device_id)")
-        return driver.CUcontext(<uintptr_t>ctx)
 
     @classmethod
     def get_all_devices(cls):
@@ -1041,12 +1025,12 @@ class Device:
     @property
     def device_id(self) -> int:
         """Return device ordinal."""
-        return self._id
+        return self._device_id
 
     @property
     def pci_bus_id(self) -> str:
         """Return a PCI Bus Id string for this device."""
-        bus_id = handle_return(runtime.cudaDeviceGetPCIBusId(13, self._id))
+        bus_id = handle_return(runtime.cudaDeviceGetPCIBusId(13, self._device_id))
         return bus_id[:12].decode()
 
     def can_access_peer(self, peer: Device | int) -> bool:
@@ -1092,7 +1076,7 @@ class Device:
         cdef str uuid_hex
 
         if self._uuid is None:
-            dev = self._id
+            dev = self._device_id
             with nogil:
                 IF CUDA_CORE_BUILD_MAJOR == 12:
                     HANDLE_RETURN(cydriver.cuDeviceGetUuid_v2(&uuid, dev))
@@ -1111,7 +1095,7 @@ class Device:
         cdef int LENGTH = 256
         cdef bytes name = bytes(LENGTH)
         cdef char* name_ptr = name
-        cdef cydriver.CUdevice this_dev = self._id
+        cdef cydriver.CUdevice this_dev = self._device_id
         with nogil:
             HANDLE_RETURN(cydriver.cuDeviceGetName(name_ptr, LENGTH, this_dev))
         name = name.split(b"\0")[0]
@@ -1121,7 +1105,7 @@ class Device:
     def properties(self) -> DeviceProperties:
         """Return a :obj:`~_device.DeviceProperties` class with information about the device."""
         if self._properties is None:
-            self._properties = DeviceProperties._init(self._id)
+            self._properties = DeviceProperties._init(self._device_id)
 
         return self._properties
 
@@ -1142,7 +1126,7 @@ class Device:
 
     @property
     def context(self) -> Context:
-        """Return the current :obj:`~_context.Context` associated with this device.
+        """Return the :obj:`~_context.Context` associated with this device.
 
         Note
         ----
@@ -1150,8 +1134,7 @@ class Device:
 
         """
         self._check_context_initialized()
-        ctx = self._get_current_context(check_consistency=True)
-        return Context._from_ctx(ctx, self._id)
+        return self._context
 
     @property
     def memory_resource(self) -> MemoryResource:
@@ -1160,7 +1143,7 @@ class Device:
         if self._memory_resource is None:
             # If the device is in TCC mode, or does not support memory pools for some other reason,
             # use the SynchronousMemoryResource which does not use memory pools.
-            device_id = self._id
+            device_id = self._device_id
             with nogil:
                 HANDLE_RETURN(
                     cydriver.cuDeviceGetAttribute(
@@ -1169,10 +1152,10 @@ class Device:
                 )
             if attr == 1:
                 from cuda.core._memory import DeviceMemoryResource
-                self._memory_resource = DeviceMemoryResource(self._id)
+                self._memory_resource = DeviceMemoryResource(self._device_id)
             else:
                 from cuda.core._memory import _SynchronousMemoryResource
-                self._memory_resource = _SynchronousMemoryResource(self._id)
+                self._memory_resource = _SynchronousMemoryResource(self._device_id)
 
         return self._memory_resource
 
@@ -1197,10 +1180,10 @@ class Device:
 
     def __int__(self):
         """Return device_id."""
-        return self._id
+        return self._device_id
 
     def __repr__(self):
-        return f"<Device {self._id} ({self.name})>"
+        return f"<Device {self._device_id} ({self.name})>"
 
     def __hash__(self) -> int:
         return hash(self.uuid)
@@ -1208,7 +1191,7 @@ class Device:
     def __eq__(self, other) -> bool:
         if not isinstance(other, Device):
             return NotImplemented
-        return self._id == other._id
+        return self._device_id == other._device_id
 
     def __reduce__(self):
         return Device, (self.device_id,)
@@ -1243,30 +1226,36 @@ class Device:
         >>> # ... do work on device 0 ...
 
         """
-        cdef cydriver.CUcontext prev_ctx
-        cdef cydriver.CUcontext curr_ctx
+        cdef ContextHandle h_context
+        cdef cydriver.CUcontext prev_ctx, curr_ctx
+
         if ctx is not None:
             # TODO: revisit once Context is cythonized
             assert_type(ctx, Context)
-            if ctx._id != self._id:
+            if ctx._device_id != self._device_id:
                 raise RuntimeError(
                     "the provided context was created on the device with"
-                    f" id={ctx._id}, which is different from the target id={self._id}"
+                    f" id={ctx._device_id}, which is different from the target id={self._device_id}"
                 )
             # prev_ctx is the previous context
-            curr_ctx = <cydriver.CUcontext>(ctx._handle)
+            curr_ctx = native(ctx._h_context)
+            prev_ctx = NULL
             with nogil:
                 HANDLE_RETURN(cydriver.cuCtxPopCurrent(&prev_ctx))
                 HANDLE_RETURN(cydriver.cuCtxPushCurrent(curr_ctx))
             self._has_inited = True
+            self._context = ctx  # Store owning context reference
             if prev_ctx != NULL:
-                return Context._from_ctx(<uintptr_t>(prev_ctx), self._id)
+                return Context._from_handle(Context, create_context_handle_ref(prev_ctx), self._device_id)
         else:
             # use primary ctx
-            curr_ctx = _get_primary_context(self._id)
+            h_context = get_primary_context(self._device_id)
+            if h_context.get() == NULL:
+                raise ValueError("Cannot set NULL context as current")
             with nogil:
-                HANDLE_RETURN(cydriver.cuCtxSetCurrent(curr_ctx))
+                HANDLE_RETURN(cydriver.cuCtxSetCurrent(native(h_context)))
             self._has_inited = True
+            self._context = Context._from_handle(Context, h_context, self._device_id)  # Store owning context
 
     def create_context(self, options: ContextOptions = None) -> Context:
         """Create a new :obj:`~_context.Context` object.
@@ -1317,7 +1306,7 @@ class Device:
 
         """
         self._check_context_initialized()
-        return Stream._init(obj=obj, options=options, device_id=self._id)
+        return Stream._init(obj=obj, options=options, device_id=self._device_id, ctx=self._context)
 
     def create_event(self, options: EventOptions | None = None) -> Event:
         """Create an Event object without recording it to a Stream.
@@ -1338,8 +1327,8 @@ class Device:
 
         """
         self._check_context_initialized()
-        ctx = self._get_current_context()
-        return Event._init(self._id, ctx, options, True)
+        cdef Context ctx = self._context
+        return cyEvent._init(cyEvent, self._device_id, ctx._h_context, options, True)
 
     def allocate(self, size, stream: Stream | GraphBuilder | None = None) -> Buffer:
         """Allocate device memory from a specified stream.
diff --git a/cuda_core/cuda/core/_event.pxd b/cuda_core/cuda/core/_event.pxd
index 1f586f18df..f52c505079 100644
--- a/cuda_core/cuda/core/_event.pxd
+++ b/cuda_core/cuda/core/_event.pxd
@@ -3,17 +3,21 @@
 # SPDX-License-Identifier: Apache-2.0
 
 from cuda.bindings cimport cydriver
+from cuda.core._resource_handles cimport ContextHandle, EventHandle
 
 
 cdef class Event:
 
     cdef:
-        cydriver.CUevent _handle
+        EventHandle _h_event
+        ContextHandle _h_context  # Cached for fast access
         bint _timing_disabled
         bint _busy_waited
         bint _ipc_enabled
         object _ipc_descriptor
         int _device_id
-        object _ctx_handle
+
+    @staticmethod
+    cdef Event _init(type cls, int device_id, ContextHandle h_context, options, bint is_free)
 
     cpdef close(self)
diff --git a/cuda_core/cuda/core/_event.pyx b/cuda_core/cuda/core/_event.pyx
index e97fdfbab4..1dec487665 100644
--- a/cuda_core/cuda/core/_event.pyx
+++ b/cuda_core/cuda/core/_event.pyx
@@ -5,9 +5,21 @@
 from __future__ import annotations
 
 cimport cpython
-from libc.stdint cimport uintptr_t
 from libc.string cimport memcpy
 from cuda.bindings cimport cydriver
+from cuda.core._context cimport Context
+from cuda.core._resource_handles cimport (
+    ContextHandle,
+    EventHandle,
+    _init_handles_table,
+    create_event_handle,
+    create_event_handle_ipc,
+    intptr,
+    native,
+    py,
+)
+
+_init_handles_table()
 from cuda.core._utils.cuda_utils cimport (
     check_or_create_options,
     HANDLE_RETURN
@@ -18,11 +30,9 @@ from dataclasses import dataclass
 import multiprocessing
 from typing import TYPE_CHECKING, Optional
 
-from cuda.core._context import Context
 from cuda.core._utils.cuda_utils import (
     CUDAError,
     check_multiprocessing_start_method,
-    driver,
 )
 if TYPE_CHECKING:
     import cuda.bindings
@@ -81,15 +91,13 @@ cdef class Event:
     and they should instead be created through a :obj:`~_stream.Stream` object.
 
     """
-    def __cinit__(self):
-        self._handle = <cydriver.CUevent>(NULL)
 
     def __init__(self, *args, **kwargs):
         raise RuntimeError("Event objects cannot be instantiated directly. Please use Stream APIs (record).")
 
-    @classmethod
-    def _init(cls, device_id: int, ctx_handle: Context, options=None, is_free=False):
-        cdef Event self = Event.__new__(cls)
+    @staticmethod
+    cdef Event _init(type cls, int device_id, ContextHandle h_context, options, bint is_free):
+        cdef Event self = cls.__new__(cls)
         cdef EventOptions opts = check_or_create_options(EventOptions, options, "Event options")
         cdef unsigned int flags = 0x0
         self._timing_disabled = False
@@ -111,23 +119,24 @@ cdef class Event:
             self._ipc_enabled = True
             if not self._timing_disabled:
                 raise TypeError("IPC-enabled events cannot use timing.")
-        with nogil:
-            HANDLE_RETURN(cydriver.cuEventCreate(&self._handle, flags))
+        # C++ creates the event and returns owning handle with context dependency
+        cdef EventHandle h_event = create_event_handle(h_context, flags)
+        if not h_event:
+            raise RuntimeError("Failed to create CUDA event")
+        self._h_event = h_event
+        self._h_context = h_context
         self._device_id = device_id
-        self._ctx_handle = ctx_handle
         if opts.ipc_enabled:
             self.get_ipc_descriptor()
         return self
 
     cpdef close(self):
-        """Destroy the event."""
-        if self._handle != NULL:
-            with nogil:
-                HANDLE_RETURN(cydriver.cuEventDestroy(self._handle))
-            self._handle = <cydriver.CUevent>(NULL)
+        """Destroy the event.
 
-    def __dealloc__(self):
-        self.close()
+        Releases the event handle. The underlying CUDA event is destroyed
+        when the last reference is released.
+        """
+        self._h_event.reset()
 
     def __isub__(self, other):
         return NotImplemented
@@ -139,7 +148,7 @@ cdef class Event:
         # return self - other (in milliseconds)
         cdef float timing
         with nogil:
-            err = cydriver.cuEventElapsedTime(&timing, other._handle, self._handle)
+            err = cydriver.cuEventElapsedTime(&timing, native((<Event>other)._h_event), native(self._h_event))
         if err == 0:
             return timing
         else:
@@ -165,14 +174,14 @@ cdef class Event:
             raise RuntimeError(explanation)
 
     def __hash__(self) -> int:
-        return hash((self._ctx_handle, <uintptr_t>(self._handle)))
+        return hash((type(self), intptr(self._h_context), intptr(self._h_event)))
 
     def __eq__(self, other) -> bool:
         # Note: using isinstance because `Event` can be subclassed.
         if not isinstance(other, Event):
             return NotImplemented
         cdef Event _other = <Event>other
-        return <uintptr_t>(self._handle) == <uintptr_t>(_other._handle)
+        return intptr(self._h_event) == intptr(_other._h_event)
 
     def get_ipc_descriptor(self) -> IPCEventDescriptor:
         """Export an event allocated for sharing between processes."""
@@ -182,7 +191,7 @@ cdef class Event:
             raise RuntimeError("Event is not IPC-enabled")
         cdef cydriver.CUipcEventHandle data
         with nogil:
-            HANDLE_RETURN(cydriver.cuIpcGetEventHandle(&data, <cydriver.CUevent>(self._handle)))
+            HANDLE_RETURN(cydriver.cuIpcGetEventHandle(&data, native(self._h_event)))
         cdef bytes data_b = cpython.PyBytes_FromStringAndSize(<char*>(data.reserved), sizeof(data.reserved))
         self._ipc_descriptor = IPCEventDescriptor._init(data_b, self._busy_waited)
         return self._ipc_descriptor
@@ -193,14 +202,17 @@ cdef class Event:
         cdef cydriver.CUipcEventHandle data
         memcpy(data.reserved, <const void*><const char*>(ipc_descriptor._reserved), sizeof(data.reserved))
         cdef Event self = Event.__new__(cls)
-        with nogil:
-            HANDLE_RETURN(cydriver.cuIpcOpenEventHandle(&self._handle, data))
+        # IPC events: the originating process owns the event and its context
+        cdef EventHandle h_event = create_event_handle_ipc(data)
+        if not h_event:
+            raise RuntimeError("Failed to open IPC event handle")
+        self._h_event = h_event
+        self._h_context = ContextHandle()
         self._timing_disabled = True
         self._busy_waited = ipc_descriptor._busy_waited
         self._ipc_enabled = True
         self._ipc_descriptor = ipc_descriptor
-        self._device_id = -1  # ??
-        self._ctx_handle = None  # ??
+        self._device_id = -1
         return self
 
     @property
@@ -229,13 +241,13 @@ cdef class Event:
 
         """
         with nogil:
-            HANDLE_RETURN(cydriver.cuEventSynchronize(self._handle))
+            HANDLE_RETURN(cydriver.cuEventSynchronize(native(self._h_event)))
 
     @property
     def is_done(self) -> bool:
         """Return True if all captured works have been completed, otherwise False."""
         with nogil:
-            result = cydriver.cuEventQuery(self._handle)
+            result = cydriver.cuEventQuery(native(self._h_event))
         if result == cydriver.CUresult.CUDA_SUCCESS:
             return True
         if result == cydriver.CUresult.CUDA_ERROR_NOT_READY:
@@ -251,7 +263,7 @@ cdef class Event:
             This handle is a Python object. To get the memory address of the underlying C
             handle, call ``int(Event.handle)``.
         """
-        return driver.CUevent(<uintptr_t>(self._handle))
+        return py(self._h_event)
 
     @property
     def device(self) -> Device:
@@ -271,8 +283,8 @@ cdef class Event:
     @property
     def context(self) -> Context:
         """Return the :obj:`~_context.Context` associated with this event."""
-        if self._ctx_handle is not None and self._device_id >= 0:
-            return Context._from_ctx(self._ctx_handle, self._device_id)
+        if self._h_context and self._device_id >= 0:
+            return Context._from_handle(Context, self._h_context, self._device_id)
 
 
 cdef class IPCEventDescriptor:
diff --git a/cuda_core/cuda/core/_graph.py b/cuda_core/cuda/core/_graph.py
index df51126bb0..b6e266a9a8 100644
--- a/cuda_core/cuda/core/_graph.py
+++ b/cuda_core/cuda/core/_graph.py
@@ -453,7 +453,7 @@ def __cuda_stream__(self) -> tuple[int, int]:
         return self.stream.__cuda_stream__()
 
     def _get_conditional_context(self) -> driver.CUcontext:
-        return self._mnff.stream.context._handle
+        return self._mnff.stream.context.handle
 
     def create_conditional_handle(self, default_value=None) -> driver.CUgraphConditionalHandle:
         """Creates a conditional handle for the graph builder.
diff --git a/cuda_core/cuda/core/_launcher.pyx b/cuda_core/cuda/core/_launcher.pyx
index 94dc5d02b4..61e867744e 100644
--- a/cuda_core/cuda/core/_launcher.pyx
+++ b/cuda_core/cuda/core/_launcher.pyx
@@ -8,6 +8,7 @@ from cuda.bindings cimport cydriver
 
 from cuda.core._launch_config cimport LaunchConfig
 from cuda.core._kernel_arg_handler cimport ParamHolder
+from cuda.core._resource_handles cimport native
 from cuda.core._stream cimport Stream_accept, Stream
 from cuda.core._utils.cuda_utils cimport (
     check_or_create_options,
@@ -87,7 +88,7 @@ def launch(stream: Stream | GraphBuilder | IsStreamT, config: LaunchConfig, kern
     # rich.
     if _use_ex:
         drv_cfg = conf._to_native_launch_config()
-        drv_cfg.hStream = s._handle
+        drv_cfg.hStream = native(s._h_stream)
         if conf.cooperative_launch:
             _check_cooperative_launch(kernel, conf, s)
         with nogil:
@@ -99,7 +100,7 @@ def launch(stream: Stream | GraphBuilder | IsStreamT, config: LaunchConfig, kern
                 func_handle,
                 conf.grid[0], conf.grid[1], conf.grid[2],
                 conf.block[0], conf.block[1], conf.block[2],
-                conf.shmem_size, s._handle, args_ptr, NULL
+                conf.shmem_size, native(s._h_stream), args_ptr, NULL
             )
         )
 
diff --git a/cuda_core/cuda/core/_memory/_buffer.pxd b/cuda_core/cuda/core/_memory/_buffer.pxd
index 730e448f63..4238bd8d82 100644
--- a/cuda_core/cuda/core/_memory/_buffer.pxd
+++ b/cuda_core/cuda/core/_memory/_buffer.pxd
@@ -4,6 +4,7 @@
 
 from libc.stdint cimport uintptr_t
 
+from cuda.core._resource_handles cimport DevicePtrHandle
 from cuda.core._stream cimport Stream
 
 
@@ -15,16 +16,23 @@ cdef struct _MemAttrs:
 
 cdef class Buffer:
     cdef:
-        uintptr_t      _ptr
-        size_t         _size
-        MemoryResource _memory_resource
-        object         _ipc_data
-        object         _owner
-        object         _ptr_obj
-        Stream         _alloc_stream
-        _MemAttrs      _mem_attrs
-        bint           _mem_attrs_inited
+        DevicePtrHandle _h_ptr
+        size_t          _size
+        MemoryResource  _memory_resource
+        object          _ipc_data
+        object          _owner
+        _MemAttrs       _mem_attrs
+        bint            _mem_attrs_inited
 
 
 cdef class MemoryResource:
     pass
+
+
+# Helper function to create a Buffer from a DevicePtrHandle
+cdef Buffer Buffer_from_deviceptr_handle(
+    DevicePtrHandle h_ptr,
+    size_t size,
+    MemoryResource mr,
+    object ipc_descriptor = *
+)
diff --git a/cuda_core/cuda/core/_memory/_buffer.pyx b/cuda_core/cuda/core/_memory/_buffer.pyx
index b92c9d51ce..32fe28bab4 100644
--- a/cuda_core/cuda/core/_memory/_buffer.pyx
+++ b/cuda_core/cuda/core/_memory/_buffer.pyx
@@ -13,6 +13,17 @@ from cuda.core._memory._device_memory_resource import DeviceMemoryResource
 from cuda.core._memory._pinned_memory_resource import PinnedMemoryResource
 from cuda.core._memory._ipc cimport IPCBufferDescriptor, IPCDataForBuffer
 from cuda.core._memory cimport _ipc
+from cuda.core._resource_handles cimport (
+    DevicePtrHandle,
+    StreamHandle,
+    _init_handles_table,
+    deviceptr_create_with_owner,
+    intptr,
+    native,
+    set_deallocation_stream,
+)
+
+_init_handles_table()
 from cuda.core._stream cimport Stream_accept, Stream
 from cuda.core._utils.cuda_utils cimport HANDLE_RETURN
 
@@ -50,12 +61,10 @@ cdef class Buffer:
         self._clear()
 
     def _clear(self):
-        self._ptr = 0
+        self._h_ptr.reset()  # Release the handle
         self._size = 0
         self._memory_resource = None
         self._ipc_data = None
-        self._ptr_obj = None
-        self._alloc_stream = None
         self._owner = None
         self._mem_attrs_inited = False
 
@@ -69,20 +78,23 @@ cdef class Buffer:
         stream: Stream | None = None, ipc_descriptor: IPCBufferDescriptor | None = None,
         owner : object | None = None
     ):
-        cdef Buffer self = Buffer.__new__(cls)
-        self._ptr = <uintptr_t>(int(ptr))
-        self._ptr_obj = ptr
-        self._size = size
+        """Legacy init for compatibility - creates a non-owning ref handle.
+
+        Note: The stream parameter is accepted for API compatibility but is
+        ignored since non-owning refs are never freed by the handle.
+        """
         if mr is not None and owner is not None:
             raise ValueError("owner and memory resource cannot be both specified together")
+        cdef Buffer self = Buffer.__new__(cls)
+        self._h_ptr = deviceptr_create_with_owner(<uintptr_t>(int(ptr)), owner)
+        self._size = size
         self._memory_resource = mr
         self._ipc_data = IPCDataForBuffer(ipc_descriptor, True) if ipc_descriptor is not None else None
-        self._alloc_stream = <Stream>(stream) if stream is not None else None
         self._owner = owner
+        self._mem_attrs_inited = False
         return self
 
-    def __dealloc__(self):
-        self.close(self._alloc_stream)
+    # No __dealloc__ needed - RAII handles cleanup via _h_ptr destructor
 
     def __reduce__(self):
         # Must not serialize the parent's stream!
@@ -107,8 +119,12 @@ cdef class Buffer:
             An object holding external allocation that the ``ptr`` points to.
             The reference is kept as long as the buffer is alive.
             The ``owner`` and ``mr`` cannot be specified together.
+
+        Note
+        ----
+        This creates a non-owning reference. The pointer will NOT be freed
+        when the Buffer is closed or garbage collected.
         """
-        # TODO: It is better to take a stream for latter deallocation
         return Buffer._init(ptr, size, mr=mr, owner=owner)
 
     @classmethod
@@ -135,7 +151,7 @@ cdef class Buffer:
         ----------
         stream : :obj:`~_stream.Stream` | :obj:`~_graph.GraphBuilder`, optional
             The stream object to use for asynchronous deallocation. If None,
-            the behavior depends on the underlying memory resource.
+            the deallocation stream stored in the handle is used.
         """
         Buffer_close(self, stream)
 
@@ -155,29 +171,23 @@ cdef class Buffer:
             asynchronous copy
 
         """
-        stream = Stream_accept(stream)
-        cdef Stream s_stream = <Stream>stream
+        cdef Stream s = Stream_accept(stream)
         cdef size_t src_size = self._size
 
         if dst is None:
             if self._memory_resource is None:
                 raise ValueError("a destination buffer must be provided (this "
                                  "buffer does not have a memory_resource)")
-            dst = self._memory_resource.allocate(src_size, stream)
+            dst = self._memory_resource.allocate(src_size, s)
 
         cdef size_t dst_size = dst._size
         if dst_size != src_size:
             raise ValueError( "buffer sizes mismatch between src and dst (sizes "
                              f"are: src={src_size}, dst={dst_size})"
             )
-        cdef cydriver.CUstream s = s_stream._handle
         with nogil:
             HANDLE_RETURN(cydriver.cuMemcpyAsync(
-                <cydriver.CUdeviceptr>dst._ptr,
-                <cydriver.CUdeviceptr>self._ptr,
-                src_size,
-                s
-            ))
+                native(dst._h_ptr), native(self._h_ptr), src_size, native(s._h_stream)))
         return dst
 
     def copy_from(self, src: Buffer, *, stream: Stream | GraphBuilder):
@@ -192,8 +202,7 @@ cdef class Buffer:
             asynchronous copy
 
         """
-        stream = Stream_accept(stream)
-        cdef Stream s_stream = <Stream>stream
+        cdef Stream s = Stream_accept(stream)
         cdef size_t dst_size = self._size
         cdef size_t src_size = src._size
 
@@ -201,14 +210,9 @@ cdef class Buffer:
             raise ValueError( "buffer sizes mismatch between src and dst (sizes "
                              f"are: src={src_size}, dst={dst_size})"
             )
-        cdef cydriver.CUstream s = s_stream._handle
         with nogil:
             HANDLE_RETURN(cydriver.cuMemcpyAsync(
-                <cydriver.CUdeviceptr>self._ptr,
-                <cydriver.CUdeviceptr>src._ptr,
-                dst_size,
-                s
-            ))
+                native(self._h_ptr), native(src._h_ptr), dst_size, native(s._h_stream)))
 
     def fill(self, value: int | BufferProtocol, *, stream: Stream | GraphBuilder):
         """Fill this buffer with a repeating byte pattern.
@@ -236,12 +240,12 @@ cdef class Buffer:
 
         # Handle int case: 1-byte fill with automatic overflow checking.
         if isinstance(value, int):
-            Buffer_fill_uint8(self, value, s_stream._handle)
+            Buffer_fill_uint8(self, value, s_stream._h_stream)
             return
 
         # Handle bytes case: direct pointer access without intermediate objects.
         if isinstance(value, bytes):
-            Buffer_fill_from_ptr(self, <const char*><bytes>value, len(value), s_stream._handle)
+            Buffer_fill_from_ptr(self, <const char*><bytes>value, len(value), s_stream._h_stream)
             return
 
         # General buffer protocol path using C buffer API.
@@ -251,7 +255,7 @@ cdef class Buffer:
                 f"value must be an int or support the buffer protocol, got {type(value).__name__}"
             )
         try:
-            Buffer_fill_from_ptr(self, <const char*>buf.buf, buf.len, s_stream._handle)
+            Buffer_fill_from_ptr(self, <const char*>buf.buf, buf.len, s_stream._h_stream)
         finally:
             PyBuffer_Release(&buf)
 
@@ -306,9 +310,8 @@ cdef class Buffer:
         """Return the device ordinal of this buffer."""
         if self._memory_resource is not None:
             return self._memory_resource.device_id
-        else:
-            Buffer_init_mem_attrs(self)
-            return self._mem_attrs.device_id
+        _init_mem_attrs(self)
+        return self._mem_attrs.device_id
 
     @property
     def handle(self) -> DevicePointerT:
@@ -319,31 +322,25 @@ cdef class Buffer:
             This handle is a Python object. To get the memory address of the underlying C
             handle, call ``int(Buffer.handle)``.
         """
-        if self._ptr_obj is not None:
-            return self._ptr_obj
-        elif self._ptr:
-            return self._ptr
-        else:
-            # contract: Buffer is closed
-            return 0
+        # Return raw integer for compatibility with ctypes and other tools
+        # that expect a raw pointer value
+        return intptr(self._h_ptr)
 
     @property
     def is_device_accessible(self) -> bool:
         """Return True if this buffer can be accessed by the GPU, otherwise False."""
         if self._memory_resource is not None:
             return self._memory_resource.is_device_accessible
-        else:
-            Buffer_init_mem_attrs(self)
-            return self._mem_attrs.is_device_accessible
+        _init_mem_attrs(self)
+        return self._mem_attrs.is_device_accessible
 
     @property
     def is_host_accessible(self) -> bool:
         """Return True if this buffer can be accessed by the CPU, otherwise False."""
         if self._memory_resource is not None:
             return self._memory_resource.is_host_accessible
-        else:
-            Buffer_init_mem_attrs(self)
-            return self._mem_attrs.is_host_accessible
+        _init_mem_attrs(self)
+        return self._mem_attrs.is_host_accessible
 
     @property
     def is_mapped(self) -> bool:
@@ -367,85 +364,52 @@ cdef class Buffer:
         return self._owner
 
 
-# Buffer Implementation
-# ---------------------
-cdef inline void Buffer_close(Buffer self, stream):
-    cdef Stream s
-    if self._ptr:
-        if self._memory_resource is not None:
-            s = Stream_accept(stream) if stream is not None else self._alloc_stream
-            self._memory_resource.deallocate(self._ptr, self._size, s)
-        self._ptr = 0
-        self._memory_resource = None
-        self._owner = None
-        self._ptr_obj = None
-        self._alloc_stream = None
-
-
-cdef inline int Buffer_fill_uint8(Buffer self, uint8_t value, cydriver.CUstream s) except? -1:
-    with nogil:
-        HANDLE_RETURN(cydriver.cuMemsetD8Async(<cydriver.CUdeviceptr>self._ptr, value, self._size, s))
-    return 0
-
-
-cdef inline int Buffer_fill_from_ptr(
-    Buffer self, const char* ptr, size_t width, cydriver.CUstream s
-) except? -1:
-    cdef size_t buffer_size = self._size
-
-    if width == 1:
-        with nogil:
-            HANDLE_RETURN(cydriver.cuMemsetD8Async(
-                <cydriver.CUdeviceptr>self._ptr, (<uint8_t*>ptr)[0], buffer_size, s))
-    elif width == 2:
-        if buffer_size & 0x1:
-            raise ValueError(f"buffer size ({buffer_size}) must be divisible by 2")
-        with nogil:
-            HANDLE_RETURN(cydriver.cuMemsetD16Async(
-                <cydriver.CUdeviceptr>self._ptr, (<uint16_t*>ptr)[0], buffer_size // 2, s))
-    elif width == 4:
-        if buffer_size & 0x3:
-            raise ValueError(f"buffer size ({buffer_size}) must be divisible by 4")
-        with nogil:
-            HANDLE_RETURN(cydriver.cuMemsetD32Async(
-                <cydriver.CUdeviceptr>self._ptr, (<uint32_t*>ptr)[0], buffer_size // 4, s))
-    else:
-        raise ValueError(f"value must be 1, 2, or 4 bytes, got {width}")
-    return 0
-
-
-cdef Buffer_init_mem_attrs(Buffer self):
+# Memory Attribute Query Helpers
+# ------------------------------
+cdef inline void _init_mem_attrs(Buffer self):
+    """Initialize memory attributes by querying the pointer."""
     if not self._mem_attrs_inited:
-        query_memory_attrs(self._mem_attrs, self._ptr)
+        _query_memory_attrs(self._mem_attrs, native(self._h_ptr))
         self._mem_attrs_inited = True
 
 
-cdef int query_memory_attrs(_MemAttrs &out, uintptr_t ptr) except -1 nogil:
+cdef inline int _query_memory_attrs(
+    _MemAttrs& out,
+    cydriver.CUdeviceptr ptr
+) except -1 nogil:
+    """Query memory attributes for a device pointer."""
     cdef unsigned int memory_type = 0
     cdef int is_managed = 0
     cdef int device_id = 0
-    _query_memory_attrs(memory_type, is_managed, device_id, <cydriver.CUdeviceptr>ptr)
+    cdef cydriver.CUpointer_attribute attrs[3]
+    cdef uintptr_t vals[3]
+
+    attrs[0] = cydriver.CUpointer_attribute.CU_POINTER_ATTRIBUTE_MEMORY_TYPE
+    attrs[1] = cydriver.CUpointer_attribute.CU_POINTER_ATTRIBUTE_IS_MANAGED
+    attrs[2] = cydriver.CUpointer_attribute.CU_POINTER_ATTRIBUTE_DEVICE_ORDINAL
+    vals[0] = <uintptr_t><void*>&memory_type
+    vals[1] = <uintptr_t><void*>&is_managed
+    vals[2] = <uintptr_t><void*>&device_id
+
+    cdef cydriver.CUresult ret
+    ret = cydriver.cuPointerGetAttributes(3, attrs, <void**>vals, ptr)
+    if ret == cydriver.CUresult.CUDA_ERROR_NOT_INITIALIZED:
+        with cython.gil:
+            # Device class handles the cuInit call internally
+            Device()
+        ret = cydriver.cuPointerGetAttributes(3, attrs, <void**>vals, ptr)
+    HANDLE_RETURN(ret)
 
     if memory_type == 0:
         # unregistered host pointer
         out.is_host_accessible = True
         out.is_device_accessible = False
         out.device_id = -1
-    # for managed memory, the memory type can be CU_MEMORYTYPE_DEVICE,
-    # so we need to check it first not to falsely claim it is not
-    # host accessible.
     elif (
         is_managed
         or memory_type == cydriver.CUmemorytype.CU_MEMORYTYPE_HOST
     ):
-        # For pinned memory allocated with cudaMallocHost or paged-locked
-        # with cudaHostRegister, the memory_type is
-        # cydriver.CUmemorytype.CU_MEMORYTYPE_HOST.
-        # TODO(ktokarski): In some cases, the registered memory requires
-        # using different ptr for device and host, we could check
-        # cuMemHostGetDevicePointer and
-        # CU_DEVICE_ATTRIBUTE_CAN_USE_HOST_POINTER_FOR_REGISTERED_MEM
-        # to double check the device accessibility.
+        # Managed memory or pinned host memory
         out.is_host_accessible = True
         out.is_device_accessible = True
         out.device_id = device_id
@@ -454,28 +418,8 @@ cdef int query_memory_attrs(_MemAttrs &out, uintptr_t ptr) except -1 nogil:
         out.is_device_accessible = True
         out.device_id = device_id
     else:
-        raise ValueError(f"Unsupported memory type: {memory_type}")
-    return 0
-
-
-cdef inline int _query_memory_attrs(unsigned int& memory_type, int & is_managed, int& device_id, cydriver.CUdeviceptr ptr) except -1 nogil:
-    cdef cydriver.CUpointer_attribute attrs[3]
-    cdef uintptr_t vals[3]
-    attrs[0] = cydriver.CUpointer_attribute.CU_POINTER_ATTRIBUTE_MEMORY_TYPE
-    attrs[1] = cydriver.CUpointer_attribute.CU_POINTER_ATTRIBUTE_IS_MANAGED
-    attrs[2] = cydriver.CUpointer_attribute.CU_POINTER_ATTRIBUTE_DEVICE_ORDINAL
-    vals[0] = <uintptr_t><void*>&memory_type
-    vals[1] = <uintptr_t><void*>&is_managed
-    vals[2] = <uintptr_t><void*>&device_id
-
-    cdef cydriver.CUresult ret
-    ret = cydriver.cuPointerGetAttributes(3, attrs, <void**>vals, ptr)
-    if ret == cydriver.CUresult.CUDA_ERROR_NOT_INITIALIZED:
         with cython.gil:
-            # Device class handles the cuInit call internally
-            Device()
-        ret = cydriver.cuPointerGetAttributes(3, attrs, <void**>vals, ptr)
-    HANDLE_RETURN(ret)
+            raise ValueError(f"Unsupported memory type: {memory_type}")
     return 0
 
 
@@ -541,3 +485,72 @@ cdef class MemoryResource:
     def device_id(self) -> int:
         """Device ID associated with this memory resource, or -1 if not applicable."""
         raise TypeError("MemoryResource.device_id must be implemented by subclasses.")
+
+
+# Buffer Implementation Helpers
+# -----------------------------
+cdef inline Buffer Buffer_from_deviceptr_handle(
+    DevicePtrHandle h_ptr,
+    size_t size,
+    MemoryResource mr,
+    object ipc_descriptor = None
+):
+    """Create a Buffer from an existing DevicePtrHandle."""
+    cdef Buffer buf = Buffer.__new__(Buffer)
+    buf._h_ptr = h_ptr
+    buf._size = size
+    buf._memory_resource = mr
+    buf._ipc_data = IPCDataForBuffer(ipc_descriptor, True) if ipc_descriptor is not None else None
+    buf._owner = None
+    buf._mem_attrs_inited = False
+    return buf
+
+
+cdef inline void Buffer_close(Buffer self, object stream):
+    """Close a buffer, freeing its memory."""
+    cdef Stream s
+    if not self._h_ptr:
+        return
+    # Update deallocation stream if provided
+    if stream is not None:
+        s = Stream_accept(stream)
+        set_deallocation_stream(self._h_ptr, s._h_stream)
+    # Reset handle - RAII deleter will free the memory (and release owner ref in C++)
+    self._h_ptr.reset()
+    self._size = 0
+    self._memory_resource = None
+    self._ipc_data = None
+    self._owner = None
+
+
+cdef inline int Buffer_fill_uint8(Buffer self, uint8_t value, StreamHandle h_stream) except? -1:
+    cdef cydriver.CUdeviceptr ptr = native(self._h_ptr)
+    cdef cydriver.CUstream s = native(h_stream)
+    with nogil:
+        HANDLE_RETURN(cydriver.cuMemsetD8Async(ptr, value, self._size, s))
+    return 0
+
+
+cdef inline int Buffer_fill_from_ptr(
+    Buffer self, const char* ptr, size_t width, StreamHandle h_stream
+) except? -1:
+    cdef size_t buffer_size = self._size
+    cdef cydriver.CUdeviceptr dst = native(self._h_ptr)
+    cdef cydriver.CUstream s = native(h_stream)
+
+    if width == 1:
+        with nogil:
+            HANDLE_RETURN(cydriver.cuMemsetD8Async(dst, (<uint8_t*>ptr)[0], buffer_size, s))
+    elif width == 2:
+        if buffer_size & 0x1:
+            raise ValueError(f"buffer size ({buffer_size}) must be divisible by 2")
+        with nogil:
+            HANDLE_RETURN(cydriver.cuMemsetD16Async(dst, (<uint16_t*>ptr)[0], buffer_size // 2, s))
+    elif width == 4:
+        if buffer_size & 0x3:
+            raise ValueError(f"buffer size ({buffer_size}) must be divisible by 4")
+        with nogil:
+            HANDLE_RETURN(cydriver.cuMemsetD32Async(dst, (<uint32_t*>ptr)[0], buffer_size // 4, s))
+    else:
+        raise ValueError(f"value must be 1, 2, or 4 bytes, got {width}")
+    return 0
diff --git a/cuda_core/cuda/core/_memory/_device_memory_resource.pyx b/cuda_core/cuda/core/_memory/_device_memory_resource.pyx
index d0cc82184a..bec16b993c 100644
--- a/cuda_core/cuda/core/_memory/_device_memory_resource.pyx
+++ b/cuda_core/cuda/core/_memory/_device_memory_resource.pyx
@@ -20,6 +20,7 @@ import platform  # no-cython-lint
 import uuid
 
 from cuda.core._utils.cuda_utils import check_multiprocessing_start_method
+from cuda.core._resource_handles cimport native
 
 if TYPE_CHECKING:
     from .._device import Device
@@ -254,7 +255,7 @@ cpdef DMR_mempool_get_access(DeviceMemoryResource dmr, int device_id):
     location.id = dev_id
 
     with nogil:
-        HANDLE_RETURN(cydriver.cuMemPoolGetAccess(&flags, dmr._handle, &location))
+        HANDLE_RETURN(cydriver.cuMemPoolGetAccess(&flags, native(dmr._h_pool), &location))
 
     if flags == cydriver.CUmemAccess_flags.CU_MEM_ACCESS_FLAGS_PROT_READWRITE:
         return "rw"
diff --git a/cuda_core/cuda/core/_memory/_graph_memory_resource.pxd b/cuda_core/cuda/core/_memory/_graph_memory_resource.pxd
index 2f6c35d72e..492aa23cd3 100644
--- a/cuda_core/cuda/core/_memory/_graph_memory_resource.pxd
+++ b/cuda_core/cuda/core/_memory/_graph_memory_resource.pxd
@@ -7,4 +7,4 @@ from cuda.core._memory._buffer cimport MemoryResource
 
 cdef class cyGraphMemoryResource(MemoryResource):
     cdef:
-        int _dev_id
+        int _device_id
diff --git a/cuda_core/cuda/core/_memory/_graph_memory_resource.pyx b/cuda_core/cuda/core/_memory/_graph_memory_resource.pyx
index bda075c201..daa38a1216 100644
--- a/cuda_core/cuda/core/_memory/_graph_memory_resource.pyx
+++ b/cuda_core/cuda/core/_memory/_graph_memory_resource.pyx
@@ -7,7 +7,15 @@ from __future__ import annotations
 from libc.stdint cimport intptr_t
 
 from cuda.bindings cimport cydriver
-from cuda.core._memory._buffer cimport Buffer, MemoryResource
+from cuda.core._memory._buffer cimport Buffer, Buffer_from_deviceptr_handle, MemoryResource
+from cuda.core._resource_handles cimport (
+    DevicePtrHandle,
+    _init_handles_table,
+    deviceptr_alloc_async,
+    native,
+)
+
+_init_handles_table()
 from cuda.core._stream cimport default_stream, Stream_accept, Stream
 from cuda.core._utils.cuda_utils cimport HANDLE_RETURN
 
@@ -22,7 +30,7 @@ __all__ = ['GraphMemoryResource']
 
 cdef class GraphMemoryResourceAttributes:
     cdef:
-        int _dev_id
+        int _device_id
 
     def __init__(self, *args, **kwargs):
         raise RuntimeError("GraphMemoryResourceAttributes cannot be instantiated directly. Please use MemoryResource APIs.")
@@ -30,7 +38,7 @@ cdef class GraphMemoryResourceAttributes:
     @classmethod
     def _init(cls, device_id: int):
         cdef GraphMemoryResourceAttributes self = GraphMemoryResourceAttributes.__new__(cls)
-        self._dev_id = device_id
+        self._device_id = device_id
         return self
 
     def __repr__(self):
@@ -41,12 +49,12 @@ cdef class GraphMemoryResourceAttributes:
 
     cdef int _getattribute(self, cydriver.CUgraphMem_attribute attr_enum, void* value) except?-1:
         with nogil:
-            HANDLE_RETURN(cydriver.cuDeviceGetGraphMemAttribute(self._dev_id, attr_enum, value))
+            HANDLE_RETURN(cydriver.cuDeviceGetGraphMemAttribute(self._device_id, attr_enum, value))
         return 0
 
     cdef int _setattribute(self, cydriver.CUgraphMem_attribute attr_enum, void* value) except?-1:
         with nogil:
-            HANDLE_RETURN(cydriver.cuDeviceSetGraphMemAttribute(self._dev_id, attr_enum, value))
+            HANDLE_RETURN(cydriver.cuDeviceSetGraphMemAttribute(self._device_id, attr_enum, value))
         return 0
 
     @property
@@ -100,7 +108,7 @@ cdef class GraphMemoryResourceAttributes:
 
 cdef class cyGraphMemoryResource(MemoryResource):
     def __cinit__(self, int device_id):
-        self._dev_id = device_id
+        self._device_id = device_id
 
     def allocate(self, size_t size, stream: Stream | GraphBuilder | None = None) -> Buffer:
         """
@@ -123,17 +131,17 @@ cdef class cyGraphMemoryResource(MemoryResource):
     def trim(self):
         """Free unused memory that was cached on the specified device for use with graphs back to the OS."""
         with nogil:
-             HANDLE_RETURN(cydriver.cuDeviceGraphMemTrim(self._dev_id))
+             HANDLE_RETURN(cydriver.cuDeviceGraphMemTrim(self._device_id))
 
     @property
     def attributes(self) -> GraphMemoryResourceAttributes:
         """Asynchronous allocation attributes related to graphs."""
-        return GraphMemoryResourceAttributes._init(self._dev_id)
+        return GraphMemoryResourceAttributes._init(self._device_id)
 
     @property
     def device_id(self) -> int:
         """The associated device ordinal."""
-        return self._dev_id
+        return self._device_id
 
     @property
     def is_device_accessible(self) -> bool:
@@ -186,22 +194,18 @@ cdef inline int check_capturing(cydriver.CUstream s) except?-1 nogil:
 
 
 cdef inline Buffer GMR_allocate(cyGraphMemoryResource self, size_t size, Stream stream):
-    cdef cydriver.CUstream s = stream._handle
-    cdef cydriver.CUdeviceptr devptr
+    cdef cydriver.CUstream s = native(stream._h_stream)
+    cdef DevicePtrHandle h_ptr
     with nogil:
         check_capturing(s)
-        HANDLE_RETURN(cydriver.cuMemAllocAsync(&devptr, size, s))
-    cdef Buffer buf = Buffer.__new__(Buffer)
-    buf._ptr = <intptr_t>(devptr)
-    buf._ptr_obj = None
-    buf._size = size
-    buf._memory_resource = self
-    buf._alloc_stream = stream
-    return buf
+        h_ptr = deviceptr_alloc_async(size, stream._h_stream)
+    if not h_ptr:
+        raise RuntimeError("Failed to allocate memory asynchronously")
+    return Buffer_from_deviceptr_handle(h_ptr, size, self, None)
 
 
 cdef inline void GMR_deallocate(intptr_t ptr, size_t size, Stream stream) noexcept:
-    cdef cydriver.CUstream s = stream._handle
+    cdef cydriver.CUstream s = native(stream._h_stream)
     cdef cydriver.CUdeviceptr devptr = <cydriver.CUdeviceptr>ptr
     with nogil:
         HANDLE_RETURN(cydriver.cuMemFreeAsync(devptr, s))
diff --git a/cuda_core/cuda/core/_memory/_ipc.pxd b/cuda_core/cuda/core/_memory/_ipc.pxd
index 0c7375efdb..5166aa8748 100644
--- a/cuda_core/cuda/core/_memory/_ipc.pxd
+++ b/cuda_core/cuda/core/_memory/_ipc.pxd
@@ -41,6 +41,8 @@ cdef class IPCBufferDescriptor:
         bytes  _payload
         size_t _size
 
+    cdef const void* payload_ptr(self) noexcept
+
 
 cdef class IPCAllocationHandle:
     cdef:
diff --git a/cuda_core/cuda/core/_memory/_ipc.pyx b/cuda_core/cuda/core/_memory/_ipc.pyx
index 793e4168d7..99608f55db 100644
--- a/cuda_core/cuda/core/_memory/_ipc.pyx
+++ b/cuda_core/cuda/core/_memory/_ipc.pyx
@@ -3,11 +3,21 @@
 # SPDX-License-Identifier: Apache-2.0
 
 cimport cpython
-from libc.stdint cimport uintptr_t
-from libc.string cimport memcpy
 
 from cuda.bindings cimport cydriver
-from cuda.core._memory._buffer cimport Buffer
+from cuda.core._memory._buffer cimport Buffer, Buffer_from_deviceptr_handle
+from cuda.core._memory._memory_pool cimport _MemPool
+from cuda.core._stream cimport Stream
+from cuda.core._resource_handles cimport (
+    DevicePtrHandle,
+    _init_handles_table,
+    create_mempool_handle_ipc,
+    deviceptr_import_ipc,
+    get_last_error,
+    native,
+)
+
+_init_handles_table()
 from cuda.core._stream cimport default_stream
 from cuda.core._utils.cuda_utils cimport HANDLE_RETURN
 from cuda.core._utils.cuda_utils import check_multiprocessing_start_method
@@ -87,6 +97,10 @@ cdef class IPCBufferDescriptor:
     def size(self):
         return self._size
 
+    cdef const void* payload_ptr(self) noexcept:
+        """Return the payload as a const void* for C API calls."""
+        return <const void*><const char*>(self._payload)
+
 
 cdef class IPCAllocationHandle:
     """Shareable handle to an IPC-enabled device memory pool."""
@@ -150,7 +164,7 @@ cdef IPCBufferDescriptor Buffer_get_ipc_descriptor(Buffer self):
     cdef cydriver.CUmemPoolPtrExportData data
     with nogil:
         HANDLE_RETURN(
-            cydriver.cuMemPoolExportPointer(&data, <cydriver.CUdeviceptr>(self._ptr))
+            cydriver.cuMemPoolExportPointer(&data, native(self._h_ptr))
         )
     cdef bytes data_b = cpython.PyBytes_FromStringAndSize(
         <char*>(data.reserved), sizeof(data.reserved)
@@ -166,16 +180,15 @@ cdef Buffer Buffer_from_ipc_descriptor(
     if stream is None:
         # Note: match this behavior to _MemPool.allocate()
         stream = default_stream()
-    cdef cydriver.CUmemPoolPtrExportData data
-    memcpy(
-        data.reserved,
-        <const void*><const char*>(ipc_descriptor._payload),
-        sizeof(data.reserved)
+    cdef Stream s = <Stream>stream
+    cdef DevicePtrHandle h_ptr = deviceptr_import_ipc(
+        mr._h_pool,
+        ipc_descriptor.payload_ptr(),
+        s._h_stream
     )
-    cdef cydriver.CUdeviceptr ptr
-    with nogil:
-        HANDLE_RETURN(cydriver.cuMemPoolImportPointer(&ptr, mr._handle, &data))
-    return Buffer._init(<uintptr_t>ptr, ipc_descriptor.size, mr, stream, ipc_descriptor)
+    if not h_ptr:
+        HANDLE_RETURN(get_last_error())
+    return Buffer_from_deviceptr_handle(h_ptr, ipc_descriptor.size, mr, ipc_descriptor)
 
 
 # _MemPool IPC Implementation
@@ -198,18 +211,15 @@ cdef _MemPool MP_from_allocation_handle(cls, alloc_handle):
             os.close(fd)
             raise
 
-    # Construct a new mempool
+    # Construct a new mempool.
     cdef _MemPool self = <_MemPool>(cls.__new__(cls))
     self._mempool_owned = True
+    cdef int ipc_fd = int(alloc_handle)
+    self._h_pool = create_mempool_handle_ipc(ipc_fd, IPC_HANDLE_TYPE)
+    if not self._h_pool:
+        raise RuntimeError("Failed to import memory pool from IPC handle")
     self._ipc_data = IPCDataForMR(alloc_handle, True)
 
-    # Map the mempool into this process.
-    cdef int handle = int(alloc_handle)
-    with nogil:
-        HANDLE_RETURN(cydriver.cuMemPoolImportFromShareableHandle(
-            &(self._handle), <void*><uintptr_t>(handle), IPC_HANDLE_TYPE, 0)
-        )
-
     # Register it.
     if uuid is not None:
         registered = self.register(uuid)
@@ -240,7 +250,7 @@ cdef IPCAllocationHandle MP_export_mempool(_MemPool self):
     cdef int fd
     with nogil:
         HANDLE_RETURN(cydriver.cuMemPoolExportToShareableHandle(
-            &fd, self._handle, IPC_HANDLE_TYPE, 0)
+            &fd, native(self._h_pool), IPC_HANDLE_TYPE, 0)
         )
     try:
         return IPCAllocationHandle._init(fd, uuid.uuid4())
diff --git a/cuda_core/cuda/core/_memory/_legacy.py b/cuda_core/cuda/core/_memory/_legacy.py
index 317494ea9e..9250819610 100644
--- a/cuda_core/cuda/core/_memory/_legacy.py
+++ b/cuda_core/cuda/core/_memory/_legacy.py
@@ -84,12 +84,12 @@ def device_id(self) -> int:
 
 
 class _SynchronousMemoryResource(MemoryResource):
-    __slots__ = ("_dev_id",)
+    __slots__ = ("_device_id",)
 
     def __init__(self, device_id):
         from .._device import Device
 
-        self._dev_id = Device(device_id).device_id
+        self._device_id = Device(device_id).device_id
 
     def allocate(self, size, stream=None) -> Buffer:
         if stream is None:
@@ -116,4 +116,4 @@ def is_host_accessible(self) -> bool:
 
     @property
     def device_id(self) -> int:
-        return self._dev_id
+        return self._device_id
diff --git a/cuda_core/cuda/core/_memory/_memory_pool.pxd b/cuda_core/cuda/core/_memory/_memory_pool.pxd
index 8d9961b68b..eaff8e4bab 100644
--- a/cuda_core/cuda/core/_memory/_memory_pool.pxd
+++ b/cuda_core/cuda/core/_memory/_memory_pool.pxd
@@ -5,12 +5,13 @@
 from cuda.bindings cimport cydriver
 from cuda.core._memory._buffer cimport MemoryResource
 from cuda.core._memory._ipc cimport IPCDataForMR
+from cuda.core._resource_handles cimport MemoryPoolHandle
 
 
 cdef class _MemPool(MemoryResource):
     cdef:
         int                   _dev_id
-        cydriver.CUmemoryPool _handle
+        MemoryPoolHandle      _h_pool
         bint                  _mempool_owned
         IPCDataForMR          _ipc_data
         object                _attributes
diff --git a/cuda_core/cuda/core/_memory/_memory_pool.pyx b/cuda_core/cuda/core/_memory/_memory_pool.pyx
index f1b72d47b5..7a255ebb3d 100644
--- a/cuda_core/cuda/core/_memory/_memory_pool.pyx
+++ b/cuda_core/cuda/core/_memory/_memory_pool.pyx
@@ -10,9 +10,22 @@ from libc.string cimport memset
 from cpython.mem cimport PyMem_Malloc, PyMem_Free
 
 from cuda.bindings cimport cydriver
-from cuda.core._memory._buffer cimport Buffer, MemoryResource
+from cuda.core._memory._buffer cimport Buffer, Buffer_from_deviceptr_handle, MemoryResource
 from cuda.core._memory cimport _ipc
 from cuda.core._stream cimport default_stream, Stream_accept, Stream
+from cuda.core._resource_handles cimport (
+    MemoryPoolHandle,
+    DevicePtrHandle,
+    _init_handles_table,
+    create_mempool_handle,
+    create_mempool_handle_ref,
+    get_device_mempool,
+    deviceptr_alloc_from_pool,
+    native,
+    py,
+)
+
+_init_handles_table()
 from cuda.core._utils.cuda_utils cimport (
     HANDLE_RETURN,
 )
@@ -61,7 +74,7 @@ cdef class _MemPoolAttributes:
         cdef _MemPool mr = <_MemPool>(self._mr_weakref())
         if mr is None:
             raise RuntimeError("_MemPool is expired")
-        cdef cydriver.CUmemoryPool pool_handle = mr._handle
+        cdef cydriver.CUmemoryPool pool_handle = native(mr._h_pool)
         with nogil:
             HANDLE_RETURN(cydriver.cuMemPoolGetAttribute(pool_handle, attr_enum, value))
         return 0
@@ -127,7 +140,6 @@ cdef class _MemPool(MemoryResource):
 
     def __cinit__(self):
         self._dev_id = cydriver.CU_DEVICE_INVALID
-        self._handle = NULL
         self._mempool_owned = False
         self._ipc_data = None
         self._attributes = None
@@ -202,9 +214,9 @@ cdef class _MemPool(MemoryResource):
         return self._dev_id
 
     @property
-    def handle(self) -> driver.CUmemoryPool:
+    def handle(self) -> object:
         """Handle to the underlying memory pool."""
-        return driver.CUmemoryPool(<uintptr_t>(self._handle))
+        return py(self._h_pool)
 
     @property
     def is_handle_owned(self) -> bool:
@@ -271,7 +283,7 @@ cdef class _MemPool(MemoryResource):
                     i += 1
 
                 with nogil:
-                    HANDLE_RETURN(cydriver.cuMemPoolSetAccess(self._handle, access_desc, count))
+                    HANDLE_RETURN(cydriver.cuMemPoolSetAccess(native(self._h_pool), access_desc, count))
             finally:
                 if access_desc != NULL:
                     PyMem_Free(access_desc)
@@ -308,64 +320,69 @@ cdef int _MP_init_current(_MemPool self, int dev_id, _MemPoolOptions opts) excep
     cdef cydriver.cuuint64_t current_threshold
     cdef cydriver.cuuint64_t max_threshold = ULLONG_MAX
     cdef cydriver.CUmemLocation loc
+    cdef cydriver.CUmemoryPool pool
 
     self._dev_id = dev_id
     self._mempool_owned = False
 
-    with nogil:
-        if opts._type == cydriver.CUmemAllocationType.CU_MEM_ALLOCATION_TYPE_PINNED \
-                and opts._location == cydriver.CUmemLocationType.CU_MEM_LOCATION_TYPE_DEVICE:
-            assert dev_id >= 0
-            HANDLE_RETURN(cydriver.cuDeviceGetMemPool(&(self._handle), dev_id))
-
-            # Set a higher release threshold to improve performance when there are
-            # no active allocations.  By default, the release threshold is 0, which
-            # means memory is immediately released back to the OS when there are no
-            # active suballocations, causing performance issues.
+    if opts._type == cydriver.CUmemAllocationType.CU_MEM_ALLOCATION_TYPE_PINNED \
+            and opts._location == cydriver.CUmemLocationType.CU_MEM_LOCATION_TYPE_DEVICE:
+        assert dev_id >= 0
+        self._h_pool = get_device_mempool(dev_id)
+
+        # Set a higher release threshold to improve performance when there are
+        # no active allocations.  By default, the release threshold is 0, which
+        # means memory is immediately released back to the OS when there are no
+        # active suballocations, causing performance issues.
+        with nogil:
             HANDLE_RETURN(
                 cydriver.cuMemPoolGetAttribute(
-                    self._handle,
+                    native(self._h_pool),
                     cydriver.CUmemPool_attribute.CU_MEMPOOL_ATTR_RELEASE_THRESHOLD,
                     &current_threshold
                 )
             )
-
-            # If threshold is 0 (default), set it to maximum to retain memory in the pool.
             if current_threshold == 0:
                 HANDLE_RETURN(cydriver.cuMemPoolSetAttribute(
-                    self._handle,
+                    native(self._h_pool),
                     cydriver.CUmemPool_attribute.CU_MEMPOOL_ATTR_RELEASE_THRESHOLD,
                     &max_threshold
                 ))
-        elif opts._type == cydriver.CUmemAllocationType.CU_MEM_ALLOCATION_TYPE_PINNED \
-                and opts._location == cydriver.CUmemLocationType.CU_MEM_LOCATION_TYPE_HOST:
-            IF CUDA_CORE_BUILD_MAJOR >= 13:
-                assert dev_id == -1
+    elif opts._type == cydriver.CUmemAllocationType.CU_MEM_ALLOCATION_TYPE_PINNED \
+            and opts._location == cydriver.CUmemLocationType.CU_MEM_LOCATION_TYPE_HOST:
+        IF CUDA_CORE_BUILD_MAJOR >= 13:
+            assert dev_id == -1
+            loc.id = dev_id
+            loc.type = opts._location
+            with nogil:
+                HANDLE_RETURN(cydriver.cuMemGetMemPool(&pool, &loc, opts._type))
+            self._h_pool = create_mempool_handle_ref(pool)
+        ELSE:
+            raise RuntimeError("not supported")
+    elif opts._type == cydriver.CUmemAllocationType.CU_MEM_ALLOCATION_TYPE_PINNED \
+            and opts._location == cydriver.CUmemLocationType.CU_MEM_LOCATION_TYPE_HOST_NUMA:
+        IF CUDA_CORE_BUILD_MAJOR >= 13:
+            assert dev_id == 0
+            loc.id = 0
+            loc.type = opts._location
+            with nogil:
+                HANDLE_RETURN(cydriver.cuMemGetMemPool(&pool, &loc, opts._type))
+            self._h_pool = create_mempool_handle_ref(pool)
+        ELSE:
+            raise RuntimeError("not supported")
+    else:
+        IF CUDA_CORE_BUILD_MAJOR >= 13:
+            if opts._type == cydriver.CUmemAllocationType.CU_MEM_ALLOCATION_TYPE_MANAGED:
+                # Managed memory pools
                 loc.id = dev_id
                 loc.type = opts._location
-                HANDLE_RETURN(cydriver.cuMemGetMemPool(&(self._handle), &loc, opts._type))
-            ELSE:
-                raise RuntimeError("not supported")
-        elif opts._type == cydriver.CUmemAllocationType.CU_MEM_ALLOCATION_TYPE_PINNED \
-                and opts._location == cydriver.CUmemLocationType.CU_MEM_LOCATION_TYPE_HOST_NUMA:
-            IF CUDA_CORE_BUILD_MAJOR >= 13:
-                assert dev_id == 0
-                loc.id = 0
-                loc.type = opts._location
-                HANDLE_RETURN(cydriver.cuMemGetMemPool(&(self._handle), &loc, opts._type))
-            ELSE:
-                raise RuntimeError("not supported")
-        else:
-            IF CUDA_CORE_BUILD_MAJOR >= 13:
-                if opts._type == cydriver.CUmemAllocationType.CU_MEM_ALLOCATION_TYPE_MANAGED:
-                    # Managed memory pools
-                    loc.id = dev_id
-                    loc.type = opts._location
-                    HANDLE_RETURN(cydriver.cuMemGetMemPool(&(self._handle), &loc, opts._type))
-                else:
-                    assert False
-            ELSE:
+                with nogil:
+                    HANDLE_RETURN(cydriver.cuMemGetMemPool(&pool, &loc, opts._type))
+                self._h_pool = create_mempool_handle_ref(pool)
+            else:
                 assert False
+        ELSE:
+            assert False
 
     return 0
 
@@ -389,9 +406,7 @@ cdef int _MP_init_create(_MemPool self, int dev_id, _MemPoolOptions opts) except
     self._dev_id = dev_id
     self._mempool_owned = True
 
-    with nogil:
-        HANDLE_RETURN(cydriver.cuMemPoolCreate(&(self._handle), &properties))
-        # TODO: should we also set the threshold here?
+    self._h_pool = create_mempool_handle(properties)
 
     if ipc_enabled:
         alloc_handle = _ipc.MP_export_mempool(self)
@@ -411,24 +426,20 @@ cdef inline int check_not_capturing(cydriver.CUstream s) except?-1 nogil:
 
 
 cdef inline Buffer _MP_allocate(_MemPool self, size_t size, Stream stream):
-    cdef cydriver.CUstream s = stream._handle
-    cdef cydriver.CUdeviceptr devptr
+    cdef cydriver.CUstream s = native(stream._h_stream)
+    cdef DevicePtrHandle h_ptr
     with nogil:
         check_not_capturing(s)
-        HANDLE_RETURN(cydriver.cuMemAllocFromPoolAsync(&devptr, size, self._handle, s))
-    cdef Buffer buf = Buffer.__new__(Buffer)
-    buf._ptr = <uintptr_t>(devptr)
-    buf._ptr_obj = None
-    buf._size = size
-    buf._memory_resource = self
-    buf._alloc_stream = stream
-    return buf
+        h_ptr = deviceptr_alloc_from_pool(size, self._h_pool, stream._h_stream)
+    if not h_ptr:
+        raise RuntimeError("Failed to allocate memory from pool")
+    return Buffer_from_deviceptr_handle(h_ptr, size, self, None)
 
 
 cdef inline void _MP_deallocate(
     _MemPool self, uintptr_t ptr, size_t size, Stream stream
 ) noexcept nogil:
-    cdef cydriver.CUstream s = stream._handle
+    cdef cydriver.CUstream s = native(stream._h_stream)
     cdef cydriver.CUdeviceptr devptr = <cydriver.CUdeviceptr>ptr
     cdef cydriver.CUresult r
     with nogil:
@@ -438,7 +449,7 @@ cdef inline void _MP_deallocate(
 
 
 cdef inline _MP_close(_MemPool self):
-    if self._handle == NULL:
+    if not self._h_pool:
         return
 
     # This works around nvbug 5698116. When a memory pool handle is recycled
@@ -446,14 +457,12 @@ cdef inline _MP_close(_MemPool self):
     if self._peer_accessible_by:
         self.peer_accessible_by = []
 
-    try:
-        if self._mempool_owned:
-            with nogil:
-                HANDLE_RETURN(cydriver.cuMemPoolDestroy(self._handle))
-    finally:
-        self._dev_id = cydriver.CU_DEVICE_INVALID
-        self._handle = NULL
-        self._attributes = None
-        self._mempool_owned = False
-        self._ipc_data = None
-        self._peer_accessible_by = ()
+    # Reset members in declaration order.
+    # The RAII deleter handles nvbug 5698116 workaround (clears peer access)
+    # and calls cuMemPoolDestroy if this is an owning handle.
+    self._h_pool.reset()
+    self._dev_id = cydriver.CU_DEVICE_INVALID
+    self._mempool_owned = False
+    self._ipc_data = None
+    self._attributes = None
+    self._peer_accessible_by = ()
diff --git a/cuda_core/cuda/core/_memoryview.pyx b/cuda_core/cuda/core/_memoryview.pyx
index c12cbbaa8a..41321c8722 100644
--- a/cuda_core/cuda/core/_memoryview.pyx
+++ b/cuda_core/cuda/core/_memoryview.pyx
@@ -13,7 +13,17 @@ from typing import Optional
 
 import numpy
 
+from cuda.bindings cimport cydriver
+from cuda.core._resource_handles cimport (
+    EventHandle,
+    _init_handles_table,
+    create_event_handle_noctx,
+    native,
+)
+
+_init_handles_table()
 from cuda.core._utils.cuda_utils import handle_return, driver
+from cuda.core._utils.cuda_utils cimport HANDLE_RETURN
 
 
 from cuda.core._memory import Buffer
@@ -591,6 +601,7 @@ cpdef StridedMemoryView view_as_cai(obj, stream_ptr, view=None):
         buf.device_id = handle_return(driver.cuCtxGetDevice())
 
     cdef intptr_t producer_s, consumer_s
+    cdef EventHandle h_event
     stream_ptr = int(stream_ptr)
     if stream_ptr != -1:
         stream = cai_data.get("stream")
@@ -600,11 +611,12 @@ cpdef StridedMemoryView view_as_cai(obj, stream_ptr, view=None):
             assert producer_s > 0
             # establish stream order
             if producer_s != consumer_s:
-                e = handle_return(driver.cuEventCreate(
-                    driver.CUevent_flags.CU_EVENT_DISABLE_TIMING))
-                handle_return(driver.cuEventRecord(e, producer_s))
-                handle_return(driver.cuStreamWaitEvent(consumer_s, e, 0))
-                handle_return(driver.cuEventDestroy(e))
+                with nogil:
+                    h_event = create_event_handle_noctx(cydriver.CUevent_flags.CU_EVENT_DISABLE_TIMING)
+                    HANDLE_RETURN(cydriver.cuEventRecord(
+                        native(h_event), <cydriver.CUstream>producer_s))
+                    HANDLE_RETURN(cydriver.cuStreamWaitEvent(
+                        <cydriver.CUstream>consumer_s, native(h_event), 0))
 
     return buf
 
diff --git a/cuda_core/cuda/core/_resource_handles.pxd b/cuda_core/cuda/core/_resource_handles.pxd
new file mode 100644
index 0000000000..801d354958
--- /dev/null
+++ b/cuda_core/cuda/core/_resource_handles.pxd
@@ -0,0 +1,246 @@
+# SPDX-FileCopyrightText: Copyright (c) 2024-2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+#
+# SPDX-License-Identifier: Apache-2.0
+
+from libc.stddef cimport size_t
+from libc.stdint cimport intptr_t, uint32_t
+from libcpp.memory cimport shared_ptr
+
+from cpython.pycapsule cimport PyCapsule_Import
+
+from cuda.bindings cimport cydriver
+
+# Declare the C++ namespace and types (inline helpers live in the header).
+cdef extern from "_cpp/resource_handles.hpp" namespace "cuda_core":
+    ctypedef shared_ptr[const cydriver.CUcontext] ContextHandle
+    ctypedef shared_ptr[const cydriver.CUstream] StreamHandle
+    ctypedef shared_ptr[const cydriver.CUevent] EventHandle
+    ctypedef shared_ptr[const cydriver.CUmemoryPool] MemoryPoolHandle
+    ctypedef shared_ptr[const cydriver.CUdeviceptr] DevicePtrHandle
+
+    # native() - extract the raw CUDA handle (inline C++)
+    cydriver.CUcontext native(ContextHandle h) nogil
+    cydriver.CUstream native(StreamHandle h) nogil
+    cydriver.CUevent native(EventHandle h) nogil
+    cydriver.CUmemoryPool native(MemoryPoolHandle h) nogil
+    cydriver.CUdeviceptr native(DevicePtrHandle h) nogil
+
+    # intptr() - extract handle as intptr_t for Python interop (inline C++)
+    intptr_t intptr(ContextHandle h) nogil
+    intptr_t intptr(StreamHandle h) nogil
+    intptr_t intptr(EventHandle h) nogil
+    intptr_t intptr(MemoryPoolHandle h) nogil
+    intptr_t intptr(DevicePtrHandle h) nogil
+
+    # py() - convert handle to Python driver wrapper object (inline C++; requires GIL)
+    object py(ContextHandle h)
+    object py(StreamHandle h)
+    object py(EventHandle h)
+    object py(MemoryPoolHandle h)
+    object py(DevicePtrHandle h)
+
+
+# The resource handles API table is exported from `cuda.core._resource_handles`
+# as a PyCapsule named:
+#
+#   "cuda.core._resource_handles._CXX_API"
+#
+# Consumers dispatch through this table to avoid relying on RTLD_GLOBAL and to
+# ensure a single owner of correctness-critical static/thread_local state.
+cdef extern from "_cpp/resource_handles_cxx_api.hpp" namespace "cuda_core":
+    cdef struct ResourceHandlesCxxApiV1:
+        uint32_t abi_version
+        uint32_t struct_size
+
+        # Thread-local error handling
+        cydriver.CUresult (*get_last_error)() nogil
+        cydriver.CUresult (*peek_last_error)() nogil
+        void (*clear_last_error)() nogil
+
+        # Context handles
+        ContextHandle (*create_context_handle_ref)(cydriver.CUcontext ctx) nogil
+        ContextHandle (*get_primary_context)(int device_id) nogil
+        ContextHandle (*get_current_context)() nogil
+
+        # Stream handles
+        StreamHandle (*create_stream_handle)(ContextHandle h_ctx, unsigned int flags, int priority) nogil
+        StreamHandle (*create_stream_handle_ref)(cydriver.CUstream stream) nogil
+        StreamHandle (*create_stream_handle_with_owner)(cydriver.CUstream stream, object owner)
+        StreamHandle (*get_legacy_stream)() nogil
+        StreamHandle (*get_per_thread_stream)() nogil
+
+        # Event handles
+        EventHandle (*create_event_handle)(ContextHandle h_ctx, unsigned int flags) nogil
+        EventHandle (*create_event_handle_noctx)(unsigned int flags) nogil
+        EventHandle (*create_event_handle_ipc)(const cydriver.CUipcEventHandle& ipc_handle) nogil
+
+        # Memory pool handles
+        MemoryPoolHandle (*create_mempool_handle)(const cydriver.CUmemPoolProps& props) nogil
+        MemoryPoolHandle (*create_mempool_handle_ref)(cydriver.CUmemoryPool pool) nogil
+        MemoryPoolHandle (*get_device_mempool)(int device_id) nogil
+        MemoryPoolHandle (*create_mempool_handle_ipc)(int fd, cydriver.CUmemAllocationHandleType handle_type) nogil
+
+        # Device pointer handles
+        DevicePtrHandle (*deviceptr_alloc_from_pool)(
+            size_t size,
+            MemoryPoolHandle h_pool,
+            StreamHandle h_stream) nogil
+        DevicePtrHandle (*deviceptr_alloc_async)(size_t size, StreamHandle h_stream) nogil
+        DevicePtrHandle (*deviceptr_alloc)(size_t size) nogil
+        DevicePtrHandle (*deviceptr_alloc_host)(size_t size) nogil
+        DevicePtrHandle (*deviceptr_create_ref)(cydriver.CUdeviceptr ptr) nogil
+        DevicePtrHandle (*deviceptr_create_with_owner)(cydriver.CUdeviceptr ptr, object owner)
+        DevicePtrHandle (*deviceptr_import_ipc)(
+            MemoryPoolHandle h_pool,
+            const void* export_data,
+            StreamHandle h_stream) nogil
+        StreamHandle (*deallocation_stream)(const DevicePtrHandle& h) nogil
+        void (*set_deallocation_stream)(const DevicePtrHandle& h, StreamHandle h_stream) nogil
+
+    const ResourceHandlesCxxApiV1* get_resource_handles_cxx_api_v1() nogil
+
+
+cdef const ResourceHandlesCxxApiV1* _handles_table = NULL
+
+
+cdef inline const ResourceHandlesCxxApiV1* _get_handles_table() except NULL nogil:
+    global _handles_table
+    if _handles_table == NULL:
+        with gil:
+            if _handles_table == NULL:
+                _handles_table = <const ResourceHandlesCxxApiV1*>PyCapsule_Import(
+                    b"cuda.core._resource_handles._CXX_API", 0
+                )
+                if _handles_table == NULL:
+                    raise ImportError("Failed to import cuda.core._resource_handles._CXX_API capsule")
+                if _handles_table.abi_version != 1:
+                    raise ImportError("Unsupported resource handles C++ API version")
+                if _handles_table.struct_size < sizeof(ResourceHandlesCxxApiV1):
+                    raise ImportError("Resource handles C++ API table is too small")
+    return _handles_table
+
+
+# -----------------------------------------------------------------------------
+# Dispatch wrappers
+#
+# These wrappers assume _handles_table has been initialized. Consumers must call
+# _init_handles_table() at module level before using these functions in nogil blocks.
+# -----------------------------------------------------------------------------
+
+cdef inline void _init_handles_table() except *:
+    """Initialize the handles table. Call at module level before using wrappers."""
+    _get_handles_table()
+
+
+cdef inline cydriver.CUresult get_last_error() noexcept nogil:
+    return _handles_table.get_last_error()
+
+
+cdef inline cydriver.CUresult peek_last_error() noexcept nogil:
+    return _handles_table.peek_last_error()
+
+
+cdef inline void clear_last_error() noexcept nogil:
+    _handles_table.clear_last_error()
+
+
+cdef inline ContextHandle create_context_handle_ref(cydriver.CUcontext ctx) noexcept nogil:
+    return _handles_table.create_context_handle_ref(ctx)
+
+
+cdef inline ContextHandle get_primary_context(int device_id) noexcept nogil:
+    return _handles_table.get_primary_context(device_id)
+
+
+cdef inline ContextHandle get_current_context() noexcept nogil:
+    return _handles_table.get_current_context()
+
+
+cdef inline StreamHandle create_stream_handle(ContextHandle h_ctx, unsigned int flags, int priority) noexcept nogil:
+    return _handles_table.create_stream_handle(h_ctx, flags, priority)
+
+
+cdef inline StreamHandle create_stream_handle_ref(cydriver.CUstream stream) noexcept nogil:
+    return _handles_table.create_stream_handle_ref(stream)
+
+
+cdef inline StreamHandle create_stream_handle_with_owner(cydriver.CUstream stream, object owner):
+    return _handles_table.create_stream_handle_with_owner(stream, owner)
+
+
+cdef inline StreamHandle get_legacy_stream() noexcept nogil:
+    return _handles_table.get_legacy_stream()
+
+
+cdef inline StreamHandle get_per_thread_stream() noexcept nogil:
+    return _handles_table.get_per_thread_stream()
+
+
+cdef inline EventHandle create_event_handle(ContextHandle h_ctx, unsigned int flags) noexcept nogil:
+    return _handles_table.create_event_handle(h_ctx, flags)
+
+
+cdef inline EventHandle create_event_handle_noctx(unsigned int flags) noexcept nogil:
+    return _handles_table.create_event_handle_noctx(flags)
+
+
+cdef inline EventHandle create_event_handle_ipc(const cydriver.CUipcEventHandle& ipc_handle) noexcept nogil:
+    return _handles_table.create_event_handle_ipc(ipc_handle)
+
+
+cdef inline MemoryPoolHandle create_mempool_handle(const cydriver.CUmemPoolProps& props) noexcept nogil:
+    return _handles_table.create_mempool_handle(props)
+
+
+cdef inline MemoryPoolHandle create_mempool_handle_ref(cydriver.CUmemoryPool pool) noexcept nogil:
+    return _handles_table.create_mempool_handle_ref(pool)
+
+
+cdef inline MemoryPoolHandle get_device_mempool(int device_id) noexcept nogil:
+    return _handles_table.get_device_mempool(device_id)
+
+
+cdef inline MemoryPoolHandle create_mempool_handle_ipc(int fd, cydriver.CUmemAllocationHandleType handle_type) noexcept nogil:
+    return _handles_table.create_mempool_handle_ipc(fd, handle_type)
+
+
+cdef inline DevicePtrHandle deviceptr_alloc_from_pool(
+    size_t size,
+    MemoryPoolHandle h_pool,
+    StreamHandle h_stream) noexcept nogil:
+    return _handles_table.deviceptr_alloc_from_pool(size, h_pool, h_stream)
+
+
+cdef inline DevicePtrHandle deviceptr_alloc_async(size_t size, StreamHandle h_stream) noexcept nogil:
+    return _handles_table.deviceptr_alloc_async(size, h_stream)
+
+
+cdef inline DevicePtrHandle deviceptr_alloc(size_t size) noexcept nogil:
+    return _handles_table.deviceptr_alloc(size)
+
+
+cdef inline DevicePtrHandle deviceptr_alloc_host(size_t size) noexcept nogil:
+    return _handles_table.deviceptr_alloc_host(size)
+
+
+cdef inline DevicePtrHandle deviceptr_create_ref(cydriver.CUdeviceptr ptr) noexcept nogil:
+    return _handles_table.deviceptr_create_ref(ptr)
+
+
+cdef inline DevicePtrHandle deviceptr_create_with_owner(cydriver.CUdeviceptr ptr, object owner):
+    return _handles_table.deviceptr_create_with_owner(ptr, owner)
+
+
+cdef inline DevicePtrHandle deviceptr_import_ipc(
+    MemoryPoolHandle h_pool,
+    const void* export_data,
+    StreamHandle h_stream) noexcept nogil:
+    return _handles_table.deviceptr_import_ipc(h_pool, export_data, h_stream)
+
+
+cdef inline StreamHandle deallocation_stream(const DevicePtrHandle& h) noexcept nogil:
+    return _handles_table.deallocation_stream(h)
+
+
+cdef inline void set_deallocation_stream(const DevicePtrHandle& h, StreamHandle h_stream) noexcept nogil:
+    _handles_table.set_deallocation_stream(h, h_stream)
diff --git a/cuda_core/cuda/core/_resource_handles.pyx b/cuda_core/cuda/core/_resource_handles.pyx
new file mode 100644
index 0000000000..48f790581e
--- /dev/null
+++ b/cuda_core/cuda/core/_resource_handles.pyx
@@ -0,0 +1,137 @@
+# SPDX-FileCopyrightText: Copyright (c) 2024-2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+#
+# SPDX-License-Identifier: Apache-2.0
+
+# This module exists to compile _cpp/resource_handles.cpp into a shared library.
+# The helper functions (native, intptr, py) are implemented as inline C++ functions
+# in _cpp/resource_handles.hpp and declared as extern in _resource_handles.pxd.
+
+from cpython.pycapsule cimport PyCapsule_New
+from libc.stdint cimport uint32_t, uint64_t, uintptr_t
+
+from ._resource_handles_cxx_api cimport (
+    ResourceHandlesCxxApiV1,
+    get_resource_handles_cxx_api_v1,
+)
+
+import cython
+
+
+cdef const char* _CXX_API_NAME = b"cuda.core._resource_handles._CXX_API"
+cdef const char* _CUDA_DRIVER_API_V1_NAME = b"cuda.core._resource_handles._CUDA_DRIVER_API_V1"
+
+# Export the C++ handles dispatch table as a PyCapsule.
+# Consumers use PyCapsule_Import(_CXX_API_NAME, 0) to retrieve it.
+cdef const ResourceHandlesCxxApiV1* _handles_table = get_resource_handles_cxx_api_v1()
+if _handles_table == NULL:
+    raise RuntimeError("Failed to initialize resource handles C++ API table")
+
+_CXX_API = <object>PyCapsule_New(<void*>_handles_table, _CXX_API_NAME, NULL)
+if _CXX_API is None:
+    raise RuntimeError("Failed to create _CXX_API capsule")
+
+
+cdef struct CudaDriverApiV1:
+    uint32_t abi_version
+    uint32_t struct_size
+
+    uintptr_t cuDevicePrimaryCtxRetain
+    uintptr_t cuDevicePrimaryCtxRelease
+    uintptr_t cuCtxGetCurrent
+
+    uintptr_t cuStreamCreateWithPriority
+    uintptr_t cuStreamDestroy
+
+    uintptr_t cuEventCreate
+    uintptr_t cuEventDestroy
+    uintptr_t cuIpcOpenEventHandle
+
+    uintptr_t cuDeviceGetCount
+
+    uintptr_t cuMemPoolSetAccess
+    uintptr_t cuMemPoolDestroy
+    uintptr_t cuMemPoolCreate
+    uintptr_t cuDeviceGetMemPool
+    uintptr_t cuMemPoolImportFromShareableHandle
+
+    uintptr_t cuMemAllocFromPoolAsync
+    uintptr_t cuMemAllocAsync
+    uintptr_t cuMemAlloc
+    uintptr_t cuMemAllocHost
+
+    uintptr_t cuMemFreeAsync
+    uintptr_t cuMemFree
+    uintptr_t cuMemFreeHost
+
+    uintptr_t cuMemPoolImportPointer
+
+
+cdef CudaDriverApiV1 _cuda_driver_api_v1
+cdef bint _cuda_driver_api_v1_inited = False
+
+
+cdef inline uintptr_t _as_addr(object pfn) except 0:
+    return <uintptr_t>int(pfn)
+
+
+cdef inline uintptr_t _resolve(object d, int driver_ver, uint64_t flags, bytes sym) except 0:
+    err, pfn, status = d.cuGetProcAddress(sym, driver_ver, flags)
+    if int(err) != 0 or pfn is None:
+        raise RuntimeError(f"cuGetProcAddress failed for {sym!r}, err={err}, status={status}")
+    return _as_addr(pfn)
+
+
+def _get_cuda_driver_api_v1_capsule():
+    """Return a PyCapsule containing cached CUDA driver entrypoints.
+
+    This is evaluated lazily on first use so cuda-core remains importable on
+    CPU-only machines.
+    """
+    global _cuda_driver_api_v1_inited, _cuda_driver_api_v1
+    if not _cuda_driver_api_v1_inited:
+        import cuda.bindings.driver as d
+
+        err, ver = d.cuDriverGetVersion()
+        if int(err) != 0:
+            raise RuntimeError(f"cuDriverGetVersion failed: {err}")
+        driver_ver = int(ver)
+
+        flags = 0  # CU_GET_PROC_ADDRESS_DEFAULT
+
+        _cuda_driver_api_v1.cuDevicePrimaryCtxRetain = _resolve(d, driver_ver, flags, b"cuDevicePrimaryCtxRetain")
+        _cuda_driver_api_v1.cuDevicePrimaryCtxRelease = _resolve(d, driver_ver, flags, b"cuDevicePrimaryCtxRelease")
+        _cuda_driver_api_v1.cuCtxGetCurrent = _resolve(d, driver_ver, flags, b"cuCtxGetCurrent")
+
+        _cuda_driver_api_v1.cuStreamCreateWithPriority = _resolve(d, driver_ver, flags, b"cuStreamCreateWithPriority")
+        _cuda_driver_api_v1.cuStreamDestroy = _resolve(d, driver_ver, flags, b"cuStreamDestroy")
+
+        _cuda_driver_api_v1.cuEventCreate = _resolve(d, driver_ver, flags, b"cuEventCreate")
+        _cuda_driver_api_v1.cuEventDestroy = _resolve(d, driver_ver, flags, b"cuEventDestroy")
+        _cuda_driver_api_v1.cuIpcOpenEventHandle = _resolve(d, driver_ver, flags, b"cuIpcOpenEventHandle")
+
+        _cuda_driver_api_v1.cuDeviceGetCount = _resolve(d, driver_ver, flags, b"cuDeviceGetCount")
+
+        _cuda_driver_api_v1.cuMemPoolSetAccess = _resolve(d, driver_ver, flags, b"cuMemPoolSetAccess")
+        _cuda_driver_api_v1.cuMemPoolDestroy = _resolve(d, driver_ver, flags, b"cuMemPoolDestroy")
+        _cuda_driver_api_v1.cuMemPoolCreate = _resolve(d, driver_ver, flags, b"cuMemPoolCreate")
+        _cuda_driver_api_v1.cuDeviceGetMemPool = _resolve(d, driver_ver, flags, b"cuDeviceGetMemPool")
+        _cuda_driver_api_v1.cuMemPoolImportFromShareableHandle = _resolve(
+            d, driver_ver, flags, b"cuMemPoolImportFromShareableHandle"
+        )
+
+        _cuda_driver_api_v1.cuMemAllocFromPoolAsync = _resolve(d, driver_ver, flags, b"cuMemAllocFromPoolAsync")
+        _cuda_driver_api_v1.cuMemAllocAsync = _resolve(d, driver_ver, flags, b"cuMemAllocAsync")
+        _cuda_driver_api_v1.cuMemAlloc = _resolve(d, driver_ver, flags, b"cuMemAlloc")
+        _cuda_driver_api_v1.cuMemAllocHost = _resolve(d, driver_ver, flags, b"cuMemAllocHost")
+
+        _cuda_driver_api_v1.cuMemFreeAsync = _resolve(d, driver_ver, flags, b"cuMemFreeAsync")
+        _cuda_driver_api_v1.cuMemFree = _resolve(d, driver_ver, flags, b"cuMemFree")
+        _cuda_driver_api_v1.cuMemFreeHost = _resolve(d, driver_ver, flags, b"cuMemFreeHost")
+
+        _cuda_driver_api_v1.cuMemPoolImportPointer = _resolve(d, driver_ver, flags, b"cuMemPoolImportPointer")
+
+        _cuda_driver_api_v1.abi_version = 1
+        _cuda_driver_api_v1.struct_size = cython.sizeof(CudaDriverApiV1)
+        _cuda_driver_api_v1_inited = True
+
+    return <object>PyCapsule_New(<void*>&_cuda_driver_api_v1, _CUDA_DRIVER_API_V1_NAME, NULL)
diff --git a/cuda_core/cuda/core/_resource_handles_cxx_api.pxd b/cuda_core/cuda/core/_resource_handles_cxx_api.pxd
new file mode 100644
index 0000000000..da3d8d4fd3
--- /dev/null
+++ b/cuda_core/cuda/core/_resource_handles_cxx_api.pxd
@@ -0,0 +1,68 @@
+# SPDX-FileCopyrightText: Copyright (c) 2024-2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+#
+# SPDX-License-Identifier: Apache-2.0
+
+from libc.stdint cimport uint32_t
+from libc.stddef cimport size_t
+
+from cuda.bindings cimport cydriver
+from ._resource_handles cimport (
+    ContextHandle,
+    DevicePtrHandle,
+    EventHandle,
+    MemoryPoolHandle,
+    StreamHandle,
+)
+
+
+cdef extern from "_cpp/resource_handles_cxx_api.hpp" namespace "cuda_core":
+    cdef struct ResourceHandlesCxxApiV1:
+        uint32_t abi_version
+        uint32_t struct_size
+
+        # Thread-local error handling
+        cydriver.CUresult (*get_last_error)() nogil
+        cydriver.CUresult (*peek_last_error)() nogil
+        void (*clear_last_error)() nogil
+
+        # Context handles
+        ContextHandle (*create_context_handle_ref)(cydriver.CUcontext ctx) nogil
+        ContextHandle (*get_primary_context)(int device_id) nogil
+        ContextHandle (*get_current_context)() nogil
+
+        # Stream handles
+        StreamHandle (*create_stream_handle)(ContextHandle h_ctx, unsigned int flags, int priority) nogil
+        StreamHandle (*create_stream_handle_ref)(cydriver.CUstream stream) nogil
+        StreamHandle (*create_stream_handle_with_owner)(cydriver.CUstream stream, object owner)
+        StreamHandle (*get_legacy_stream)() nogil
+        StreamHandle (*get_per_thread_stream)() nogil
+
+        # Event handles
+        EventHandle (*create_event_handle)(ContextHandle h_ctx, unsigned int flags) nogil
+        EventHandle (*create_event_handle_noctx)(unsigned int flags) nogil
+        EventHandle (*create_event_handle_ipc)(const cydriver.CUipcEventHandle& ipc_handle) nogil
+
+        # Memory pool handles
+        MemoryPoolHandle (*create_mempool_handle)(const cydriver.CUmemPoolProps& props) nogil
+        MemoryPoolHandle (*create_mempool_handle_ref)(cydriver.CUmemoryPool pool) nogil
+        MemoryPoolHandle (*get_device_mempool)(int device_id) nogil
+        MemoryPoolHandle (*create_mempool_handle_ipc)(int fd, cydriver.CUmemAllocationHandleType handle_type) nogil
+
+        # Device pointer handles
+        DevicePtrHandle (*deviceptr_alloc_from_pool)(
+            size_t size,
+            MemoryPoolHandle h_pool,
+            StreamHandle h_stream) nogil
+        DevicePtrHandle (*deviceptr_alloc_async)(size_t size, StreamHandle h_stream) nogil
+        DevicePtrHandle (*deviceptr_alloc)(size_t size) nogil
+        DevicePtrHandle (*deviceptr_alloc_host)(size_t size) nogil
+        DevicePtrHandle (*deviceptr_create_ref)(cydriver.CUdeviceptr ptr) nogil
+        DevicePtrHandle (*deviceptr_create_with_owner)(cydriver.CUdeviceptr ptr, object owner)
+        DevicePtrHandle (*deviceptr_import_ipc)(
+            MemoryPoolHandle h_pool,
+            const void* export_data,
+            StreamHandle h_stream) nogil
+        StreamHandle (*deallocation_stream)(const DevicePtrHandle& h) nogil
+        void (*set_deallocation_stream)(const DevicePtrHandle& h, StreamHandle h_stream) nogil
+
+    const ResourceHandlesCxxApiV1* get_resource_handles_cxx_api_v1() nogil
diff --git a/cuda_core/cuda/core/_stream.pxd b/cuda_core/cuda/core/_stream.pxd
index edc25e2ba7..69bd5821ad 100644
--- a/cuda_core/cuda/core/_stream.pxd
+++ b/cuda_core/cuda/core/_stream.pxd
@@ -2,23 +2,22 @@
 #
 # SPDX-License-Identifier: Apache-2.0
 
-from cuda.bindings cimport cydriver
+from cuda.core._resource_handles cimport ContextHandle, StreamHandle
 
 
 cdef class Stream:
 
     cdef:
-        cydriver.CUstream _handle
-        object _owner
-        bint _builtin
+        StreamHandle _h_stream
+        ContextHandle _h_context
+        int _device_id
         int _nonblocking
         int _priority
-        cydriver.CUdevice _device_id
-        cydriver.CUcontext _ctx_handle
+
+    @staticmethod
+    cdef Stream _from_handle(type cls, StreamHandle h_stream)
 
     cpdef close(self)
-    cdef int _get_context(self) except?-1 nogil
-    cdef int _get_device_and_context(self) except?-1
 
 
 cpdef Stream default_stream()
diff --git a/cuda_core/cuda/core/_stream.pyx b/cuda_core/cuda/core/_stream.pyx
index b724f9aee3..aecf24b06e 100644
--- a/cuda_core/cuda/core/_stream.pyx
+++ b/cuda_core/cuda/core/_stream.pyx
@@ -12,8 +12,6 @@ from cuda.bindings cimport cydriver
 from cuda.core._event cimport Event as cyEvent
 from cuda.core._utils.cuda_utils cimport (
     check_or_create_options,
-    CU_CONTEXT_INVALID,
-    get_device_from_ctx,
     HANDLE_RETURN,
 )
 
@@ -25,13 +23,28 @@ from typing import TYPE_CHECKING, Optional, Protocol, Union
 if TYPE_CHECKING:
     import cuda.bindings
     from cuda.core._device import Device
-from cuda.core._context import Context
+from cuda.core._context cimport Context
 from cuda.core._event import Event, EventOptions
-from cuda.core._graph import GraphBuilder
-from cuda.core._utils.cuda_utils import (
-    driver,
+from cuda.core._resource_handles cimport (
+    ContextHandle,
+    EventHandle,
+    StreamHandle,
+    _init_handles_table,
+    create_context_handle_ref,
+    create_event_handle_noctx,
+    create_stream_handle,
+    create_stream_handle_with_owner,
+    get_current_context,
+    get_legacy_stream,
+    get_per_thread_stream,
+    intptr,
+    native,
+    py,
 )
 
+_init_handles_table()
+from cuda.core._graph import GraphBuilder
+
 
 @dataclass
 cdef class StreamOptions:
@@ -78,52 +91,61 @@ cdef class Stream:
     object, or created directly through using an existing handle
     using Stream.from_handle().
     """
-    def __cinit__(self):
-        self._handle = <cydriver.CUstream>(NULL)
-        self._owner = None
-        self._builtin = False
-        self._nonblocking = -1  # lazy init'd
-        self._priority = INT32_MIN  # lazy init'd
-        self._device_id = cydriver.CU_DEVICE_INVALID  # lazy init'd
-        self._ctx_handle = CU_CONTEXT_INVALID  # lazy init'd
-
     def __init__(self, *args, **kwargs):
         raise RuntimeError(
             "Stream objects cannot be instantiated directly. "
             "Please use Device APIs (create_stream) or other Stream APIs (from_handle)."
         )
 
+    @staticmethod
+    cdef Stream _from_handle(type cls, StreamHandle h_stream):
+        """Create a Stream from an existing StreamHandle (cdef-only factory)."""
+        cdef Stream s = cls.__new__(cls)
+        s._h_stream = h_stream
+        # _h_context is default-initialized to empty ContextHandle by C++
+        s._device_id = -1  # lazy init'd (invalid sentinel)
+        s._nonblocking = -1  # lazy init'd
+        s._priority = INT32_MIN  # lazy init'd
+        return s
+
     @classmethod
     def _legacy_default(cls):
-        cdef Stream self = Stream.__new__(cls)
-        self._handle = <cydriver.CUstream>(cydriver.CU_STREAM_LEGACY)
-        self._builtin = True
-        return self
+        """Return the legacy default stream (supports subclassing)."""
+        return Stream._from_handle(cls, get_legacy_stream())
 
     @classmethod
     def _per_thread_default(cls):
-        cdef Stream self = Stream.__new__(cls)
-        self._handle = <cydriver.CUstream>(cydriver.CU_STREAM_PER_THREAD)
-        self._builtin = True
-        return self
+        """Return the per-thread default stream (supports subclassing)."""
+        return Stream._from_handle(cls, get_per_thread_stream())
 
     @classmethod
-    def _init(cls, obj: IsStreamT | None = None, options=None, device_id: int = None):
-        cdef Stream self = Stream.__new__(cls)
+    def _init(cls, obj: IsStreamT | None = None, options=None, device_id: int = None,
+              ctx: Context = None):
+        cdef StreamHandle h_stream
+        cdef cydriver.CUstream borrowed
+        cdef ContextHandle h_context
+        cdef Stream self
+
+        # Extract context handle if provided
+        if ctx is not None:
+            h_context = (<Context>ctx)._h_context
 
         if obj is not None and options is not None:
             raise ValueError("obj and options cannot be both specified")
         if obj is not None:
-            self._handle = _handle_from_stream_protocol(obj)
-            # TODO: check if obj is created under the current context/device
-            self._owner = obj
-            return self
+            # Borrowed stream from foreign object
+            # C++ handle prevents owner from being GC'd until handle is released
+            # Owner is responsible for keeping the stream's context alive
+            borrowed = _handle_from_stream_protocol(obj)
+            h_stream = create_stream_handle_with_owner(borrowed, obj)
+            return Stream._from_handle(cls, h_stream)
 
         cdef StreamOptions opts = check_or_create_options(StreamOptions, options, "Stream options")
         nonblocking = opts.nonblocking
         priority = opts.priority
 
-        flags = cydriver.CUstream_flags.CU_STREAM_NON_BLOCKING if nonblocking else cydriver.CUstream_flags.CU_STREAM_DEFAULT
+        cdef unsigned int flags = (cydriver.CUstream_flags.CU_STREAM_NON_BLOCKING if nonblocking
+                                   else cydriver.CUstream_flags.CU_STREAM_DEFAULT)
         # TODO: we might want to consider memoizing high/low per CUDA context and avoid this call
         cdef int high, low
         with nogil:
@@ -136,57 +158,47 @@ cdef class Stream:
         else:
             prio = high
 
-        cdef cydriver.CUstream s
-        with nogil:
-            HANDLE_RETURN(cydriver.cuStreamCreateWithPriority(&s, flags, prio))
-        self._handle = s
+        # C++ creates the stream and returns owning handle with context dependency
+        h_stream = create_stream_handle(h_context, flags, prio)
+        if not h_stream:
+            raise RuntimeError("Failed to create CUDA stream")
+        self = Stream._from_handle(cls, h_stream)
         self._nonblocking = int(nonblocking)
         self._priority = prio
-        self._device_id = device_id if device_id is not None else self._device_id
+        if device_id is not None:
+            self._device_id = device_id
         return self
 
-    def __dealloc__(self):
-        self.close()
-
     cpdef close(self):
         """Destroy the stream.
 
-        Destroy the stream if we own it. Borrowed foreign stream
-        object will instead have their references released.
-
+        Releases the stream handle. For owned streams, this destroys the
+        underlying CUDA stream. For borrowed streams, this releases the
+        reference and allows the Python owner to be GC'd.
         """
-        if self._owner is None:
-            if self._handle and not self._builtin:
-                with nogil:
-                    HANDLE_RETURN(cydriver.cuStreamDestroy(self._handle))
-        else:
-            self._owner = None
-        self._handle = <cydriver.CUstream>(NULL)
+        self._h_stream.reset()
 
     def __cuda_stream__(self) -> tuple[int, int]:
         """Return an instance of a __cuda_stream__ protocol."""
-        return (0, <uintptr_t>(self._handle))
+        return (0, intptr(self._h_stream))
 
     def __hash__(self) -> int:
         # Ensure context is initialized for hash consistency
-        if self._ctx_handle == CU_CONTEXT_INVALID:
-            self._get_context()
-        return hash((<uintptr_t>(self._ctx_handle), <uintptr_t>(self._handle)))
+        Stream_ensure_ctx(self)
+        return hash((intptr(self._h_context), intptr(self._h_stream)))
 
     def __eq__(self, other) -> bool:
         if not isinstance(other, Stream):
             return NotImplemented
         cdef Stream _other = <Stream>other
         # Fast path: compare handles first
-        if <uintptr_t>(self._handle) != <uintptr_t>((_other)._handle):
+        if intptr(self._h_stream) != intptr(_other._h_stream):
             return False
         # Ensure contexts are initialized for both streams
-        if self._ctx_handle == CU_CONTEXT_INVALID:
-            self._get_context()
-        if _other._ctx_handle == CU_CONTEXT_INVALID:
-            _other._get_context()
+        Stream_ensure_ctx(self)
+        Stream_ensure_ctx(_other)
         # Compare contexts as well
-        return <uintptr_t>(self._ctx_handle) == <uintptr_t>((_other)._ctx_handle)
+        return intptr(self._h_context) == intptr(_other._h_context)
 
     @property
     def handle(self) -> cuda.bindings.driver.CUstream:
@@ -197,7 +209,7 @@ cdef class Stream:
             This handle is a Python object. To get the memory address of the underlying C
             handle, call ``int(Stream.handle)``.
         """
-        return driver.CUstream(<uintptr_t>(self._handle))
+        return py(self._h_stream)
 
     @property
     def is_nonblocking(self) -> bool:
@@ -205,11 +217,8 @@ cdef class Stream:
         cdef unsigned int flags
         if self._nonblocking == -1:
             with nogil:
-                HANDLE_RETURN(cydriver.cuStreamGetFlags(self._handle, &flags))
-            if flags & cydriver.CUstream_flags.CU_STREAM_NON_BLOCKING:
-                self._nonblocking = True
-            else:
-                self._nonblocking = False
+                HANDLE_RETURN(cydriver.cuStreamGetFlags(native(self._h_stream), &flags))
+            self._nonblocking = flags & cydriver.CUstream_flags.CU_STREAM_NON_BLOCKING
         return bool(self._nonblocking)
 
     @property
@@ -218,14 +227,14 @@ cdef class Stream:
         cdef int prio
         if self._priority == INT32_MIN:
             with nogil:
-                HANDLE_RETURN(cydriver.cuStreamGetPriority(self._handle, &prio))
+                HANDLE_RETURN(cydriver.cuStreamGetPriority(native(self._h_stream), &prio))
             self._priority = prio
         return self._priority
 
     def sync(self):
         """Synchronize the stream."""
         with nogil:
-            HANDLE_RETURN(cydriver.cuStreamSynchronize(self._handle))
+            HANDLE_RETURN(cydriver.cuStreamSynchronize(native(self._h_stream)))
 
     def record(self, event: Event = None, options: EventOptions = None) -> Event:
         """Record an event onto the stream.
@@ -250,17 +259,17 @@ cdef class Stream:
         # on the stream. Event flags such as disabling timing, nonblocking,
         # and CU_EVENT_RECORD_EXTERNAL, can be set in EventOptions.
         if event is None:
-            self._get_device_and_context()
-            event = Event._init(<int>(self._device_id), <uintptr_t>(self._ctx_handle), options, False)
+            Stream_ensure_ctx_device(self)
+            event = cyEvent._init(cyEvent, self._device_id, self._h_context, options, False)
         elif event.is_ipc_enabled:
             raise TypeError(
                 "IPC-enabled events should not be re-recorded, instead create a "
                 "new event by supplying options."
             )
 
-        cdef cydriver.CUevent e = (<cyEvent?>(event))._handle
+        cdef cydriver.CUevent e = native((<cyEvent?>(event))._h_event)
         with nogil:
-            HANDLE_RETURN(cydriver.cuEventRecord(e, self._handle))
+            HANDLE_RETURN(cydriver.cuEventRecord(e, native(self._h_stream)))
         return event
 
     def wait(self, event_or_stream: Union[Event, Stream]):
@@ -273,32 +282,35 @@ cdef class Stream:
         on the stream and then waiting on it.
 
         """
-        cdef cydriver.CUevent event
-        cdef cydriver.CUstream stream
+        cdef Stream stream
+        cdef EventHandle h_event
 
+        # Handle Event directly
         if isinstance(event_or_stream, Event):
-            event = <cydriver.CUevent><uintptr_t>(event_or_stream.handle)
             with nogil:
                 # TODO: support flags other than 0?
-                HANDLE_RETURN(cydriver.cuStreamWaitEvent(self._handle, event, 0))
+                HANDLE_RETURN(cydriver.cuStreamWaitEvent(
+                    native(self._h_stream), native((<cyEvent>event_or_stream)._h_event), 0))
+            return
+
+        # Convert to Stream if needed
+        if isinstance(event_or_stream, Stream):
+            stream = <Stream>event_or_stream
         else:
-            if isinstance(event_or_stream, Stream):
-                stream = <cydriver.CUstream><uintptr_t>(event_or_stream.handle)
-            else:
-                try:
-                    s = Stream._init(obj=event_or_stream)
-                except Exception as e:
-                    raise ValueError(
-                        "only an Event, Stream, or object supporting __cuda_stream__ can be waited,"
-                        f" got {type(event_or_stream)}"
-                    ) from e
-                stream = <cydriver.CUstream><uintptr_t>(s.handle)
-            with nogil:
-                HANDLE_RETURN(cydriver.cuEventCreate(&event, cydriver.CUevent_flags.CU_EVENT_DISABLE_TIMING))
-                HANDLE_RETURN(cydriver.cuEventRecord(event, stream))
-                # TODO: support flags other than 0?
-                HANDLE_RETURN(cydriver.cuStreamWaitEvent(self._handle, event, 0))
-                HANDLE_RETURN(cydriver.cuEventDestroy(event))
+            try:
+                stream = Stream._init(obj=event_or_stream)
+            except Exception as e:
+                raise ValueError(
+                    "only an Event, Stream, or object supporting __cuda_stream__ can be waited,"
+                    f" got {type(event_or_stream)}"
+                ) from e
+
+        # Wait on stream via temporary event
+        with nogil:
+            h_event = create_event_handle_noctx(cydriver.CUevent_flags.CU_EVENT_DISABLE_TIMING)
+            HANDLE_RETURN(cydriver.cuEventRecord(native(h_event), native(stream._h_stream)))
+            # TODO: support flags other than 0?
+            HANDLE_RETURN(cydriver.cuStreamWaitEvent(native(self._h_stream), native(h_event), 0))
 
     @property
     def device(self) -> Device:
@@ -312,32 +324,15 @@ cdef class Stream:
 
         """
         from cuda.core._device import Device  # avoid circular import
-        self._get_device_and_context()
-        return Device(<int>(self._device_id))
-
-    cdef int _get_context(self) except?-1 nogil:
-        if self._ctx_handle == CU_CONTEXT_INVALID:
-            HANDLE_RETURN(cydriver.cuStreamGetCtx(self._handle, &(self._ctx_handle)))
-        return 0
-
-    cdef int _get_device_and_context(self) except?-1:
-        cdef cydriver.CUcontext curr_ctx
-        if self._device_id == cydriver.CU_DEVICE_INVALID:
-            with nogil:
-                # Get the current context
-                HANDLE_RETURN(cydriver.cuCtxGetCurrent(&curr_ctx))
-                # Get the stream's context (self.ctx_handle is populated)
-                self._get_context()
-                # Get the stream's device (may require a context-switching dance)
-                self._device_id = get_device_from_ctx(self._ctx_handle, curr_ctx)
-        return 0
+        Stream_ensure_ctx_device(self)
+        return Device(self._device_id)
 
     @property
     def context(self) -> Context:
         """Return the :obj:`~_context.Context` associated with this stream."""
-        self._get_context()
-        self._get_device_and_context()
-        return Context._from_ctx(<uintptr_t>(self._ctx_handle), <int>(self._device_id))
+        Stream_ensure_ctx(self)
+        Stream_ensure_ctx_device(self)
+        return Context._from_handle(Context, self._h_context, self._device_id)
 
     @staticmethod
     def from_handle(handle: int) -> Stream:
@@ -417,6 +412,36 @@ cpdef Stream default_stream():
         return C_LEGACY_DEFAULT_STREAM
 
 
+cdef inline int Stream_ensure_ctx(Stream self) except?-1 nogil:
+    """Ensure the stream's context handle is populated."""
+    cdef cydriver.CUcontext ctx
+    if not self._h_context:
+        HANDLE_RETURN(cydriver.cuStreamGetCtx(native(self._h_stream), &ctx))
+        with gil:
+            self._h_context = create_context_handle_ref(ctx)
+    return 0
+
+
+cdef inline int Stream_ensure_ctx_device(Stream self) except?-1:
+    """Ensure the stream's context and device_id are populated."""
+    cdef cydriver.CUcontext ctx
+    cdef cydriver.CUdevice target_dev
+    cdef bint switch_context
+
+    if self._device_id < 0:
+        with nogil:
+            # Get device ID from context, switching context temporarily if needed
+            Stream_ensure_ctx(self)
+            switch_context = (get_current_context() != self._h_context)
+            if switch_context:
+                HANDLE_RETURN(cydriver.cuCtxPushCurrent(native(self._h_context)))
+            HANDLE_RETURN(cydriver.cuCtxGetDevice(&target_dev))
+            if switch_context:
+                HANDLE_RETURN(cydriver.cuCtxPopCurrent(&ctx))
+        self._device_id = <int>target_dev
+    return 0
+
+
 cdef cydriver.CUstream _handle_from_stream_protocol(obj) except*:
     if isinstance(obj, Stream):
         return <cydriver.CUstream><uintptr_t>(obj.handle)
diff --git a/cuda_core/cuda/core/_utils/cuda_utils.pxd b/cuda_core/cuda/core/_utils/cuda_utils.pxd
index ce30285aa5..9b5044beda 100644
--- a/cuda_core/cuda/core/_utils/cuda_utils.pxd
+++ b/cuda_core/cuda/core/_utils/cuda_utils.pxd
@@ -22,10 +22,6 @@ ctypedef fused integer_t:
 cdef const cydriver.CUcontext CU_CONTEXT_INVALID = <cydriver.CUcontext>(-2)
 
 
-cdef cydriver.CUdevice get_device_from_ctx(
-    cydriver.CUcontext target_ctx, cydriver.CUcontext curr_ctx) except?cydriver.CU_DEVICE_INVALID nogil
-
-
 cdef int HANDLE_RETURN(supported_error_type err) except?-1 nogil
 
 
diff --git a/cuda_core/cuda/core/_utils/cuda_utils.pyx b/cuda_core/cuda/core/_utils/cuda_utils.pyx
index 0c3f6521a4..c7f867a0d5 100644
--- a/cuda_core/cuda/core/_utils/cuda_utils.pyx
+++ b/cuda_core/cuda/core/_utils/cuda_utils.pyx
@@ -197,25 +197,6 @@ def precondition(checker: Callable[..., None], str what="") -> Callable:
     return outer
 
 
-cdef cydriver.CUdevice get_device_from_ctx(
-        cydriver.CUcontext target_ctx, cydriver.CUcontext curr_ctx) except?cydriver.CU_DEVICE_INVALID nogil:
-    """Get device ID from the given ctx."""
-    cdef bint switch_context = (curr_ctx != target_ctx)
-    cdef cydriver.CUcontext ctx
-    cdef cydriver.CUdevice target_dev
-    with nogil:
-        if switch_context:
-            HANDLE_RETURN(cydriver.cuCtxPopCurrent(&ctx))
-            assert curr_ctx == ctx
-            HANDLE_RETURN(cydriver.cuCtxPushCurrent(target_ctx))
-        HANDLE_RETURN(cydriver.cuCtxGetDevice(&target_dev))
-        if switch_context:
-            HANDLE_RETURN(cydriver.cuCtxPopCurrent(&ctx))
-            assert target_ctx == ctx
-            HANDLE_RETURN(cydriver.cuCtxPushCurrent(curr_ctx))
-    return target_dev
-
-
 def is_sequence(obj):
     """
     Check if the given object is a sequence (list or tuple).
diff --git a/cuda_core/cuda/core/experimental/_context.pxd b/cuda_core/cuda/core/experimental/_context.pxd
new file mode 100644
index 0000000000..58ca887908
--- /dev/null
+++ b/cuda_core/cuda/core/experimental/_context.pxd
@@ -0,0 +1,6 @@
+# SPDX-FileCopyrightText: Copyright (c) 2024-2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+#
+# SPDX-License-Identifier: Apache-2.0
+
+# Backward compatibility stub - use cuda.core._context instead
+from cuda.core._context cimport Context
diff --git a/cuda_core/pyproject.toml b/cuda_core/pyproject.toml
index 94a9e931cc..bf509dc1c3 100644
--- a/cuda_core/pyproject.toml
+++ b/cuda_core/pyproject.toml
@@ -70,6 +70,7 @@ include = ["cuda.core*"]
 
 [tool.setuptools.package-data]
 "cuda.core._include" = ["*.h", "*.hpp", "*.cuh"]
+"cuda.core._cpp" = ["*.cpp", "*.hpp"]
 
 [tool.setuptools.dynamic]
 version = { attr = "cuda.core._version.__version__" }
diff --git a/cuda_core/tests/memory_ipc/test_ipc_duplicate_import.py b/cuda_core/tests/memory_ipc/test_ipc_duplicate_import.py
new file mode 100644
index 0000000000..ca4ecc0749
--- /dev/null
+++ b/cuda_core/tests/memory_ipc/test_ipc_duplicate_import.py
@@ -0,0 +1,87 @@
+# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+
+"""Test for duplicate IPC buffer imports.
+
+Verifies that importing the same buffer descriptor multiple times returns the
+same underlying handle, and that closing all imports works correctly without
+crashing. This tests the workaround for nvbug 5570902 where IPC-imported
+pointers are not correctly reference counted by the driver.
+"""
+
+import contextlib
+import multiprocessing as mp
+
+import pytest
+from cuda.core import Buffer, Device
+from helpers.logging import TimestampedLogger
+
+CHILD_TIMEOUT_SEC = 20
+NBYTES = 64
+POOL_SIZE = 2097152
+
+ENABLE_LOGGING = False  # Set True for test debugging and development
+
+
+def child_main(log, queue):
+    log.prefix = " child: "
+    log("ready")
+    device = Device()
+    device.set_current()
+    mr = queue.get()
+    buffer_desc1 = queue.get()
+    buffer_desc2 = queue.get()
+
+    # Import the same buffer twice - should return same handle due to cache
+    buffer1 = Buffer.from_ipc_descriptor(mr, buffer_desc1)
+    buffer2 = Buffer.from_ipc_descriptor(mr, buffer_desc2)
+
+    log(f"buffer1.handle = {buffer1.handle}")
+    log(f"buffer2.handle = {buffer2.handle}")
+    log(f"same handle: {buffer1.handle == buffer2.handle}")
+
+    # Close both - should not crash
+    buffer1.close()
+    log("buffer1 closed")
+
+    buffer2.close()
+    log("buffer2 closed")
+
+    device.sync()
+    log("done")
+
+
+class TestIpcDuplicateImport:
+    """Test that duplicate IPC imports return the same handle and close safely."""
+
+    @pytest.fixture(autouse=True)
+    def _set_start_method(self):
+        # Ensure spawn is used for multiprocessing
+        with contextlib.suppress(RuntimeError):
+            mp.set_start_method("spawn", force=True)
+
+    def test_main(self, ipc_device, ipc_memory_resource):
+        log = TimestampedLogger(prefix="parent: ", enabled=ENABLE_LOGGING)
+        ipc_device.set_current()
+        mr = ipc_memory_resource
+
+        log("allocating buffer")
+        buffer = mr.allocate(NBYTES)
+
+        # Start the child process.
+        log("starting child")
+        queue = mp.Queue()
+        process = mp.Process(target=child_main, args=(log, queue))
+        process.start()
+
+        # Send the memory resource and buffer descriptor twice.
+        log("sending mr and buffer descriptors")
+        queue.put(mr)
+        queue.put(buffer.get_ipc_descriptor())
+        queue.put(buffer.get_ipc_descriptor())
+
+        log("waiting for child")
+        process.join(timeout=CHILD_TIMEOUT_SEC)
+        log(f"child exit code: {process.exitcode}")
+        assert process.exitcode == 0, f"Child process failed with exit code {process.exitcode}"
+        log("done")
diff --git a/cuda_core/tests/test_comparable.py b/cuda_core/tests/test_comparable.py
index a93e49e4e8..281ed4ab1c 100644
--- a/cuda_core/tests/test_comparable.py
+++ b/cuda_core/tests/test_comparable.py
@@ -9,8 +9,6 @@
 """
 
 from cuda.core import Device, Stream
-from cuda.core._context import Context
-from cuda.core._event import Event, EventOptions
 from cuda.core._stream import StreamOptions
 
 # ============================================================================
@@ -105,50 +103,34 @@ def test_event_subclass_equality(init_cuda):
 
     Event uses isinstance() for equality checking, similar to Stream.
     """
-
-    class MyEvent(Event):
-        pass
-
     device = Device(0)
     device.set_current()
 
-    # Create two different events
-    event = Event._init(device.device_id, device.context, options=EventOptions())
-    my_event = MyEvent._init(device.device_id, device.context, options=EventOptions())
+    # Create events using public API
+    event1 = device.create_event()
+    event2 = device.create_event()
+    event3 = device.create_event()
 
     # Different events should not be equal (different handles)
-    assert event != my_event, "Different Event instances are not equal"
+    assert event1 != event2, "Different Event instances are not equal"
+    assert event2 != event3, "Different Event instances are not equal"
 
-    # Same subclass type with different handles
-    my_event2 = MyEvent._init(device.device_id, device.context, options=EventOptions())
-    assert my_event != my_event2, "Different MyEvent instances are not equal"
-
-
-def test_context_subclass_equality(init_cuda):
-    """Test Context subclass equality behavior."""
-
-    class MyContext(Context):
-        pass
 
+def test_context_equality(init_cuda):
+    """Test Context equality behavior."""
     device = Device(0)
     device.set_current()
-    stream = device.create_stream()
-    context = stream.context
-
-    # MyContext._from_ctx() returns a Context instance, not MyContext
-    my_context = MyContext._from_ctx(context._handle, device.device_id)
-    assert type(my_context) is Context, "_from_ctx returns Context, not subclass"
-    assert type(my_context) is not MyContext
-
-    # Since both are Context instances with same handle, they're equal
-    assert context == my_context, "Context instances with same handle are equal"
 
-    # Create another context from different stream
+    # Get context from different sources
+    stream1 = device.create_stream()
     stream2 = device.create_stream()
+    context1 = stream1.context
     context2 = stream2.context
+    device_context = device.context
 
     # Same device, same primary context, should be equal
-    assert context == context2, "Contexts from same device are equal"
+    assert context1 == context2, "Contexts from same device are equal"
+    assert context1 == device_context, "Stream context equals device context"
 
 
 def test_subclass_type_safety(init_cuda):
diff --git a/cuda_core/tests/test_event.py b/cuda_core/tests/test_event.py
index 0d8f3a3c2d..ef075d8580 100644
--- a/cuda_core/tests/test_event.py
+++ b/cuda_core/tests/test_event.py
@@ -148,14 +148,12 @@ def test_event_context(init_cuda):
     assert context is not None
 
 
-def test_event_subclassing():
-    class MyEvent(Event):
-        pass
-
+def test_event_creation():
+    """Test Event creation via public API."""
     dev = Device()
     dev.set_current()
-    event = MyEvent._init(dev.device_id, dev.context)
-    assert isinstance(event, MyEvent)
+    event = dev.create_event()
+    assert isinstance(event, Event)
 
 
 # ============================================================================
diff --git a/cuda_core/tests/test_hashable.py b/cuda_core/tests/test_hashable.py
index 9bc89969a2..feeae9b07b 100644
--- a/cuda_core/tests/test_hashable.py
+++ b/cuda_core/tests/test_hashable.py
@@ -13,8 +13,6 @@
 """
 
 from cuda.core import Device
-from cuda.core._context import Context
-from cuda.core._event import Event, EventOptions
 from cuda.core._stream import Stream, StreamOptions
 
 # ============================================================================
@@ -128,65 +126,51 @@ class MyStream(Stream):
     assert hash(my_stream) != hash(my_stream2), "Different streams have different hashes"
 
 
-def test_event_subclass_hash(init_cuda):
-    """Test Event subclass hash behavior."""
-
-    class MyEvent(Event):
-        pass
-
+def test_event_hash(init_cuda):
+    """Test Event hash behavior."""
     device = Device(0)
     device.set_current()
 
-    # Create events with different handles
-    event = Event._init(device.device_id, device.context, options=EventOptions())
-    my_event = MyEvent._init(device.device_id, device.context, options=EventOptions())
+    # Create events using public API
+    event1 = device.create_event()
+    event2 = device.create_event()
 
     # Different events (different handles) -> different hashes
-    assert hash(event) != hash(my_event), "Different events have different hashes"
-    assert event != my_event, "Different handles means not equal"
+    assert hash(event1) != hash(event2), "Different events have different hashes"
+    assert event1 != event2, "Different handles means not equal"
 
     # Verify hash consistency
-    hash1 = hash(event)
-    hash2 = hash(event)
+    hash1 = hash(event1)
+    hash2 = hash(event1)
     assert hash1 == hash2, "Hash is consistent across multiple calls"
 
     # Both should be usable as dict keys
-    cache = {event: "base", my_event: "subclass"}
+    cache = {event1: "first", event2: "second"}
     assert len(cache) == 2, "Different events are distinct dict keys"
-    assert cache[event] == "base"
-    assert cache[my_event] == "subclass"
-
-
-def test_context_subclass_hash(init_cuda):
-    """Test Context subclass hash behavior.
+    assert cache[event1] == "first"
+    assert cache[event2] == "second"
 
-    Context._from_ctx() always returns Context instances, even when called
-    as MyContext._from_ctx(). This means we can't create actual MyContext
-    instances in practice.
-    """
-
-    class MyContext(Context):
-        pass
 
+def test_context_hash(init_cuda):
+    """Test Context hash behavior."""
     device = Device(0)
     device.set_current()
-    stream = device.create_stream()
-    context = stream.context
 
-    # MyContext._from_ctx() returns Context, not MyContext
-    my_context = MyContext._from_ctx(context._handle, device.device_id)
-    assert type(my_context) is Context, "_from_ctx returns Context type"
+    # Get context from different sources
+    stream1 = device.create_stream()
+    stream2 = device.create_stream()
+    context1 = stream1.context
+    context2 = stream2.context
 
-    # Same handle -> same hash
-    assert hash(context) == hash(my_context), "Contexts with same handle have same hash"
+    # Same underlying context -> same hash
+    assert hash(context1) == hash(context2), "Contexts with same handle have same hash"
 
     # Verify equality matches hash
-    assert context == my_context, "Contexts with same handle are equal"
-    assert hash(context) == hash(my_context), "Equal contexts have equal hashes"
+    assert context1 == context2, "Contexts with same handle are equal"
 
     # Verify hash consistency
-    hash1 = hash(context)
-    hash2 = hash(context)
+    hash1 = hash(context1)
+    hash2 = hash(context1)
     assert hash1 == hash2, "Hash is consistent across multiple calls"
 
 
@@ -200,33 +184,24 @@ def test_hash_equality_contract_maintained(init_cuda):
     allowing cross-type equality with consistent hashing.
     """
 
-    class MyStream(Stream):
-        pass
-
-    class MyEvent(Event):
-        pass
-
-    class MyContext(Context):
-        pass
-
     device = Device(0)
     device.set_current()
 
-    # Test Stream: base and subclass with same handle
-    my_stream = MyStream._init(options=StreamOptions(), device_id=device.device_id)
-    stream = Stream.from_handle(int(my_stream.handle))
+    # Test Stream: two references to same handle
+    stream1 = device.create_stream()
+    stream2 = Stream.from_handle(int(stream1.handle))
 
-    assert my_stream == stream, "Equal due to isinstance() check and same handle"
-    assert hash(my_stream) == hash(stream), "Equal objects have equal hashes"
+    assert stream1 == stream2, "Equal due to same handle"
+    assert hash(stream1) == hash(stream2), "Equal objects have equal hashes"
 
-    # Test Context: always returns base type from _from_ctx
-    ctx = device.context
-    my_ctx = MyContext._from_ctx(ctx._handle, device.device_id)
+    # Test Context: contexts from same device share same underlying context
+    ctx1 = device.context
+    ctx2 = device.create_stream().context
 
-    assert ctx == my_ctx, "Equal contexts with same handle"
-    assert hash(ctx) == hash(my_ctx), "Equal objects have equal hashes"
+    assert ctx1 == ctx2, "Equal contexts with same handle"
+    assert hash(ctx1) == hash(ctx2), "Equal objects have equal hashes"
 
     # Test that different handles still produce different hashes
-    my_stream2 = MyStream._init(options=StreamOptions(), device_id=device.device_id)
-    assert my_stream != my_stream2, "Different handles means not equal"
-    assert hash(my_stream) != hash(my_stream2), "Different objects have different hashes"
+    stream3 = device.create_stream()
+    assert stream1 != stream3, "Different handles means not equal"
+    assert hash(stream1) != hash(stream3), "Different objects have different hashes"
diff --git a/cuda_core/tests/test_stream.py b/cuda_core/tests/test_stream.py
index 01b0b861af..925daa7cd5 100644
--- a/cuda_core/tests/test_stream.py
+++ b/cuda_core/tests/test_stream.py
@@ -74,7 +74,7 @@ def test_stream_context(init_cuda):
     stream = Device().create_stream(options=StreamOptions())
     context = stream.context
     assert context is not None
-    assert context._handle is not None
+    assert context.handle is not None
 
 
 def test_stream_from_foreign_stream(init_cuda):