diff --git a/.bumpversion.toml b/.bumpversion.toml
index 8a8ac17855..ec13b7baef 100644
--- a/.bumpversion.toml
+++ b/.bumpversion.toml
@@ -6,6 +6,11 @@ filename = "INSTALLING.rst"
 search = "hoomd={current_version}"
 replace = "hoomd={new_version}"
 
+[[tool.bumpversion.files]]
+filename = "BUILDING.rst"
+search = "tree/v{current_version}"
+replace = "tree/v{new_version}"
+
 [[tool.bumpversion.files]]
 filename = "CMakeLists.txt"
 search = 'HOOMD_VERSION_RAW "{current_version}"'
diff --git a/.github/workflows/build_and_test.yaml b/.github/workflows/build_and_test.yaml
index e5935bbb25..843cec3b74 100644
--- a/.github/workflows/build_and_test.yaml
+++ b/.github/workflows/build_and_test.yaml
@@ -87,12 +87,18 @@ jobs:
           echo 'test_docker_options=--gpus=all' >> "$GITHUB_OUTPUT"
 
           case "${{ inputs.compiler_version }}" in
+            130)
+              echo "image=nvidia/cuda:13.0.2-devel-ubuntu24.04" >> "$GITHUB_OUTPUT";;
+            129)
+              echo "image=nvidia/cuda:12.9.1-devel-ubuntu24.04" >> "$GITHUB_OUTPUT";;
+            128)
+              echo "image=nvidia/cuda:12.8.1-devel-ubuntu24.04" >> "$GITHUB_OUTPUT";;
+            126)
+              echo "image=nvidia/cuda:12.6.3-devel-ubuntu24.04" >> "$GITHUB_OUTPUT";;
             125)
-              echo "image=nvidia/cuda:12.5.0-devel-ubuntu22.04" >> "$GITHUB_OUTPUT";;
+              echo "image=nvidia/cuda:12.5.1-devel-ubuntu24.04" >> "$GITHUB_OUTPUT";;
             124)
               echo "image=nvidia/cuda:12.4.1-devel-ubuntu22.04" >> "$GITHUB_OUTPUT";;
-            122)
-              echo "image=nvidia/cuda:12.2.2-devel-ubuntu22.04" >> "$GITHUB_OUTPUT";;
             *)
               echo "Unknown compiler" && exit 1;;
             esac
@@ -134,6 +140,56 @@ jobs:
         activate-environment: true
         manifest-path: code/pixi.toml
 
+    - name: Clone hip
+      if: ${{ inputs.compiler_family == 'cuda' }}
+      uses: actions/checkout@93cb6efe18208431cddfb8368fd83d5badbf9bfd # v5.0.1
+      with:
+        repository: ROCm/rocm-systems
+        ref: 75ad45d5f131738a4d888499026edfeaa3a11fe3 # hip-version_7.2.53220
+        path: rocm-systems
+
+    - name: Clone hipCUB
+      if: ${{ inputs.compiler_family == 'cuda' }}
+      uses: actions/checkout@93cb6efe18208431cddfb8368fd83d5badbf9bfd # v5.0.1
+      with:
+        repository: ROCm/rocm-libraries
+        ref: 1a01b92fd0971c98258c0ea7811aea2e55ec8698 # rocm-7.1.0
+        sparse-checkout: projects/hipcub
+        path: rocm-libraries
+
+    - name: Patch hip
+      if: ${{ inputs.compiler_family == 'cuda' }}
+      run: |
+        git apply ${GITHUB_WORKSPACE}/code/.github/workflows/fix-cuda-12-rocm-systems.patch
+      working-directory: rocm-systems
+
+    - name: Patch hipcub
+      if: ${{ inputs.compiler_family == 'cuda' }}
+      run: |
+        git apply ${GITHUB_WORKSPACE}/code/.github/workflows/fix-cuda-12-rocm-libraries.patch
+      working-directory: rocm-libraries
+
+    - name: Install hip
+      if: ${{ inputs.compiler_family == 'cuda' }}
+      run: |
+        export CLR_DIR="$(readlink -f rocm-systems/projects/clr)"
+        export HIP_DIR="$(readlink -f rocm-systems/projects/hip)"
+        export HIP_OTHER="$(readlink -f rocm-systems/projects/hipother)"
+        cd "$CLR_DIR"
+        mkdir build
+        cd build
+        cmake -DHIP_COMMON_DIR=$HIP_DIR -DHIP_PLATFORM=nvidia -DCMAKE_INSTALL_PREFIX=${GITHUB_WORKSPACE}/local -DCLR_BUILD_HIP=ON -DCLR_BUILD_OCL=OFF -DHIPNV_DIR=$HIP_OTHER/hipnv ..
+        make install
+
+    - name: Install hipcub
+      if: ${{ inputs.compiler_family == 'cuda' }}
+      run: |
+        mkdir build
+        cd build
+        cmake ../ -Dhip_ROOT=${GITHUB_WORKSPACE}/local -DHIP_PLATFORM=nvidia -DCMAKE_INSTALL_PREFIX=${GITHUB_WORKSPACE}/local
+        make install
+      working-directory: rocm-libraries/projects/hipcub
+
     - name: Set compiler
       if: ${{ inputs.compiler_family != 'cuda' }}
       run: |
@@ -157,8 +213,9 @@ jobs:
                                -DBUILD_MPCD=${BUILD_MD:-"ON"} \
                                -DBUILD_METAL=${BUILD_MD:-"ON"} \
                                -DBUILD_HPMC=${BUILD_HPMC:-"ON"} \
-                               -DCUDA_ARCH_LIST="60;70" \
+                               -DCUDA_ARCH_LIST="80" \
                                -DCMAKE_INSTALL_PREFIX=${GITHUB_WORKSPACE}/install \
+                               -Dhip_ROOT=${GITHUB_WORKSPACE}/local \
                                -DPLUGINS=""
       env:
         ENABLE_GPU: ${{ contains(inputs.config, 'cuda') }}
diff --git a/.github/workflows/fix-cuda-12-rocm-libraries.patch b/.github/workflows/fix-cuda-12-rocm-libraries.patch
new file mode 100644
index 0000000000..c18d74b6ce
--- /dev/null
+++ b/.github/workflows/fix-cuda-12-rocm-libraries.patch
@@ -0,0 +1,129 @@
+diff --git a/projects/hipcub/CMakeLists.txt b/projects/hipcub/CMakeLists.txt
+index fbcecf0fc4..0fbae70749 100644
+--- a/projects/hipcub/CMakeLists.txt
++++ b/projects/hipcub/CMakeLists.txt
+@@ -174,7 +174,7 @@ math(EXPR hipcub_VERSION_NUMBER "${hipcub_VERSION_MAJOR} * 100000 + ${hipcub_VER
+ include(VerifyCompiler)
+ 
+ # Get dependencies (except rocm-cmake, included earlier)
+-include(Dependencies)
++# include(Dependencies)
+ 
+ if(BUILD_ADDRESS_SANITIZER)
+   add_compile_options(-fsanitize=address -shared-libasan)
+diff --git a/projects/hipcub/hipcub/include/hipcub/backend/cub/device/device_for.hpp b/projects/hipcub/hipcub/include/hipcub/backend/cub/device/device_for.hpp
+index 0f22c40517..31fab88dd6 100644
+--- a/projects/hipcub/hipcub/include/hipcub/backend/cub/device/device_for.hpp
++++ b/projects/hipcub/hipcub/include/hipcub/backend/cub/device/device_for.hpp
+@@ -30,6 +30,7 @@
+ #define HIPCUB_CUB_DEVICE_DEVICE_FOR_HPP_
+ 
+ #include "../../../config.hpp"
++#if CUDA_VERSION >= 12060
+ 
+ #include <cub/device/device_for.cuh> // IWYU pragma: export
+ 
+@@ -194,4 +195,5 @@ HIPCUB_RUNTIME_FUNCTION
+ 
+ END_HIPCUB_NAMESPACE
+ 
++#endif
+ #endif // HIPCUB_CUB_DEVICE_DEVICE_FOR_HPP_
+diff --git a/projects/hipcub/hipcub/include/hipcub/backend/cub/device/device_merge.hpp b/projects/hipcub/hipcub/include/hipcub/backend/cub/device/device_merge.hpp
+index f314f5a128..e22f08fba9 100644
+--- a/projects/hipcub/hipcub/include/hipcub/backend/cub/device/device_merge.hpp
++++ b/projects/hipcub/hipcub/include/hipcub/backend/cub/device/device_merge.hpp
+@@ -30,6 +30,7 @@
+ #define HIPCUB_CUB_DEVICE_DEVICE_MERGE_HPP_
+ 
+ #include "../../../config.hpp"
++#if CUDA_VERSION >= 12080
+ 
+ #include <cub/device/device_merge.cuh> // IWYU pragma: export
+ 
+@@ -104,4 +105,5 @@ struct DeviceMerge
+ 
+ END_HIPCUB_NAMESPACE
+ 
++#endif
+ #endif // HIPCUB_CUB_DEVICE_DEVICE_MERGE_HPP_
+diff --git a/projects/hipcub/hipcub/include/hipcub/backend/cub/device/device_scan.hpp b/projects/hipcub/hipcub/include/hipcub/backend/cub/device/device_scan.hpp
+index 72ad11f7bc..038fddccdd 100644
+--- a/projects/hipcub/hipcub/include/hipcub/backend/cub/device/device_scan.hpp
++++ b/projects/hipcub/hipcub/include/hipcub/backend/cub/device/device_scan.hpp
+@@ -182,6 +182,7 @@ public:
+                                                  stream);
+     }
+ 
++#if CUDA_VERSION >= 12080
+     template<typename InputIteratorT,
+              typename OutputIteratorT,
+              typename ScanOpT,
+@@ -206,6 +207,7 @@ public:
+                                                                            num_items,
+                                                                            stream));
+     }
++#endif
+ 
+     template<typename InputIteratorT, typename OutputIteratorT, typename NumItemsT>
+     HIPCUB_RUNTIME_FUNCTION
+diff --git a/projects/hipcub/hipcub/include/hipcub/backend/cub/device/device_select.hpp b/projects/hipcub/hipcub/include/hipcub/backend/cub/device/device_select.hpp
+index 6812c5cfeb..33bce4aa23 100644
+--- a/projects/hipcub/hipcub/include/hipcub/backend/cub/device/device_select.hpp
++++ b/projects/hipcub/hipcub/include/hipcub/backend/cub/device/device_select.hpp
+@@ -223,6 +223,7 @@ public:
+                   stream);
+     }
+ 
++#if CUDA_VERSION >= 12060
+     template<typename InputIteratorT,
+              typename FlagIterator,
+              typename OutputIteratorT,
+@@ -329,6 +330,7 @@ public:
+                          select_op,
+                          stream);
+     }
++#endif
+ 
+     template<typename InputIteratorT, typename OutputIteratorT, typename NumSelectedIteratorT>
+     HIPCUB_RUNTIME_FUNCTION
+diff --git a/projects/hipcub/hipcub/include/hipcub/backend/cub/device/device_transform.hpp b/projects/hipcub/hipcub/include/hipcub/backend/cub/device/device_transform.hpp
+index 681a0bbf98..d071c7d859 100644
+--- a/projects/hipcub/hipcub/include/hipcub/backend/cub/device/device_transform.hpp
++++ b/projects/hipcub/hipcub/include/hipcub/backend/cub/device/device_transform.hpp
+@@ -30,6 +30,7 @@
+ #define HIBCUB_ROCPRIM_DEVICE_DEVICE_TRANSFORM_HPP_
+ 
+ #include "../../../config.hpp"
++#if CUDA_VERSION >= 12090
+ 
+ #include <hip/hip_runtime.h>
+ 
+@@ -203,4 +204,5 @@ struct DeviceTransform
+ 
+ END_HIPCUB_NAMESPACE
+ 
++#endif
+ #endif // HIBCUB_ROCPRIM_DEVICE_DEVICE_TRANSFORM_HPP_
+diff --git a/projects/hipcub/hipcub/include/hipcub/backend/cub/util_temporary_storage.hpp b/projects/hipcub/hipcub/include/hipcub/backend/cub/util_temporary_storage.hpp
+index fc67d645b1..63540b717f 100644
+--- a/projects/hipcub/hipcub/include/hipcub/backend/cub/util_temporary_storage.hpp
++++ b/projects/hipcub/hipcub/include/hipcub/backend/cub/util_temporary_storage.hpp
+@@ -50,10 +50,17 @@ HIPCUB_HOST_DEVICE HIPCUB_FORCEINLINE hipError_t
+                      void* (&allocations)[ALLOCATIONS],
+                      const size_t (&allocation_sizes)[ALLOCATIONS])
+ {
++    #if CUDA_VERSION >= 12090
+     cudaError_t error = ::cub::detail::AliasTemporaries(d_temp_storage,
+                                                         temp_storage_bytes,
+                                                         allocations,
+                                                         allocation_sizes);
++    #else
++    cudaError_t error = ::cub::AliasTemporaries(d_temp_storage,
++                                                        temp_storage_bytes,
++                                                        allocations,
++                                                        allocation_sizes);
++    #endif
+ 
+     if(cudaSuccess == error)
+     {
diff --git a/.github/workflows/fix-cuda-12-rocm-systems.patch b/.github/workflows/fix-cuda-12-rocm-systems.patch
new file mode 100644
index 0000000000..3160dd2760
--- /dev/null
+++ b/.github/workflows/fix-cuda-12-rocm-systems.patch
@@ -0,0 +1,41 @@
+diff --git a/projects/hipother/hipnv/include/hip/nvidia_detail/nvidia_hip_runtime_api.h b/projects/hipother/hipnv/include/hip/nvidia_detail/nvidia_hip_runtime_api.h
+index 8f6c295aab..05036a57fe 100644
+--- a/projects/hipother/hipnv/include/hip/nvidia_detail/nvidia_hip_runtime_api.h
++++ b/projects/hipother/hipnv/include/hip/nvidia_detail/nvidia_hip_runtime_api.h
+@@ -869,7 +869,7 @@ static inline void hipMemcpy2DTocudaMemcpy2D(CUDA_MEMCPY2D* a, const hip_Memcpy2
+   a->Height = (size_t)p->Height;
+ }
+ 
+-#if CUDA_VERSION >= CUDA_12020
++#if CUDA_VERSION >= 12080
+ typedef enum cudaMemcpyFlags hipMemcpyFlags;
+ #define hipMemcpyFlagDefault cudaMemcpyFlagDefault
+ #define hipMemcpyFlagPreferOverlapWithCompute cudaMemcpyFlagPreferOverlapWithCompute
+@@ -2430,6 +2430,7 @@ inline static hipError_t hipMemcpy2DToArrayAsync(hipArray_t dst, size_t wOffset,
+       cudaMemcpy2DToArrayAsync(dst, wOffset, hOffset, src, spitch, width, height, kind, stream));
+ }
+ 
++#if CUDA_VERSION >= 12080
+ inline static hipError_t hipMemcpyBatchAsync(void** dsts, void** srcs, size_t* sizes, size_t count,
+                                              hipMemcpyAttributes* attrs, size_t* attrsIdxs,
+                                              size_t numAttrs, size_t* failIdx, hipStream_t stream) {
+@@ -2467,6 +2468,7 @@ inline static hipError_t hipMemcpy3DPeer(hipMemcpy3DPeerParms* p) {
+ inline static hipError_t hipMemcpy3DPeerAsync(hipMemcpy3DPeerParms* p, hipStream_t stream) {
+   return hipCUDAErrorTohipError(cudaMemcpy3DPeerAsync(p, stream));
+ }
++#endif
+ 
+ __HIP_DEPRECATED inline static hipError_t hipMemcpyToArray(hipArray_t dst, size_t wOffset,
+                                                            size_t hOffset, const void* src,
+@@ -3756,9 +3758,11 @@ inline static hipError_t hipLibraryEnumerateKernels(hipKernel_t* kernels, unsign
+   return hipCUResultTohipError(cuLibraryEnumerateKernels(kernels, numKernels, library));
+ }
+ 
++#if CUDA_VERSION >= 12060
+ inline static hipError_t hipKernelGetLibrary(hipLibrary_t* library, hipKernel_t kernel) {
+   return hipCUResultTohipError(cuKernelGetLibrary(library, kernel));
+ }
++#endif
+ 
+ inline static hipError_t hipKernelGetName(const char** name, hipKernel_t kernel) {
+   return hipCUResultTohipError(cuKernelGetName(name, kernel));
diff --git a/.github/workflows/test.yaml b/.github/workflows/test.yaml
index 0a5bcb46bb..0d55f64ae0 100644
--- a/.github/workflows/test.yaml
+++ b/.github/workflows/test.yaml
@@ -53,13 +53,13 @@ jobs:
         - config: [gcc, 14, -py, 313, -nomd, -nohpmc]
         - config: [gcc, 10, -py, 310, -mpi]
 
-        - config: [cuda, 124, -py, 313, -mpi]
+        - config: [cuda, 125, -py, 313, -mpi]
           validate: true
 
-        - config: [cuda, 124, -py, 313]
+        - config: [cuda, 125, -py, 313]
           validate: true
 
-        - config: [cuda, 124, -py, 313, -mpi, -debug]
+        - config: [cuda, 125, -py, 313, -mpi, -debug]
 
 
   release:
@@ -95,6 +95,9 @@ jobs:
         - config: [gcc, 13, -py, 312, -mpi]
         - config: [gcc, 12, -py, 311, -mpi]
         - config: [gcc, 11, -py, 310, -mpi]
+        - config: [cuda, 126, -py, 313, -mpi]
+        - config: [cuda, 128, -py, 313, -mpi]
+        - config: [cuda, 129, -py, 313, -mpi]
 
   tests_complete:
     name: Unit test
diff --git a/.gitmodules b/.gitmodules
index 22dff3fe03..9ba5dad040 100644
--- a/.gitmodules
+++ b/.gitmodules
@@ -7,13 +7,6 @@
 [submodule "hoomd/extern/quickhull"]
 	path = hoomd/extern/quickhull
 	url = https://github.com/glotzerlab/quickhull
-[submodule "hoomd/extern/HIP"]
-	path = hoomd/extern/HIP
-	url = https://github.com/glotzerlab/HIP.git
-[submodule "hoomd/extern/hipCUB"]
-	path = hoomd/extern/hipCUB
-	url = https://github.com/glotzerlab/hipCUB
-	branch = header_only
 [submodule "hoomd/extern/neighbor"]
 	path = hoomd/extern/neighbor
 	url = https://github.com/mphowardlab/neighbor.git
diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
index b0abc2d07d..932b02cb70 100644
--- a/.pre-commit-config.yaml
+++ b/.pre-commit-config.yaml
@@ -35,7 +35,7 @@ repos:
   - id: end-of-file-fixer
     exclude_types: [svg]
   - id: trailing-whitespace
-    exclude_types: [svg]
+    exclude_types: [svg, diff]
   - id: check-json
   - id: check-yaml
     exclude: "\\.clang-format"
diff --git a/BUILDING.rst b/BUILDING.rst
index eaad8681bf..193d9cbc51 100644
--- a/BUILDING.rst
+++ b/BUILDING.rst
@@ -124,22 +124,30 @@ Install additional packages needed to build the documentation:
 
 **For GPU execution** (required when ``ENABLE_GPU=on``):
 
-- **NVIDIA CUDA Toolkit**
+.. tab:: NVIDIA (CUDA)
 
-  *OR*
+    - NVIDIA CUDA Toolkit
+    - hip (`hip installation instructions`_)
+    - hipcub (`hipcub installation instructions`_)
 
-- AMD ROCm
-- HIP [with ``hipcc`` and ``hcc`` as backend]
-- rocFFT
-- rocPRIM
-- rocThrust
-- hipCUB
-- roctracer-dev
+    .. note::
 
-.. note::
+        hip ``hip-version_7.2.53220`` and hipcub ``rocm-7.1.0`` work with CUDA 12.9.
+        Apply the patches in `.github/workflows`_ to add support for CUDA 12.5–12.8.
+
+.. tab:: AMD (HIP)
+
+    - AMD ROCm
+    - HIP [with ``hipcc`` and ``hcc`` as backend]
+    - rocFFT
+    - rocPRIM
+    - rocThrust
+    - hipCUB
+    - roctracer-dev
 
-    When ``ENABLE_GPU=on``, HOOMD-blue will default to CUDA. Set ``HOOMD_GPU_PLATFORM=HIP`` to
-    choose HIP.
+.. _`hip installation instructions`: https://rocmdocs.amd.com/projects/HIP/en/latest/install/build.html
+.. _`hipcub installation instructions`: https://rocm.docs.amd.com/projects/hipCUB/en/latest/install/hipCUB-install-overview.html
+.. _`.github/workflows`: https://github.com/glotzerlab/hoomd-blue/tree/v5.4.0/.github/workflows
 
 **To build the documentation:**
 
@@ -147,6 +155,7 @@ Install additional packages needed to build the documentation:
 - **furo**
 - **nbsphinx**
 - **ipython**
+- **sphinx-inline-tabs**
 
 .. _Obtain the source:
 
diff --git a/CMake/hoomd/FindCUDALibs.cmake b/CMake/hoomd/FindCUDALibs.cmake
index 37e275254f..792c444913 100644
--- a/CMake/hoomd/FindCUDALibs.cmake
+++ b/CMake/hoomd/FindCUDALibs.cmake
@@ -1,7 +1,7 @@
 # Find CUDA libraries and binaries used by HOOMD
 
 set(REQUIRED_CUDA_LIB_VARS "")
-if (HIP_PLATFORM STREQUAL "nvcc")
+if (HIP_PLATFORM STREQUAL "nvidia")
     # find CUDA library path
     get_filename_component(CUDA_BIN_PATH ${CMAKE_CUDA_COMPILER} DIRECTORY)
     get_filename_component(CUDA_LIB_PATH "${CUDA_BIN_PATH}/../lib64/" ABSOLUTE)
@@ -23,7 +23,7 @@ else()
     add_library(CUDA::cudart UNKNOWN IMPORTED)
 endif()
 
-if (HIP_PLATFORM STREQUAL "nvcc")
+if (HIP_PLATFORM STREQUAL "nvidia")
     find_library(CUDA_cudadevrt_LIBRARY cudadevrt HINTS ${CUDA_LIB_PATH})
     mark_as_advanced(CUDA_cudadevrt_LIBRARY)
     if(CUDA_cudadevrt_LIBRARY AND NOT TARGET CUDA::cudadevrt)
@@ -38,7 +38,7 @@ else()
     add_library(CUDA::cudadevrt UNKNOWN IMPORTED)
 endif()
 
-if (HIP_PLATFORM STREQUAL "nvcc")
+if (HIP_PLATFORM STREQUAL "nvidia")
     find_library(CUDA_cufft_LIBRARY cufft HINTS ${CUDA_LIB_PATH})
     mark_as_advanced(CUDA_cufft_LIBRARY)
     if(CUDA_cufft_LIBRARY AND NOT TARGET CUDA::cufft)
@@ -55,7 +55,7 @@ else()
     add_library(CUDA::cufft UNKNOWN IMPORTED)
 endif()
 
-if (HIP_PLATFORM STREQUAL "nvcc")
+if (HIP_PLATFORM STREQUAL "nvidia")
     find_library(CUDA_cusolver_LIBRARY cusolver HINTS ${CUDA_LIB_PATH})
     mark_as_advanced(CUDA_cusolver_LIBRARY)
     if(CUDA_cusolver_LIBRARY AND NOT TARGET CUDA::cusolver)
@@ -72,7 +72,7 @@ else()
     add_library(CUDA::cusolver UNKNOWN IMPORTED)
 endif()
 
-if (HIP_PLATFORM STREQUAL "nvcc")
+if (HIP_PLATFORM STREQUAL "nvidia")
     find_library(CUDA_cusparse_LIBRARY cusparse HINTS ${CUDA_LIB_PATH})
     mark_as_advanced(CUDA_cusparse_LIBRARY)
     if(CUDA_cusparse_LIBRARY AND NOT TARGET CUDA::cusparse)
@@ -96,7 +96,7 @@ if (HIP_PLATFORM STREQUAL "amd")
     message("Found hipfft includes: ${hipfft_INCLUDE_DIR}")
 endif()
 
-if (HIP_PLATFORM STREQUAL "nvcc")
+if (HIP_PLATFORM STREQUAL "nvidia")
     # find compute-sanitizer / cuda-memcheck
     find_program(CUDA_MEMCHECK_EXECUTABLE
       NAMES compute-sanitizer
@@ -115,7 +115,7 @@ if (HIP_PLATFORM STREQUAL "nvcc")
     mark_as_advanced(CUDA_MEMCHECK_EXECUTABLE)
 endif()
 
-if (HIP_PLATFORM STREQUAL "nvcc")
+if (HIP_PLATFORM STREQUAL "nvidia")
     include(FindPackageHandleStandardArgs)
     find_package_handle_standard_args(CUDALibs
       REQUIRED_VARS
diff --git a/CMake/hoomd/HOOMDCUDASetup.cmake b/CMake/hoomd/HOOMDCUDASetup.cmake
index 4da0263536..63958e07ad 100644
--- a/CMake/hoomd/HOOMDCUDASetup.cmake
+++ b/CMake/hoomd/HOOMDCUDASetup.cmake
@@ -1,6 +1,6 @@
 # setup CUDA compile options
 if (ENABLE_HIP)
-    if (HIP_PLATFORM STREQUAL "nvcc")
+    if (HIP_PLATFORM STREQUAL "nvidia")
         # setup nvcc to build for all CUDA architectures. Allow user to modify the list if desired
         if (CMAKE_CUDA_COMPILER_VERSION VERSION_GREATER_EQUAL 12.8)
             set(CUDA_ARCH_LIST 80 CACHE STRING "List of target sm_ architectures to compile CUDA code for. Separate with semicolons.")
@@ -9,7 +9,7 @@ if (ENABLE_HIP)
         endif()
 
         # ignore warnings about unused results
-        set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} -Wno-unused-result -diag-suppress 2810")
+        set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} -Wno-unused-result -Wno-deprecated-declarations -diag-suppress 2810")
 
         if (CMAKE_CUDA_COMPILER_VERSION VERSION_GREATER_EQUAL 11.2)
           set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -DCUSPARSE_NEW_API")
@@ -55,7 +55,7 @@ if (ENABLE_HIP)
 endif (ENABLE_HIP)
 
 # set CUSOLVER_AVAILABLE depending on CUDA Toolkit version
-if (ENABLE_HIP AND HIP_PLATFORM STREQUAL "nvcc")
+if (ENABLE_HIP AND HIP_PLATFORM STREQUAL "nvidia")
     # CUDA 8.0 requires that libgomp be linked in - see if we can link it
     try_compile(_can_link_gomp
                 ${CMAKE_CURRENT_BINARY_DIR}/tmp
diff --git a/CMake/hoomd/HOOMDHIPSetup.cmake b/CMake/hoomd/HOOMDHIPSetup.cmake
index 8a15aa72d1..502ee7278a 100644
--- a/CMake/hoomd/HOOMDHIPSetup.cmake
+++ b/CMake/hoomd/HOOMDHIPSetup.cmake
@@ -1,7 +1,7 @@
 if(ENABLE_HIP)
 
     if (HOOMD_GPU_PLATFORM STREQUAL "HIP")
-        find_package(HIP REQUIRED)
+        find_package(hip REQUIRED)
         CMAKE_MINIMUM_REQUIRED(VERSION 3.21 FATAL_ERROR)
         ENABLE_LANGUAGE(HIP)
         SET(HOOMD_DEVICE_LANGUAGE HIP)
@@ -14,26 +14,21 @@ if(ENABLE_HIP)
         ENABLE_LANGUAGE(CUDA)
         SET(HOOMD_DEVICE_LANGUAGE CUDA)
 
-        set(HIP_INCLUDE_DIR "$<IF:$<STREQUAL:${CMAKE_PROJECT_NAME},HOOMD>,${CMAKE_CURRENT_SOURCE_DIR},${HOOMD_INSTALL_PREFIX}/${PYTHON_SITE_INSTALL_DIR}/include>/hoomd/extern/HIP/include/")
-
         # use CUDA runtime version
         string(REGEX MATCH "([0-9]*).([0-9]*).([0-9]*).*" _hip_version_match "${CMAKE_CUDA_COMPILER_VERSION}")
         set(HIP_VERSION_MAJOR "${CMAKE_MATCH_1}")
         set(HIP_VERSION_MINOR "${CMAKE_MATCH_2}")
         set(HIP_VERSION_PATCH "${CMAKE_MATCH_3}")
-        set(HIP_PLATFORM "nvcc")
-
-        # hipCUB
-        # Use system provided CUB for CUDA 11 and newer
-        set(HIPCUB_INCLUDE_DIR "$<IF:$<STREQUAL:${CMAKE_PROJECT_NAME},HOOMD>,${CMAKE_CURRENT_SOURCE_DIR},${HOOMD_INSTALL_PREFIX}/${PYTHON_SITE_INSTALL_DIR}/include>/hoomd/extern/hipCUB/hipcub/include/")
+        set(HIP_PLATFORM "nvidia")
+        find_package(hip REQUIRED)
     else()
         message(FATAL_ERROR "HOOMD_GPU_PLATFORM must be either CUDA or HIP")
     endif()
 
-    if(NOT TARGET hip::host)
-        add_library(hip::host INTERFACE IMPORTED)
+    if(${HIP_PLATFORM} STREQUAL "nvidia")
+        # HIP does not configure hip::host properly for the nvidia platform
         set_target_properties(hip::host PROPERTIES
-            INTERFACE_INCLUDE_DIRECTORIES "${HIP_INCLUDE_DIR};${HIPCUB_INCLUDE_DIR}")
+            INTERFACE_INCLUDE_DIRECTORIES "${HIP_INCLUDE_DIR}")
 
         # set HIP_VERSION_* on non-CUDA targets (the version is already defined on AMD targets through hipcc)
         set_property(TARGET hip::host APPEND PROPERTY INTERFACE_COMPILE_DEFINITIONS
@@ -46,10 +41,10 @@ if(ENABLE_HIP)
     endif()
 
     # branch upon HCC or NVCC target
-    if(${HIP_PLATFORM} STREQUAL "nvcc")
-        set_property(TARGET hip::host APPEND PROPERTY INTERFACE_COMPILE_DEFINITIONS __HIP_PLATFORM_NVCC__)
+    if(${HIP_PLATFORM} STREQUAL "nvidia")
+        set_property(TARGET hip::host APPEND PROPERTY INTERFACE_COMPILE_DEFINITIONS __HIP_PLATFORM_NVIDIA__ HIPPER_CUDA)
     elseif(${HIP_PLATFORM} STREQUAL "amd")
-        set_property(TARGET hip::host APPEND PROPERTY INTERFACE_COMPILE_DEFINITIONS __HIP_PLATFORM_AMD__ __HIP_PLATFORM_HCC__)
+        set_property(TARGET hip::host APPEND PROPERTY INTERFACE_COMPILE_DEFINITIONS __HIP_PLATFORM_AMD__ __HIP_PLATFORM_HCC__ HIPPER_HIP)
     endif()
 
     find_package(CUDALibs REQUIRED)
diff --git a/CMake/hoomd/HOOMDMPISetup.cmake b/CMake/hoomd/HOOMDMPISetup.cmake
index b4194420b5..6f432503ee 100644
--- a/CMake/hoomd/HOOMDMPISetup.cmake
+++ b/CMake/hoomd/HOOMDMPISetup.cmake
@@ -36,12 +36,12 @@ if (ENABLE_MPI)
     mark_as_advanced(OMPI_INFO)
 
 if (ENABLE_HIP)
-    string(REPLACE "-pthread" "$<$<AND:$<COMPILE_LANGUAGE:CUDA>,$<STREQUAL:${HIP_PLATFORM},nvcc>>:-Xcompiler>;-pthread"
+    string(REPLACE "-pthread" "$<$<AND:$<COMPILE_LANGUAGE:CUDA>,$<STREQUAL:${HIP_PLATFORM},nvidia>>:-Xcompiler>;-pthread"
       _MPI_C_COMPILE_OPTIONS "${MPI_C_COMPILE_OPTIONS}")
     set_property(TARGET MPI::MPI_C PROPERTY INTERFACE_COMPILE_OPTIONS "${_MPI_C_COMPILE_OPTIONS}")
     unset(_MPI_C_COMPILE_OPTIONS)
 
-    string(REPLACE "-pthread" "$<$<AND:$<COMPILE_LANGUAGE:CUDA>,$<STREQUAL:${HIP_PLATFORM},nvcc>>:-Xcompiler>;-pthread"
+    string(REPLACE "-pthread" "$<$<AND:$<COMPILE_LANGUAGE:CUDA>,$<STREQUAL:${HIP_PLATFORM},nvidia>>:-Xcompiler>;-pthread"
       _MPI_CXX_COMPILE_OPTIONS "${MPI_CXX_COMPILE_OPTIONS}")
     set_property(TARGET MPI::MPI_CXX PROPERTY INTERFACE_COMPILE_OPTIONS "${_MPI_CXX_COMPILE_OPTIONS}")
     unset(_MPI_CXX_COMPILE_OPTIONS)
diff --git a/CMakeLists.txt b/CMakeLists.txt
index d47352292a..523d9974b5 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -75,7 +75,7 @@ if(CMAKE_COMPILER_IS_GNUCXX OR CMAKE_CXX_COMPILER_ID MATCHES "Clang")
     set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -Wall -Wno-sign-conversion -Wno-unknown-pragmas -Wno-deprecated-declarations -Wno-unused-result")
 
     # suppress warnings regarding HIP's overly complex vector structs
-    if (CMAKE_COMPILER_IS_GNUCXXH AND OOMD_GPU_PLATFORM STREQUAL "HIP")
+    if (CMAKE_COMPILER_IS_GNUCXX AND HOOMD_GPU_PLATFORM STREQUAL "HIP")
         set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -Wno-class-memaccess")
     endif()
     set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -Wall")
@@ -144,7 +144,6 @@ endif()
 if (
     NOT EXISTS ${CMAKE_CURRENT_SOURCE_DIR}/hoomd/extern/nano-signal-slot/nano_signal_slot.hpp OR
     NOT EXISTS ${CMAKE_CURRENT_SOURCE_DIR}/hoomd/extern/upp11/upp11.h OR
-    NOT EXISTS ${CMAKE_CURRENT_SOURCE_DIR}/hoomd/extern/HIP/include/hip/hip_runtime.h OR
     NOT EXISTS ${CMAKE_CURRENT_SOURCE_DIR}/hoomd/extern/quickhull/ConvexHull.hpp OR
     NOT EXISTS ${CMAKE_CURRENT_SOURCE_DIR}/hoomd/extern/OpenRAND/include/openrand/philox.h OR
     NOT EXISTS ${CMAKE_CURRENT_SOURCE_DIR}/hoomd/extern/neighbor/include/neighbor/neighbor.h OR
diff --git a/INSTALLING.rst b/INSTALLING.rst
index 32532d62af..7db4aa524d 100644
--- a/INSTALLING.rst
+++ b/INSTALLING.rst
@@ -20,32 +20,81 @@ Serial CPU and single GPU builds
 *linux-64*, *osx-64*, and *osx-arm64* platforms. Install the ``hoomd`` package from the conda-forge_
 channel:
 
-.. code-block:: bash
+.. tab:: Pixi
 
-    micromamba install hoomd=6.0.0
+    .. code-block:: bash
+
+        pixi add hoomd=6.0.0
+
+.. tab:: Micromamba
+
+    .. code-block:: bash
+
+        micromamba install hoomd=6.0.0
+
+.. tab:: Mamba
+
+    .. code-block:: bash
+
+        mamba install hoomd=6.0.0
 
 .. _conda-forge: https://conda-forge.org/docs/user/introduction.html
 
 By default, micromamba auto-detects whether your system has a GPU and attempts to install the
 appropriate package. Override this and force the GPU enabled package installation with:
 
-.. code-block:: bash
+.. tab:: Pixi
+
+    First add the following to your ``pixi.toml`` file:
+
+    .. code-block:: toml
+
+        [system-requirements]
+        cuda = "12.9"
+
+    Then, add hoomd with:
+
+    .. code-block:: bash
+
+        pixi add "hoomd=5.4.0=*gpu*"
 
-    export CONDA_OVERRIDE_CUDA="12.6"
-    micromamba install "hoomd=6.0.0=*gpu*" "cuda-version=12.6"
+
+.. tab:: Micromamba
+
+    .. code-block:: bash
+
+        export CONDA_OVERRIDE_CUDA="12.9"
+        micromamba install "hoomd=5.4.0=*gpu*" "cuda-version=12.9"
+
+.. tab:: Mamba
+
+    .. code-block:: bash
+
+        export CONDA_OVERRIDE_CUDA="12.9"
+        mamba install "hoomd=6.0.0=*gpu*" "cuda-version=12.9"
 
 .. note::
 
-    conda-forge_ may update to a new version of CUDA. If the above command results in an error,
-    replace ``12.6`` with the version noted in micromamba's error message.
+    conda-forge_ may update to a new version of CUDA after these instructions are published.
+    If the above command results in an error, replace ``12.9`` with the version noted in
+    micromamba's error message.
 
 Similarly, you can force CPU-only package installation with:
 
-.. code-block:: bash
+.. tab:: Pixi
 
-    micromamba install "hoomd=6.0.0=*cpu*"
+    .. code-block:: bash
 
-.. note::
+        pixi add "hoomd=6.0.0=*cpu*"
+
+.. tab:: Micromamba
+
+    .. code-block:: bash
+
+        micromamba install "hoomd=6.0.0=*cpu*"
+
+.. tab:: Mamba
+
+    .. code-block:: bash
 
-    CUDA 11.8 compatible packages are also available. Replace "12.0" with "11.8" above when
-    installing HOOMD-blue on systems with CUDA 11 compatible drivers.
+        mamba install "hoomd=6.0.0=*cpu*"
diff --git a/hoomd/CMakeLists.txt b/hoomd/CMakeLists.txt
index 4abb33a2d0..713e6f639a 100644
--- a/hoomd/CMakeLists.txt
+++ b/hoomd/CMakeLists.txt
@@ -223,7 +223,7 @@ add_library(HOOMD::_hoomd ALIAS _hoomd)
 # Work around support for the delete operator with pybind11 and older versions of clang
 # https://github.com/pybind/pybind11/issues/1604
 if ("${CMAKE_CXX_COMPILER_ID}" STREQUAL "Clang")
-    target_compile_options(_hoomd PUBLIC $<$<AND:$<COMPILE_LANGUAGE:CUDA>,$<STREQUAL:${HIP_PLATFORM},nvcc>>:-Xcompiler=>;-fsized-deallocation)
+    target_compile_options(_hoomd PUBLIC $<$<AND:$<COMPILE_LANGUAGE:CUDA>,$<STREQUAL:${HIP_PLATFORM},nvidia>>:-Xcompiler=>;-fsized-deallocation)
 endif()
 
 # add quick hull as its own library so that it's symbols can be public
@@ -267,7 +267,7 @@ target_compile_definitions(_hoomd PUBLIC HOOMD_LONGREAL_SIZE=${HOOMD_LONGREAL_SI
 if (ENABLE_HIP)
     if (HIP_PLATFORM STREQUAL "amd")
         target_link_libraries(_hoomd PUBLIC hip::hipfft)
-    elseif(HIP_PLATFORM STREQUAL "nvcc")
+    elseif(HIP_PLATFORM STREQUAL "nvidia")
         target_link_libraries(_hoomd PUBLIC CUDA::cudart CUDA::cufft)
     endif()
     target_compile_definitions(_hoomd PUBLIC ENABLE_HIP CUDA_ARCH=${_cuda_min_arch})
diff --git a/hoomd/ExecutionConfiguration.cc b/hoomd/ExecutionConfiguration.cc
index dd173dfc20..0829682a1b 100644
--- a/hoomd/ExecutionConfiguration.cc
+++ b/hoomd/ExecutionConfiguration.cc
@@ -7,7 +7,7 @@
 #ifdef ENABLE_HIP
 #include <hip/hip_runtime.h>
 
-#if defined(__HIP_PLATFORM_NVCC__)
+#if defined(__HIP_PLATFORM_NVIDIA__)
 #include <cuda_runtime.h>
 #endif
 #endif
@@ -212,7 +212,7 @@ void ExecutionConfiguration::handleHIPError(hipError_t err,
             file += strlen(HOOMD_SOURCE_DIR);
 
         std::ostringstream s;
-#ifdef __HIP_PLATFORM_NVCC__
+#ifdef __HIP_PLATFORM_NVIDIA__
         cudaError_t cuda_error = cudaPeekAtLastError();
         s << "CUDA Error: " << string(cudaGetErrorString(cuda_error));
 #else
@@ -263,7 +263,7 @@ void ExecutionConfiguration::initializeGPU(int gpu_id)
 
     if (gpu_id != -1)
         {
-#ifdef __HIP_PLATFORM_NVCC__
+#ifdef __HIP_PLATFORM_NVIDIA__
         cudaSetValidDevices(&s_capable_gpu_ids[gpu_id], 1);
 #endif
         hipSetDeviceFlags(hipDeviceMapHost);
@@ -272,7 +272,7 @@ void ExecutionConfiguration::initializeGPU(int gpu_id)
     else
         {
             // initialize the default CUDA context from one of the capable GPUs
-#ifdef __HIP_PLATFORM_NVCC__
+#ifdef __HIP_PLATFORM_NVIDIA__
         cudaSetValidDevices(&s_capable_gpu_ids[0], (int)s_capable_gpu_ids.size());
 #endif
         hipSetDeviceFlags(hipDeviceMapHost);
@@ -327,7 +327,7 @@ void ExecutionConfiguration::scanGPUs()
     if (error != hipSuccess)
         {
         std::string message = "Failed to get GPU device count: ";
-#ifdef __HIP_PLATFORM_NVCC__
+#ifdef __HIP_PLATFORM_NVIDIA__
         cudaError_t cuda_error = cudaPeekAtLastError();
         message += string(cudaGetErrorString(cuda_error));
 #else
@@ -352,7 +352,7 @@ void ExecutionConfiguration::scanGPUs()
         if (error != hipSuccess)
             {
             std::string message = "Failed to get device properties: ";
-#ifdef __HIP_PLATFORM_NVCC__
+#ifdef __HIP_PLATFORM_NVIDIA__
             cudaError_t cuda_error = cudaPeekAtLastError();
             message += string(cudaGetErrorString(cuda_error));
 #else
@@ -362,7 +362,7 @@ void ExecutionConfiguration::scanGPUs()
             continue;
             }
 
-#ifdef __HIP_PLATFORM_NVCC__
+#ifdef __HIP_PLATFORM_NVIDIA__
         // exclude a GPU if it's compute version is not high enough
         int compoundComputeVer = prop.minor + prop.major * 10;
 
@@ -386,7 +386,7 @@ void ExecutionConfiguration::scanGPUs()
             }
 
         // exclude a GPU when it doesn't support mapped memory
-#ifdef __HIP_PLATFORM_NVCC__
+#ifdef __HIP_PLATFORM_NVIDIA__
         int supports_managed_memory = 0;
         cudaError_t cuda_error = cudaDeviceGetAttribute(&supports_managed_memory,
                                                         cudaDevAttrConcurrentManagedAccess,
diff --git a/hoomd/ExecutionConfiguration.h b/hoomd/ExecutionConfiguration.h
index f24f20afd1..1a23f3ddf5 100644
--- a/hoomd/ExecutionConfiguration.h
+++ b/hoomd/ExecutionConfiguration.h
@@ -152,7 +152,7 @@ class PYBIND11_EXPORT ExecutionConfiguration
         hipSetDevice(m_gpu_id);
         hipDeviceSynchronize();
 
-#ifdef __HIP_PLATFORM_NVCC__
+#ifdef __HIP_PLATFORM_NVIDIA__
         hipProfilerStart();
 #elif defined(__HIP_PLATFORM_HCC__)
 #ifdef ENABLE_ROCTRACER
@@ -167,7 +167,7 @@ class PYBIND11_EXPORT ExecutionConfiguration
         {
         hipSetDevice(m_gpu_id);
         hipDeviceSynchronize();
-#ifdef __HIP_PLATFORM_NVCC__
+#ifdef __HIP_PLATFORM_NVIDIA__
         hipProfilerStop();
 #elif defined(__HIP_PLATFORM_HCC__)
 #ifdef ENABLE_ROCTRACER
diff --git a/hoomd/HOOMDMath.h b/hoomd/HOOMDMath.h
index dbae39dbcc..365fe0ad2b 100644
--- a/hoomd/HOOMDMath.h
+++ b/hoomd/HOOMDMath.h
@@ -233,7 +233,7 @@ namespace fast
 inline HOSTDEVICE float rsqrt(float x)
     {
 #ifdef __HIP_DEVICE_COMPILE__
-#ifdef __HIP_PLATFORM_NVCC__
+#ifdef __HIP_PLATFORM_NVIDIA__
     return ::rsqrtf(x);
 #elif defined(__HIP_PLATFORM_HCC__)
     return ::__frsqrt_rn(x);
@@ -248,7 +248,7 @@ inline HOSTDEVICE float rsqrt(float x)
 //! Compute the reciprocal square root of x
 inline HOSTDEVICE double rsqrt(double x)
     {
-#if defined(__HIP_DEVICE_COMPILE__) && defined(__HIP_PLATFORM_NVCC__)
+#if defined(__HIP_DEVICE_COMPILE__) && defined(__HIP_PLATFORM_NVIDIA__)
     return ::rsqrt(x);
 #else
     return 1.0 / ::sqrt(x);
@@ -720,7 +720,7 @@ HOSTDEVICE inline hoomd::Scalar3 operator+(const hoomd::Scalar3& a, const hoomd:
     return hoomd::make_scalar3(a.x + b.x, a.y + b.y, a.z + b.z);
     }
 
-#if !defined(ENABLE_HIP) || defined(__HIP_PLATFORM_NVCC__)
+#if !defined(ENABLE_HIP) || defined(__HIP_PLATFORM_NVIDIA__)
 //! Vector addition
 HOSTDEVICE inline hoomd::Scalar3& operator+=(hoomd::Scalar3& a, const hoomd::Scalar3& b)
     {
diff --git a/hoomd/HOOMDVersion.cc b/hoomd/HOOMDVersion.cc
index 55b252838e..a748b6dc93 100644
--- a/hoomd/HOOMDVersion.cc
+++ b/hoomd/HOOMDVersion.cc
@@ -21,7 +21,7 @@ std::string BuildInfo::getCompileFlags()
     int hip_minor = HIP_VERSION_MINOR;
 
     o << "GPU [";
-#if defined(__HIP_PLATFORM_NVCC__)
+#if defined(__HIP_PLATFORM_NVIDIA__)
     o << "CUDA";
 #elif defined(__HIP_PLATFORM_HCC__)
     o << "ROCm";
@@ -105,7 +105,7 @@ std::string BuildInfo::getGPUAPIVersion()
 
 std::string BuildInfo::getGPUPlatform()
     {
-#if defined(__HIP_PLATFORM_NVCC__)
+#if defined(__HIP_PLATFORM_NVIDIA__)
     return std::string("CUDA");
 #elif defined(__HIP_PLATFORM_HCC__)
     return std::string("ROCm");
diff --git a/hoomd/ManagedArray.h b/hoomd/ManagedArray.h
index 2ca58b0eea..efd026da78 100644
--- a/hoomd/ManagedArray.h
+++ b/hoomd/ManagedArray.h
@@ -189,8 +189,8 @@ template<class T> class ManagedArray
         {
         if (managed && ptr)
             {
-#if defined(__HIP_PLATFORM_NVCC__) && (CUDART_VERSION >= 8000)
-            cudaMemAdvise(ptr, sizeof(T) * N, cudaMemAdviseSetReadMostly, 0);
+#if defined(__HIP_PLATFORM_NVIDIA__) && (CUDART_VERSION >= 8000)
+            hipMemAdvise(ptr, sizeof(T) * N, hipMemAdviseSetReadMostly, 0);
 #endif
             }
         }
diff --git a/hoomd/WarpTools.cuh b/hoomd/WarpTools.cuh
index 410775d93c..2c10a05717 100644
--- a/hoomd/WarpTools.cuh
+++ b/hoomd/WarpTools.cuh
@@ -65,7 +65,7 @@ class WarpReduce
     public:
     DEVICE WarpReduce()
         {
-#ifdef __HIP_PLATFORM_NVCC__
+#ifdef __HIP_PLATFORM_NVIDIA__
         static_assert(PTX_ARCH >= 300, "PTX architecture must be >= 300");
         static_assert(LOGICAL_WARP_THREADS <= CUB_PTX_WARP_THREADS,
                       "Logical warp size cannot exceed hardware warp size");
@@ -196,7 +196,7 @@ class WarpScan
     public:
     DEVICE WarpScan()
         {
-#ifdef __HIP_PLATFORM_NVCC__
+#ifdef __HIP_PLATFORM_NVIDIA__
         static_assert(PTX_ARCH >= 300, "PTX architecture must be >= 300");
         static_assert(LOGICAL_WARP_THREADS <= CUB_PTX_WARP_THREADS,
                       "Logical warp size cannot exceed hardware warp size");
diff --git a/hoomd/extern/ECL.cuh b/hoomd/extern/ECL.cuh
index 38b48dcdea..2ab5276eb8 100644
--- a/hoomd/extern/ECL.cuh
+++ b/hoomd/extern/ECL.cuh
@@ -57,7 +57,7 @@ inline void ecl_connected_components(const int nodes,
 static const int Device = 0;
 static const int ThreadsPerBlock = 256;
 
-#ifdef __HIP_PLATFORM_NVCC__
+#ifdef __HIP_PLATFORM_NVIDIA__
 static const int warpsize = 32;
 #else
 static const int warpsize = 64;
@@ -166,7 +166,7 @@ void compute2(const int nodes, const int* const __restrict__ nidx, const int* co
   int idx;
   if (lane == 0) idx = atomicAdd(&posL, 1);
 
-  #ifdef __HIP_PLATFORM_NVCC__
+  #ifdef __HIP_PLATFORM_NVIDIA__
   idx = __shfl_sync(0xffffffff,idx, 0);
   #else
   idx = __shfl(idx,0);
@@ -200,7 +200,7 @@ void compute2(const int nodes, const int* const __restrict__ nidx, const int* co
     }
     if (lane == 0) idx = atomicAdd(&posL, 1);
 
-    #ifdef __HIP_PLATFORM_NVCC__
+    #ifdef __HIP_PLATFORM_NVIDIA__
     idx = __shfl_sync(0xffffffff,idx, 0);
     #else
     idx = __shfl(idx,0);
diff --git a/hoomd/extern/HIP b/hoomd/extern/HIP
deleted file mode 160000
index db753e4ea7..0000000000
--- a/hoomd/extern/HIP
+++ /dev/null
@@ -1 +0,0 @@
-Subproject commit db753e4ea7a715afec405117250cecef9e882b33
diff --git a/hoomd/extern/hipCUB b/hoomd/extern/hipCUB
deleted file mode 160000
index 64d8adf32b..0000000000
--- a/hoomd/extern/hipCUB
+++ /dev/null
@@ -1 +0,0 @@
-Subproject commit 64d8adf32bd48d8723cc7df9e5c970169e2845b5
diff --git a/hoomd/hpmc/CMakeLists.txt b/hoomd/hpmc/CMakeLists.txt
index cc9a35abc0..383138b6d0 100644
--- a/hoomd/hpmc/CMakeLists.txt
+++ b/hoomd/hpmc/CMakeLists.txt
@@ -168,7 +168,7 @@ endif()
 # link the library to its dependencies
 target_link_libraries(_hpmc PUBLIC _hoomd)
 
-if (ENABLE_HIP AND HIP_PLATFORM STREQUAL "nvcc")
+if (ENABLE_HIP AND HIP_PLATFORM STREQUAL "nvidia")
 target_link_libraries(_hpmc PUBLIC CUDA::cusparse )
 endif()
 
diff --git a/hoomd/hpmc/IntegratorHPMCMonoGPU.cuh b/hoomd/hpmc/IntegratorHPMCMonoGPU.cuh
index 3f1f808583..cb20300a00 100644
--- a/hoomd/hpmc/IntegratorHPMCMonoGPU.cuh
+++ b/hoomd/hpmc/IntegratorHPMCMonoGPU.cuh
@@ -29,7 +29,7 @@ namespace hpmc
     {
 namespace gpu
     {
-#ifdef __HIP_PLATFORM_NVCC__
+#ifdef __HIP_PLATFORM_NVIDIA__
 #define MAX_BLOCK_SIZE 1024
 #define MIN_BLOCK_SIZE 32
 #else
@@ -42,7 +42,7 @@ namespace kernel
     {
 //! Check narrow-phase overlaps
 template<class Shape, unsigned int max_threads>
-#ifdef __HIP_PLATFORM_NVCC__
+#ifdef __HIP_PLATFORM_NVIDIA__
 __launch_bounds__(max_threads)
 #endif
     __global__ void hpmc_narrow_phase(const Scalar4* d_postype,
diff --git a/hoomd/hpmc/IntegratorHPMCMonoGPU.h b/hoomd/hpmc/IntegratorHPMCMonoGPU.h
index 8525f31353..d6b93d9562 100644
--- a/hoomd/hpmc/IntegratorHPMCMonoGPU.h
+++ b/hoomd/hpmc/IntegratorHPMCMonoGPU.h
@@ -840,12 +840,12 @@ template<class Shape> void IntegratorHPMCMonoGPU<Shape>::updateCellWidth()
     // update the cell list
     this->m_cl->setNominalWidth(this->m_nominal_width);
 
-#ifdef __HIP_PLATFORM_NVCC__
+#ifdef __HIP_PLATFORM_NVIDIA__
     // set memory hints
-    cudaMemAdvise(this->m_params.data(),
-                  this->m_params.size() * sizeof(typename Shape::param_type),
-                  cudaMemAdviseSetReadMostly,
-                  0);
+    hipMemAdvise(this->m_params.data(),
+                 this->m_params.size() * sizeof(typename Shape::param_type),
+                 hipMemAdviseSetReadMostly,
+                 0);
     CHECK_CUDA_ERROR();
 #endif
 
diff --git a/hoomd/hpmc/UpdaterGCAGPU.cu b/hoomd/hpmc/UpdaterGCAGPU.cu
index 42dd69d976..8dbf341109 100644
--- a/hoomd/hpmc/UpdaterGCAGPU.cu
+++ b/hoomd/hpmc/UpdaterGCAGPU.cu
@@ -23,7 +23,7 @@
 #include <thrust/unique.h>
 #pragma GCC diagnostic pop
 
-#ifdef __HIP_PLATFORM_NVCC__
+#ifdef __HIP_PLATFORM_NVIDIA__
 #include <cusparse.h>
 #endif
 
@@ -39,7 +39,7 @@ namespace hpmc
     {
 namespace gpu
     {
-#ifdef __HIP_PLATFORM_NVCC__
+#ifdef __HIP_PLATFORM_NVIDIA__
 #define check_cusparse(a)                                                                 \
         {                                                                                 \
         cusparseStatus_t status = (a);                                                    \
@@ -277,7 +277,7 @@ void connected_components(uint2* d_adj,
                           const hipDeviceProp_t& dev_prop,
                           CachedAllocator& alloc)
     {
-#ifdef __HIP_PLATFORM_NVCC__
+#ifdef __HIP_PLATFORM_NVIDIA__
     thrust::device_ptr<uint2> adj(d_adj);
 
     // sort the list of pairs
diff --git a/hoomd/hpmc/UpdaterGCAGPU.cuh b/hoomd/hpmc/UpdaterGCAGPU.cuh
index 58ecb063d2..3efb10e5e0 100644
--- a/hoomd/hpmc/UpdaterGCAGPU.cuh
+++ b/hoomd/hpmc/UpdaterGCAGPU.cuh
@@ -21,7 +21,7 @@
 
 #include "IntegratorHPMCMonoGPUTypes.cuh"
 
-#ifdef __HIP_PLATFORM_NVCC__
+#ifdef __HIP_PLATFORM_NVIDIA__
 #define MAX_BLOCK_SIZE 1024
 #define MIN_BLOCK_SIZE 256 // a reasonable minimum to limit the number of template instantiations
 #else
@@ -194,7 +194,7 @@ namespace kernel
     {
 //! Check narrow-phase overlaps
 template<class Shape, unsigned int max_threads>
-#ifdef __HIP_PLATFORM_NVCC__
+#ifdef __HIP_PLATFORM_NVIDIA__
 __launch_bounds__(max_threads)
 #endif
     __global__ void hpmc_cluster_overlaps(const Scalar4* d_postype,
diff --git a/hoomd/md/AnisoPotentialPairGPU.cuh b/hoomd/md/AnisoPotentialPairGPU.cuh
index 82015e2c39..f56cb06dc7 100644
--- a/hoomd/md/AnisoPotentialPairGPU.cuh
+++ b/hoomd/md/AnisoPotentialPairGPU.cuh
@@ -24,7 +24,7 @@
 
 //! Maximum number of threads (width of a warp)
 // currently this is hardcoded, we should set it to the max of platforms
-#if defined(__HIP_PLATFORM_NVCC__)
+#if defined(__HIP_PLATFORM_NVIDIA__)
 const int gpu_aniso_pair_force_max_tpp = 32;
 #elif defined(__HIP_PLATFORM_HCC__)
 const int gpu_aniso_pair_force_max_tpp = 64;
diff --git a/hoomd/md/CommunicatorGrid.cc b/hoomd/md/CommunicatorGrid.cc
index 10251c915b..9f8cf2fcf4 100644
--- a/hoomd/md/CommunicatorGrid.cc
+++ b/hoomd/md/CommunicatorGrid.cc
@@ -12,7 +12,7 @@
 #if defined(ENABLE_HIP)
 #if __HIP_PLATFORM_HCC__
 #include <hipfft.h>
-#elif __HIP_PLATFORM_NVCC__
+#elif __HIP_PLATFORM_NVIDIA__
 #include <cufft.h>
 typedef cufftComplex hipfftComplex;
 #endif
diff --git a/hoomd/md/CommunicatorGridGPU.cc b/hoomd/md/CommunicatorGridGPU.cc
index 3fd9320328..83e6334e6c 100644
--- a/hoomd/md/CommunicatorGridGPU.cc
+++ b/hoomd/md/CommunicatorGridGPU.cc
@@ -10,7 +10,7 @@
 
 #if __HIP_PLATFORM_HCC__
 #include <hipfft.h>
-#elif __HIP_PLATFORM_NVCC__
+#elif __HIP_PLATFORM_NVIDIA__
 #include <cufft.h>
 typedef cufftComplex hipfftComplex;
 #endif
diff --git a/hoomd/md/FrictionPairGPU.cuh b/hoomd/md/FrictionPairGPU.cuh
index 3932542ae0..1b665f3580 100644
--- a/hoomd/md/FrictionPairGPU.cuh
+++ b/hoomd/md/FrictionPairGPU.cuh
@@ -22,7 +22,7 @@
 
 //! Maximum number of threads (width of a warp)
 // currently this is hardcoded, we should set it to the max of platforms
-#if defined(__HIP_PLATFORM_NVCC__)
+#if defined(__HIP_PLATFORM_NVIDIA__)
 const int gpu_friction_pair_force_max_tpp = 32;
 #elif defined(__HIP_PLATFORM_HCC__)
 const int gpu_friction_pair_force_max_tpp = 64;
diff --git a/hoomd/md/NeighborListGPUBinned.cuh b/hoomd/md/NeighborListGPUBinned.cuh
index c05971d2e3..b7cf38d41b 100644
--- a/hoomd/md/NeighborListGPUBinned.cuh
+++ b/hoomd/md/NeighborListGPUBinned.cuh
@@ -15,7 +15,7 @@
 */
 
 // currently this is hardcoded, we should set it to the max of platforms
-#if defined(__HIP_PLATFORM_NVCC__)
+#if defined(__HIP_PLATFORM_NVIDIA__)
 #define WARP_SIZE 32
 #elif defined(__HIP_PLATFORM_HCC__)
 #define WARP_SIZE 64
diff --git a/hoomd/md/NeighborListGPUStencil.cuh b/hoomd/md/NeighborListGPUStencil.cuh
index b303b52fa0..1db5c5f9da 100644
--- a/hoomd/md/NeighborListGPUStencil.cuh
+++ b/hoomd/md/NeighborListGPUStencil.cuh
@@ -14,7 +14,7 @@
     \brief Declares GPU kernel code for neighbor list generation on the GPU
 */
 
-#if defined(__HIP_PLATFORM_NVCC__)
+#if defined(__HIP_PLATFORM_NVIDIA__)
 #define WARP_SIZE 32
 #elif defined(__HIP_PLATFORM_HCC__)
 #define WARP_SIZE 64
diff --git a/hoomd/md/PPPMForceComputeGPU.h b/hoomd/md/PPPMForceComputeGPU.h
index d74df567e9..da2ec75dd0 100644
--- a/hoomd/md/PPPMForceComputeGPU.h
+++ b/hoomd/md/PPPMForceComputeGPU.h
@@ -10,7 +10,7 @@
 
 #if __HIP_PLATFORM_HCC__
 #include <hipfft.h>
-#elif __HIP_PLATFORM_NVCC__
+#elif __HIP_PLATFORM_NVIDIA__
 #include <cufft.h>
 typedef cufftComplex hipfftComplex;
 typedef cufftHandle hipfftHandle;
diff --git a/hoomd/md/PotentialPair.h b/hoomd/md/PotentialPair.h
index b5948bdb91..dec76d2784 100644
--- a/hoomd/md/PotentialPair.h
+++ b/hoomd/md/PotentialPair.h
@@ -368,14 +368,14 @@ PotentialPair<evaluator>::PotentialPair(std::shared_ptr<SystemDefinition> sysdef
     m_r_cut_nlist = std::make_shared<GPUArray<Scalar>>(m_typpair_idx.getNumElements(), m_exec_conf);
     nlist->addRCutMatrix(m_r_cut_nlist);
 
-#if defined(ENABLE_HIP) && defined(__HIP_PLATFORM_NVCC__)
+#if defined(ENABLE_HIP) && defined(__HIP_PLATFORM_NVIDIA__)
     if (m_pdata->getExecConf()->isCUDAEnabled())
         {
         // m_params is _always_ in unified memory, so memadvise and prefetch
-        cudaMemAdvise(m_params.data(),
-                      m_params.size() * sizeof(param_type),
-                      cudaMemAdviseSetReadMostly,
-                      0);
+        hipMemAdvise(m_params.data(),
+                     m_params.size() * sizeof(param_type),
+                     hipMemAdviseSetReadMostly,
+                     0);
         cudaMemPrefetchAsync(m_params.data(),
                              sizeof(param_type) * m_params.size(),
                              m_exec_conf->getGPUId());
diff --git a/hoomd/md/PotentialPairDPDThermoGPU.cuh b/hoomd/md/PotentialPairDPDThermoGPU.cuh
index 8fcee9a0b6..59a2121e94 100644
--- a/hoomd/md/PotentialPairDPDThermoGPU.cuh
+++ b/hoomd/md/PotentialPairDPDThermoGPU.cuh
@@ -27,7 +27,7 @@ namespace md
 namespace kernel
     {
 // currently this is hardcoded, we should set it to the max of platforms
-#if defined(__HIP_PLATFORM_NVCC__)
+#if defined(__HIP_PLATFORM_NVIDIA__)
 const int gpu_dpd_pair_force_max_tpp = 32;
 #elif defined(__HIP_PLATFORM_HCC__)
 const int gpu_dpd_pair_force_max_tpp = 64;
diff --git a/hoomd/md/PotentialPairGPU.cuh b/hoomd/md/PotentialPairGPU.cuh
index 45b2f17a41..24c6021687 100644
--- a/hoomd/md/PotentialPairGPU.cuh
+++ b/hoomd/md/PotentialPairGPU.cuh
@@ -32,7 +32,7 @@ namespace kernel
     {
 //! Maximum number of threads (width of a warp)
 // currently this is hardcoded, we should set it to the max of platforms
-#if defined(__HIP_PLATFORM_NVCC__)
+#if defined(__HIP_PLATFORM_NVIDIA__)
 const int gpu_pair_force_max_tpp = 32;
 #elif defined(__HIP_PLATFORM_HCC__)
 const int gpu_pair_force_max_tpp = 64;
diff --git a/hoomd/md/PotentialTersoffGPU.cuh b/hoomd/md/PotentialTersoffGPU.cuh
index f3e2b859e2..b02ce1b098 100644
--- a/hoomd/md/PotentialTersoffGPU.cuh
+++ b/hoomd/md/PotentialTersoffGPU.cuh
@@ -29,7 +29,7 @@ namespace kernel
     {
 //! Maximum number of threads (width of a warp)
 // currently this is hardcoded, we should set it to the max of platforms
-#if defined(__HIP_PLATFORM_NVCC__)
+#if defined(__HIP_PLATFORM_NVIDIA__)
 const int gpu_tersoff_max_tpp = 32;
 #elif defined(__HIP_PLATFORM_HCC__)
 const int gpu_tersoff_max_tpp = 64;
diff --git a/hoomd/mpcd/ParticleData.cu b/hoomd/mpcd/ParticleData.cu
index 5fc9bd950a..3464cff9ee 100644
--- a/hoomd/mpcd/ParticleData.cu
+++ b/hoomd/mpcd/ParticleData.cu
@@ -13,7 +13,6 @@
 #pragma GCC diagnostic push
 #pragma GCC diagnostic ignored "-Wconversion"
 #include <cub/device/device_partition.cuh>
-#include <cub/iterator/counting_input_iterator.cuh>
 #include <thrust/iterator/counting_iterator.h>
 #pragma GCC diagnostic pop
 
diff --git a/sphinx-doc/conf.py b/sphinx-doc/conf.py
index 9d6183d456..c085ce84c6 100644
--- a/sphinx-doc/conf.py
+++ b/sphinx-doc/conf.py
@@ -30,6 +30,7 @@
     "sphinx.ext.napoleon",
     "sphinx.ext.intersphinx",
     "sphinx.ext.todo",
+    "sphinx_inline_tabs",
 ]
 
 if find_spec("sphinxcontrib.katex") is not None:
diff --git a/sphinx-doc/requirements.in b/sphinx-doc/requirements.in
index 3f87c4fa84..181a8c1c3f 100644
--- a/sphinx-doc/requirements.in
+++ b/sphinx-doc/requirements.in
@@ -9,3 +9,4 @@ sphinxcontrib-googleanalytics
 sphinxcontrib-katex
 sphinx-copybutton
 sphinx-notfound-page
+sphinx-inline-tabs
diff --git a/sphinx-doc/requirements.txt b/sphinx-doc/requirements.txt
index 48a07614af..50fab1737b 100644
--- a/sphinx-doc/requirements.txt
+++ b/sphinx-doc/requirements.txt
@@ -146,6 +146,7 @@ sphinx==8.1.3
     #   nbsphinx
     #   sphinx-basic-ng
     #   sphinx-copybutton
+    #   sphinx-inline-tabs
     #   sphinx-notfound-page
     #   sphinxcontrib-googleanalytics
     #   sphinxcontrib-katex
@@ -153,6 +154,8 @@ sphinx-basic-ng==1.0.0b2
     # via furo
 sphinx-copybutton==0.5.2
     # via -r sphinx-doc/requirements.in
+sphinx-inline-tabs==2023.4.21
+    # via -r sphinx-doc/requirements.in
 sphinx-notfound-page==1.1.0
     # via -r sphinx-doc/requirements.in
 sphinxcontrib-applehelp==2.0.0