feat: add MetaX MACA device support

simshi · simshi · commit 8a573d7eaca5 · 2025-11-21T09:05:50.000Z
diff --git a/setup.py b/setup.py
@@ -49,6 +49,9 @@ def _is_npu() -> bool:
 def _is_musa() -> bool:
     return PLATFORM == "musa"
 
+def _is_maca() -> bool:
+    return PLATFORM == "maca"
+
 
 class CMakeExtension(Extension):
     def __init__(self, name: str, sourcedir: str = ""):
@@ -91,6 +94,9 @@ def build_cmake(self, ext: CMakeExtension):
             cmake_args.append("-DRUNTIME_ENVIRONMENT=ascend")
         elif _is_musa():
             cmake_args.append("-DRUNTIME_ENVIRONMENT=musa")
+        elif _is_maca():
+            cmake_args.append("-DRUNTIME_ENVIRONMENT=maca")
+            cmake_args.append("-DBUILD_UCM_SPARSE=OFF")
         else:
             raise RuntimeError(
                 "No supported accelerator found. "
diff --git a/ucm/shared/trans/CMakeLists.txt b/ucm/shared/trans/CMakeLists.txt
@@ -1,6 +1,9 @@
 if(RUNTIME_ENVIRONMENT STREQUAL "ascend")
     add_subdirectory(ascend)
 endif()
+if(RUNTIME_ENVIRONMENT STREQUAL "maca")
+    add_subdirectory(maca)
+endif()
 if(RUNTIME_ENVIRONMENT STREQUAL "cuda")
     add_subdirectory(cuda)
 endif()
diff --git a/ucm/shared/trans/maca/CMakeLists.txt b/ucm/shared/trans/maca/CMakeLists.txt
@@ -0,0 +1,33 @@
+set(CUDA_ROOT "/opt/maca/tools/cu-bridge" CACHE PATH "Path to WCUDA root directory")
+set(CMAKE_CUDA_COMPILER ${CUDA_ROOT}/bin/cucc)
+list(APPEND CMAKE_MODULE_PATH "${CUDA_ROOT}/cmake_module/maca")
+enable_language(CUDA)
+add_library(kernel OBJECT maca_sm_kernel.cu)
+target_compile_options(kernel PRIVATE
+    -Wall -fPIC
+    -std=c++17
+)
+add_library(trans STATIC
+    ${CMAKE_CURRENT_LIST_DIR}/../cuda/cuda_device.cc
+    ${CMAKE_CURRENT_LIST_DIR}/../cuda/cuda_buffer.cc
+    ${CMAKE_CURRENT_LIST_DIR}/../cuda/cuda_stream.cc
+    ${CMAKE_CURRENT_LIST_DIR}/../cuda/cuda_sm_stream.cc
+)
+
+add_library(WCUDA::cudart UNKNOWN IMPORTED)
+set_target_properties(WCUDA::cudart PROPERTIES
+	INTERFACE_INCLUDE_DIRECTORIES "${CUDA_ROOT}/include"
+	IMPORTED_LOCATION "${CUDA_ROOT}/lib/libcuda.so"
+)
+target_include_directories(WCUDA::cudart INTERFACE
+	/opt/maca/include
+	/opt/maca/include/mcr
+)
+
+target_include_directories(trans PUBLIC ${CUDA_ROOT}/include)
+target_link_directories(trans PUBLIC ${CUDA_ROOT}/lib64)
+target_link_libraries(trans PUBLIC
+    fmt
+    WCUDA::cudart
+    kernel
+)
diff --git a/ucm/shared/trans/maca/maca_sm_kernel.cu b/ucm/shared/trans/maca/maca_sm_kernel.cu
@@ -0,0 +1,112 @@
+/**
+ * MIT License
+ *
+ * Copyright (c) 2025 MetaX Integrated Circuits (Shanghai) Co., Ltd.
+ * All rights reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ * */
+#include <cstdint>
+#include "../cuda/cuda_sm_kernel.h"
+
+namespace UC::Trans {
+
+#define CUDA_TRANS_UNIT_SIZE (sizeof(uint4) * 2)
+#define CUDA_TRANS_BLOCK_NUMBER (32)
+#define CUDA_TRANS_BLOCK_SIZE (256)
+#define CUDA_TRANS_THREAD_NUMBER (CUDA_TRANS_BLOCK_NUMBER * CUDA_TRANS_BLOCK_SIZE)
+
+inline __device__ void CudaCopyUnit(const uint8_t* __restrict__ src,
+                                    volatile uint8_t* __restrict__ dst)
+{
+    const uint4* src4 = reinterpret_cast<const uint4*>(src);
+    uint4 lo = __ldcs(src4);
+    uint4 hi = __ldcs(src4 + 1);
+
+    uint8_t* nv_dst = const_cast<uint8_t*>(dst);
+    uint4* dst4 = reinterpret_cast<uint4*>(nv_dst);
+    __stcg(dst4, lo);
+    __stcg(dst4 + 1, hi);
+}
+
+__global__ void CudaCopyKernel(const void** src, void** dst, size_t size, size_t num)
+{
+    auto length = size * num;
+    auto offset = (blockIdx.x * blockDim.x + threadIdx.x) * CUDA_TRANS_UNIT_SIZE;
+    while (offset + CUDA_TRANS_UNIT_SIZE <= length) {
+        auto idx = offset / size;
+        auto off = offset % size;
+        auto host = ((const uint8_t*)src[idx]) + off;
+        auto device = ((uint8_t*)dst[idx]) + off;
+        CudaCopyUnit(host, device);
+        offset += CUDA_TRANS_THREAD_NUMBER * CUDA_TRANS_UNIT_SIZE;
+    }
+}
+
+__global__ void CudaCopyKernel(const void** src, void* dst, size_t size, size_t num)
+{
+    auto length = size * num;
+    auto offset = (blockIdx.x * blockDim.x + threadIdx.x) * CUDA_TRANS_UNIT_SIZE;
+    while (offset + CUDA_TRANS_UNIT_SIZE <= length) {
+        auto idx = offset / size;
+        auto off = offset % size;
+        auto host = ((const uint8_t*)src[idx]) + off;
+        auto device = ((uint8_t*)dst) + offset;
+        CudaCopyUnit(host, device);
+        offset += CUDA_TRANS_THREAD_NUMBER * CUDA_TRANS_UNIT_SIZE;
+    }
+}
+
+__global__ void CudaCopyKernel(const void* src, void** dst, size_t size, size_t num)
+{
+    auto length = size * num;
+    auto offset = (blockIdx.x * blockDim.x + threadIdx.x) * CUDA_TRANS_UNIT_SIZE;
+    while (offset + CUDA_TRANS_UNIT_SIZE <= length) {
+        auto idx = offset / size;
+        auto off = offset % size;
+        auto host = ((const uint8_t*)src) + offset;
+        auto device = ((uint8_t*)dst[idx]) + off;
+        CudaCopyUnit(host, device);
+        offset += CUDA_TRANS_THREAD_NUMBER * CUDA_TRANS_UNIT_SIZE;
+    }
+}
+
+cudaError_t CudaSMCopyAsync(void* src[], void* dst[], size_t size, size_t number,
+                            cudaStream_t stream)
+{
+    CudaCopyKernel<<<CUDA_TRANS_BLOCK_NUMBER, CUDA_TRANS_BLOCK_SIZE, 0, stream>>>(src, dst, size,
+                                                                                  number);
+    return cudaGetLastError();
+}
+
+cudaError_t CudaSMCopyAsync(void* src[], void* dst, size_t size, size_t number, cudaStream_t stream)
+{
+    CudaCopyKernel<<<CUDA_TRANS_BLOCK_NUMBER, CUDA_TRANS_BLOCK_SIZE, 0, stream>>>(
+        (const void**)src, dst, size, number);
+    return cudaGetLastError();
+}
+
+cudaError_t CudaSMCopyAsync(void* src, void* dst[], size_t size, size_t number, cudaStream_t stream)
+{
+    CudaCopyKernel<<<CUDA_TRANS_BLOCK_NUMBER, CUDA_TRANS_BLOCK_SIZE, 0, stream>>>(src, dst, size,
+                                                                                  number);
+    return cudaGetLastError();
+}
+
+} // namespace UC::Trans
diff --git a/ucm/store/device/CMakeLists.txt b/ucm/store/device/CMakeLists.txt
@@ -2,6 +2,8 @@ if(RUNTIME_ENVIRONMENT STREQUAL "ascend")
     add_subdirectory(ascend)
 elseif(RUNTIME_ENVIRONMENT STREQUAL "musa")
     add_subdirectory(musa)
+elseif(RUNTIME_ENVIRONMENT STREQUAL "maca")
+    add_subdirectory(maca)
 elseif(RUNTIME_ENVIRONMENT STREQUAL "cuda")
     add_subdirectory(cuda)
 elseif(RUNTIME_ENVIRONMENT STREQUAL "simu")
diff --git a/ucm/store/device/maca/CMakeLists.txt b/ucm/store/device/maca/CMakeLists.txt
@@ -0,0 +1,20 @@
+set(CUDA_ROOT "/opt/maca/tools/cu-bridge" CACHE PATH "Path to WCUDA root directory")
+set(CMAKE_CUDA_COMPILER ${CUDA_ROOT}/bin/cucc)
+list(APPEND CMAKE_MODULE_PATH "${CUDA_ROOT}/cmake_module/maca")
+set(CMAKE_CUDA_ARCHITECTURES 75 80 86 89 90)
+enable_language(CUDA)
+
+add_library(storedevice STATIC maca_device.cu)
+
+add_library(WCUDA::cudart UNKNOWN IMPORTED)
+set_target_properties(WCUDA::cudart PROPERTIES
+	INTERFACE_INCLUDE_DIRECTORIES "${CUDA_ROOT}/include"
+	IMPORTED_LOCATION "${CUDA_ROOT}/lib/libcuda.so"
+)
+target_include_directories(WCUDA::cudart INTERFACE
+	/opt/maca/include
+	/opt/maca/include/mcr
+)
+
+target_link_libraries(storedevice PUBLIC storeinfra WCUDA::cudart)
+target_compile_options(storedevice PRIVATE -Wall -fPIC -std=c++17)
diff --git a/ucm/store/device/maca/maca_device.cu b/ucm/store/device/maca/maca_device.cu