Skip to content

Commit 8a573d7

Browse files
committed
feat: add MetaX MACA device support
1 parent c87d6ef commit 8a573d7

File tree

7 files changed

+417
-0
lines changed

7 files changed

+417
-0
lines changed

setup.py

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -49,6 +49,9 @@ def _is_npu() -> bool:
4949
def _is_musa() -> bool:
5050
return PLATFORM == "musa"
5151

52+
def _is_maca() -> bool:
53+
return PLATFORM == "maca"
54+
5255

5356
class CMakeExtension(Extension):
5457
def __init__(self, name: str, sourcedir: str = ""):
@@ -91,6 +94,9 @@ def build_cmake(self, ext: CMakeExtension):
9194
cmake_args.append("-DRUNTIME_ENVIRONMENT=ascend")
9295
elif _is_musa():
9396
cmake_args.append("-DRUNTIME_ENVIRONMENT=musa")
97+
elif _is_maca():
98+
cmake_args.append("-DRUNTIME_ENVIRONMENT=maca")
99+
cmake_args.append("-DBUILD_UCM_SPARSE=OFF")
94100
else:
95101
raise RuntimeError(
96102
"No supported accelerator found. "

ucm/shared/trans/CMakeLists.txt

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,9 @@
11
if(RUNTIME_ENVIRONMENT STREQUAL "ascend")
22
add_subdirectory(ascend)
33
endif()
4+
if(RUNTIME_ENVIRONMENT STREQUAL "maca")
5+
add_subdirectory(maca)
6+
endif()
47
if(RUNTIME_ENVIRONMENT STREQUAL "cuda")
58
add_subdirectory(cuda)
69
endif()
Lines changed: 33 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,33 @@
1+
set(CUDA_ROOT "/opt/maca/tools/cu-bridge" CACHE PATH "Path to WCUDA root directory")
2+
set(CMAKE_CUDA_COMPILER ${CUDA_ROOT}/bin/cucc)
3+
list(APPEND CMAKE_MODULE_PATH "${CUDA_ROOT}/cmake_module/maca")
4+
enable_language(CUDA)
5+
add_library(kernel OBJECT maca_sm_kernel.cu)
6+
target_compile_options(kernel PRIVATE
7+
-Wall -fPIC
8+
-std=c++17
9+
)
10+
add_library(trans STATIC
11+
${CMAKE_CURRENT_LIST_DIR}/../cuda/cuda_device.cc
12+
${CMAKE_CURRENT_LIST_DIR}/../cuda/cuda_buffer.cc
13+
${CMAKE_CURRENT_LIST_DIR}/../cuda/cuda_stream.cc
14+
${CMAKE_CURRENT_LIST_DIR}/../cuda/cuda_sm_stream.cc
15+
)
16+
17+
add_library(WCUDA::cudart UNKNOWN IMPORTED)
18+
set_target_properties(WCUDA::cudart PROPERTIES
19+
INTERFACE_INCLUDE_DIRECTORIES "${CUDA_ROOT}/include"
20+
IMPORTED_LOCATION "${CUDA_ROOT}/lib/libcuda.so"
21+
)
22+
target_include_directories(WCUDA::cudart INTERFACE
23+
/opt/maca/include
24+
/opt/maca/include/mcr
25+
)
26+
27+
target_include_directories(trans PUBLIC ${CUDA_ROOT}/include)
28+
target_link_directories(trans PUBLIC ${CUDA_ROOT}/lib64)
29+
target_link_libraries(trans PUBLIC
30+
fmt
31+
WCUDA::cudart
32+
kernel
33+
)
Lines changed: 112 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,112 @@
1+
/**
2+
* MIT License
3+
*
4+
* Copyright (c) 2025 MetaX Integrated Circuits (Shanghai) Co., Ltd.
5+
* All rights reserved.
6+
*
7+
* Permission is hereby granted, free of charge, to any person obtaining a copy
8+
* of this software and associated documentation files (the "Software"), to deal
9+
* in the Software without restriction, including without limitation the rights
10+
* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
11+
* copies of the Software, and to permit persons to whom the Software is
12+
* furnished to do so, subject to the following conditions:
13+
*
14+
* The above copyright notice and this permission notice shall be included in all
15+
* copies or substantial portions of the Software.
16+
*
17+
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
18+
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
19+
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
20+
* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
21+
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
22+
* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
23+
* SOFTWARE.
24+
* */
25+
#include <cstdint>
26+
#include "../cuda/cuda_sm_kernel.h"
27+
28+
namespace UC::Trans {
29+
30+
#define CUDA_TRANS_UNIT_SIZE (sizeof(uint4) * 2)
31+
#define CUDA_TRANS_BLOCK_NUMBER (32)
32+
#define CUDA_TRANS_BLOCK_SIZE (256)
33+
#define CUDA_TRANS_THREAD_NUMBER (CUDA_TRANS_BLOCK_NUMBER * CUDA_TRANS_BLOCK_SIZE)
34+
35+
inline __device__ void CudaCopyUnit(const uint8_t* __restrict__ src,
36+
volatile uint8_t* __restrict__ dst)
37+
{
38+
const uint4* src4 = reinterpret_cast<const uint4*>(src);
39+
uint4 lo = __ldcs(src4);
40+
uint4 hi = __ldcs(src4 + 1);
41+
42+
uint8_t* nv_dst = const_cast<uint8_t*>(dst);
43+
uint4* dst4 = reinterpret_cast<uint4*>(nv_dst);
44+
__stcg(dst4, lo);
45+
__stcg(dst4 + 1, hi);
46+
}
47+
48+
__global__ void CudaCopyKernel(const void** src, void** dst, size_t size, size_t num)
49+
{
50+
auto length = size * num;
51+
auto offset = (blockIdx.x * blockDim.x + threadIdx.x) * CUDA_TRANS_UNIT_SIZE;
52+
while (offset + CUDA_TRANS_UNIT_SIZE <= length) {
53+
auto idx = offset / size;
54+
auto off = offset % size;
55+
auto host = ((const uint8_t*)src[idx]) + off;
56+
auto device = ((uint8_t*)dst[idx]) + off;
57+
CudaCopyUnit(host, device);
58+
offset += CUDA_TRANS_THREAD_NUMBER * CUDA_TRANS_UNIT_SIZE;
59+
}
60+
}
61+
62+
__global__ void CudaCopyKernel(const void** src, void* dst, size_t size, size_t num)
63+
{
64+
auto length = size * num;
65+
auto offset = (blockIdx.x * blockDim.x + threadIdx.x) * CUDA_TRANS_UNIT_SIZE;
66+
while (offset + CUDA_TRANS_UNIT_SIZE <= length) {
67+
auto idx = offset / size;
68+
auto off = offset % size;
69+
auto host = ((const uint8_t*)src[idx]) + off;
70+
auto device = ((uint8_t*)dst) + offset;
71+
CudaCopyUnit(host, device);
72+
offset += CUDA_TRANS_THREAD_NUMBER * CUDA_TRANS_UNIT_SIZE;
73+
}
74+
}
75+
76+
__global__ void CudaCopyKernel(const void* src, void** dst, size_t size, size_t num)
77+
{
78+
auto length = size * num;
79+
auto offset = (blockIdx.x * blockDim.x + threadIdx.x) * CUDA_TRANS_UNIT_SIZE;
80+
while (offset + CUDA_TRANS_UNIT_SIZE <= length) {
81+
auto idx = offset / size;
82+
auto off = offset % size;
83+
auto host = ((const uint8_t*)src) + offset;
84+
auto device = ((uint8_t*)dst[idx]) + off;
85+
CudaCopyUnit(host, device);
86+
offset += CUDA_TRANS_THREAD_NUMBER * CUDA_TRANS_UNIT_SIZE;
87+
}
88+
}
89+
90+
cudaError_t CudaSMCopyAsync(void* src[], void* dst[], size_t size, size_t number,
91+
cudaStream_t stream)
92+
{
93+
CudaCopyKernel<<<CUDA_TRANS_BLOCK_NUMBER, CUDA_TRANS_BLOCK_SIZE, 0, stream>>>(src, dst, size,
94+
number);
95+
return cudaGetLastError();
96+
}
97+
98+
cudaError_t CudaSMCopyAsync(void* src[], void* dst, size_t size, size_t number, cudaStream_t stream)
99+
{
100+
CudaCopyKernel<<<CUDA_TRANS_BLOCK_NUMBER, CUDA_TRANS_BLOCK_SIZE, 0, stream>>>(
101+
(const void**)src, dst, size, number);
102+
return cudaGetLastError();
103+
}
104+
105+
cudaError_t CudaSMCopyAsync(void* src, void* dst[], size_t size, size_t number, cudaStream_t stream)
106+
{
107+
CudaCopyKernel<<<CUDA_TRANS_BLOCK_NUMBER, CUDA_TRANS_BLOCK_SIZE, 0, stream>>>(src, dst, size,
108+
number);
109+
return cudaGetLastError();
110+
}
111+
112+
} // namespace UC::Trans

ucm/store/device/CMakeLists.txt

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -2,6 +2,8 @@ if(RUNTIME_ENVIRONMENT STREQUAL "ascend")
22
add_subdirectory(ascend)
33
elseif(RUNTIME_ENVIRONMENT STREQUAL "musa")
44
add_subdirectory(musa)
5+
elseif(RUNTIME_ENVIRONMENT STREQUAL "maca")
6+
add_subdirectory(maca)
57
elseif(RUNTIME_ENVIRONMENT STREQUAL "cuda")
68
add_subdirectory(cuda)
79
elseif(RUNTIME_ENVIRONMENT STREQUAL "simu")
Lines changed: 20 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,20 @@
1+
set(CUDA_ROOT "/opt/maca/tools/cu-bridge" CACHE PATH "Path to WCUDA root directory")
2+
set(CMAKE_CUDA_COMPILER ${CUDA_ROOT}/bin/cucc)
3+
list(APPEND CMAKE_MODULE_PATH "${CUDA_ROOT}/cmake_module/maca")
4+
set(CMAKE_CUDA_ARCHITECTURES 75 80 86 89 90)
5+
enable_language(CUDA)
6+
7+
add_library(storedevice STATIC maca_device.cu)
8+
9+
add_library(WCUDA::cudart UNKNOWN IMPORTED)
10+
set_target_properties(WCUDA::cudart PROPERTIES
11+
INTERFACE_INCLUDE_DIRECTORIES "${CUDA_ROOT}/include"
12+
IMPORTED_LOCATION "${CUDA_ROOT}/lib/libcuda.so"
13+
)
14+
target_include_directories(WCUDA::cudart INTERFACE
15+
/opt/maca/include
16+
/opt/maca/include/mcr
17+
)
18+
19+
target_link_libraries(storedevice PUBLIC storeinfra WCUDA::cudart)
20+
target_compile_options(storedevice PRIVATE -Wall -fPIC -std=c++17)

0 commit comments

Comments
 (0)