Skip to content

Commit d8a152f

Browse files
曙光:支持DCU推理
1 parent d7bda2a commit d8a152f

File tree

8 files changed

+112
-21
lines changed

8 files changed

+112
-21
lines changed

src/devices/cuda/common_cuda.h

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,12 @@
11
#ifndef __COMMON_CUDA_H__
22
#define __COMMON_CUDA_H__
33

4+
#ifdef ENABLE_SUGON_DCU
5+
#define MAX_THREADS_PER_BLOCK 512
6+
#else
47
#define MAX_THREADS_PER_BLOCK 1024
8+
#endif
9+
510
#define MAX_WARP_PER_BLOCK 32
611
#define WARP_SIZE 32
712

src/ops/causal_softmax/cuda/causal_softmax.cu

Lines changed: 33 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -16,6 +16,12 @@ struct AttentionCausualMask {
1616
}
1717
};
1818

19+
struct MaxOp {
20+
__device__ float operator()(const float a, const float b) const {
21+
return a > b ? a: b;
22+
}
23+
};
24+
1925
template<unsigned int BLOCK_SIZE, class Tdata, class Tmask>
2026
static __device__ void block_padding(
2127
Tdata *__restrict__ att,
@@ -33,7 +39,12 @@ static __device__ void block_padding(
3339

3440
__shared__ float max;
3541
{
42+
#ifdef ENABLE_SUGON_DCU
43+
MaxOp max_op;
44+
auto acc = block_op.Reduce(thread_data, max_op, total_seq_len);
45+
#else
3646
auto acc = block_op.Reduce(thread_data, cub::Max(), total_seq_len);
47+
#endif
3748
if (threadIdx.x == 0) { max = acc; }
3849
}
3950
__syncthreads();
@@ -67,7 +78,12 @@ static __device__ void block_folding(
6778
thread_data[i] = att_idx < total_seq_len && mask(token_idx, seq_len, att_idx, total_seq_len)
6879
? float(att[i])
6980
: -__FLT_MAX__;
81+
#ifdef ENABLE_SUGON_DCU
82+
MaxOp max_op;
83+
thread_max = max_op(thread_max, thread_data[i]);
84+
#else
7085
thread_max = cub::Max()(thread_max, thread_data[i]);
86+
#endif
7187
}
7288

7389
using BlockOp = cub::BlockReduce<float, BLOCK_SIZE>;
@@ -76,7 +92,12 @@ static __device__ void block_folding(
7692

7793
__shared__ float max;
7894
{
95+
#ifdef ENABLE_SUGON_DCU
96+
MaxOp max_op;
97+
auto acc = block_op.Reduce(thread_max, max_op);
98+
#else
7999
auto acc = block_op.Reduce(thread_max, cub::Max());
100+
#endif
80101
if (threadIdx.x == 0) { max = acc; }
81102
}
82103
__syncthreads();
@@ -130,7 +151,7 @@ static __forceinline__ __device__ void folding(
130151
}
131152

132153
template<unsigned int BLOCK_SIZE, class Tdata>
133-
__global__ void fused_softmax_padding(
154+
__launch_bounds__(MAX_THREADS_PER_BLOCK) __global__ void fused_softmax_padding(
134155
Tdata *__restrict__ att,
135156
unsigned int const stride_x,
136157
unsigned int const stride_y,
@@ -140,7 +161,7 @@ __global__ void fused_softmax_padding(
140161
}
141162

142163
template<unsigned int BLOCK_SIZE, unsigned int ITEMS_PER_THREAD, class Tdata>
143-
__global__ void fused_softmax_folding(
164+
__launch_bounds__(MAX_THREADS_PER_BLOCK) __global__ void fused_softmax_folding(
144165
Tdata *__restrict__ att,
145166
unsigned int const stride_x,
146167
unsigned int const stride_y,
@@ -152,7 +173,7 @@ __global__ void fused_softmax_folding(
152173
}
153174

154175
template<unsigned int BLOCK_SIZE, class Tdata>
155-
__global__ void fused_softmax_standard(
176+
__launch_bounds__(MAX_THREADS_PER_BLOCK) __global__ void fused_softmax_standard(
156177
Tdata *__restrict__ att_,
157178
unsigned int const stride_x,
158179
unsigned int const stride_y,
@@ -183,7 +204,12 @@ __global__ void fused_softmax_standard(
183204
__syncthreads();
184205
// Block reduce max
185206
{
207+
#ifdef ENABLE_SUGON_DCU
208+
MaxOp max_op;
209+
auto acc = block_op.Reduce(partial, max_op);
210+
#else
186211
auto acc = block_op.Reduce(partial, cub::Max());
212+
#endif
187213
if (threadIdx.x == 0) { max_ = acc; }
188214
}
189215
__syncthreads();
@@ -200,7 +226,11 @@ __global__ void fused_softmax_standard(
200226

201227
// Block reduce sum
202228
{
229+
#ifdef ENABLE_SUGON_DCU
230+
auto acc = block_op.Sum(partial);
231+
#else
203232
auto acc = block_op.Reduce(partial, cub::Sum());
233+
#endif
204234
if (threadIdx.x == 0) { sum_ = acc; }
205235
}
206236
__syncthreads();

src/ops/matmul/cuda/matmul_cuda.cu

Lines changed: 19 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -13,20 +13,38 @@ infiniopStatus_t matmul_cuda(MatmulCudaDescriptor_t desc, void *c, float beta, v
1313
std::swap(a, b);
1414
}
1515

16+
17+
18+
#ifdef ENABLE_SUGON_DCU
19+
float alpha_, beta_;
20+
#else
1621
Tdata alpha_, beta_;
22+
#endif
1723
cudaDataType a_type, b_type, c_type;
1824
cublasComputeType_t compute_type;
19-
2025
if constexpr (std::is_same<Tdata, half>::value) {
26+
#ifdef ENABLE_SUGON_DCU
27+
alpha_ = alpha;
28+
beta_ = beta;
29+
#else
2130
alpha_ = __float2half(alpha);
2231
beta_ = __float2half(beta);
32+
#endif
2333
a_type = b_type = c_type = CUDA_R_16F;
34+
#ifdef ENABLE_SUGON_DCU
35+
compute_type = CUBLAS_COMPUTE_32F;
36+
#else
2437
compute_type = CUBLAS_COMPUTE_16F;
38+
#endif
2539
} else {
2640
alpha_ = alpha;
2741
beta_ = beta;
2842
a_type = b_type = c_type = CUDA_R_32F;
43+
#ifdef ENABLE_SUGON_DCU
44+
compute_type = CUBLAS_COMPUTE_32F;
45+
#else
2946
compute_type = CUBLAS_COMPUTE_32F_FAST_TF32;
47+
#endif
3048
}
3149

3250
auto op_a = info.a_matrix.row_stride == 1 ? CUBLAS_OP_N : CUBLAS_OP_T;

src/ops/random_sample/cuda/random_sample.cu

Lines changed: 7 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -5,7 +5,7 @@
55
#include <cub/cub.cuh>
66

77
template<class T, int BLOCK_DIM>
8-
__global__ void softmax(
8+
__launch_bounds__(MAX_THREADS_PER_BLOCK) __global__ void softmax(
99
T *val_out,
1010
int topk,
1111
float temperature, int voc) {
@@ -29,14 +29,14 @@ __global__ void softmax(
2929
}
3030
}
3131

32-
__global__ void index(uint64_t *key_in, int voc) {
32+
__launch_bounds__(MAX_THREADS_PER_BLOCK) __global__ void index(uint64_t *key_in, int voc) {
3333
int ind = threadIdx.x + blockIdx.x * blockDim.x;
3434
if (ind < voc) {
3535
key_in[ind] = static_cast<uint64_t>(ind);
3636
}
3737
}
3838
template<class T>
39-
__global__ void random_sample_kernel(uint64_t *result,
39+
__launch_bounds__(MAX_THREADS_PER_BLOCK) __global__ void random_sample_kernel(uint64_t *result,
4040
T *val_out,
4141
float random_val,
4242
float topp,
@@ -119,7 +119,9 @@ void random_sample_nv_gpu_f16(RandomSampleCudaDescriptor_t desc, void *workspace
119119
uint64_t *key_in = (uint64_t *) keyTmp;
120120
uint64_t *key_out = key_in + voc;
121121

122-
index<<<(voc + 1023) / 1024, 1024, 0, (cudaStream_t) stream>>>(key_in, voc);
122+
int block_dim = MAX_THREADS_PER_BLOCK;
123+
int num_blocks = ROUND_UP_DIV(voc, block_dim);
124+
index<<<num_blocks, block_dim, 0, (cudaStream_t) stream>>>(key_in, voc);
123125
//下面开始计算workspace空间
124126
size_t size_radix_sort;
125127
size_t size_scan;
@@ -134,9 +136,7 @@ void random_sample_nv_gpu_f16(RandomSampleCudaDescriptor_t desc, void *workspace
134136
voc, (cudaStream_t) stream);//该函数会把排序结果和对应索引保存在val_out和key_out上
135137
//排序结束,然后开始做softmax变换
136138
if (topp > 0 && topk > 1) {
137-
int BLOCK_DIM = 1024;
138-
int num_blocks = (voc + BLOCK_DIM - 1) / BLOCK_DIM;
139-
softmax<half, 1024><<<num_blocks, BLOCK_DIM, 0, (cudaStream_t) stream>>>(val_out, topk,
139+
softmax<half, MAX_THREADS_PER_BLOCK><<<num_blocks, block_dim, 0, (cudaStream_t) stream>>>(val_out, topk,
140140
temperature, voc);
141141

142142

src/ops/rearrange/cuda/rearrange.cu

Lines changed: 5 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -1,8 +1,9 @@
11
#include "../../../devices/cuda/common_cuda.h"
22
#include "rearrange.cuh"
3+
#include "../../utils.h"
34

45
template<class Tmem>
5-
static __global__ void rearrange(
6+
static __launch_bounds__(MAX_THREADS_PER_BLOCK) __global__ void rearrange(
67
void *__restrict__ dst,
78
int const rsa,
89
int const csa,
@@ -35,9 +36,9 @@ void rearrange_nv_gpu(RearrangeCudaDescriptor_t desc, void *y, void const *x, vo
3536
return;
3637
}
3738

38-
auto warps = 1024 / WARP_SIZE;
39-
auto grid = dim3((c + warps - 1) / warps, r);
40-
auto block = dim3(WARP_SIZE, (c + grid.x - 1) / grid.x);
39+
auto warps = MAX_THREADS_PER_BLOCK / WARP_SIZE;
40+
auto grid = dim3(ROUND_UP_DIV(c, warps), r);
41+
auto block = dim3(WARP_SIZE, ROUND_UP_DIV(c, grid.x));
4142
dst_rs /= unit;
4243
dst_cs /= unit;
4344
src_rs /= unit;

src/ops/rms_norm/cuda/rms_norm.cu

Lines changed: 10 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -6,7 +6,7 @@
66

77
// assert BLOCK_SIZE >= blockDim.x
88
template<unsigned int BLOCK_SIZE, class Tdata, class Wdata>
9-
static __global__ void rms_norm_padding(
9+
__launch_bounds__(MAX_THREADS_PER_BLOCK) static __global__ void rms_norm_padding(
1010
Tdata *__restrict__ o_,
1111
unsigned int const stride_y,
1212
Tdata const *__restrict__ x_,
@@ -19,8 +19,11 @@ static __global__ void rms_norm_padding(
1919

2020
using BlockOp = cub::BlockReduce<float, BLOCK_SIZE>;
2121
__shared__ typename BlockOp::TempStorage temp_storage;
22+
#ifdef ENABLE_SUGON_DCU
23+
auto acc = BlockOp(temp_storage).Sum(x * x);
24+
#else
2225
auto acc = BlockOp(temp_storage).Reduce(x * x, cub::Sum());
23-
26+
#endif
2427
__shared__ Tdata rms;
2528
if (threadIdx.x == 0) {
2629
rms = Tdata(rsqrtf(acc / float(blockDim.x) + epsilon));
@@ -31,7 +34,7 @@ static __global__ void rms_norm_padding(
3134
}
3235

3336
template<unsigned int BLOCK_SIZE, unsigned int ITEMS_PER_THREAD, class Tdata, class Wdata>
34-
static __global__ void rms_norm_folding(
37+
__launch_bounds__(MAX_THREADS_PER_BLOCK) static __global__ void rms_norm_folding(
3538
Tdata *__restrict__ y,
3639
unsigned int const stride_y,
3740
Tdata const *__restrict__ x,
@@ -59,7 +62,11 @@ static __global__ void rms_norm_folding(
5962
{
6063
using BlockOp = cub::BlockReduce<float, BLOCK_SIZE>;
6164
__shared__ typename BlockOp::TempStorage temp_storage;
65+
#ifdef ENABLE_SUGON_DCU
66+
acc = BlockOp(temp_storage).Sum(squared);
67+
#else
6268
acc = BlockOp(temp_storage).Reduce(squared, cub::Sum());
69+
#endif
6370
}
6471

6572
__shared__ Tdata rms;

src/ops/swiglu/cuda/swiglu.cu

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -17,7 +17,7 @@ inline int gcd(int a, int b) {
1717
}
1818

1919
template<class Tdata>
20-
static __global__ void swiglu(
20+
static __launch_bounds__(MAX_THREADS_PER_BLOCK) __global__ void swiglu(
2121
Tdata *__restrict__ c,
2222
int const stride_c,
2323
Tdata const *__restrict__ a,

xmake.lua

Lines changed: 32 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -40,6 +40,14 @@ option("ascend-npu")
4040
add_defines("ENABLE_ASCEND_NPU")
4141
option_end()
4242

43+
option("sugon-dcu")
44+
set_default(false)
45+
set_showmenu(true)
46+
set_description("Enable or disable Sugon DCU kernel")
47+
add_defines("ENABLE_SUGON_DCU")
48+
add_defines("ENABLE_NV_GPU")
49+
option_end()
50+
4351
if is_mode("debug") then
4452
add_cxflags("-g -O0")
4553
add_defines("DEBUG_MODE")
@@ -66,9 +74,11 @@ if has_config("cpu") then
6674

6775
end
6876

69-
if has_config("nv-gpu") then
70-
77+
if has_config("nv-gpu", "sugon-dcu") then
7178
add_defines("ENABLE_NV_GPU")
79+
if has_config("sugon-dcu") then
80+
add_defines("ENABLE_SUGON_DCU")
81+
end
7282
local CUDA_ROOT = os.getenv("CUDA_ROOT") or os.getenv("CUDA_HOME") or os.getenv("CUDA_PATH")
7383
local CUDNN_ROOT = os.getenv("CUDNN_ROOT") or os.getenv("CUDNN_HOME") or os.getenv("CUDNN_PATH")
7484
if CUDA_ROOT ~= nil then
@@ -212,6 +222,11 @@ if has_config("ascend-npu") then
212222
target_end()
213223
end
214224

225+
226+
toolchain("sugon-dcu-linker")
227+
set_toolset("sh", "nvcc")
228+
toolchain_end()
229+
215230
target("infiniop")
216231
set_kind("shared")
217232

@@ -221,6 +236,21 @@ target("infiniop")
221236
if has_config("nv-gpu") then
222237
add_deps("nv-gpu")
223238
end
239+
if has_config("sugon-dcu") then
240+
local builddir = string.format(
241+
"build/%s/%s/%s",
242+
get_config("plat"),
243+
get_config("arch"),
244+
get_config("mode")
245+
)
246+
add_shflags("-s", "-shared", "-fPIC")
247+
add_links("cublas", "cudnn", "cudadevrt", "cudart_static", "rt", "pthread", "dl")
248+
-- Using -lnv-gpu will fail, manually link the target using full path
249+
add_deps("nv-gpu", {inherit = false})
250+
add_links(builddir.."/libnv-gpu.a")
251+
set_toolchains("sugon-dcu-linker")
252+
end
253+
224254
if has_config("cambricon-mlu") then
225255
add_deps("cambricon-mlu")
226256
end

0 commit comments

Comments
 (0)