diff --git a/include/infini_operators.h b/include/infini_operators.h index 9a5a2555..0ebeafc4 100644 --- a/include/infini_operators.h +++ b/include/infini_operators.h @@ -3,8 +3,10 @@ #include "ops/attention/attention.h" #include "ops/avg_pool/avg_pool.h" #include "ops/causal_softmax/causal_softmax.h" +#include "ops/clip/clip.h" #include "ops/global_avg_pool/global_avg_pool.h" #include "ops/expand/expand.h" +#include "ops/gather/gather.h" #include "ops/gemm/gemm.h" #include "ops/conv/conv.h" #include "ops/matmul/matmul.h" @@ -12,8 +14,12 @@ #include "ops/mlp/mlp.h" #include "ops/random_sample/random_sample.h" #include "ops/rearrange/rearrange.h" +#include "ops/reduce_max/reduce_max.h" +#include "ops/reduce_mean/reduce_mean.h" +#include "ops/reduce_min/reduce_min.h" #include "ops/relu/relu.h" #include "ops/rms_norm/rms_norm.h" #include "ops/rotary_embedding/rotary_embedding.h" #include "ops/swiglu/swiglu.h" +#include "ops/where/where.h" #include "tensor/tensor_descriptor.h" diff --git a/include/ops/clip/clip.h b/include/ops/clip/clip.h new file mode 100644 index 00000000..40428cba --- /dev/null +++ b/include/ops/clip/clip.h @@ -0,0 +1,28 @@ +#ifndef CLIP_H +#define CLIP_H + +#include "../../export.h" +#include "../../operators.h" +#include + +typedef struct ClipDescriptor { + Device device; +} ClipDescriptor; + +typedef ClipDescriptor *infiniopClipDescriptor_t; + +__C __export infiniopStatus_t infiniopCreateClipDescriptor(infiniopHandle_t handle, + infiniopClipDescriptor_t *desc_ptr, + infiniopTensorDescriptor_t y, + infiniopTensorDescriptor_t x, + float* lower_bound, + float* upper_bound); + +__C __export infiniopStatus_t infiniopClip(infiniopClipDescriptor_t desc, + void *y, + void const *x, + void *stream); + +__C __export infiniopStatus_t infiniopDestroyClipDescriptor(infiniopClipDescriptor_t desc); + +#endif diff --git a/include/ops/gather/gather.h b/include/ops/gather/gather.h new file mode 100644 index 00000000..f9c48f74 --- /dev/null +++ b/include/ops/gather/gather.h @@ -0,0 +1,28 @@ +#ifndef GATHER_H +#define GATHER_H + +#include "../../export.h" +#include "../../operators.h" + +typedef struct GatherDescriptor { + Device device; +} GatherDescriptor; + +typedef GatherDescriptor *infiniopGatherDescriptor_t; + +__C __export infiniopStatus_t infiniopCreateGatherDescriptor(infiniopHandle_t handle, + infiniopGatherDescriptor_t *desc_ptr, + infiniopTensorDescriptor_t output, + infiniopTensorDescriptor_t input, + infiniopTensorDescriptor_t indices, + uint64_t axis); + +__C __export infiniopStatus_t infiniopGather(infiniopGatherDescriptor_t desc, + void *output, + void const *input, + void const *indices, + void *stream); + +__C __export infiniopStatus_t infiniopDestroyGatherDescriptor(infiniopGatherDescriptor_t desc); + +#endif diff --git a/include/ops/reduce_max/reduce_max.h b/include/ops/reduce_max/reduce_max.h new file mode 100644 index 00000000..f0b80f36 --- /dev/null +++ b/include/ops/reduce_max/reduce_max.h @@ -0,0 +1,27 @@ +#ifndef REDUCE_MAX_H +#define REDUCE_MAX_H + +#include "../../export.h" +#include "../../operators.h" + +typedef struct ReduceMaxDescriptor { + Device device; +} ReduceMaxDescriptor; + +typedef ReduceMaxDescriptor *infiniopReduceMaxDescriptor_t; + +__C __export infiniopStatus_t infiniopCreateReduceMaxDescriptor(infiniopHandle_t handle, + infiniopReduceMaxDescriptor_t *desc_ptr, + infiniopTensorDescriptor_t y, + infiniopTensorDescriptor_t x, + int *axes, + uint64_t axes_ndim); + +__C __export infiniopStatus_t infiniopGetReduceMaxWorkspaceSize(infiniopReduceMaxDescriptor_t desc, uint64_t *size); + +__C __export infiniopStatus_t infiniopReduceMax(infiniopReduceMaxDescriptor_t desc, void *workspace, uint64_t workspace_size, void *y, void const *x, void *stream); + +__C __export infiniopStatus_t infiniopDestroyReduceMaxDescriptor(infiniopReduceMaxDescriptor_t desc); + + +#endif diff --git a/include/ops/reduce_mean/reduce_mean.h b/include/ops/reduce_mean/reduce_mean.h new file mode 100644 index 00000000..cfeb933f --- /dev/null +++ b/include/ops/reduce_mean/reduce_mean.h @@ -0,0 +1,27 @@ +#ifndef REDUCE_MEAN_H +#define REDUCE_MEAN_H + +#include "../../export.h" +#include "../../operators.h" + +typedef struct ReduceMeanDescriptor { + Device device; +} ReduceMeanDescriptor; + +typedef ReduceMeanDescriptor *infiniopReduceMeanDescriptor_t; + +__C __export infiniopStatus_t infiniopCreateReduceMeanDescriptor(infiniopHandle_t handle, + infiniopReduceMeanDescriptor_t *desc_ptr, + infiniopTensorDescriptor_t y, + infiniopTensorDescriptor_t x, + int *axes, + uint64_t axes_ndim); + +__C __export infiniopStatus_t infiniopGetReduceMeanWorkspaceSize(infiniopReduceMeanDescriptor_t desc, uint64_t *size); + +__C __export infiniopStatus_t infiniopReduceMean(infiniopReduceMeanDescriptor_t desc, void *workspace, uint64_t workspace_size, void *y, void const *x, void *stream); + +__C __export infiniopStatus_t infiniopDestroyReduceMeanDescriptor(infiniopReduceMeanDescriptor_t desc); + + +#endif diff --git a/include/ops/reduce_min/reduce_min.h b/include/ops/reduce_min/reduce_min.h new file mode 100644 index 00000000..32e7ae3c --- /dev/null +++ b/include/ops/reduce_min/reduce_min.h @@ -0,0 +1,27 @@ +#ifndef REDUCE_MIN_H +#define REDUCE_MIN_H + +#include "../../export.h" +#include "../../operators.h" + +typedef struct ReduceMinDescriptor { + Device device; +} ReduceMinDescriptor; + +typedef ReduceMinDescriptor *infiniopReduceMinDescriptor_t; + +__C __export infiniopStatus_t infiniopCreateReduceMinDescriptor(infiniopHandle_t handle, + infiniopReduceMinDescriptor_t *desc_ptr, + infiniopTensorDescriptor_t y, + infiniopTensorDescriptor_t x, + int *axes, + uint64_t axes_ndim); + +__C __export infiniopStatus_t infiniopGetReduceMinWorkspaceSize(infiniopReduceMinDescriptor_t desc, uint64_t *size); + +__C __export infiniopStatus_t infiniopReduceMin(infiniopReduceMinDescriptor_t desc, void *workspace, uint64_t workspace_size, void *y, void const *x, void *stream); + +__C __export infiniopStatus_t infiniopDestroyReduceMinDescriptor(infiniopReduceMinDescriptor_t desc); + + +#endif diff --git a/include/ops/where/where.h b/include/ops/where/where.h new file mode 100644 index 00000000..3245e9b2 --- /dev/null +++ b/include/ops/where/where.h @@ -0,0 +1,29 @@ +#ifndef WHERE_H +#define WHERE_H + +#include "../../export.h" +#include "../../operators.h" + +typedef struct WhereDescriptor { + Device device; +} WhereDescriptor; + +typedef WhereDescriptor *infiniopWhereDescriptor_t; + +__C __export infiniopStatus_t infiniopCreateWhereDescriptor(infiniopHandle_t handle, + infiniopWhereDescriptor_t *desc_ptr, + infiniopTensorDescriptor_t output, + infiniopTensorDescriptor_t x, + infiniopTensorDescriptor_t y, + infiniopTensorDescriptor_t condition); + +__C __export infiniopStatus_t infiniopWhere(infiniopWhereDescriptor_t desc, + void *output, + void const *x, + void const *y, + void const *condition, + void *stream); + +__C __export infiniopStatus_t infiniopDestroyWhereDescriptor(infiniopWhereDescriptor_t desc); + +#endif diff --git a/operatorspy/tests/clip.py b/operatorspy/tests/clip.py new file mode 100644 index 00000000..b72004c7 --- /dev/null +++ b/operatorspy/tests/clip.py @@ -0,0 +1,233 @@ +from ctypes import POINTER, Structure, c_int32, c_uint64, c_void_p, c_float +import ctypes +import sys +import os +import time + +sys.path.insert(0, os.path.abspath(os.path.join(os.path.dirname(__file__), "..", ".."))) +from operatorspy import ( + open_lib, + to_tensor, + CTensor, + DeviceEnum, + infiniopHandle_t, + infiniopTensorDescriptor_t, + create_handle, + destroy_handle, + check_error, + rearrange_tensor, + create_workspace, +) + +from operatorspy.tests.test_utils import get_args, synchronize_device +import torch + +PROFILE = True +NUM_PRERUN = 10 +NUM_ITERATIONS = 1000 + +class ClipDescriptor(Structure): + _fields_ = [("device", c_int32)] + + +infiniopClipDescriptor_t = POINTER(ClipDescriptor) + +def clip(x, lower_bound, upper_bound): + return torch.clamp_max(torch.clamp_min(x, lower_bound if lower_bound else torch.finfo(x.dtype).min), upper_bound if upper_bound else torch.finfo(x.dtype).max) + + +def test( + lib, + handle, + torch_device, + x_shape, + lower_bound, + upper_bound, + dtype=torch.float16, +): + print( + f"Testing Clip on {torch_device} with x_shape:{x_shape} lower_bound:{lower_bound} upper_bound:{upper_bound} dtype:{dtype}" + ) + + x = torch.randn(x_shape, dtype=dtype, device=torch_device) + ans = clip(x, lower_bound, upper_bound) + y = torch.zeros(ans.shape, dtype=dtype, device=torch_device) + + + x_tensor = to_tensor(x, lib) + y_tensor = to_tensor(y, lib) + + + descriptor = infiniopClipDescriptor_t() + check_error( + lib.infiniopCreateClipDescriptor( + handle, + ctypes.byref(descriptor), + y_tensor.descriptor, + x_tensor.descriptor, + ctypes.byref(c_float(lower_bound)) if lower_bound else None, + ctypes.byref(c_float(upper_bound)) if upper_bound else None, + ) + ) + + + # Invalidate the shape and strides in the descriptor to prevent them from being directly used by the kernel + x_tensor.descriptor.contents.invalidate() + y_tensor.descriptor.contents.invalidate() + + + check_error( + lib.infiniopClip( + descriptor, + y_tensor.data, + x_tensor.data, + None, + ) + ) + + assert torch.allclose(y, ans, atol=0, rtol=0) + # ans_ = ans.cpu().numpy().flatten() + # y_ = y.cpu().numpy().flatten() + # print(ans_) + # print(y_) + # atol = max(abs(ans_ - y_)) + # rtol = atol / max(abs(y_) + 1e-8) + + # print(f"atol: {atol}, rtol: {rtol}") + + if PROFILE: + for i in range(NUM_PRERUN): + _ = clip(x, lower_bound, upper_bound) + synchronize_device(torch_device) + start_time = time.time() + for i in range(NUM_ITERATIONS): + _ = clip(x, lower_bound, upper_bound) + synchronize_device(torch_device) + elapsed = (time.time() - start_time) / NUM_ITERATIONS + print(f" pytorch time: {elapsed * 1000 :6f} ms") + for i in range(NUM_PRERUN): + check_error( + lib.infiniopClip( + descriptor, + y_tensor.data, + x_tensor.data, + None, + ) + ) + synchronize_device(torch_device) + start_time = time.time() + for i in range(NUM_ITERATIONS): + check_error( + lib.infiniopClip( + descriptor, + y_tensor.data, + x_tensor.data, + None, + ) + ) + synchronize_device(torch_device) + elapsed = (time.time() - start_time) / NUM_ITERATIONS + print(f" lib time: {elapsed * 1000 :6f} ms") + + check_error(lib.infiniopDestroyClipDescriptor(descriptor)) + + +def test_cpu(lib, test_cases): + device = DeviceEnum.DEVICE_CPU + handle = create_handle(lib, device) + + for ( + x_shape, + lower_bound, + upper_bound, + dtype, + ) in test_cases: + test( + lib, + handle, + "cpu", + x_shape, + lower_bound, + upper_bound, + dtype, + ) + + destroy_handle(lib, handle) + + +def test_cuda(lib, test_cases): + device = DeviceEnum.DEVICE_CUDA + handle = create_handle(lib, device) + + for ( + x_shape, + lower_bound, + upper_bound, + dtype, + ) in test_cases: + test( + lib, + handle, + "cuda", + x_shape, + lower_bound, + upper_bound, + dtype, + ) + + destroy_handle(lib, handle) + + +if __name__ == "__main__": + test_cases = [ + # x_shape, lower_bound, upper_bound, test_dtype + ((2, 2), -0.1, 0.1, torch.float32), + ((2, 2), 0.1, -0.1, torch.float32), + # ((2, 2), None, None, torch.float32), + ((2, 2), None, 0.1, torch.float32), + ((2, 2), 0.1, None, torch.float32), + ((2048, 2048), -0.1, 0.1, torch.float32), + + ((2, 2), -0.1, 0.1, torch.float16), + ((2, 2), 0.1, -0.1, torch.float16), + # ((2, 2), None, None, torch.float16), + ((2, 2), None, 0.1, torch.float16), + ((2, 2), 0.1, None, torch.float16), + ((2048, 2048), -0.1, 0.1, torch.float16), + ] + args = get_args() + lib = open_lib() + + lib.infiniopCreateClipDescriptor.restype = c_int32 + lib.infiniopCreateClipDescriptor.argtypes = [ + infiniopHandle_t, + POINTER(infiniopClipDescriptor_t), + infiniopTensorDescriptor_t, + infiniopTensorDescriptor_t, + POINTER(c_float), + POINTER(c_float), + ] + + + lib.infiniopClip.restype = c_int32 + lib.infiniopClip.argtypes = [ + infiniopClipDescriptor_t, + c_void_p, + c_void_p, + c_void_p, + ] + + lib.infiniopDestroyClipDescriptor.restype = c_int32 + lib.infiniopDestroyClipDescriptor.argtypes = [ + infiniopClipDescriptor_t, + ] + + if args.profile: + PROFILE = True + if args.cpu: + test_cpu(lib, test_cases) + if args.cuda: + test_cuda(lib, test_cases) + if not (args.cpu or args.cuda): + test_cpu(lib, test_cases) + print("\033[92mTest passed!\033[0m") diff --git a/operatorspy/tests/gather.py b/operatorspy/tests/gather.py new file mode 100644 index 00000000..c90d9b7f --- /dev/null +++ b/operatorspy/tests/gather.py @@ -0,0 +1,241 @@ +from ctypes import POINTER, Structure, c_int32, c_uint64, c_void_p, c_float +import ctypes +import sys +import os +import time + +sys.path.insert(0, os.path.abspath(os.path.join(os.path.dirname(__file__), "..", ".."))) +from operatorspy import ( + open_lib, + to_tensor, + CTensor, + DeviceEnum, + infiniopHandle_t, + infiniopTensorDescriptor_t, + create_handle, + destroy_handle, + check_error, + rearrange_tensor, + create_workspace, +) + +from operatorspy.tests.test_utils import get_args, synchronize_device +import torch + +PROFILE = True +NUM_PRERUN = 10 +NUM_ITERATIONS = 1000 + +class GatherDescriptor(Structure): + _fields_ = [("device", c_int32)] + + +infiniopGatherDescriptor_t = POINTER(GatherDescriptor) + +def gather(rank, axis, inputTensor, indexTensor): + indices = [slice(None)] * rank + indices[axis] = indexTensor + outTensor = inputTensor[tuple(indices)] + return outTensor + + +def test( + lib, + handle, + torch_device, + input_shape, + index_shape, + axis, + dtype=torch.float16, +): + print( + f"Testing Gather on {torch_device} with input_shape:{input_shape} indices_shape:{index_shape} axis:{axis} dtype:{dtype}" + ) + + input = torch.randn(input_shape, dtype=dtype, device=torch_device) + index = torch.randint(0, input.shape[axis], index_shape, device=torch_device).to(torch.int32) + ans = gather(len(input_shape), axis, input, index) + output = torch.zeros(ans.shape, dtype=dtype, device=torch_device) + + + input_tensor = to_tensor(input, lib) + index_tensor = to_tensor(index, lib) + output_tensor = to_tensor(output, lib) + + descriptor = infiniopGatherDescriptor_t() + check_error( + lib.infiniopCreateGatherDescriptor( + handle, + ctypes.byref(descriptor), + output_tensor.descriptor, + input_tensor.descriptor, + index_tensor.descriptor, + axis, + ) + ) + + # Invalidate the shape and strides in the descriptor to prevent them from being directly used by the kernel + input_tensor.descriptor.contents.invalidate() + index_tensor.descriptor.contents.invalidate() + output_tensor.descriptor.contents.invalidate() + + + check_error( + lib.infiniopGather( + descriptor, + output_tensor.data, + input_tensor.data, + index_tensor.data, + None, + ) + ) + + assert torch.allclose(output, ans, atol=0, rtol=0) + # ans_ = ans.cpu().numpy().flatten() + # output_ = output.cpu().numpy().flatten() + # atol = max(abs(ans_ - output_)) + # rtol = atol / max(abs(output_) + 1e-8) + + # print(f"atol: {atol}, rtol: {rtol}") + + if PROFILE: + for i in range(NUM_PRERUN): + _ = gather(len(input_shape), axis, input, index) + synchronize_device(torch_device) + start_time = time.time() + for i in range(NUM_ITERATIONS): + _ = gather(len(input_shape), axis, input, index) + synchronize_device(torch_device) + elapsed = (time.time() - start_time) / NUM_ITERATIONS + print(f" pytorch time: {elapsed * 1000 :6f} ms") + for i in range(NUM_PRERUN): + check_error( + lib.infiniopGather( + descriptor, + output_tensor.data, + input_tensor.data, + index_tensor.data, + None, + ) + ) + synchronize_device(torch_device) + start_time = time.time() + for i in range(NUM_ITERATIONS): + check_error( + lib.infiniopGather( + descriptor, + output_tensor.data, + input_tensor.data, + index_tensor.data, + None, + ) + ) + synchronize_device(torch_device) + elapsed = (time.time() - start_time) / NUM_ITERATIONS + print(f" lib time: {elapsed * 1000 :6f} ms") + + check_error(lib.infiniopDestroyGatherDescriptor(descriptor)) + + +def test_cpu(lib, test_cases): + device = DeviceEnum.DEVICE_CPU + handle = create_handle(lib, device) + + for ( + input_shape, + index_shape, + axis, + dtype, + ) in test_cases: + test( + lib, + handle, + "cpu", + input_shape, + index_shape, + axis, + dtype, + ) + + destroy_handle(lib, handle) + + +def test_cuda(lib, test_cases): + device = DeviceEnum.DEVICE_CUDA + handle = create_handle(lib, device) + + for ( + input_shape, + index_shape, + axis, + dtype, + ) in test_cases: + test( + lib, + handle, + "cuda", + input_shape, + index_shape, + axis, + dtype, + ) + + destroy_handle(lib, handle) + + +if __name__ == "__main__": + test_cases = [ + # input_shape , index_shape, axis, test_dtype + ((64, 64), (64, 64), 0, torch.float32), + ((64, 64), (64, 64), 1, torch.float32), + ((8, 8, 8, 8, 8), (8, 8), 0, torch.float32), + ((8, 8, 8, 8, 8), (8, 8), 2, torch.float32), + ((1024, 1024, 1024), (1, ), 1, torch.float32), + ((2048, 2048), (128, 128), 0, torch.float32), + ((2048, 2048), (128, 128), 1, torch.float32), + + ((64, 64), (64, 64), 0, torch.float16), + ((64, 64), (64, 64), 1, torch.float16), + ((8, 8, 8, 8, 8), (8, 8), 0, torch.float16), + ((8, 8, 8, 8, 8), (8, 8), 2, torch.float16), + ((1024, 1024, 1024), (1, ), 1, torch.float16), + ((2048, 2048), (128, 128), 0, torch.float16), + ((2048, 2048), (128, 128), 1, torch.float16), + ] + args = get_args() + lib = open_lib() + + lib.infiniopCreateGatherDescriptor.restype = c_int32 + lib.infiniopCreateGatherDescriptor.argtypes = [ + infiniopHandle_t, + POINTER(infiniopGatherDescriptor_t), + infiniopTensorDescriptor_t, + infiniopTensorDescriptor_t, + infiniopTensorDescriptor_t, + c_uint64, + ] + + + lib.infiniopGather.restype = c_int32 + lib.infiniopGather.argtypes = [ + infiniopGatherDescriptor_t, + c_void_p, + c_void_p, + c_void_p, + c_void_p, + ] + + lib.infiniopDestroyGatherDescriptor.restype = c_int32 + lib.infiniopDestroyGatherDescriptor.argtypes = [ + infiniopGatherDescriptor_t, + ] + + if args.profile: + PROFILE = True + if args.cpu: + test_cpu(lib, test_cases) + if args.cuda: + test_cuda(lib, test_cases) + if not (args.cpu or args.cuda): + test_cpu(lib, test_cases) + print("\033[92mTest passed!\033[0m") diff --git a/operatorspy/tests/reduce_max.py b/operatorspy/tests/reduce_max.py new file mode 100644 index 00000000..ebc4bdae --- /dev/null +++ b/operatorspy/tests/reduce_max.py @@ -0,0 +1,217 @@ + +from ctypes import POINTER, Structure, c_int32, c_uint64, c_void_p +import ctypes +import sys +import os +import time + +sys.path.insert(0, os.path.abspath(os.path.join(os.path.dirname(__file__), "..", ".."))) +from operatorspy import ( + open_lib, + to_tensor, + DeviceEnum, + infiniopHandle_t, + infiniopTensorDescriptor_t, + create_handle, + destroy_handle, + check_error, +) + +from operatorspy.tests.test_utils import get_args +import torch +import math +import ctypes +from torch.nn import functional as F +from typing import List, Tuple + +# constant for control whether profile the pytorch and lib functions +# NOTE: need to manually add synchronization function to the lib function, +# e.g., cudaDeviceSynchronize() for CUDA +PROFILE = False +NUM_PRERUN = 10 +NUM_ITERATIONS = 1000 + + +class ReduceMaxDescriptor(Structure): + _fields_ = [("device", c_int32)] + + +infiniopReduceMaxDescriptor_t = POINTER(ReduceMaxDescriptor) + +def reduce_max(x, axes, keepdim=False): + return torch.amax(x, dim=axes, keepdim=keepdim) + +# convert a python tuple to a ctype void pointer +def tuple_to_int_p(py_tuple: Tuple): + array = ctypes.c_int * len(py_tuple) + data_array = array(*py_tuple) + return ctypes.cast(data_array, ctypes.POINTER(ctypes.c_int)) + + +def test( + lib, + handle, + torch_device, + x_shape, + axes, + keepdim, + tensor_dtype=torch.float16, +): + print( + f"Testing ReduceMax on {torch_device} with x_shape: {x_shape}, axes:{axes}, keepdim:{keepdim}, dtype:{tensor_dtype}" + ) + x = torch.rand(x_shape, dtype=tensor_dtype).to(torch_device) + ans = reduce_max(x, axes, keepdim) + y = torch.zeros(ans.shape, dtype=tensor_dtype).to(torch_device) + + # print(f'y_shape: {y.shape}') + + for i in range(NUM_PRERUN if PROFILE else 1): + ans = reduce_max(x, axes, keepdim) + if PROFILE: + start_time = time.time() + for i in range(NUM_ITERATIONS): + _ = reduce_max(x, axes, keepdim) + elapsed = (time.time() - start_time) / NUM_ITERATIONS + print(f"pytorch time: {elapsed :8f}") + + + x_tensor = to_tensor(x, lib) + y_tensor = to_tensor(y, lib) + descriptor = infiniopReduceMaxDescriptor_t() + + # print("!") + + check_error( + lib.infiniopCreateReduceMaxDescriptor( + handle, + ctypes.byref(descriptor), + y_tensor.descriptor, + x_tensor.descriptor, + tuple_to_int_p(axes), + len(axes) + ) + ) + # print("!") + + # Invalidate the shape and strides in the descriptor to prevent them from being directly used by the kernel + x_tensor.descriptor.contents.invalidate() + y_tensor.descriptor.contents.invalidate() + + workspaceSize = ctypes.c_uint64(0) + check_error( + lib.infiniopGetReduceMaxWorkspaceSize(descriptor, ctypes.byref(workspaceSize)) + ) + workspace = torch.zeros(int(workspaceSize.value), dtype=torch.uint8).to(torch_device) + workspace_ptr = ctypes.cast(workspace.data_ptr(), ctypes.POINTER(ctypes.c_uint8)) + + + for i in range(NUM_PRERUN if PROFILE else 1): + check_error( + lib.infiniopReduceMax( + descriptor, + workspace_ptr, + workspaceSize, + y_tensor.data, + x_tensor.data, + None, + ) + ) + if PROFILE: + start_time = time.time() + for i in range(NUM_ITERATIONS): + check_error( + lib.infiniopReduceMax( + descriptor, + workspace_ptr, + workspaceSize, + y_tensor.data, + x_tensor.data, + None, + ) + ) + elapsed = (time.time() - start_time) / NUM_ITERATIONS + print(f" lib time: {elapsed :8f}") + + assert torch.allclose(y, ans, atol=0, rtol=0) + # ans_ = ans.cpu().numpy().flatten() + # y_ = y.cpu().numpy().flatten() + # atol = max(abs(ans_ - y_)) + # rtol = atol / max(abs(y_) + 1e-8) + # print(f"ans: {ans_}") + # print(f"y: {y_}") + + # print(f"atol: {atol}, rtol: {rtol}") + check_error(lib.infiniopDestroyReduceMaxDescriptor(descriptor)) + + +def test_cpu(lib, test_cases): + device = DeviceEnum.DEVICE_CPU + handle = create_handle(lib, device) + for x_shape, axes, keepdim, tensor_dtype in test_cases: + test(lib, handle, "cpu", x_shape, axes, keepdim, tensor_dtype) + destroy_handle(lib, handle) + + +def test_cuda(lib, test_cases): + device = DeviceEnum.DEVICE_CUDA + handle = create_handle(lib, device) + for x_shape, axes, keepdim, tensor_dtype in test_cases: + test(lib, handle, "cuda", x_shape, axes, keepdim, tensor_dtype) + destroy_handle(lib, handle) + + + +if __name__ == "__main__": + test_cases = [ + # x_shape, axes, keepdim, dtype + ((2, 2, 3, 4), (0,), False, torch.float32), + ((2, 2, 3, 4), (0,), True, torch.float32), + ((2, 2, 3, 4), (1,), False, torch.float32), + ((2, 2, 3, 4), (1, 2), False, torch.float32), + ((2, 2, 3, 4), (1, 3), False, torch.float32), + ((2, 2, 3, 4), (0, 1, 2, 3), False, torch.float32), + ((2, 2, 3, 4, 5), (0, 3), False, torch.float32), + ((64, 64, 64, 64), (0, 2), False, torch.float32), + + ((2, 2, 3, 4), (0,), False, torch.float16), + ((2, 2, 3, 4), (0,), True, torch.float16), + ((2, 2, 3, 4), (1,), False, torch.float16), + ((2, 2, 3, 4), (1, 2), False, torch.float16), + ((2, 2, 3, 4), (1, 3), False, torch.float16), + ((2, 2, 3, 4), (0, 1, 2, 3), False, torch.float16), + ((2, 2, 3, 4, 5), (0, 3), False, torch.float16), + ((64, 64, 64, 64), (0, 2), False, torch.float16), + ] + args = get_args() + lib = open_lib() + lib.infiniopCreateReduceMaxDescriptor.restype = c_int32 + lib.infiniopCreateReduceMaxDescriptor.argtypes = [ + infiniopHandle_t, + POINTER(infiniopReduceMaxDescriptor_t), + infiniopTensorDescriptor_t, + infiniopTensorDescriptor_t, + POINTER(ctypes.c_int), + c_uint64, + ] + lib.infiniopReduceMax.restype = c_int32 + lib.infiniopReduceMax.argtypes = [ + infiniopReduceMaxDescriptor_t, + c_void_p, + c_uint64, + c_void_p, + c_void_p, + c_void_p, + ] + lib.infiniopDestroyReduceMaxDescriptor.restype = c_int32 + lib.infiniopDestroyReduceMaxDescriptor.argtypes = [ + infiniopReduceMaxDescriptor_t, + ] + + if args.cpu: + test_cpu(lib, test_cases) + if args.cuda: + test_cuda(lib, test_cases) + if not (args.cpu or args.cuda): + test_cpu(lib, test_cases) + print("\033[92mTest passed!\033[0m") diff --git a/operatorspy/tests/reduce_mean.py b/operatorspy/tests/reduce_mean.py new file mode 100644 index 00000000..a7994c9a --- /dev/null +++ b/operatorspy/tests/reduce_mean.py @@ -0,0 +1,220 @@ + +from ctypes import POINTER, Structure, c_int32, c_uint64, c_void_p +import ctypes +import sys +import os +import time + +sys.path.insert(0, os.path.abspath(os.path.join(os.path.dirname(__file__), "..", ".."))) +from operatorspy import ( + open_lib, + to_tensor, + DeviceEnum, + infiniopHandle_t, + infiniopTensorDescriptor_t, + create_handle, + destroy_handle, + check_error, +) + +from operatorspy.tests.test_utils import get_args +import torch +import math +import ctypes +from torch.nn import functional as F +from typing import List, Tuple + +# constant for control whether profile the pytorch and lib functions +# NOTE: need to manually add synchronization function to the lib function, +# e.g., cudaDeviceSynchronize() for CUDA +PROFILE = False +NUM_PRERUN = 10 +NUM_ITERATIONS = 1000 + + +class ReduceMeanDescriptor(Structure): + _fields_ = [("device", c_int32)] + + +infiniopReduceMeanDescriptor_t = POINTER(ReduceMeanDescriptor) + +def reduce_mean(x, axes, keepdim=False): + return torch.mean(x, dim=axes, keepdim=keepdim) + +# convert a python tuple to a ctype void pointer +def tuple_to_int_p(py_tuple: Tuple): + array = ctypes.c_int * len(py_tuple) + data_array = array(*py_tuple) + return ctypes.cast(data_array, ctypes.POINTER(ctypes.c_int)) + + +def test( + lib, + handle, + torch_device, + x_shape, + axes, + keepdim, + tensor_dtype=torch.float16, +): + print( + f"Testing ReduceMean on {torch_device} with x_shape: {x_shape}, axes:{axes}, keepdim:{keepdim}, dtype:{tensor_dtype}" + ) + x = torch.rand(x_shape, dtype=tensor_dtype).to(torch_device) + ans = reduce_mean(x, axes, keepdim) + y = torch.zeros(ans.shape, dtype=tensor_dtype).to(torch_device) + + # print(f'y_shape: {y.shape}') + + for i in range(NUM_PRERUN if PROFILE else 1): + ans = reduce_mean(x, axes, keepdim) + if PROFILE: + start_time = time.time() + for i in range(NUM_ITERATIONS): + _ = reduce_mean(x, axes, keepdim) + elapsed = (time.time() - start_time) / NUM_ITERATIONS + print(f"pytorch time: {elapsed :8f}") + + + x_tensor = to_tensor(x, lib) + y_tensor = to_tensor(y, lib) + descriptor = infiniopReduceMeanDescriptor_t() + + # print("!") + + check_error( + lib.infiniopCreateReduceMeanDescriptor( + handle, + ctypes.byref(descriptor), + y_tensor.descriptor, + x_tensor.descriptor, + tuple_to_int_p(axes), + len(axes) + ) + ) + # print("!") + + # Invalidate the shape and strides in the descriptor to prevent them from being directly used by the kernel + x_tensor.descriptor.contents.invalidate() + y_tensor.descriptor.contents.invalidate() + + workspaceSize = ctypes.c_uint64(0) + check_error( + lib.infiniopGetReduceMeanWorkspaceSize(descriptor, ctypes.byref(workspaceSize)) + ) + workspace = torch.zeros(int(workspaceSize.value), dtype=torch.uint8).to(torch_device) + workspace_ptr = ctypes.cast(workspace.data_ptr(), ctypes.POINTER(ctypes.c_uint8)) + + + for i in range(NUM_PRERUN if PROFILE else 1): + check_error( + lib.infiniopReduceMean( + descriptor, + workspace_ptr, + workspaceSize, + y_tensor.data, + x_tensor.data, + None, + ) + ) + if PROFILE: + start_time = time.time() + for i in range(NUM_ITERATIONS): + check_error( + lib.infiniopReduceMean( + descriptor, + workspace_ptr, + workspaceSize, + y_tensor.data, + x_tensor.data, + None, + ) + ) + elapsed = (time.time() - start_time) / NUM_ITERATIONS + print(f" lib time: {elapsed :8f}") + + if(tensor_dtype == torch.float16): + assert torch.allclose(y, ans, atol=0, rtol=1e-3) + else: + assert torch.allclose(y, ans, atol=0, rtol=1e-5) + # ans_ = ans.cpu().numpy().flatten() + # y_ = y.cpu().numpy().flatten() + # atol = max(abs(ans_ - y_)) + # rtol = atol / max(abs(y_) + 1e-8) + # # print(f"ans: {ans_}") + # # print(f"y: {y_}") + + # print(f"atol: {atol}, rtol: {rtol}") + check_error(lib.infiniopDestroyReduceMeanDescriptor(descriptor)) + + +def test_cpu(lib, test_cases): + device = DeviceEnum.DEVICE_CPU + handle = create_handle(lib, device) + for x_shape, axes, keepdim, tensor_dtype in test_cases: + test(lib, handle, "cpu", x_shape, axes, keepdim, tensor_dtype) + destroy_handle(lib, handle) + + +def test_cuda(lib, test_cases): + device = DeviceEnum.DEVICE_CUDA + handle = create_handle(lib, device) + for x_shape, axes, keepdim, tensor_dtype in test_cases: + test(lib, handle, "cuda", x_shape, axes, keepdim, tensor_dtype) + destroy_handle(lib, handle) + + + +if __name__ == "__main__": + test_cases = [ + # x_shape, axes, keepdim, dtype + ((2, 2, 3, 4), (0,), False, torch.float32), + ((2, 2, 3, 4), (0,), True, torch.float32), + ((2, 2, 3, 4), (1,), False, torch.float32), + ((2, 2, 3, 4), (1, 2), False, torch.float32), + ((2, 2, 3, 4), (1, 3), False, torch.float32), + ((2, 2, 3, 4), (0, 1, 2, 3), False, torch.float32), + ((2, 2, 3, 4, 5), (0, 3), False, torch.float32), + ((64, 64, 64, 64), (0, 2), False, torch.float32), + + ((2, 2, 3, 4), (0,), False, torch.float16), + ((2, 2, 3, 4), (0,), True, torch.float16), + ((2, 2, 3, 4), (1,), False, torch.float16), + ((2, 2, 3, 4), (1, 2), False, torch.float16), + ((2, 2, 3, 4), (1, 3), False, torch.float16), + ((2, 2, 3, 4), (0, 1, 2, 3), False, torch.float16), + ((2, 2, 3, 4, 5), (0, 3), False, torch.float16), + ((64, 64, 64, 64), (0, 2), False, torch.float16), + ] + args = get_args() + lib = open_lib() + lib.infiniopCreateReduceMeanDescriptor.restype = c_int32 + lib.infiniopCreateReduceMeanDescriptor.argtypes = [ + infiniopHandle_t, + POINTER(infiniopReduceMeanDescriptor_t), + infiniopTensorDescriptor_t, + infiniopTensorDescriptor_t, + POINTER(ctypes.c_int), + c_uint64, + ] + lib.infiniopReduceMean.restype = c_int32 + lib.infiniopReduceMean.argtypes = [ + infiniopReduceMeanDescriptor_t, + c_void_p, + c_uint64, + c_void_p, + c_void_p, + c_void_p, + ] + lib.infiniopDestroyReduceMeanDescriptor.restype = c_int32 + lib.infiniopDestroyReduceMeanDescriptor.argtypes = [ + infiniopReduceMeanDescriptor_t, + ] + + if args.cpu: + test_cpu(lib, test_cases) + if args.cuda: + test_cuda(lib, test_cases) + if not (args.cpu or args.cuda): + test_cpu(lib, test_cases) + print("\033[92mTest passed!\033[0m") diff --git a/operatorspy/tests/reduce_min.py b/operatorspy/tests/reduce_min.py new file mode 100644 index 00000000..87f1eecc --- /dev/null +++ b/operatorspy/tests/reduce_min.py @@ -0,0 +1,220 @@ +from ctypes import POINTER, Structure, c_int32, c_uint64, c_void_p +import ctypes +import sys +import os +import time + +sys.path.insert(0, os.path.abspath(os.path.join(os.path.dirname(__file__), "..", ".."))) +from operatorspy import ( + open_lib, + to_tensor, + DeviceEnum, + infiniopHandle_t, + infiniopTensorDescriptor_t, + create_handle, + destroy_handle, + check_error, +) + +from operatorspy.tests.test_utils import get_args +import torch +import math +import ctypes +from torch.nn import functional as F +from typing import List, Tuple + +# constant for control whether profile the pytorch and lib functions +# NOTE: need to manually add synchronization function to the lib function, +# e.g., cudaDeviceSynchronize() for CUDA +PROFILE = False +NUM_PRERUN = 10 +NUM_ITERATIONS = 1000 + + +class ReduceMinDescriptor(Structure): + _fields_ = [("device", c_int32)] + + +infiniopReduceMinDescriptor_t = POINTER(ReduceMinDescriptor) + +def reduce_min(x, axes, keepdim=False): + return torch.amin(x, dim=axes, keepdim=keepdim) + +# convert a python tuple to a ctype void pointer +def tuple_to_int_p(py_tuple: Tuple): + array = ctypes.c_int * len(py_tuple) + data_array = array(*py_tuple) + return ctypes.cast(data_array, ctypes.POINTER(ctypes.c_int)) + + +def test( + lib, + handle, + torch_device, + x_shape, + axes, + keepdim, + tensor_dtype=torch.float16, +): + print( + f"Testing ReduceMin on {torch_device} with x_shape: {x_shape}, axes:{axes}, keepdim:{keepdim}, dtype:{tensor_dtype}" + ) + x = torch.rand(x_shape, dtype=tensor_dtype).to(torch_device) + ans = reduce_min(x, axes, keepdim) + y = torch.zeros(ans.shape, dtype=tensor_dtype).to(torch_device) + + # print(f'y_shape: {y.shape}') + + for i in range(NUM_PRERUN if PROFILE else 1): + ans = reduce_min(x, axes, keepdim) + if PROFILE: + start_time = time.time() + for i in range(NUM_ITERATIONS): + _ = reduce_min(x, axes, keepdim) + elapsed = (time.time() - start_time) / NUM_ITERATIONS + print(f"pytorch time: {elapsed :8f}") + + + x_tensor = to_tensor(x, lib) + y_tensor = to_tensor(y, lib) + descriptor = infiniopReduceMinDescriptor_t() + + # print("!") + + check_error( + lib.infiniopCreateReduceMinDescriptor( + handle, + ctypes.byref(descriptor), + y_tensor.descriptor, + x_tensor.descriptor, + tuple_to_int_p(axes), + len(axes) + ) + ) + # print("!") + + # Invalidate the shape and strides in the descriptor to prevent them from being directly used by the kernel + x_tensor.descriptor.contents.invalidate() + y_tensor.descriptor.contents.invalidate() + + workspaceSize = ctypes.c_uint64(0) + check_error( + lib.infiniopGetReduceMinWorkspaceSize(descriptor, ctypes.byref(workspaceSize)) + ) + workspace = torch.zeros(int(workspaceSize.value), dtype=torch.uint8).to(torch_device) + workspace_ptr = ctypes.cast(workspace.data_ptr(), ctypes.POINTER(ctypes.c_uint8)) + + + for i in range(NUM_PRERUN if PROFILE else 1): + check_error( + lib.infiniopReduceMin( + descriptor, + workspace_ptr, + workspaceSize, + y_tensor.data, + x_tensor.data, + None, + ) + ) + if PROFILE: + start_time = time.time() + for i in range(NUM_ITERATIONS): + check_error( + lib.infiniopReduceMin( + descriptor, + workspace_ptr, + workspaceSize, + y_tensor.data, + x_tensor.data, + None, + ) + ) + elapsed = (time.time() - start_time) / NUM_ITERATIONS + print(f" lib time: {elapsed :8f}") + + assert torch.allclose(y, ans, atol=0, rtol=0) + # ans_ = ans.cpu().numpy().flatten() + # y_ = y.cpu().numpy().flatten() + # atol = max(abs(ans_ - y_)) + # rtol = atol / max(abs(y_) + 1e-8) + # print(f"ans: {ans_}") + # print(f"y: {y_}") + + # print(f"atol: {atol}, rtol: {rtol}") + check_error(lib.infiniopDestroyReduceMinDescriptor(descriptor)) + + +def test_cpu(lib, test_cases): + device = DeviceEnum.DEVICE_CPU + handle = create_handle(lib, device) + for x_shape, axes, keepdim, tensor_dtype in test_cases: + test(lib, handle, "cpu", x_shape, axes, keepdim, tensor_dtype) + destroy_handle(lib, handle) + + +def test_cuda(lib, test_cases): + device = DeviceEnum.DEVICE_CUDA + handle = create_handle(lib, device) + for x_shape, axes, keepdim, tensor_dtype in test_cases: + test(lib, handle, "cuda", x_shape, axes, keepdim, tensor_dtype) + destroy_handle(lib, handle) + + + +if __name__ == "__main__": + test_cases = [ + # x_shape, axes, keepdim, dtype + ((2, 3, 4), (1,), False, torch.float32), + ((2, 3, 4), (1,), True, torch.float32), + ((2, 2, 3, 4), (0,), False, torch.float32), + ((2, 2, 3, 4), (0,), True, torch.float32), + ((2, 2, 3, 4), (1,), False, torch.float32), + ((2, 2, 3, 4), (1, 2), False, torch.float32), + ((2, 2, 3, 4), (1, 3), False, torch.float32), + ((2, 2, 3, 4), (0, 1, 2, 3), False, torch.float32), + ((2, 2, 3, 4, 5), (0, 3), False, torch.float32), + ((64, 64, 64, 64), (0, 2), False, torch.float32), + + ((2, 3, 4), (1,), False, torch.float16), + ((2, 3, 4), (1,), True, torch.float16), + ((2, 2, 3, 4), (0,), False, torch.float16), + ((2, 2, 3, 4), (0,), True, torch.float16), + ((2, 2, 3, 4), (1,), False, torch.float16), + ((2, 2, 3, 4), (1, 2), False, torch.float16), + ((2, 2, 3, 4), (1, 3), False, torch.float16), + ((2, 2, 3, 4), (0, 1, 2, 3), False, torch.float16), + ((2, 2, 3, 4, 5), (0, 3), False, torch.float16), + ((64, 64, 64, 64), (0, 2), False, torch.float16), + ] + args = get_args() + lib = open_lib() + lib.infiniopCreateReduceMinDescriptor.restype = c_int32 + lib.infiniopCreateReduceMinDescriptor.argtypes = [ + infiniopHandle_t, + POINTER(infiniopReduceMinDescriptor_t), + infiniopTensorDescriptor_t, + infiniopTensorDescriptor_t, + POINTER(ctypes.c_int), + c_uint64, + ] + lib.infiniopReduceMin.restype = c_int32 + lib.infiniopReduceMin.argtypes = [ + infiniopReduceMinDescriptor_t, + c_void_p, + c_uint64, + c_void_p, + c_void_p, + c_void_p, + ] + lib.infiniopDestroyReduceMinDescriptor.restype = c_int32 + lib.infiniopDestroyReduceMinDescriptor.argtypes = [ + infiniopReduceMinDescriptor_t, + ] + + if args.cpu: + test_cpu(lib, test_cases) + if args.cuda: + test_cuda(lib, test_cases) + if not (args.cpu or args.cuda): + test_cpu(lib, test_cases) + print("\033[92mTest passed!\033[0m") diff --git a/operatorspy/tests/where.py b/operatorspy/tests/where.py new file mode 100644 index 00000000..ac2adc9d --- /dev/null +++ b/operatorspy/tests/where.py @@ -0,0 +1,253 @@ +from ctypes import POINTER, Structure, c_int32, c_uint64, c_void_p, c_float +import ctypes +import sys +import os +import time + +sys.path.insert(0, os.path.abspath(os.path.join(os.path.dirname(__file__), "..", ".."))) +from operatorspy import ( + open_lib, + to_tensor, + CTensor, + DeviceEnum, + infiniopHandle_t, + infiniopTensorDescriptor_t, + create_handle, + destroy_handle, + check_error, + rearrange_tensor, + create_workspace, +) + +from operatorspy.tests.test_utils import get_args, synchronize_device +import torch + +PROFILE = True +NUM_PRERUN = 10 +NUM_ITERATIONS = 1000 + +class WhereDescriptor(Structure): + _fields_ = [("device", c_int32)] + + +infiniopWhereDescriptor_t = POINTER(WhereDescriptor) + +def where(condition, x, y): + return torch.where(condition, x, y) + + +def test( + lib, + handle, + torch_device, + x_shape, + y_shape, + condition_shape, + dtype=torch.float16, +): + print( + f"Testing Where on {torch_device} with x_shape:{x_shape} y_shape:{y_shape} condition_shape:{condition_shape} dtype:{dtype}" + ) + + x = torch.randn(x_shape, dtype=dtype, device=torch_device) + y = torch.randn(y_shape, dtype=dtype, device=torch_device) + condition = torch.randint(0, 2, condition_shape, device=torch_device).to(dtype=torch.uint8) + ans = where(condition.to(torch.bool), x, y) + + output = torch.zeros(ans.shape, dtype=dtype, device=torch_device) + + x_tensor = to_tensor(x, lib) + y_tensor = to_tensor(y, lib) + condition_tensor = to_tensor(condition, lib) + output_tensor = to_tensor(output, lib) + + + descriptor = infiniopWhereDescriptor_t() + check_error( + lib.infiniopCreateWhereDescriptor( + handle, + ctypes.byref(descriptor), + output_tensor.descriptor, + x_tensor.descriptor, + y_tensor.descriptor, + condition_tensor.descriptor, + ) + ) + + + # Invalidate the shape and strides in the descriptor to prevent them from being directly used by the kernel + x_tensor.descriptor.contents.invalidate() + y_tensor.descriptor.contents.invalidate() + condition_tensor.descriptor.contents.invalidate() + output_tensor.descriptor.contents.invalidate() + + + check_error( + lib.infiniopWhere( + descriptor, + output_tensor.data, + x_tensor.data, + y_tensor.data, + condition_tensor.data, + None, + ) + ) + + assert torch.allclose(output, ans, atol=0, rtol=0) + # ans_ = ans.cpu().numpy().flatten() + # output_ = output.cpu().numpy().flatten() + # print(ans_) + # print(output_) + # atol = max(abs(ans_ - output_)) + # rtol = atol / max(abs(output_) + 1e-8) + + # print(f"atol: {atol}, rtol: {rtol}") + + if PROFILE: + for i in range(NUM_PRERUN): + _ = where(condition.to(torch.bool), x, y) + synchronize_device(torch_device) + start_time = time.time() + for i in range(NUM_ITERATIONS): + _ = where(condition.to(torch.bool), x, y) + synchronize_device(torch_device) + elapsed = (time.time() - start_time) / NUM_ITERATIONS + print(f" pytorch time: {elapsed * 1000 :6f} ms") + for i in range(NUM_PRERUN): + check_error( + lib.infiniopWhere( + descriptor, + output_tensor.data, + x_tensor.data, + y_tensor.data, + condition_tensor.data, + None, + ) + ) + synchronize_device(torch_device) + start_time = time.time() + for i in range(NUM_ITERATIONS): + check_error( + lib.infiniopWhere( + descriptor, + output_tensor.data, + x_tensor.data, + y_tensor.data, + condition_tensor.data, + None, + ) + ) + synchronize_device(torch_device) + elapsed = (time.time() - start_time) / NUM_ITERATIONS + print(f" lib time: {elapsed * 1000 :6f} ms") + + check_error(lib.infiniopDestroyWhereDescriptor(descriptor)) + + +def test_cpu(lib, test_cases): + device = DeviceEnum.DEVICE_CPU + handle = create_handle(lib, device) + + for ( + x_shape, + y_shape, + condition_shape, + dtype, + ) in test_cases: + test( + lib, + handle, + "cpu", + x_shape, + y_shape, + condition_shape, + dtype, + ) + + destroy_handle(lib, handle) + + +def test_cuda(lib, test_cases): + device = DeviceEnum.DEVICE_CUDA + handle = create_handle(lib, device) + + for ( + x_shape, + y_shape, + condition_shape, + dtype, + ) in test_cases: + test( + lib, + handle, + "cuda", + x_shape, + y_shape, + condition_shape, + dtype, + ) + + destroy_handle(lib, handle) + + +if __name__ == "__main__": + test_cases = [ + # x_shape, y_shape, condition_shape, dtype + ((2, 2), (2, 2), (2, 2), torch.float32), + ((10,), (10,), (10,), torch.float32), + ((1,), (2, 2), (2, 2), torch.float32), + ((2, 2), (1,), (2, 2), torch.float32), + ((2, 2), (2, 2), (1,), torch.float32), + ((1, ), (1, ), (2, 2), torch.float32), + ((1, ), (2, 2), (1, ), torch.float32), + ((2, 2), (1, ), (1, ), torch.float32), + ((1024, 1024), (1024, 1024), (1024, 1024), torch.float32), + + ((2, 2), (2, 2), (2, 2), torch.float16), + ((10,), (10,), (10,), torch.float16), + ((1,), (2, 2), (2, 2), torch.float16), + ((2, 2), (1,), (2, 2), torch.float16), + ((2, 2), (2, 2), (1,), torch.float16), + ((1, ), (1, ), (2, 2), torch.float16), + ((1, ), (2, 2), (1, ), torch.float16), + ((2, 2), (1, ), (1, ), torch.float16), + ((1024, 1024), (1024, 1024), (1024, 1024), torch.float16), + ] + args = get_args() + lib = open_lib() + + lib.infiniopCreateWhereDescriptor.restype = c_int32 + lib.infiniopCreateWhereDescriptor.argtypes = [ + infiniopHandle_t, + POINTER(infiniopWhereDescriptor_t), + infiniopTensorDescriptor_t, + infiniopTensorDescriptor_t, + infiniopTensorDescriptor_t, + infiniopTensorDescriptor_t, + ] + + + lib.infiniopWhere.restype = c_int32 + lib.infiniopWhere.argtypes = [ + infiniopWhereDescriptor_t, + c_void_p, + c_void_p, + c_void_p, + c_void_p, + c_void_p, + ] + + lib.infiniopDestroyWhereDescriptor.restype = c_int32 + lib.infiniopDestroyWhereDescriptor.argtypes = [ + infiniopWhereDescriptor_t, + ] + + if args.profile: + PROFILE = True + if args.cpu: + test_cpu(lib, test_cases) + if args.cuda: + test_cuda(lib, test_cases) + if not (args.cpu or args.cuda): + test_cpu(lib, test_cases) + print("\033[92mTest passed!\033[0m") diff --git a/src/ops/clip/cpu/clip_cpu.cc b/src/ops/clip/cpu/clip_cpu.cc new file mode 100644 index 00000000..6baa7514 --- /dev/null +++ b/src/ops/clip/cpu/clip_cpu.cc @@ -0,0 +1,96 @@ +#include "clip_cpu.h" +#include "../../../devices/cpu/common_cpu.h" +#include "../../utils.h" + +infiniopStatus_t cpuCreateClipDescriptor(infiniopHandle_t, + ClipCpuDescriptor_t *desc_ptr, + infiniopTensorDescriptor_t y, + infiniopTensorDescriptor_t x, + float *lower_bound, + float *upper_bound) { + if (x->ndim != y->ndim) { + return STATUS_BAD_TENSOR_SHAPE; + } + for (uint64_t i = 0; i < x->ndim; i++) { + if (x->shape[i] != y->shape[i]) { + return STATUS_BAD_TENSOR_SHAPE; + } + } + + if (!is_contiguous(y) || !is_contiguous(x)) { + return STATUS_BAD_TENSOR_STRIDES; + } + + if (y->dt != x->dt) { + return STATUS_BAD_TENSOR_DTYPE; + } + + uint64_t data_size = std::accumulate(y->shape, y->shape + y->ndim, 1ULL, std::multiplies()); + + bool has_lower_bound = lower_bound != nullptr; + bool has_upper_bound = upper_bound != nullptr; + float lower_bound_ = has_lower_bound ? *lower_bound : std::numeric_limits::lowest(); + float upper_bound_ = has_upper_bound ? *upper_bound : std::numeric_limits::max(); + + *desc_ptr = new ClipCpuDescriptor{ + DevCpu, + y->dt, + data_size, + has_lower_bound, + lower_bound_, + has_upper_bound, + upper_bound_, + }; + return STATUS_SUCCESS; +} + +infiniopStatus_t cpuDestroyClipDescriptor(ClipCpuDescriptor_t desc) { + delete desc; + return STATUS_SUCCESS; +} + +template +infiniopStatus_t clip_cpu(ClipCpuDescriptor_t desc, void *y, void const *x) { + auto x_ = reinterpret_cast(x); + auto y_ = reinterpret_cast(y); + auto lower_bound_ = desc->lower_bound; + auto upper_bound_ = desc->upper_bound; + auto data_size_ = desc->data_size; + + if constexpr (std::is_same::value) { + if (!desc->has_lower_bound && !desc->has_upper_bound) { + std::memcpy(y_, x_, data_size_ * sizeof(Tdata)); + } else { +#pragma omp parallel for + for (uint64_t i = 0; i < data_size_; i++) { + float x_val = f16_to_f32(x_[i]); + x_val = (desc->has_lower_bound && x_val < lower_bound_) ? lower_bound_ : x_val; + x_val = (desc->has_upper_bound && x_val > upper_bound_) ? upper_bound_ : x_val; + y_[i] = f32_to_f16(x_val); + } + } + } else { + if (!desc->has_lower_bound && !desc->has_upper_bound) { + std::memcpy(y_, x_, data_size_ * sizeof(Tdata)); + } else { +#pragma omp parallel for + for (uint64_t i = 0; i < data_size_; i++) { + Tdata x_val = x_[i]; + x_val = (desc->has_lower_bound && x_val < lower_bound_) ? lower_bound_ : x_val; + x_val = (desc->has_upper_bound && x_val > upper_bound_) ? upper_bound_ : x_val; + y_[i] = x_val; + } + } + } + return STATUS_SUCCESS; +} + +infiniopStatus_t cpuClip(ClipCpuDescriptor_t desc, void *y, void const *x) { + if (desc->dtype == F16) { + return clip_cpu(desc, y, x); + } + if (desc->dtype == F32) { + return clip_cpu(desc, y, x); + } + return STATUS_BAD_TENSOR_DTYPE; +} \ No newline at end of file diff --git a/src/ops/clip/cpu/clip_cpu.h b/src/ops/clip/cpu/clip_cpu.h new file mode 100644 index 00000000..e112d329 --- /dev/null +++ b/src/ops/clip/cpu/clip_cpu.h @@ -0,0 +1,34 @@ +#ifndef __CPU_CLIP_H__ +#define __CPU_CLIP_H__ + +#include "operators.h" +#include +#include +#include + +struct ClipCpuDescriptor { + Device device; + DT dtype; + uint64_t data_size; + bool has_lower_bound; + float lower_bound; + bool has_upper_bound; + float upper_bound; +}; + +typedef struct ClipCpuDescriptor *ClipCpuDescriptor_t; + +infiniopStatus_t cpuCreateClipDescriptor(infiniopHandle_t, + ClipCpuDescriptor_t *, + infiniopTensorDescriptor_t y, + infiniopTensorDescriptor_t x, + float* lower_bound, + float* upper_bound); + +infiniopStatus_t cpuClip(ClipCpuDescriptor_t desc, + void *y, + void const *x); + +infiniopStatus_t cpuDestroyClipDescriptor(ClipCpuDescriptor_t desc); + +#endif diff --git a/src/ops/clip/cuda/clip.cc b/src/ops/clip/cuda/clip.cc new file mode 100644 index 00000000..090731a8 --- /dev/null +++ b/src/ops/clip/cuda/clip.cc @@ -0,0 +1,55 @@ +#include "clip.cuh" +#include "../../../devices/cuda/common_cuda.h" +#include "../../utils.h" +#include + +infiniopStatus_t cudaCreateClipDescriptor(CudaHandle_t handle, + ClipCudaDescriptor_t *desc_ptr, + infiniopTensorDescriptor_t y, + infiniopTensorDescriptor_t x, + float* lower_bound, + float* upper_bound) { + if (x->ndim != y->ndim) { + return STATUS_BAD_TENSOR_SHAPE; + } + for (uint64_t i = 0; i < x->ndim; i++) { + if (x->shape[i] != y->shape[i]) { + return STATUS_BAD_TENSOR_SHAPE; + } + } + + if (!is_contiguous(y) || !is_contiguous(x)) { + return STATUS_BAD_TENSOR_STRIDES; + } + + if (y->dt != x->dt) { + return STATUS_BAD_TENSOR_DTYPE; + } + + uint64_t data_size = std::accumulate(y->shape, y->shape + y->ndim, 1ULL, std::multiplies()); + + bool has_lower_bound = lower_bound != nullptr; + bool has_upper_bound = upper_bound != nullptr; + float lower_bound_ = has_lower_bound ? *lower_bound : std::numeric_limits::lowest(); + float upper_bound_ = has_upper_bound ? *upper_bound : std::numeric_limits::max(); + + *desc_ptr = new ClipCudaDescriptor{ + DevNvGpu, + y->dt, + handle->device_id, + data_size, + has_lower_bound, + lower_bound_, + has_upper_bound, + upper_bound_, + static_cast(handle->prop.maxGridSize[0]), + }; + + + return STATUS_SUCCESS; +} + +infiniopStatus_t cudaDestroyClipDescriptor(ClipCudaDescriptor_t desc) { + delete desc; + return STATUS_SUCCESS; +} diff --git a/src/ops/clip/cuda/clip.cu b/src/ops/clip/cuda/clip.cu new file mode 100644 index 00000000..ffe8988e --- /dev/null +++ b/src/ops/clip/cuda/clip.cu @@ -0,0 +1,65 @@ +#include "../../../devices/cuda/common_cuda.h" +#include "../../utils.h" +#include "clip.cuh" +#include "status.h" + +template +__global__ void clip( + Tdata const *x, + Tdata *y, + uint64_t data_size, + bool has_lower_bound, + bool has_upper_bound, + float lower_bound, + float upper_bound, + uint64_t offset) { + const uint64_t idx = blockIdx.x * blockDim.x + threadIdx.x + offset; + if (idx >= data_size) return; + + Tdata value = x[idx]; + if constexpr (std::is_same::value) { + value = (has_lower_bound && __hlt(value, __float2half(lower_bound))) ? __float2half(lower_bound) : value; + value = (has_upper_bound && __hgt(value, __float2half(upper_bound))) ? __float2half(upper_bound) : value; + y[idx] = value; + } else { + value = (has_lower_bound && value < lower_bound) ? lower_bound : value; + value = (has_upper_bound && value > upper_bound) ? upper_bound : value; + y[idx] = value; + } +} + +template +infiniopStatus_t clip_nv_gpu( + ClipCudaDescriptor_t desc, + void const *x, + void *y, + cudaStream_t stream) { + const uint64_t data_size = desc->data_size; + const uint64_t max_grid_size = desc->max_grid_size; + + auto x_ = reinterpret_cast(x); + auto y_ = reinterpret_cast(y); + + dim3 blockDims(std::min(static_cast(256), data_size)); + dim3 gridDims(std::min(ROUND_UP_DIV(data_size, blockDims.x), max_grid_size)); + uint64_t step = gridDims.x * blockDims.x; + +#pragma unroll + for (uint64_t offset = 0; offset < data_size; offset += step) { + clip<<>>( + x_, y_, data_size, desc->has_lower_bound, desc->has_upper_bound, desc->lower_bound, desc->upper_bound, offset); + } + + return STATUS_SUCCESS; +} + +infiniopStatus_t cudaClip(ClipCudaDescriptor_t desc, void *y, const void *x, void *stream) { + checkCudaError(cudaSetDevice(desc->device_id)); + if (desc->dtype == F16) { + return clip_nv_gpu(desc, reinterpret_cast(x), reinterpret_cast(y), reinterpret_cast(stream)); + } + if (desc->dtype == F32) { + return clip_nv_gpu(desc, reinterpret_cast(x), reinterpret_cast(y), reinterpret_cast(stream)); + } + return STATUS_SUCCESS; +} \ No newline at end of file diff --git a/src/ops/clip/cuda/clip.cuh b/src/ops/clip/cuda/clip.cuh new file mode 100644 index 00000000..863c61ca --- /dev/null +++ b/src/ops/clip/cuda/clip.cuh @@ -0,0 +1,38 @@ +#ifndef __CUDA_CLIP_H__ +#define __CUDA_CLIP_H__ + +#include "../../../devices/cuda/common_cuda.h" +#include "../../../devices/cuda/cuda_handle.h" +#include "operators.h" +#include +#include +#include + +struct ClipCudaDescriptor { + Device device; + DT dtype; + int device_id; + uint64_t data_size; + bool has_lower_bound; + float lower_bound; + bool has_upper_bound; + float upper_bound; + uint64_t max_grid_size; +}; + +typedef struct ClipCudaDescriptor *ClipCudaDescriptor_t; + +infiniopStatus_t cudaCreateClipDescriptor(CudaHandle_t, + ClipCudaDescriptor_t *, + infiniopTensorDescriptor_t y, + infiniopTensorDescriptor_t x, + float* lower_bound, + float* upper_bound); + +infiniopStatus_t cudaClip(ClipCudaDescriptor_t desc, + void *y, void const *x, + void *stream); + +infiniopStatus_t cudaDestroyClipDescriptor(ClipCudaDescriptor_t desc); + +#endif diff --git a/src/ops/clip/operator.cc b/src/ops/clip/operator.cc new file mode 100644 index 00000000..7c69473d --- /dev/null +++ b/src/ops/clip/operator.cc @@ -0,0 +1,78 @@ +#include "../utils.h" +#include "operators.h" +#include "ops/clip/clip.h" + +#ifdef ENABLE_CPU +#include "cpu/clip_cpu.h" +#endif +#ifdef ENABLE_NV_GPU +#include "../../devices/cuda/cuda_handle.h" +#include "cuda/clip.cuh" +#endif + +__C infiniopStatus_t infiniopCreateClipDescriptor( + infiniopHandle_t handle, + infiniopClipDescriptor_t *desc_ptr, + infiniopTensorDescriptor_t y, + infiniopTensorDescriptor_t x, + float* lower_bound, + float* upper_bound) { + switch (handle->device) { +#ifdef ENABLE_CPU + case DevCpu: + return cpuCreateClipDescriptor(handle, (ClipCpuDescriptor_t *) desc_ptr, y, x, lower_bound, upper_bound); +#endif +#ifdef ENABLE_NV_GPU + case DevNvGpu: { + return cudaCreateClipDescriptor((CudaHandle_t) handle, (ClipCudaDescriptor_t *) desc_ptr, y, x, lower_bound, upper_bound); + } + +#endif +#ifdef ENABLE_CAMBRICON_MLU + // TODO +#endif + default: + return STATUS_BAD_DEVICE; + } + return STATUS_BAD_DEVICE; +} + +__C infiniopStatus_t infiniopClip( + infiniopClipDescriptor_t desc, + void *y, + void const *x, + void *stream) { + switch (desc->device) { +#ifdef ENABLE_CPU + case DevCpu: + return cpuClip((ClipCpuDescriptor_t) desc, y, x); +#endif +#ifdef ENABLE_NV_GPU + case DevNvGpu: { + return cudaClip((ClipCudaDescriptor_t) desc, y, x, stream); + } + +#endif + default: + return STATUS_BAD_DEVICE; + } + return STATUS_BAD_DEVICE; +} + +__C infiniopStatus_t infiniopDestroyClipDescriptor(infiniopClipDescriptor_t desc) { + switch (desc->device) { +#ifdef ENABLE_CPU + case DevCpu: + return cpuDestroyClipDescriptor((ClipCpuDescriptor_t) desc); +#endif +#ifdef ENABLE_NV_GPU + case DevNvGpu: { + return cudaDestroyClipDescriptor((ClipCudaDescriptor_t) desc); + } + +#endif + default: + return STATUS_BAD_DEVICE; + } + return STATUS_BAD_DEVICE; +} diff --git a/src/ops/gather/cpu/gather_cpu.cc b/src/ops/gather/cpu/gather_cpu.cc new file mode 100644 index 00000000..4b291fe6 --- /dev/null +++ b/src/ops/gather/cpu/gather_cpu.cc @@ -0,0 +1,119 @@ +#include "gather_cpu.h" +#include "../../../devices/cpu/common_cpu.h" +#include "../../utils.h" + +infiniopStatus_t cpuCreateGatherDescriptor(infiniopHandle_t, + GatherCpuDescriptor_t *desc_ptr, + infiniopTensorDescriptor_t output, + infiniopTensorDescriptor_t input, + infiniopTensorDescriptor_t indices, + uint64_t axis) { + if (indices->dt != I32 && indices->dt != I64) { + return STATUS_BAD_TENSOR_DTYPE; + } + if (output->dt != input->dt) { + return STATUS_BAD_TENSOR_DTYPE; + } + + if (output->ndim != input->ndim + indices->ndim - 1) { + return STATUS_BAD_TENSOR_SHAPE; + } + for (int i = 0; i < output->ndim; i++) { + if (i < axis) { + if (output->shape[i] != input->shape[i]) { + return STATUS_BAD_TENSOR_SHAPE; + } + } else if (i < axis + indices->ndim) { + if (output->shape[i] != indices->shape[i - axis]) { + return STATUS_BAD_TENSOR_SHAPE; + } + } else { + if (output->shape[i] != input->shape[i - indices->ndim + 1]) { + return STATUS_BAD_TENSOR_SHAPE; + } + } + } + + if (!is_contiguous(output) || !is_contiguous(input) || !is_contiguous(indices)) { + return STATUS_BAD_TENSOR_STRIDES; + } + + uint64_t axis_tmp = axis; + if (axis_tmp < 0) { + axis_tmp = output->ndim + axis; + } + + uint64_t pre_size = std::accumulate(input->shape, input->shape + axis_tmp, 1ULL, std::multiplies()); + uint64_t post_size = std::accumulate(input->shape + axis_tmp + 1, input->shape + input->ndim, 1ULL, std::multiplies()); + uint64_t indices_size = std::accumulate(indices->shape, indices->shape + indices->ndim, 1ULL, std::multiplies()); + uint64_t axis_size = input->shape[axis_tmp]; + + *desc_ptr = new GatherCpuDescriptor{ + DevCpu, + output->dt, + indices->dt, + pre_size, + axis_size, + indices_size, + post_size, + }; + return STATUS_SUCCESS; +} + +infiniopStatus_t cpuDestroyGatherDescriptor(GatherCpuDescriptor_t desc) { + delete desc; + return STATUS_SUCCESS; +} + +template +infiniopStatus_t gather_cpu(GatherCpuDescriptor_t desc, void *output, void const *input, void const *indices) { + auto input_ = reinterpret_cast(input); + auto output_ = reinterpret_cast(output); + auto indices_ = reinterpret_cast(indices); + + uint64_t pre_size = desc->pre_size; + uint64_t post_size = desc->post_size; + uint64_t indices_size = desc->indices_size; + uint64_t axis_size = desc->axis_size; + if (post_size == 1) { +#pragma omp parallel for collapse(2) + for (uint64_t i = 0; i < pre_size; i++) { + for (uint64_t j = 0; j < indices_size; j++) { + uint64_t output_offset = i * indices_size * post_size + j * post_size; + uint64_t input_offset = i * axis_size * post_size + indices_[j] * post_size; + output_[output_offset] = input_[input_offset]; + } + } + } else { +#pragma omp parallel for collapse(2) + for (uint64_t i = 0; i < pre_size; i++) { + for (uint64_t j = 0; j < indices_size; j++) { + uint64_t output_offset = i * indices_size * post_size + j * post_size; + uint64_t input_offset = i * axis_size * post_size + indices_[j] * post_size; + std::memcpy(output_ + output_offset, input_ + input_offset, post_size * sizeof(Tdata)); + } + } + } + + return STATUS_SUCCESS; +} + +infiniopStatus_t cpuGather(GatherCpuDescriptor_t desc, void *output, void const *input, void const *indices, void *stream) { + if (desc->dtype == F16) { + if (desc->indices_dtype == I32) { + return gather_cpu(desc, output, input, indices); + } + if (desc->indices_dtype == I64) { + return gather_cpu(desc, output, input, indices); + } + } + if (desc->dtype == F32) { + if (desc->indices_dtype == I32) { + return gather_cpu(desc, output, input, indices); + } + if (desc->indices_dtype == I64) { + return gather_cpu(desc, output, input, indices); + } + } + return STATUS_BAD_TENSOR_DTYPE; +} \ No newline at end of file diff --git a/src/ops/gather/cpu/gather_cpu.h b/src/ops/gather/cpu/gather_cpu.h new file mode 100644 index 00000000..9e7bba6c --- /dev/null +++ b/src/ops/gather/cpu/gather_cpu.h @@ -0,0 +1,35 @@ +#ifndef __CPU_GATHER_H__ +#define __CPU_GATHER_H__ + +#include "operators.h" +#include +#include + +struct GatherCpuDescriptor { + Device device; + DT dtype; + DT indices_dtype; + uint64_t pre_size; + uint64_t axis_size; + uint64_t indices_size; + uint64_t post_size; +}; + +typedef struct GatherCpuDescriptor *GatherCpuDescriptor_t; + +infiniopStatus_t cpuCreateGatherDescriptor(infiniopHandle_t, + GatherCpuDescriptor_t *, + infiniopTensorDescriptor_t output, + infiniopTensorDescriptor_t input, + infiniopTensorDescriptor_t indices, + uint64_t axis); + +infiniopStatus_t cpuGather(GatherCpuDescriptor_t desc, + void *output, + void const *input, + void const *indices, + void *stream); + +infiniopStatus_t cpuDestroyGatherDescriptor(GatherCpuDescriptor_t desc); + +#endif diff --git a/src/ops/gather/cuda/gather.cc b/src/ops/gather/cuda/gather.cc new file mode 100644 index 00000000..beed130d --- /dev/null +++ b/src/ops/gather/cuda/gather.cc @@ -0,0 +1,82 @@ +#include "gather.cuh" +#include "../../../devices/cuda/common_cuda.h" +#include "../../utils.h" + +infiniopStatus_t cudaCreateGatherDescriptor(CudaHandle_t handle, + GatherCudaDescriptor_t *desc_ptr, + infiniopTensorDescriptor_t output, + infiniopTensorDescriptor_t input, + infiniopTensorDescriptor_t indices, + uint64_t axis) { + if (indices->dt != I32 && indices->dt != I64) { + return STATUS_BAD_TENSOR_DTYPE; + } + if (output->dt != input->dt) { + return STATUS_BAD_TENSOR_DTYPE; + } + + if (output->ndim != input->ndim + indices->ndim - 1) { + return STATUS_BAD_TENSOR_SHAPE; + } + for (int i = 0; i < output->ndim; i++) { + if (i < axis) { + if (output->shape[i] != input->shape[i]) { + return STATUS_BAD_TENSOR_SHAPE; + } + } else if (i < axis + indices->ndim) { + if (output->shape[i] != indices->shape[i - axis]) { + return STATUS_BAD_TENSOR_SHAPE; + } + } else { + if (output->shape[i] != input->shape[i - indices->ndim + 1]) { + return STATUS_BAD_TENSOR_SHAPE; + } + } + } + + if (!is_contiguous(output) || !is_contiguous(input) || !is_contiguous(indices)) { + return STATUS_BAD_TENSOR_STRIDES; + } + + uint64_t axis_tmp = axis; + if (axis_tmp < 0) { + axis_tmp = output->ndim + axis; + } + + uint64_t output_size = std::accumulate(output->shape, output->shape + output->ndim, 1ULL, std::multiplies()); + uint64_t pre_size = std::accumulate(input->shape, input->shape + axis_tmp, 1ULL, std::multiplies()); + uint64_t post_size = std::accumulate(input->shape + axis_tmp + 1, input->shape + input->ndim, 1ULL, std::multiplies()); + uint64_t indices_size = std::accumulate(indices->shape, indices->shape + indices->ndim, 1ULL, std::multiplies()); + uint64_t axis_size = input->shape[axis_tmp]; + + int kernel_type = 0; + uint64_t sizes[3] = {pre_size * indices_size, indices_size * post_size, pre_size * post_size}; + for(int i = 1; i < 3; i++) { + if(sizes[i] > sizes[kernel_type]) { + kernel_type = i; + } + } + + + *desc_ptr = new GatherCudaDescriptor{ + DevNvGpu, + output->dt, + indices->dt, + handle->device_id, + output_size, + pre_size, + axis_size, + indices_size, + post_size, + kernel_type, + static_cast(handle->prop.maxGridSize[0]), + }; + + + return STATUS_SUCCESS; +} + +infiniopStatus_t cudaDestroyGatherDescriptor(GatherCudaDescriptor_t desc) { + delete desc; + return STATUS_SUCCESS; +} diff --git a/src/ops/gather/cuda/gather.cu b/src/ops/gather/cuda/gather.cu new file mode 100644 index 00000000..e6785c69 --- /dev/null +++ b/src/ops/gather/cuda/gather.cu @@ -0,0 +1,182 @@ +#include "../../../devices/cuda/common_cuda.h" +#include "../../utils.h" +#include "gather.cuh" + +constexpr int num_elem_per_thread = 2; + +template +__global__ void gather_0( + Tdata const *data, + Tind const *indices, + uint64_t indices_size, + Tdata *output, + uint64_t pre_size, + uint64_t axis_size, + uint64_t post_size, + uint64_t offset_x, + uint64_t offset_y) { + const uint64_t x = blockIdx.x * blockDim.x + threadIdx.x + offset_x; + const uint64_t y = blockIdx.y * blockDim.y + threadIdx.y + offset_y; + if (x >= pre_size * indices_size || y >= post_size) return; + + for (uint64_t i = 0; i < num_elem_per_thread; i++) { + const uint64_t idx = x * num_elem_per_thread + i; + if (idx >= pre_size * indices_size) return; + + const uint64_t pre_idx = idx / indices_size; + const uint64_t indices_idx = idx % indices_size; + const uint64_t post_idx = y; + + const uint64_t data_idx = pre_idx * axis_size * post_size + indices[indices_idx] * post_size + post_idx; + const uint64_t output_idx = pre_idx * indices_size * post_size + indices_idx * post_size + post_idx; + output[output_idx] = data[data_idx]; + } +} + +template +__global__ void gather_1( + Tdata const *data, + Tind const *indices, + uint64_t indices_size, + Tdata *output, + uint64_t pre_size, + uint64_t axis_size, + uint64_t post_size, + uint64_t offset_x, + uint64_t offset_y) { + const uint64_t x = blockIdx.x * blockDim.x + threadIdx.x + offset_x; + const uint64_t y = blockIdx.y * blockDim.y + threadIdx.y + offset_y; + if (x >= indices_size * post_size || y >= pre_size) return; + + for (uint64_t i = 0; i < num_elem_per_thread; i++) { + const uint64_t idx = x * num_elem_per_thread + i; + if (idx >= indices_size * post_size) return; + + const uint64_t pre_idx = y; + const uint64_t indices_idx = idx / post_size; + const uint64_t post_idx = idx % post_size; + + const uint64_t data_idx = pre_idx * axis_size * post_size + indices[indices_idx] * post_size + post_idx; + const uint64_t output_idx = pre_idx * indices_size * post_size + indices_idx * post_size + post_idx; + output[output_idx] = data[data_idx]; + } +} + +template +__global__ void gather_2( + Tdata const *data, + Tind const *indices, + uint64_t indices_size, + Tdata *output, + uint64_t pre_size, + uint64_t axis_size, + uint64_t post_size, + uint64_t offset_x, + uint64_t offset_y) { + const uint64_t x = blockIdx.x * blockDim.x + threadIdx.x + offset_x; + const uint64_t y = blockIdx.y * blockDim.y + threadIdx.y + offset_y; + if (x >= pre_size * post_size || y >= indices_size) return; + + for (uint64_t i = 0; i < num_elem_per_thread; i++) { + const uint64_t idx = x * num_elem_per_thread + i; + if (idx >= pre_size * post_size) return; + + const uint64_t pre_idx = idx / post_size; + const uint64_t indices_idx = y; + const uint64_t post_idx = idx % post_size; + + const uint64_t data_idx = pre_idx * axis_size * post_size + indices[indices_idx] * post_size + post_idx; + const uint64_t output_idx = pre_idx * indices_size * post_size + indices_idx * post_size + post_idx; + output[output_idx] = data[data_idx]; + } +} + +template +infiniopStatus_t gather_nv_gpu(GatherCudaDescriptor_t desc, void *output, void const *input, void const *indices, void *stream) { + if (desc->output_size == 0) { + return STATUS_SUCCESS; + } + + const auto input_ = reinterpret_cast(input); + const auto output_ = reinterpret_cast(output); + const auto indices_ = reinterpret_cast(indices); + cudaStream_t cuda_stream = reinterpret_cast(stream); + + uint64_t pre_size = desc->pre_size; + uint64_t axis_size = desc->axis_size; + uint64_t post_size = desc->post_size; + uint64_t indices_size = desc->indices_size; + + int kernel_type = desc->kernel_type; + switch (kernel_type) { + case 0: { + dim3 block_size(std::min(static_cast(64), pre_size * indices_size / num_elem_per_thread), + std::min(static_cast(16), post_size)); + dim3 grid_size(std::min(ROUND_UP_DIV(pre_size * indices_size / num_elem_per_thread, block_size.x), desc->max_grid_size), + std::min(ROUND_UP_DIV(post_size, block_size.y), desc->max_grid_size)); + uint64_t step_x = grid_size.x * block_size.x; + uint64_t step_y = grid_size.y * block_size.y; + #pragma unroll + for(uint64_t i = 0; i < pre_size * indices_size; i += step_x) { + for(uint64_t j = 0; j < post_size; j += step_y) { + gather_0<<>>( + input_, indices_, indices_size, output_, pre_size, axis_size, post_size, i, j); + } + } + } + break; + case 1: { + dim3 block_size(std::min(static_cast(64), indices_size * post_size / num_elem_per_thread), + std::min(static_cast(16), pre_size)); + dim3 grid_size(std::min(ROUND_UP_DIV(indices_size * post_size / num_elem_per_thread, block_size.x), desc->max_grid_size), + std::min(ROUND_UP_DIV(pre_size, block_size.y), desc->max_grid_size)); + uint64_t step_x = grid_size.x * block_size.x; + uint64_t step_y = grid_size.y * block_size.y; + #pragma unroll + for(uint64_t i = 0; i < indices_size * post_size; i += step_x) { + for(uint64_t j = 0; j < pre_size; j += step_y) { + gather_1<<>>( + input_, indices_, indices_size, output_, pre_size, axis_size, post_size, i, j); + } + } + } + break; + case 2: { + dim3 block_size(std::min(static_cast(64), pre_size * post_size / num_elem_per_thread), + std::min(static_cast(16), indices_size)); + dim3 grid_size(std::min(ROUND_UP_DIV(pre_size * post_size / num_elem_per_thread, block_size.x), desc->max_grid_size), + std::min(ROUND_UP_DIV(indices_size, block_size.y), desc->max_grid_size)); + uint64_t step_x = grid_size.x * block_size.x; + uint64_t step_y = grid_size.y * block_size.y; + #pragma unroll + for(uint64_t i = 0; i < pre_size * post_size; i += step_x) { + for(uint64_t j = 0; j < indices_size; j += step_y) { + gather_2<<>>( + input_, indices_, indices_size, output_, pre_size, axis_size, post_size, i, j); + } + } + } + } + return STATUS_SUCCESS; +} + +infiniopStatus_t cudaGather(GatherCudaDescriptor_t desc, void *output, void const *input, void const *indices, void *stream) { + checkCudaError(cudaSetDevice(desc->device_id)); + if (desc->dtype == F16) { + if (desc->indices_dtype == I32) { + return gather_nv_gpu(desc, output, input, indices, stream); + } + if (desc->indices_dtype == I64) { + return gather_nv_gpu(desc, output, input, indices, stream); + } + } + if (desc->dtype == F32) { + if (desc->indices_dtype == I32) { + return gather_nv_gpu(desc, output, input, indices, stream); + } + if (desc->indices_dtype == I64) { + return gather_nv_gpu(desc, output, input, indices, stream); + } + } + return STATUS_BAD_TENSOR_DTYPE; +} \ No newline at end of file diff --git a/src/ops/gather/cuda/gather.cuh b/src/ops/gather/cuda/gather.cuh new file mode 100644 index 00000000..df85346e --- /dev/null +++ b/src/ops/gather/cuda/gather.cuh @@ -0,0 +1,39 @@ +#ifndef __CUDA_GATHER_H__ +#define __CUDA_GATHER_H__ + +#include "../../../devices/cuda/common_cuda.h" +#include "../../../devices/cuda/cuda_handle.h" +#include "operators.h" +#include +#include + +struct GatherCudaDescriptor { + Device device; + DT dtype; + DT indices_dtype; + int device_id; + uint64_t output_size; + uint64_t pre_size; + uint64_t axis_size; + uint64_t indices_size; + uint64_t post_size; + int kernel_type; + uint64_t max_grid_size; +}; + +typedef struct GatherCudaDescriptor *GatherCudaDescriptor_t; + +infiniopStatus_t cudaCreateGatherDescriptor(CudaHandle_t, + GatherCudaDescriptor_t *, + infiniopTensorDescriptor_t output, + infiniopTensorDescriptor_t input, + infiniopTensorDescriptor_t indices, + uint64_t axis); + +infiniopStatus_t cudaGather(GatherCudaDescriptor_t desc, + void *output, void const *input, + void const *indices, void *stream); + +infiniopStatus_t cudaDestroyGatherDescriptor(GatherCudaDescriptor_t desc); + +#endif diff --git a/src/ops/gather/operator.cc b/src/ops/gather/operator.cc new file mode 100644 index 00000000..99dce24e --- /dev/null +++ b/src/ops/gather/operator.cc @@ -0,0 +1,79 @@ +#include "../utils.h" +#include "operators.h" +#include "ops/gather/gather.h" + +#ifdef ENABLE_CPU +#include "cpu/gather_cpu.h" +#endif +#ifdef ENABLE_NV_GPU +#include "../../devices/cuda/cuda_handle.h" +#include "cuda/gather.cuh" +#endif + +__C infiniopStatus_t infiniopCreateGatherDescriptor( + infiniopHandle_t handle, + infiniopGatherDescriptor_t *desc_ptr, + infiniopTensorDescriptor_t output, + infiniopTensorDescriptor_t input, + infiniopTensorDescriptor_t indices, + uint64_t axis) { + switch (handle->device) { +#ifdef ENABLE_CPU + case DevCpu: + return cpuCreateGatherDescriptor(handle, (GatherCpuDescriptor_t *) desc_ptr, output, input, indices, axis); +#endif +#ifdef ENABLE_NV_GPU + case DevNvGpu: { + return cudaCreateGatherDescriptor((CudaHandle_t) handle, (GatherCudaDescriptor_t *) desc_ptr, output, input, indices, axis); + } + +#endif +#ifdef ENABLE_CAMBRICON_MLU + // TODO +#endif + default: + return STATUS_BAD_DEVICE; + } + return STATUS_BAD_DEVICE; +} + +__C infiniopStatus_t infiniopGather( + infiniopGatherDescriptor_t desc, + void *output, + void const *input, + void const *indices, + void *stream) { + switch (desc->device) { +#ifdef ENABLE_CPU + case DevCpu: + return cpuGather((GatherCpuDescriptor_t) desc, output, input, indices, stream); +#endif +#ifdef ENABLE_NV_GPU + case DevNvGpu: { + return cudaGather((GatherCudaDescriptor_t) desc, output, input, indices, stream); + } + +#endif + default: + return STATUS_BAD_DEVICE; + } + return STATUS_BAD_DEVICE; +} + +__C infiniopStatus_t infiniopDestroyGatherDescriptor(infiniopGatherDescriptor_t desc) { + switch (desc->device) { +#ifdef ENABLE_CPU + case DevCpu: + return cpuDestroyGatherDescriptor((GatherCpuDescriptor_t) desc); +#endif +#ifdef ENABLE_NV_GPU + case DevNvGpu: { + return cudaDestroyGatherDescriptor((GatherCudaDescriptor_t) desc); + } + +#endif + default: + return STATUS_BAD_DEVICE; + } + return STATUS_BAD_DEVICE; +} diff --git a/src/ops/reduce/cpu/reduce_cpu.cc b/src/ops/reduce/cpu/reduce_cpu.cc new file mode 100644 index 00000000..b5098adb --- /dev/null +++ b/src/ops/reduce/cpu/reduce_cpu.cc @@ -0,0 +1,294 @@ +#include "reduce_cpu.h" +#include "../../../devices/cpu/common_cpu.h" +#include "../../utils.h" + +infiniopStatus_t cpuCreateReduceDescriptor(infiniopHandle_t, + ReduceCpuDescriptor_t *desc_ptr, + infiniopTensorDescriptor_t y, + infiniopTensorDescriptor_t x, + int *axes, + uint64_t axes_ndim, + int reduce_op) { + if (!is_contiguous(y) || !is_contiguous(x)) { + return STATUS_BAD_TENSOR_STRIDES; + } + + if (y->dt != x->dt) { + return STATUS_BAD_TENSOR_DTYPE; + } + + bool use_1Dreduce = axes_ndim == x->ndim; // means all axes are reduced, seem as 1D reduce to avoid out_ndim=0 + bool is_y_scalar = std::accumulate(y->shape, y->shape + y->ndim, 1ULL, std::multiplies()) == 1; + if(use_1Dreduce != is_y_scalar) { + return STATUS_BAD_TENSOR_SHAPE; + } + + std::vector axes_, out_of_axes; + + uint64_t ndim = x->ndim; + uint64_t out_ndim = ndim - axes_ndim; + uint64_t axes_size = 1; + uint64_t out_size = 1; + + for(uint64_t i = 0; i < ndim; i++) { + bool is_axis = false; + for(uint64_t j = 0; j < axes_ndim; j++) { + if (axes[j] == i) { + is_axis = true; + break; + } + } + if (is_axis) { + axes_size *= x->shape[i]; + axes_.emplace_back(i); + } else { + out_size *= x->shape[i]; + out_of_axes.emplace_back(i); + } + } + sort(axes_.begin(), axes_.end()); + sort(out_of_axes.begin(), out_of_axes.end()); + + std::vector axes_strides(axes_ndim, 1); + std::vector out_strides(out_ndim, 1); + + for(uint64_t i = 0; i < axes_ndim; i++) { + for(uint64_t j = i + 1; j < axes_ndim; j++) { + axes_strides[i] *= x->shape[axes[j]]; + } + } + for(uint64_t i = 0; i < out_ndim; i++) { + for(uint64_t j = i + 1; j < out_ndim; j++) { + out_strides[i] *= x->shape[out_of_axes[j]]; + } + } + + int64_t *strides = new int64_t[ndim]; + std::memcpy(strides, x->strides, ndim * sizeof(int64_t)); + + + *desc_ptr = new ReduceCpuDescriptor{ + DevCpu, + x->dt, + use_1Dreduce, + axes_, + out_of_axes, + ndim, + axes_ndim, + out_ndim, + axes_size, + out_size, + strides, + axes_strides, + out_strides, + reduce_op + }; + + return STATUS_SUCCESS; +} + +infiniopStatus_t cpuDestroyReduceDescriptor(ReduceCpuDescriptor_t desc) { + delete[] (desc->strides); + delete desc; + return STATUS_SUCCESS; +} + + +template +infiniopStatus_t reduce_cpu_1D(ReduceCpuDescriptor_t desc, void *y, void const *x) { + auto x_ = reinterpret_cast(x); + auto y_ = reinterpret_cast(y); + auto data_size_ = desc->out_size * desc->axes_size; + auto reduce_op_ = desc->reduce_op; + + if constexpr (std::is_same::value) { + switch (reduce_op_) { + case 0: { // ReduceMin + float result = std::numeric_limits::max(); + for (uint64_t i = 0; i < data_size_; i++) { + result = std::min(result, f16_to_f32(x_[i])); + } + y_[0] = f32_to_f16(result); + } + break; + case 1: { // ReduceMax + float result = std::numeric_limits::lowest(); + for (uint64_t i = 0; i < data_size_; i++) { + result = std::max(result, f16_to_f32(x_[i])); + } + y_[0] = f32_to_f16(result); + } + break; + case 2: { // ReduceMean + float sum = 0; + for (uint64_t i = 0; i < data_size_; i++) { + sum += f16_to_f32(x_[i]); + } + y_[0] = f32_to_f16(sum / data_size_); + } + break; + } + } else { + switch (reduce_op_) { + case 0: { // ReduceMin + Tdata result = std::numeric_limits::max(); + for (uint64_t i = 0; i < data_size_; i++) { + result = std::min(result, x_[i]); + } + y_[0] = result; + } + break; + case 1: { // ReduceMax + Tdata result = std::numeric_limits::lowest(); + for (uint64_t i = 0; i < data_size_; i++) { + result = std::max(result, x_[i]); + } + y_[0] = result; + } + break; + case 2: { // ReduceMean + Tdata sum = 0; + for (uint64_t i = 0; i < data_size_; i++) { + sum += x_[i]; + } + y_[0] = sum / data_size_; + } + break; + } + } + return STATUS_SUCCESS; +} + +template +infiniopStatus_t reduce_cpu(ReduceCpuDescriptor_t desc, void *y, void const *x) { + auto x_ = reinterpret_cast(x); + auto y_ = reinterpret_cast(y); + auto axes_ = desc->axes; + auto out_of_axes_ = desc->out_of_axes; + auto ndim_ = desc->ndim; + auto axes_ndim_ = desc->axes_ndim; + auto out_ndim_ = desc->out_ndim; + auto axes_size_ = desc->axes_size; + auto out_size_ = desc->out_size; + auto strides_ = desc->strides; + auto axes_strides_ = desc->axes_strides; + auto out_strides_ = desc->out_strides; + auto reduce_op_ = desc->reduce_op; + +#pragma omp parallel for + for(uint64_t i = 0; i < out_size_; i++) { + uint64_t idx = 0; + uint64_t temp_i = i; + for(uint64_t j = 0; j < out_ndim_; j++) { + idx += temp_i / out_strides_[j] * strides_[out_of_axes_[j]]; + temp_i %= out_strides_[j]; + } + + float result; + switch (reduce_op_) { + case 0: { // ReduceMin + result = std::numeric_limits::max(); + } + break; + case 1: { // ReduceMax + result = std::numeric_limits::lowest(); + } + break; + case 2: { // ReduceMean + result = 0; + } + break; + } + + for(uint64_t j = 0; j < axes_size_; j++) { + uint64_t idx_ = idx; + uint64_t temp_j = j; + for(uint64_t k = 0; k < axes_ndim_; k++) { + idx_ += temp_j / axes_strides_[k] * strides_[axes_[k]]; + temp_j %= axes_strides_[k]; + } + + if constexpr (std::is_same::value) { + switch (reduce_op_) { + case 0: { // ReduceMin + result = std::min(result, f16_to_f32(x_[idx_])); + } + break; + case 1: { // ReduceMax + result = std::max(result, f16_to_f32(x_[idx_])); + } + break; + case 2: { // ReduceMean + result += f16_to_f32(x_[idx_]); + } + break; + } + } else { + switch (reduce_op_) { + case 0: { // ReduceMin + result = std::min(result, x_[idx_]); + } + break; + case 1: { // ReduceMax + result = std::max(result, x_[idx_]); + } + break; + case 2: { // ReduceMean + result += x_[idx_]; + } + break; + } + } + } + if constexpr (std::is_same::value) { + switch (reduce_op_) { + case 0: { // ReduceMin + y_[i] = f32_to_f16(result); + } + break; + case 1: { // ReduceMax + y_[i] = f32_to_f16(result); + } + break; + case 2: { // ReduceMean + y_[i] = f32_to_f16(result / axes_size_); + } + break; + } + } else { + switch (reduce_op_) { + case 0: { // ReduceMin + y_[i] = result; + } + break; + case 1: { // ReduceMax + y_[i] = result; + } + break; + case 2: { // ReduceMean + y_[i] = result / axes_size_; + } + break; + } + } + } + return STATUS_SUCCESS; +} + +infiniopStatus_t cpuReduce(ReduceCpuDescriptor_t desc, void *y, void const *x, void *stream) { + if (desc->dtype == F16) { + if(desc->use_1Dreduce) { + return reduce_cpu_1D(desc, y, x); + } else { + return reduce_cpu(desc, y, x); + } + } + if (desc->dtype == F32) { + if(desc->use_1Dreduce) { + return reduce_cpu_1D(desc, y, x); + } else { + return reduce_cpu(desc, y, x); + } + } + return STATUS_BAD_TENSOR_DTYPE; +} \ No newline at end of file diff --git a/src/ops/reduce/cpu/reduce_cpu.h b/src/ops/reduce/cpu/reduce_cpu.h new file mode 100644 index 00000000..9f5696f3 --- /dev/null +++ b/src/ops/reduce/cpu/reduce_cpu.h @@ -0,0 +1,43 @@ +#ifndef __CPU_REDUCE_H__ +#define __CPU_REDUCE_H__ + +#include "operators.h" +#include +#include +#include + +struct ReduceCpuDescriptor { + Device device; + DT dtype; + bool use_1Dreduce; + std::vector axes; + std::vector out_of_axes; + uint64_t ndim; + uint64_t axes_ndim; + uint64_t out_ndim; + uint64_t axes_size; + uint64_t out_size; + int64_t *strides; + std::vector axes_strides; + std::vector out_strides; + int reduce_op; +}; + +typedef struct ReduceCpuDescriptor *ReduceCpuDescriptor_t; + +infiniopStatus_t cpuCreateReduceDescriptor(infiniopHandle_t, + ReduceCpuDescriptor_t *, + infiniopTensorDescriptor_t y, + infiniopTensorDescriptor_t x, + int *axes, + uint64_t axes_ndim, + int reduce_op); + +infiniopStatus_t cpuReduce(ReduceCpuDescriptor_t desc, + void *y, + void const *x, + void *stream); + +infiniopStatus_t cpuDestroyReduceDescriptor(ReduceCpuDescriptor_t desc); + +#endif diff --git a/src/ops/reduce/cuda/reduce.cc b/src/ops/reduce/cuda/reduce.cc new file mode 100644 index 00000000..67e31f65 --- /dev/null +++ b/src/ops/reduce/cuda/reduce.cc @@ -0,0 +1,141 @@ +#include "reduce.cuh" +#include "../../../devices/cuda/common_cuda.h" +#include "../../utils.h" + +infiniopStatus_t cudaCreateReduceDescriptor(CudaHandle_t handle, + ReduceCudaDescriptor_t *desc_ptr, + infiniopTensorDescriptor_t y, + infiniopTensorDescriptor_t x, + int *axes, + uint64_t axes_ndim, + int reduce_op) { + if (y->dt != F16 && y->dt != F32) { + return STATUS_BAD_TENSOR_DTYPE; + } + if (y->dt != x->dt) { + return STATUS_BAD_TENSOR_DTYPE; + } + + if (!is_contiguous(y) || !is_contiguous(x)) { + return STATUS_BAD_TENSOR_STRIDES; + } + + uint64_t ndim = x->ndim; + if (ndim > 8) { + return STATUS_BAD_TENSOR_SHAPE; + } + + int x_shape[CUDNN_DIM_MAX]; + for (uint64_t i = 0; i < ndim; ++i) { + x_shape[i] = static_cast(x->shape[i]); + } + + int x_strides[CUDNN_DIM_MAX]; + for (uint64_t i = 0; i < ndim; ++i) { + x_strides[i] = static_cast(x->strides[i]); + } + + int y_shape[CUDNN_DIM_MAX]; + for (uint64_t i = 0; i < ndim; ++i) { + bool is_axis = false; + for (uint64_t j = 0; j < axes_ndim; ++j) { + if (axes[j] == i) { + is_axis = true; + break; + } + } + if (is_axis) { + y_shape[i] = 1; + } else { + y_shape[i] = static_cast(x->shape[i]); + } + } + + int y_strides[CUDNN_DIM_MAX]; + int stride = 1; + for (uint64_t i = ndim; i > 0; --i) { + y_strides[i - 1] = stride; + stride *= y_shape[i - 1]; + } + + CREATE_CHECK_ERROR(auto dt = dataTypeMap[x->dt], dt, -1, STATUS_BAD_PARAM); + cudnnDataType_t cudnn_dt = [&] { + switch (dt) { + case CUDNN_DATA_HALF: + if (handle->compute_capability_major > 5 || (handle->compute_capability_major == 5 && handle->compute_capability_minor >= 3)) { + return CUDNN_DATA_HALF; + } + return CUDNN_DATA_FLOAT; + case CUDNN_DATA_BFLOAT16: + case CUDNN_DATA_FLOAT: + return CUDNN_DATA_FLOAT; + case CUDNN_DATA_DOUBLE: + return CUDNN_DATA_DOUBLE; + default: + return CUDNN_DATA_INT32; + } + }(); + + cudnnTensorDescriptor_t x_desc; + checkCudnnError(cudnnCreateTensorDescriptor(&x_desc)); + cudnnTensorDescriptor_t y_desc; + checkCudnnError(cudnnCreateTensorDescriptor(&y_desc)); + if (ndim > 4) { + checkCudnnError(cudnnSetTensorNdDescriptor(x_desc, cudnn_dt, ndim, x_shape, x_strides)); + checkCudnnError(cudnnSetTensorNdDescriptor(y_desc, cudnn_dt, ndim, y_shape, y_strides)); + } else { + int x_shape_[4] = {1, 1, 1, 1}; + int y_shape_[4] = {1, 1, 1, 1}; + for (int i = 0; i < ndim; ++i) { + x_shape_[4 - i - 1] = x_shape[ndim - i - 1]; + y_shape_[4 - i - 1] = y_shape[ndim - i - 1]; + } + checkCudnnError(cudnnSetTensor4dDescriptor(x_desc, CUDNN_TENSOR_NCHW, cudnn_dt, x_shape_[0], x_shape_[1], x_shape_[2], x_shape_[3])); + checkCudnnError(cudnnSetTensor4dDescriptor(y_desc, CUDNN_TENSOR_NCHW, cudnn_dt, y_shape_[0], y_shape_[1], y_shape_[2], y_shape_[3])); + } + + cudnnReduceTensorDescriptor_t reduce_desc; + checkCudnnError(cudnnCreateReduceTensorDescriptor(&reduce_desc)); + cudnnReduceTensorOp_t reduce_op_ = [&] { + switch (reduce_op) { + case 0: + return CUDNN_REDUCE_TENSOR_MIN; + case 1: + return CUDNN_REDUCE_TENSOR_MAX; + case 2: + return CUDNN_REDUCE_TENSOR_AVG; + } + }(); + checkCudnnError(cudnnSetReduceTensorDescriptor(reduce_desc, reduce_op_, CUDNN_DATA_FLOAT, CUDNN_NOT_PROPAGATE_NAN, CUDNN_REDUCE_TENSOR_NO_INDICES, CUDNN_32BIT_INDICES)); + + uint64_t workspace_size = 0; + if (use_cudnn(handle->cudnn_handles_t, handle->device_id, nullptr, + [&](cudnnHandle_t handle) { return cudnnGetReductionWorkspaceSize(handle, reduce_desc, x_desc, y_desc, &workspace_size); }) != CUDNN_STATUS_SUCCESS) { + return STATUS_EXECUTION_FAILED; + } + *desc_ptr = new ReduceCudaDescriptor{ + DevNvGpu, + y->dt, + handle->device_id, + handle->cudnn_handles_t, + x_desc, + y_desc, + reduce_desc, + workspace_size}; + + return STATUS_SUCCESS; +} + +infiniopStatus_t cudaGetReduceWorkspaceSize(ReduceCudaDescriptor_t desc, uint64_t *size) { + *size = desc->workspace_size; + return STATUS_SUCCESS; +} + +infiniopStatus_t cudaDestroyReduceDescriptor(ReduceCudaDescriptor_t desc) { + checkCudnnError(cudnnDestroyReduceTensorDescriptor(desc->reduce_desc)); + checkCudnnError(cudnnDestroyTensorDescriptor(desc->y_desc)); + checkCudnnError(cudnnDestroyTensorDescriptor(desc->x_desc)); + desc->cudnn_handles_t = nullptr; + delete desc; + return STATUS_SUCCESS; +} diff --git a/src/ops/reduce/cuda/reduce.cu b/src/ops/reduce/cuda/reduce.cu new file mode 100644 index 00000000..4b730e5c --- /dev/null +++ b/src/ops/reduce/cuda/reduce.cu @@ -0,0 +1,24 @@ +#include "../../../devices/cuda/common_cuda.h" +#include "../../utils.h" +#include "reduce.cuh" + +infiniopStatus_t reduce_nv_gpu(ReduceCudaDescriptor_t desc, void *workspace, uint64_t workspace_size, + void *y, void const *x, void *stream) { + checkCudaError(cudaSetDevice(desc->device_id)); + float alpha = 1.f; + float beta = 0.f; + + checkCudnnError(use_cudnn(desc->cudnn_handles_t, desc->device_id, (cudaStream_t) stream, + [&](cudnnHandle_t handle) { return cudnnReduceTensor(handle, desc->reduce_desc, nullptr, 0, workspace, (size_t)workspace_size, &alpha, desc->x_desc, x, &beta, desc->y_desc, y); })); + return STATUS_SUCCESS; +} + +infiniopStatus_t cudaReduce(ReduceCudaDescriptor_t desc, + void *workspace, uint64_t workspace_size, + void *y, void const *x, + void *stream) { + if (desc->dtype == F16 || desc->dtype == F32) { + return reduce_nv_gpu(desc, workspace, workspace_size, y, x, stream); + } + return STATUS_BAD_TENSOR_DTYPE; +} diff --git a/src/ops/reduce/cuda/reduce.cuh b/src/ops/reduce/cuda/reduce.cuh new file mode 100644 index 00000000..1d9fa90f --- /dev/null +++ b/src/ops/reduce/cuda/reduce.cuh @@ -0,0 +1,42 @@ +#ifndef __CUDA_REDUCE_H__ +#define __CUDA_REDUCE_H__ + +#include "../../../devices/cuda/common_cuda.h" +#include "../../../devices/cuda/cuda_handle.h" +#include "operators.h" +#include +#include +#include + +struct ReduceCudaDescriptor { + Device device; + DT dtype; + int device_id; + std::shared_ptr> cudnn_handles_t; + cudnnTensorDescriptor_t x_desc; + cudnnTensorDescriptor_t y_desc; + cudnnReduceTensorDescriptor_t reduce_desc; + uint64_t workspace_size; +}; + +typedef struct ReduceCudaDescriptor *ReduceCudaDescriptor_t; + +infiniopStatus_t cudaCreateReduceDescriptor(CudaHandle_t, + ReduceCudaDescriptor_t *, + infiniopTensorDescriptor_t y, + infiniopTensorDescriptor_t x, + int *axes, + uint64_t axes_ndim, + int reduce_op); + +infiniopStatus_t cudaGetReduceWorkspaceSize(ReduceCudaDescriptor_t desc, + uint64_t *size); + +infiniopStatus_t cudaReduce(ReduceCudaDescriptor_t desc, + void *workspace, uint64_t workspace_size, + void *y, void const *x, + void *stream); + +infiniopStatus_t cudaDestroyReduceDescriptor(ReduceCudaDescriptor_t desc); + +#endif diff --git a/src/ops/reduce/operator.cc b/src/ops/reduce/operator.cc new file mode 100644 index 00000000..1ce20835 --- /dev/null +++ b/src/ops/reduce/operator.cc @@ -0,0 +1,88 @@ +#include "../utils.h" +#include "operators.h" +#include "reduce.h" + +#ifdef ENABLE_CPU +#include "cpu/reduce_cpu.h" +#endif +#ifdef ENABLE_NV_GPU +#include "../../devices/cuda/cuda_handle.h" +#include "cuda/reduce.cuh" +#endif + +__C infiniopStatus_t infiniopCreateReduceDescriptor( + infiniopHandle_t handle, + infiniopReduceDescriptor_t *desc_ptr, + infiniopTensorDescriptor_t y, + infiniopTensorDescriptor_t x, + int *axes, + uint64_t axes_ndim, + int reduce_op) { + switch (handle->device) { +#ifdef ENABLE_CPU + case DevCpu: + return cpuCreateReduceDescriptor(handle, (ReduceCpuDescriptor_t *) desc_ptr, y, x, axes, axes_ndim, reduce_op); +#endif +#ifdef ENABLE_NV_GPU + case DevNvGpu: { + return cudaCreateReduceDescriptor((CudaHandle_t) handle, (ReduceCudaDescriptor_t *) desc_ptr, y, x, axes, axes_ndim, reduce_op); + } + +#endif + } + return STATUS_BAD_DEVICE; +} + +__C infiniopStatus_t infiniopGetReduceWorkspaceSize(infiniopReduceDescriptor_t desc, uint64_t *size) { + // std::cout << desc->device << std::endl; + switch (desc->device) { +#ifdef ENABLE_CPU + case DevCpu: { + return STATUS_SUCCESS; + } +#endif +#ifdef ENABLE_NV_GPU + case DevNvGpu: { + return cudaGetReduceWorkspaceSize((ReduceCudaDescriptor_t) desc, size); + } + +#endif + + default: + return STATUS_BAD_DEVICE; + } + return STATUS_BAD_DEVICE; +} + +__C infiniopStatus_t infiniopReduce(infiniopReduceDescriptor_t desc, void *workspace, uint64_t workspace_size, void *y, void const *x, void *stream) { + switch (desc->device) { +#ifdef ENABLE_CPU + case DevCpu: + return cpuReduce((ReduceCpuDescriptor_t) desc, y, x, stream); +#endif +#ifdef ENABLE_NV_GPU + case DevNvGpu: + return cudaReduce((ReduceCudaDescriptor_t) desc, workspace, workspace_size, y, x, stream); + +#endif + default: + return STATUS_BAD_DEVICE; + } + return STATUS_BAD_DEVICE; +} + +__C infiniopStatus_t infiniopDestroyReduceDescriptor(infiniopReduceDescriptor_t desc) { + switch (desc->device) { +#ifdef ENABLE_CPU + case DevCpu: + return cpuDestroyReduceDescriptor((ReduceCpuDescriptor_t) desc); +#endif +#ifdef ENABLE_NV_GPU + case DevNvGpu: { + return cudaDestroyReduceDescriptor((ReduceCudaDescriptor_t) desc); + } + +#endif + } + return STATUS_BAD_DEVICE; +} diff --git a/src/ops/reduce/reduce.h b/src/ops/reduce/reduce.h new file mode 100644 index 00000000..9dc44d5c --- /dev/null +++ b/src/ops/reduce/reduce.h @@ -0,0 +1,28 @@ +#ifndef REDUCE_H +#define REDUCE_H + +#include "export.h" +#include "operators.h" + +typedef struct ReduceDescriptor { + Device device; +} ReduceDescriptor; + +typedef ReduceDescriptor *infiniopReduceDescriptor_t; + +__C infiniopStatus_t infiniopCreateReduceDescriptor(infiniopHandle_t handle, + infiniopReduceDescriptor_t *desc_ptr, + infiniopTensorDescriptor_t y, + infiniopTensorDescriptor_t x, + int *axes, + uint64_t axes_ndim, + int reduce_op); + +__C infiniopStatus_t infiniopGetReduceWorkspaceSize(infiniopReduceDescriptor_t desc, uint64_t *size); + +__C infiniopStatus_t infiniopReduce(infiniopReduceDescriptor_t desc, void *workspace, uint64_t workspace_size, void *y, void const *x, void *stream); + +__C infiniopStatus_t infiniopDestroyReduceDescriptor(infiniopReduceDescriptor_t desc); + + +#endif diff --git a/src/ops/reduce_max/operator.cc b/src/ops/reduce_max/operator.cc new file mode 100644 index 00000000..9496afb6 --- /dev/null +++ b/src/ops/reduce_max/operator.cc @@ -0,0 +1,53 @@ +#include "../reduce/reduce.h" +#include "../utils.h" +#include "ops/reduce_max/reduce_max.h" + + +struct _ReduceMaxDescriptor { + Device device; + infiniopReduceDescriptor_t reduce_desc; + uint64_t workspace_size; +}; + +typedef struct _ReduceMaxDescriptor *_ReduceMaxDescriptor_t; + +__C __export infiniopStatus_t infiniopCreateReduceMaxDescriptor(infiniopHandle_t handle, + infiniopReduceMaxDescriptor_t *desc_ptr, + infiniopTensorDescriptor_t y, + infiniopTensorDescriptor_t x, + int *axes, + uint64_t axes_ndim) { + infiniopReduceDescriptor_t reduce_desc; + CHECK_STATUS(infiniopCreateReduceDescriptor(handle, &reduce_desc, y, x, axes, axes_ndim, 1), STATUS_SUCCESS); + uint64_t workspace_size = 0; + CHECK_STATUS(infiniopGetReduceWorkspaceSize(reduce_desc, &workspace_size), STATUS_SUCCESS); + + *(_ReduceMaxDescriptor_t *) desc_ptr = new _ReduceMaxDescriptor{ + handle->device, + reduce_desc, + workspace_size}; + + return STATUS_SUCCESS; +} + +__C __export infiniopStatus_t infiniopGetReduceMaxWorkspaceSize(infiniopReduceMaxDescriptor_t desc, uint64_t *size) { + *size = ((_ReduceMaxDescriptor_t) desc)->workspace_size; + return STATUS_SUCCESS; +} + +__C __export infiniopStatus_t infiniopReduceMax(infiniopReduceMaxDescriptor_t desc, void *workspace, uint64_t workspace_size, void *y, void const *x, void *stream) { + auto _desc = (_ReduceMaxDescriptor_t) desc; + if (workspace_size < _desc->workspace_size) { + return STATUS_MEMORY_NOT_ALLOCATED; + } + + CHECK_STATUS(infiniopReduce(_desc->reduce_desc, workspace, workspace_size, y, x, stream), + STATUS_SUCCESS); + return STATUS_SUCCESS; +} + +__C __export infiniopStatus_t infiniopDestroyReduceMaxDescriptor(infiniopReduceMaxDescriptor_t desc) { + CHECK_STATUS(infiniopDestroyReduceDescriptor(((_ReduceMaxDescriptor_t) desc)->reduce_desc), STATUS_SUCCESS); + delete desc; + return STATUS_SUCCESS; +} diff --git a/src/ops/reduce_mean/operator.cc b/src/ops/reduce_mean/operator.cc new file mode 100644 index 00000000..bbbe8100 --- /dev/null +++ b/src/ops/reduce_mean/operator.cc @@ -0,0 +1,53 @@ +#include "../reduce/reduce.h" +#include "../utils.h" +#include "ops/reduce_mean/reduce_mean.h" + + +struct _ReduceMeanDescriptor { + Device device; + infiniopReduceDescriptor_t reduce_desc; + uint64_t workspace_size; +}; + +typedef struct _ReduceMeanDescriptor *_ReduceMeanDescriptor_t; + +__C __export infiniopStatus_t infiniopCreateReduceMeanDescriptor(infiniopHandle_t handle, + infiniopReduceMeanDescriptor_t *desc_ptr, + infiniopTensorDescriptor_t y, + infiniopTensorDescriptor_t x, + int *axes, + uint64_t axes_ndim) { + infiniopReduceDescriptor_t reduce_desc; + CHECK_STATUS(infiniopCreateReduceDescriptor(handle, &reduce_desc, y, x, axes, axes_ndim, 2), STATUS_SUCCESS); + uint64_t workspace_size = 0; + CHECK_STATUS(infiniopGetReduceWorkspaceSize(reduce_desc, &workspace_size), STATUS_SUCCESS); + + *(_ReduceMeanDescriptor_t *) desc_ptr = new _ReduceMeanDescriptor{ + handle->device, + reduce_desc, + workspace_size}; + + return STATUS_SUCCESS; +} + +__C __export infiniopStatus_t infiniopGetReduceMeanWorkspaceSize(infiniopReduceMeanDescriptor_t desc, uint64_t *size) { + *size = ((_ReduceMeanDescriptor_t) desc)->workspace_size; + return STATUS_SUCCESS; +} + +__C __export infiniopStatus_t infiniopReduceMean(infiniopReduceMeanDescriptor_t desc, void *workspace, uint64_t workspace_size, void *y, void const *x, void *stream) { + auto _desc = (_ReduceMeanDescriptor_t) desc; + if (workspace_size < _desc->workspace_size) { + return STATUS_MEMORY_NOT_ALLOCATED; + } + + CHECK_STATUS(infiniopReduce(_desc->reduce_desc, workspace, workspace_size, y, x, stream), + STATUS_SUCCESS); + return STATUS_SUCCESS; +} + +__C __export infiniopStatus_t infiniopDestroyReduceMeanDescriptor(infiniopReduceMeanDescriptor_t desc) { + CHECK_STATUS(infiniopDestroyReduceDescriptor(((_ReduceMeanDescriptor_t) desc)->reduce_desc), STATUS_SUCCESS); + delete desc; + return STATUS_SUCCESS; +} diff --git a/src/ops/reduce_min/operator.cc b/src/ops/reduce_min/operator.cc new file mode 100644 index 00000000..7c0eca38 --- /dev/null +++ b/src/ops/reduce_min/operator.cc @@ -0,0 +1,53 @@ +#include "../reduce/reduce.h" +#include "../utils.h" +#include "ops/reduce_min/reduce_min.h" + + +struct _ReduceMinDescriptor { + Device device; + infiniopReduceDescriptor_t reduce_desc; + uint64_t workspace_size; +}; + +typedef struct _ReduceMinDescriptor *_ReduceMinDescriptor_t; + +__C __export infiniopStatus_t infiniopCreateReduceMinDescriptor(infiniopHandle_t handle, + infiniopReduceMinDescriptor_t *desc_ptr, + infiniopTensorDescriptor_t y, + infiniopTensorDescriptor_t x, + int *axes, + uint64_t axes_ndim) { + infiniopReduceDescriptor_t reduce_desc; + CHECK_STATUS(infiniopCreateReduceDescriptor(handle, &reduce_desc, y, x, axes, axes_ndim, 0), STATUS_SUCCESS); + uint64_t workspace_size = 0; + CHECK_STATUS(infiniopGetReduceWorkspaceSize(reduce_desc, &workspace_size), STATUS_SUCCESS); + + *(_ReduceMinDescriptor_t *) desc_ptr = new _ReduceMinDescriptor{ + handle->device, + reduce_desc, + workspace_size}; + + return STATUS_SUCCESS; +} + +__C __export infiniopStatus_t infiniopGetReduceMinWorkspaceSize(infiniopReduceMinDescriptor_t desc, uint64_t *size) { + *size = ((_ReduceMinDescriptor_t) desc)->workspace_size; + return STATUS_SUCCESS; +} + +__C __export infiniopStatus_t infiniopReduceMin(infiniopReduceMinDescriptor_t desc, void *workspace, uint64_t workspace_size, void *y, void const *x, void *stream) { + auto _desc = (_ReduceMinDescriptor_t) desc; + if (workspace_size < _desc->workspace_size) { + return STATUS_MEMORY_NOT_ALLOCATED; + } + + CHECK_STATUS(infiniopReduce(_desc->reduce_desc, workspace, workspace_size, y, x, stream), + STATUS_SUCCESS); + return STATUS_SUCCESS; +} + +__C __export infiniopStatus_t infiniopDestroyReduceMinDescriptor(infiniopReduceMinDescriptor_t desc) { + CHECK_STATUS(infiniopDestroyReduceDescriptor(((_ReduceMinDescriptor_t) desc)->reduce_desc), STATUS_SUCCESS); + delete desc; + return STATUS_SUCCESS; +} diff --git a/src/ops/utils.h b/src/ops/utils.h index b48cf419..a7335959 100644 --- a/src/ops/utils.h +++ b/src/ops/utils.h @@ -104,6 +104,33 @@ inline bool getBroadcastShape(const uint64_t *shape1, uint64_t ndim1, return true; } +inline bool getBroadcastShape(const uint64_t *shape1, uint64_t ndim1, + const uint64_t *shape2, uint64_t ndim2, + const uint64_t *shape3, uint64_t ndim3, + uint64_t *broadcast_shape, uint64_t *padded_shape1, + uint64_t *padded_shape2, uint64_t *padded_shape3, + uint64_t max_rank) { + // prepending and initializing + std::fill(padded_shape1, padded_shape1 + max_rank, 1); + std::fill(padded_shape2, padded_shape2 + max_rank, 1); + std::fill(padded_shape3, padded_shape3 + max_rank, 1); + std::copy(shape1, shape1 + ndim1, padded_shape1 + max_rank - ndim1); + std::copy(shape2, shape2 + ndim2, padded_shape2 + max_rank - ndim2); + std::copy(shape3, shape3 + ndim3, padded_shape3 + max_rank - ndim3); + + // compute broadcasted shape + for (size_t i = 0; i < max_rank; ++i) { + if ((padded_shape1[i] == padded_shape2[i] || padded_shape1[i] == 1 || padded_shape2[i] == 1) && + (padded_shape1[i] == padded_shape3[i] || padded_shape1[i] == 1 || padded_shape3[i] == 1)) { + broadcast_shape[i] = std::max(std::max(padded_shape1[i], padded_shape2[i]), padded_shape3[i]); + } else { + return false; + } + } + + return true; +} + // check if the shape of tensor c is valid after broadcasting tensors a and b and also get the broadcasted shapes inline bool isValidBroadcastShape(infiniopTensorDescriptor_t a, infiniopTensorDescriptor_t b, infiniopTensorDescriptor_t c, uint64_t broadcast_ndim) { @@ -224,7 +251,7 @@ inline infiniopTensorDescriptor_t dim_merge(infiniopTensorDescriptor_t desc, uin // split the dimension dim of a tensor descriptor into multiple dimensions inline infiniopTensorDescriptor_t dim_split(infiniopTensorDescriptor_t desc, uint64_t dim, const std::vector &dims) { uint64_t ndim = desc->ndim; - if (desc->shape[dim] != std::accumulate(dims.begin(), dims.end(), (uint64_t)1, std::multiplies{})) { + if (desc->shape[dim] != std::accumulate(dims.begin(), dims.end(), (uint64_t) 1, std::multiplies{})) { return nullptr; } uint64_t new_ndim = ndim + dims.size() - 1; diff --git a/src/ops/where/cpu/where_cpu.cc b/src/ops/where/cpu/where_cpu.cc new file mode 100644 index 00000000..11ef3a4b --- /dev/null +++ b/src/ops/where/cpu/where_cpu.cc @@ -0,0 +1,184 @@ +#include "where_cpu.h" +#include "../../../devices/cpu/common_cpu.h" +#include "../../utils.h" + +infiniopStatus_t cpuCreateWhereDescriptor(infiniopHandle_t, + WhereCpuDescriptor_t *desc_ptr, + infiniopTensorDescriptor_t output, + infiniopTensorDescriptor_t x, + infiniopTensorDescriptor_t y, + infiniopTensorDescriptor_t condition) { + if (!isValidBroadcastShape(output, x) || !isValidBroadcastShape(output, y) || !isValidBroadcastShape(output, condition)) { + return STATUS_BAD_TENSOR_SHAPE; + } + + uint64_t max_rank = std::max(std::max(x->ndim, y->ndim), condition->ndim); + if (output->ndim != max_rank) { + return STATUS_BAD_TENSOR_SHAPE; + } + + uint64_t *valid_shape = new uint64_t[max_rank]; + uint64_t *x_padded_shape = new uint64_t[max_rank]; + uint64_t *y_padded_shape = new uint64_t[max_rank]; + uint64_t *condition_padded_shape = new uint64_t[max_rank]; + getBroadcastShape(x->shape, x->ndim, y->shape, y->ndim, condition->shape, condition->ndim, valid_shape, x_padded_shape, y_padded_shape, condition_padded_shape, max_rank); + for (uint64_t i = 0; i < output->ndim; i++) { + if (output->shape[i] != valid_shape[i]) { + return STATUS_BAD_TENSOR_SHAPE; + } + } + + if (!is_contiguous(y) || !is_contiguous(x)) { + return STATUS_BAD_TENSOR_STRIDES; + } + + if (output->dt != x->dt || output->dt != y->dt) { + return STATUS_BAD_TENSOR_DTYPE; + } + + if (condition->dt != U8) { + return STATUS_BAD_TENSOR_DTYPE; + } + + uint64_t output_size = std::accumulate(output->shape, output->shape + output->ndim, 1ULL, std::multiplies()); + + uint64_t x_ndim = x->ndim; + uint64_t y_ndim = y->ndim; + uint64_t condition_ndim = condition->ndim; + + bool x_broadcast = false; + for (uint64_t i = 0; i < max_rank; i++) { + if (x_padded_shape[i] != valid_shape[i]) { + x_broadcast = true; + break; + } + } + + bool y_broadcast = false; + for (uint64_t i = 0; i < max_rank; i++) { + if (y_padded_shape[i] != valid_shape[i]) { + y_broadcast = true; + break; + } + } + + bool condition_broadcast = false; + for (uint64_t i = 0; i < max_rank; i++) { + if (condition_padded_shape[i] != valid_shape[i]) { + condition_broadcast = true; + break; + } + } + + uint64_t *x_shapes = new uint64_t[x_ndim]; + int64_t *x_strides = new int64_t[x_ndim]; + uint64_t *y_shapes = new uint64_t[y_ndim]; + int64_t *y_strides = new int64_t[y_ndim]; + uint64_t *condition_shapes = new uint64_t[condition_ndim]; + int64_t *condition_strides = new int64_t[condition_ndim]; + + if (x_broadcast) { + std::memcpy(x_shapes, x->shape, x_ndim * sizeof(uint64_t)); + std::memcpy(x_strides, x->strides, x_ndim * sizeof(int64_t)); + } + if (y_broadcast) { + std::memcpy(y_shapes, y->shape, y_ndim * sizeof(uint64_t)); + std::memcpy(y_strides, y->strides, y_ndim * sizeof(int64_t)); + } + if (condition_broadcast) { + std::memcpy(condition_shapes, condition->shape, condition_ndim * sizeof(uint64_t)); + std::memcpy(condition_strides, condition->strides, condition_ndim * sizeof(int64_t)); + } + + *desc_ptr = new WhereCpuDescriptor{ + DevCpu, + output->dt, + output_size, + x_broadcast, + x_ndim, + x_shapes, + x_strides, + y_broadcast, + y_ndim, + y_shapes, + y_strides, + condition_broadcast, + condition_ndim, + condition_shapes, + condition_strides, + }; + + delete[] valid_shape; + delete[] x_padded_shape; + delete[] y_padded_shape; + delete[] condition_padded_shape; + return STATUS_SUCCESS; +} + +infiniopStatus_t cpuDestroyWhereDescriptor(WhereCpuDescriptor_t desc) { + delete desc->x_shapes; + delete desc->x_strides; + delete desc->y_shapes; + delete desc->y_strides; + delete desc->condition_shapes; + delete desc->condition_strides; + delete desc; + return STATUS_SUCCESS; +} + +inline uint64_t broadcast_map(uint64_t idx, uint64_t const *shapes, int64_t const *strides, uint64_t ndim) { + uint64_t index = 0; + for (uint64_t i = 0; i < ndim; i++) { + index += (idx / strides[i]) % shapes[i] * strides[i]; + } + return index; +} + +template +infiniopStatus_t where_cpu(WhereCpuDescriptor_t desc, void *output, void const *x, void const *y, void const *condition) { + + auto x_ = reinterpret_cast(x); + auto x_ndim = desc->x_ndim; + auto x_shapes = desc->x_shapes; + auto x_strides = desc->x_strides; + auto y_ = reinterpret_cast(y); + auto y_ndim = desc->y_ndim; + auto y_shapes = desc->y_shapes; + auto y_strides = desc->y_strides; + auto condition_ = reinterpret_cast(condition); + auto condition_ndim = desc->condition_ndim; + auto condition_shapes = desc->condition_shapes; + auto condition_strides = desc->condition_strides; + auto output_ = reinterpret_cast(output); + auto output_size_ = desc->output_size; + + bool x_broadcast = desc->x_broadcast; + bool y_broadcast = desc->y_broadcast; + bool condition_broadcast = desc->condition_broadcast; + +#pragma omp parallel for + for (uint64_t i = 0; i < output_size_; i++) { + if (condition_broadcast) { + uint64_t condition_idx = broadcast_map(i, condition_shapes, condition_strides, condition_ndim); + output_[i] = condition_[condition_idx] != 0 ? x_broadcast ? x_[broadcast_map(i, x_shapes, x_strides, x_ndim)] : x_[i] : y_broadcast ? y_[broadcast_map(i, y_shapes, y_strides, y_ndim)] + : y_[i]; + } else { + output_[i] = condition_[i] != 0 ? x_broadcast ? x_[broadcast_map(i, x_shapes, x_strides, x_ndim)] + : x_[i] + : y_broadcast ? y_[broadcast_map(i, y_shapes, y_strides, y_ndim)] + : y_[i]; + } + } + + return STATUS_SUCCESS; +} + +infiniopStatus_t cpuWhere(WhereCpuDescriptor_t desc, void *output, void const *x, void const *y, void const *condition) { + if (desc->dtype == F16) { + return where_cpu(desc, output, x, y, condition); + } + if (desc->dtype == F32) { + return where_cpu(desc, output, x, y, condition); + } + return STATUS_BAD_TENSOR_DTYPE; +} \ No newline at end of file diff --git a/src/ops/where/cpu/where_cpu.h b/src/ops/where/cpu/where_cpu.h new file mode 100644 index 00000000..b1b084f2 --- /dev/null +++ b/src/ops/where/cpu/where_cpu.h @@ -0,0 +1,43 @@ +#ifndef __CPU_WHERE_H__ +#define __CPU_WHERE_H__ + +#include "operators.h" +#include +#include + +struct WhereCpuDescriptor { + Device device; + DT dtype; + uint64_t output_size; + bool x_broadcast; + uint64_t x_ndim; + uint64_t *x_shapes; + int64_t *x_strides; + bool y_broadcast; + uint64_t y_ndim; + uint64_t *y_shapes; + int64_t *y_strides; + bool condition_broadcast; + uint64_t condition_ndim; + uint64_t *condition_shapes; + int64_t *condition_strides; +}; + +typedef struct WhereCpuDescriptor *WhereCpuDescriptor_t; + +infiniopStatus_t cpuCreateWhereDescriptor(infiniopHandle_t, + WhereCpuDescriptor_t *, + infiniopTensorDescriptor_t output, + infiniopTensorDescriptor_t x, + infiniopTensorDescriptor_t y, + infiniopTensorDescriptor_t condition); + +infiniopStatus_t cpuWhere(WhereCpuDescriptor_t desc, + void *output, + void const *x, + void const *y, + void const *condition); + +infiniopStatus_t cpuDestroyWhereDescriptor(WhereCpuDescriptor_t desc); + +#endif diff --git a/src/ops/where/cuda/where.cc b/src/ops/where/cuda/where.cc new file mode 100644 index 00000000..ce5fb119 --- /dev/null +++ b/src/ops/where/cuda/where.cc @@ -0,0 +1,125 @@ +#include "where.cuh" +#include "../../../devices/cuda/common_cuda.h" +#include "../../utils.h" + +infiniopStatus_t cudaCreateWhereDescriptor(CudaHandle_t handle, + WhereCudaDescriptor_t *desc_ptr, + infiniopTensorDescriptor_t output, + infiniopTensorDescriptor_t x, + infiniopTensorDescriptor_t y, + infiniopTensorDescriptor_t condition) { + if (!isValidBroadcastShape(output, x) || !isValidBroadcastShape(output, y) || !isValidBroadcastShape(output, condition)) { + return STATUS_BAD_TENSOR_SHAPE; + } + + uint64_t max_rank = std::max(std::max(x->ndim, y->ndim), condition->ndim); + if (output->ndim != max_rank) { + return STATUS_BAD_TENSOR_SHAPE; + } + + uint64_t *valid_shape = new uint64_t[max_rank]; + uint64_t *x_padded_shape = new uint64_t[max_rank]; + uint64_t *y_padded_shape = new uint64_t[max_rank]; + uint64_t *condition_padded_shape = new uint64_t[max_rank]; + getBroadcastShape(x->shape, x->ndim, y->shape, y->ndim, condition->shape, condition->ndim, valid_shape, x_padded_shape, y_padded_shape, condition_padded_shape, max_rank); + for (uint64_t i = 0; i < output->ndim; i++) { + if (output->shape[i] != valid_shape[i]) { + return STATUS_BAD_TENSOR_SHAPE; + } + } + + if (!is_contiguous(y) || !is_contiguous(x)) { + return STATUS_BAD_TENSOR_STRIDES; + } + + if (output->dt != x->dt || output->dt != y->dt) { + return STATUS_BAD_TENSOR_DTYPE; + } + + if (condition->dt != U8) { + return STATUS_BAD_TENSOR_DTYPE; + } + + uint64_t output_size = std::accumulate(output->shape, output->shape + output->ndim, 1ULL, std::multiplies()); + + uint64_t x_ndim = x->ndim; + uint64_t y_ndim = y->ndim; + uint64_t condition_ndim = condition->ndim; + + uint64_t *x_shapes, *y_shapes, *condition_shapes; + int64_t *x_strides, *y_strides, *condition_strides; + + checkCudaErrorWithCode(cudaMalloc((void**)&x_shapes, x_ndim * sizeof(uint64_t)), STATUS_MEMORY_NOT_ALLOCATED); + checkCudaErrorWithCode(cudaMalloc((void**)&y_shapes, y_ndim * sizeof(uint64_t)), STATUS_MEMORY_NOT_ALLOCATED); + checkCudaErrorWithCode(cudaMalloc((void**)&condition_shapes, condition_ndim * sizeof(uint64_t)), STATUS_MEMORY_NOT_ALLOCATED); + checkCudaErrorWithCode(cudaMalloc((void**)&x_strides, x_ndim * sizeof(int64_t)), STATUS_MEMORY_NOT_ALLOCATED); + checkCudaErrorWithCode(cudaMalloc((void**)&y_strides, y_ndim * sizeof(int64_t)), STATUS_MEMORY_NOT_ALLOCATED); + checkCudaErrorWithCode(cudaMalloc((void**)&condition_strides, condition_ndim * sizeof(int64_t)), STATUS_MEMORY_NOT_ALLOCATED); + + bool x_broadcast = false; + bool y_broadcast = false; + bool condition_broadcast = false; + + for (uint64_t i = 0; i < output->ndim; i++) { + if (x_padded_shape[i] != x->shape[i]) { + x_broadcast = true; + } + if (y_padded_shape[i] != y->shape[i]) { + y_broadcast = true; + } + if (condition_padded_shape[i] != condition->shape[i]) { + condition_broadcast = true; + } + } + + if(x_broadcast) { + checkCudaErrorWithCode(cudaMemcpy(x_shapes, x->shape, x_ndim * sizeof(uint64_t), cudaMemcpyHostToDevice), STATUS_EXECUTION_FAILED); + checkCudaErrorWithCode(cudaMemcpy(x_strides, x->strides, x_ndim * sizeof(int64_t), cudaMemcpyHostToDevice), STATUS_EXECUTION_FAILED); + } + if(y_broadcast) { + checkCudaErrorWithCode(cudaMemcpy(y_shapes, y->shape, y_ndim * sizeof(uint64_t), cudaMemcpyHostToDevice), STATUS_EXECUTION_FAILED); + checkCudaErrorWithCode(cudaMemcpy(y_strides, y->strides, y_ndim * sizeof(int64_t), cudaMemcpyHostToDevice), STATUS_EXECUTION_FAILED); + } + if(condition_broadcast) { + checkCudaErrorWithCode(cudaMemcpy(condition_shapes, condition->shape, condition_ndim * sizeof(uint64_t), cudaMemcpyHostToDevice), STATUS_EXECUTION_FAILED); + checkCudaErrorWithCode(cudaMemcpy(condition_strides, condition->strides, condition_ndim * sizeof(int64_t), cudaMemcpyHostToDevice), STATUS_EXECUTION_FAILED); + } + + *desc_ptr = new WhereCudaDescriptor{ + DevNvGpu, + output->dt, + handle->device_id, + output_size, + x_broadcast, + x_ndim, + x_shapes, + x_strides, + y_broadcast, + y_ndim, + y_shapes, + y_strides, + condition_broadcast, + condition_ndim, + condition_shapes, + condition_strides, + static_cast(handle->prop.maxGridSize[0]), + }; + + delete [] valid_shape; + delete [] x_padded_shape; + delete [] y_padded_shape; + delete [] condition_padded_shape; + + return STATUS_SUCCESS; +} + +infiniopStatus_t cudaDestroyWhereDescriptor(WhereCudaDescriptor_t desc) { + checkCudaErrorWithCode(cudaFree((void*)desc->x_shapes), STATUS_EXECUTION_FAILED); + checkCudaErrorWithCode(cudaFree((void*)desc->x_strides), STATUS_EXECUTION_FAILED); + checkCudaErrorWithCode(cudaFree((void*)desc->y_shapes), STATUS_EXECUTION_FAILED); + checkCudaErrorWithCode(cudaFree((void*)desc->y_strides), STATUS_EXECUTION_FAILED); + checkCudaErrorWithCode(cudaFree((void*)desc->condition_shapes), STATUS_EXECUTION_FAILED); + checkCudaErrorWithCode(cudaFree((void*)desc->condition_strides), STATUS_EXECUTION_FAILED); + delete desc; + return STATUS_SUCCESS; +} diff --git a/src/ops/where/cuda/where.cu b/src/ops/where/cuda/where.cu new file mode 100644 index 00000000..8e9e454a --- /dev/null +++ b/src/ops/where/cuda/where.cu @@ -0,0 +1,104 @@ +#include "../../../devices/cuda/common_cuda.h" +#include "../../utils.h" +#include "status.h" +#include "where.cuh" + +__device__ inline uint64_t broadcast_map(uint64_t idx, uint64_t ndim, uint64_t const *shapes, int64_t const *strides) { + uint64_t index = 0; + for (uint64_t i = 0; i < ndim; i++) { + index += (idx / strides[i]) % shapes[i] * strides[i]; + } + return index; +} + +template +__global__ void where( + Tdata const *x, + bool x_broadcast, + uint64_t x_ndim, + uint64_t const *x_shapes, + int64_t const *x_strides, + bool y_broadcast, + Tdata const *y, + uint64_t y_ndim, + uint64_t const *y_shapes, + int64_t const *y_strides, + bool condition_broadcast, + uint8_t const *condition, + uint64_t condition_ndim, + uint64_t const *condition_shapes, + int64_t const *condition_strides, + Tdata *output, + uint64_t output_size, + uint64_t offset) { + const uint64_t idx = blockIdx.x * blockDim.x + threadIdx.x + offset; + if (idx >= output_size) return; + + if (condition_broadcast) { + uint64_t condition_idx = broadcast_map(idx, condition_ndim, condition_shapes, condition_strides); + if (condition[condition_idx] == 0) { + output[idx] = y_broadcast ? y[broadcast_map(idx, y_ndim, y_shapes, y_strides)] : y[idx]; + } else { + output[idx] = x_broadcast ? x[broadcast_map(idx, x_ndim, x_shapes, x_strides)] : x[idx]; + } + } else { + output[idx] = condition[idx] == 0 ? (y_broadcast ? y[broadcast_map(idx, y_ndim, y_shapes, y_strides)] : y[idx]) + : (x_broadcast ? x[broadcast_map(idx, x_ndim, x_shapes, x_strides)] : x[idx]); + } +} + +template +infiniopStatus_t where_nv_gpu( + WhereCudaDescriptor_t desc, + void const *x, + void const *y, + void const *condition, + void *output, + cudaStream_t stream) { + + const uint64_t output_size = desc->output_size; + const uint64_t max_grid_size = desc->max_grid_size; + + auto x_ = reinterpret_cast(x); + auto y_ = reinterpret_cast(y); + auto condition_ = reinterpret_cast(condition); + auto output_ = reinterpret_cast(output); + + bool x_broadcast = desc->x_broadcast; + bool y_broadcast = desc->y_broadcast; + bool condition_broadcast = desc->condition_broadcast; + + uint64_t x_ndim = desc->x_ndim; + uint64_t y_ndim = desc->y_ndim; + uint64_t condition_ndim = desc->condition_ndim; + + uint64_t const *x_shapes = reinterpret_cast(desc->x_shapes); + int64_t const *x_strides = reinterpret_cast(desc->x_strides); + uint64_t const *y_shapes = reinterpret_cast(desc->y_shapes); + int64_t const *y_strides = reinterpret_cast(desc->y_strides); + uint64_t const *condition_shapes = reinterpret_cast(desc->condition_shapes); + int64_t const *condition_strides = reinterpret_cast(desc->condition_strides); + + dim3 blockDims(std::min(static_cast(256), output_size)); + dim3 gridDims(std::min(ROUND_UP_DIV(output_size, blockDims.x), max_grid_size)); + uint64_t step = gridDims.x * blockDims.x; + +#pragma unroll + for (uint64_t offset = 0; offset < output_size; offset += step) { + where<<>>( + x_, x_broadcast, x_ndim, x_shapes, x_strides, y_broadcast, y_, y_ndim, y_shapes, y_strides, condition_broadcast, condition_, condition_ndim, condition_shapes, condition_strides, output_, output_size, offset); + } + + return STATUS_SUCCESS; +} + +infiniopStatus_t cudaWhere(WhereCudaDescriptor_t desc, void *output, const void *x, const void *y, const void *condition, void *stream) { + checkCudaError(cudaSetDevice(desc->device_id)); + if (desc->dtype == F16) { + return where_nv_gpu(desc, reinterpret_cast(x), reinterpret_cast(y), reinterpret_cast(condition), reinterpret_cast(output), reinterpret_cast(stream)); + } + if (desc->dtype == F32) { + return where_nv_gpu(desc, reinterpret_cast(x), reinterpret_cast(y), reinterpret_cast(condition), reinterpret_cast(output), reinterpret_cast(stream)); + } + return STATUS_SUCCESS; +} \ No newline at end of file diff --git a/src/ops/where/cuda/where.cuh b/src/ops/where/cuda/where.cuh new file mode 100644 index 00000000..fad1bcc3 --- /dev/null +++ b/src/ops/where/cuda/where.cuh @@ -0,0 +1,46 @@ +#ifndef __CUDA_WHERE_H__ +#define __CUDA_WHERE_H__ + +#include "../../../devices/cuda/common_cuda.h" +#include "../../../devices/cuda/cuda_handle.h" +#include "operators.h" +#include +#include + +struct WhereCudaDescriptor { + Device device; + DT dtype; + int device_id; + uint64_t output_size; + bool x_broadcast; + uint64_t x_ndim; + uint64_t const *x_shapes; + int64_t const *x_strides; + bool y_broadcast; + uint64_t y_ndim; + uint64_t const *y_shapes; + int64_t const *y_strides; + bool condition_broadcast; + uint64_t condition_ndim; + uint64_t const *condition_shapes; + int64_t const *condition_strides; + uint64_t max_grid_size; +}; + +typedef struct WhereCudaDescriptor *WhereCudaDescriptor_t; + +infiniopStatus_t cudaCreateWhereDescriptor(CudaHandle_t, + WhereCudaDescriptor_t *, + infiniopTensorDescriptor_t output, + infiniopTensorDescriptor_t x, + infiniopTensorDescriptor_t y, + infiniopTensorDescriptor_t condition); + +infiniopStatus_t cudaWhere(WhereCudaDescriptor_t desc, + void *output, + void const *x, void const *y, void const *condition, + void *stream); + +infiniopStatus_t cudaDestroyWhereDescriptor(WhereCudaDescriptor_t desc); + +#endif diff --git a/src/ops/where/operator.cc b/src/ops/where/operator.cc new file mode 100644 index 00000000..5865f1fa --- /dev/null +++ b/src/ops/where/operator.cc @@ -0,0 +1,80 @@ +#include "../utils.h" +#include "operators.h" +#include "ops/where/where.h" + +#ifdef ENABLE_CPU +#include "cpu/where_cpu.h" +#endif +#ifdef ENABLE_NV_GPU +#include "../../devices/cuda/cuda_handle.h" +#include "cuda/where.cuh" +#endif + +__C infiniopStatus_t infiniopCreateWhereDescriptor( + infiniopHandle_t handle, + infiniopWhereDescriptor_t *desc_ptr, + infiniopTensorDescriptor_t output, + infiniopTensorDescriptor_t x, + infiniopTensorDescriptor_t y, + infiniopTensorDescriptor_t condition) { + switch (handle->device) { +#ifdef ENABLE_CPU + case DevCpu: + return cpuCreateWhereDescriptor(handle, (WhereCpuDescriptor_t *) desc_ptr, output, x, y, condition); +#endif +#ifdef ENABLE_NV_GPU + case DevNvGpu: { + return cudaCreateWhereDescriptor((CudaHandle_t) handle, (WhereCudaDescriptor_t *) desc_ptr, output, x, y, condition); + } + +#endif +#ifdef ENABLE_CAMBRICON_MLU + // TODO +#endif + default: + return STATUS_BAD_DEVICE; + } + return STATUS_BAD_DEVICE; +} + +__C infiniopStatus_t infiniopWhere( + infiniopWhereDescriptor_t desc, + void *output, + void const *x, + void const *y, + void const *condition, + void *stream) { + switch (desc->device) { +#ifdef ENABLE_CPU + case DevCpu: + return cpuWhere((WhereCpuDescriptor_t) desc, output, x, y, condition); +#endif +#ifdef ENABLE_NV_GPU + case DevNvGpu: { + return cudaWhere((WhereCudaDescriptor_t) desc, output, x, y, condition, stream); + } + +#endif + default: + return STATUS_BAD_DEVICE; + } + return STATUS_BAD_DEVICE; +} + +__C infiniopStatus_t infiniopDestroyWhereDescriptor(infiniopWhereDescriptor_t desc) { + switch (desc->device) { +#ifdef ENABLE_CPU + case DevCpu: + return cpuDestroyWhereDescriptor((WhereCpuDescriptor_t) desc); +#endif +#ifdef ENABLE_NV_GPU + case DevNvGpu: { + return cudaDestroyWhereDescriptor((WhereCudaDescriptor_t) desc); + } + +#endif + default: + return STATUS_BAD_DEVICE; + } + return STATUS_BAD_DEVICE; +}