diff --git a/include/infini_operators.h b/include/infini_operators.h index 9a5a2555..0ebeafc4 100644 --- a/include/infini_operators.h +++ b/include/infini_operators.h @@ -3,8 +3,10 @@ #include "ops/attention/attention.h" #include "ops/avg_pool/avg_pool.h" #include "ops/causal_softmax/causal_softmax.h" +#include "ops/clip/clip.h" #include "ops/global_avg_pool/global_avg_pool.h" #include "ops/expand/expand.h" +#include "ops/gather/gather.h" #include "ops/gemm/gemm.h" #include "ops/conv/conv.h" #include "ops/matmul/matmul.h" @@ -12,8 +14,12 @@ #include "ops/mlp/mlp.h" #include "ops/random_sample/random_sample.h" #include "ops/rearrange/rearrange.h" +#include "ops/reduce_max/reduce_max.h" +#include "ops/reduce_mean/reduce_mean.h" +#include "ops/reduce_min/reduce_min.h" #include "ops/relu/relu.h" #include "ops/rms_norm/rms_norm.h" #include "ops/rotary_embedding/rotary_embedding.h" #include "ops/swiglu/swiglu.h" +#include "ops/where/where.h" #include "tensor/tensor_descriptor.h" diff --git a/include/ops/clip/clip.h b/include/ops/clip/clip.h new file mode 100644 index 00000000..180bb3ea --- /dev/null +++ b/include/ops/clip/clip.h @@ -0,0 +1,27 @@ +#ifndef CLIP_H +#define CLIP_H + +#include "../../export.h" +#include "../../operators.h" + +typedef struct ClipDescriptor { + Device device; +} ClipDescriptor; + +typedef ClipDescriptor *infiniopClipDescriptor_t; + +__C __export infiniopStatus_t infiniopCreateClipDescriptor(infiniopHandle_t handle, + infiniopClipDescriptor_t *desc_ptr, + infiniopTensorDescriptor_t output, + infiniopTensorDescriptor_t input, + float min_value, + float max_value); + +__C __export infiniopStatus_t infiniopClip(infiniopClipDescriptor_t desc, + void *output, + void const *input, + void *stream); + +__C __export infiniopStatus_t infiniopDestroyClipDescriptor(infiniopClipDescriptor_t desc); + +#endif diff --git a/include/ops/gather/gather.h b/include/ops/gather/gather.h new file mode 100644 index 00000000..7c5e4c38 --- /dev/null +++ b/include/ops/gather/gather.h @@ -0,0 +1,28 @@ +#ifndef GATHER_H +#define GATHER_H + +#include "../../export.h" +#include "../../operators.h" + +typedef struct GatherDescriptor { + Device device; +} GatherDescriptor; + +typedef GatherDescriptor *infiniopGatherDescriptor_t; + +__C __export infiniopStatus_t infiniopCreateGatherDescriptor(infiniopHandle_t handle, + infiniopGatherDescriptor_t *desc_ptr, + infiniopTensorDescriptor_t output, + infiniopTensorDescriptor_t data, + infiniopTensorDescriptor_t indices, + int64_t axis); + +__C __export infiniopStatus_t infiniopGather(infiniopGatherDescriptor_t desc, + void *output, + void const *data, + void const *indices, + void *stream); + +__C __export infiniopStatus_t infiniopDestroyGatherDescriptor(infiniopGatherDescriptor_t desc); + +#endif diff --git a/include/ops/reduce_max/reduce_max.h b/include/ops/reduce_max/reduce_max.h new file mode 100644 index 00000000..6ef30477 --- /dev/null +++ b/include/ops/reduce_max/reduce_max.h @@ -0,0 +1,30 @@ +#ifndef REDUCE_MAX_H +#define REDUCE_MAX_H + +#include "../../export.h" +#include "../../operators.h" + +typedef struct ReduceMaxDescriptor { + Device device; +} ReduceMaxDescriptor; + +typedef ReduceMaxDescriptor *infiniopReduceMaxDescriptor_t; + +__C __export infiniopStatus_t infiniopCreateReduceMaxDescriptor(infiniopHandle_t handle, + infiniopReduceMaxDescriptor_t *desc_ptr, + infiniopTensorDescriptor_t reduced, + infiniopTensorDescriptor_t data, + int64_t *axes, + uint64_t axes_ndim, + bool keepdims, + bool noop_with_empty_axes); + +__C __export infiniopStatus_t infiniopGetReduceMaxWorkspaceSize(infiniopReduceMaxDescriptor_t desc, uint64_t *size); + +__C __export infiniopStatus_t infiniopReduceMax(infiniopReduceMaxDescriptor_t desc, + void *workspace, uint64_t workspace_size, + void *reduced, void const *data, void *stream); + +__C __export infiniopStatus_t infiniopDestroyReduceMaxDescriptor(infiniopReduceMaxDescriptor_t desc); + +#endif diff --git a/include/ops/reduce_mean/reduce_mean.h b/include/ops/reduce_mean/reduce_mean.h new file mode 100644 index 00000000..5629687e --- /dev/null +++ b/include/ops/reduce_mean/reduce_mean.h @@ -0,0 +1,30 @@ +#ifndef REDUCE_MEAN_H +#define REDUCE_MEAN_H + +#include "../../export.h" +#include "../../operators.h" + +typedef struct ReduceMeanDescriptor { + Device device; +} ReduceMeanDescriptor; + +typedef ReduceMeanDescriptor *infiniopReduceMeanDescriptor_t; + +__C __export infiniopStatus_t infiniopCreateReduceMeanDescriptor(infiniopHandle_t handle, + infiniopReduceMeanDescriptor_t *desc_ptr, + infiniopTensorDescriptor_t reduced, + infiniopTensorDescriptor_t data, + int64_t *axes, + uint64_t axes_ndim, + bool keepdims, + bool noop_with_empty_axes); + +__C __export infiniopStatus_t infiniopGetReduceMeanWorkspaceSize(infiniopReduceMeanDescriptor_t desc, uint64_t *size); + +__C __export infiniopStatus_t infiniopReduceMean(infiniopReduceMeanDescriptor_t desc, + void *workspace, uint64_t workspace_size, + void *reduced, void const *data, void *stream); + +__C __export infiniopStatus_t infiniopDestroyReduceMeanDescriptor(infiniopReduceMeanDescriptor_t desc); + +#endif diff --git a/include/ops/reduce_min/reduce_min.h b/include/ops/reduce_min/reduce_min.h new file mode 100644 index 00000000..4a1459d6 --- /dev/null +++ b/include/ops/reduce_min/reduce_min.h @@ -0,0 +1,30 @@ +#ifndef REDUCE_MIN_H +#define REDUCE_MIN_H + +#include "../../export.h" +#include "../../operators.h" + +typedef struct ReduceMinDescriptor { + Device device; +} ReduceMinDescriptor; + +typedef ReduceMinDescriptor *infiniopReduceMinDescriptor_t; + +__C __export infiniopStatus_t infiniopCreateReduceMinDescriptor(infiniopHandle_t handle, + infiniopReduceMinDescriptor_t *desc_ptr, + infiniopTensorDescriptor_t reduced, + infiniopTensorDescriptor_t data, + int64_t *axes, + uint64_t axes_ndim, + bool keepdims, + bool noop_with_empty_axes); + +__C __export infiniopStatus_t infiniopGetReduceMinWorkspaceSize(infiniopReduceMinDescriptor_t desc, uint64_t *size); + +__C __export infiniopStatus_t infiniopReduceMin(infiniopReduceMinDescriptor_t desc, + void *workspace, uint64_t workspace_size, + void *reduced, void const *data, void *stream); + +__C __export infiniopStatus_t infiniopDestroyReduceMinDescriptor(infiniopReduceMinDescriptor_t desc); + +#endif diff --git a/include/ops/where/where.h b/include/ops/where/where.h new file mode 100644 index 00000000..0cec3c5e --- /dev/null +++ b/include/ops/where/where.h @@ -0,0 +1,29 @@ +#ifndef WHERE_H +#define WHERE_H + +#include "../../export.h" +#include "../../operators.h" + +typedef struct WhereDescriptor { + Device device; +} WhereDescriptor; + +typedef WhereDescriptor *infiniopWhereDescriptor_t; + +__C __export infiniopStatus_t infiniopCreateWhereDescriptor(infiniopHandle_t handle, + infiniopWhereDescriptor_t *desc_ptr, + infiniopTensorDescriptor_t output, + infiniopTensorDescriptor_t condition, + infiniopTensorDescriptor_t x, + infiniopTensorDescriptor_t y); + +__C __export infiniopStatus_t infiniopWhere(infiniopWhereDescriptor_t desc, + void *output, + void const *condition, + void const *x, + void const *y, + void *stream); + +__C __export infiniopStatus_t infiniopDestroyWhereDescriptor(infiniopWhereDescriptor_t desc); + +#endif diff --git a/operatorspy/tests/clip.py b/operatorspy/tests/clip.py new file mode 100644 index 00000000..b0ae8e5c --- /dev/null +++ b/operatorspy/tests/clip.py @@ -0,0 +1,160 @@ +from ctypes import POINTER, Structure, c_int32, c_void_p, c_float +import ctypes +import sys +import os + +sys.path.insert(0, os.path.abspath(os.path.join(os.path.dirname(__file__), "..", ".."))) +from operatorspy import ( + open_lib, + to_tensor, + DeviceEnum, + infiniopHandle_t, + infiniopTensorDescriptor_t, + create_handle, + destroy_handle, + check_error, +) + +from operatorspy.tests.test_utils import get_args +from enum import Enum, auto +import torch + + +class Inplace(Enum): + OUT_OF_PLACE = auto() + INPLACE_ = auto() + + +class ClipDescriptor(Structure): + _fields_ = [("device", c_int32)] + + +infiniopClipDescriptor_t = POINTER(ClipDescriptor) + +def clip(x, min_value, max_value): + return torch.clip(x, min=min_value, max=max_value) + + +def test( + lib, + handle, + torch_device, + o_shape, + i_shape, + min_value=None, + max_value=None, + tensor_dtype=torch.float16, + inplace=Inplace.OUT_OF_PLACE, +): + print( + f"Testing Clip on {torch_device} with o_shape:{o_shape} i_shape:{i_shape} \ + min_value:{min_value} max_value:{max_value} \ + dtype:{tensor_dtype} inplace: {inplace.name}" + ) + if o_shape != i_shape: + print("Unsupported test: unmatched shapes for input and output") + return + + input = torch.rand(i_shape, dtype=tensor_dtype).to(torch_device) + output = torch.rand(o_shape, dtype=tensor_dtype).to(torch_device) if inplace == Inplace.OUT_OF_PLACE else input + + if min_value != None: + min_value = torch.tensor(min_value, dtype=tensor_dtype, device=torch_device) + else: + min_value = torch.tensor(float("-inf"), dtype=tensor_dtype, device=torch_device) + if max_value != None: + max_value = torch.tensor(max_value, dtype=tensor_dtype, device=torch_device) + else: + max_value = torch.tensor(float("inf"), dtype=tensor_dtype, device=torch_device) + + descriptor = infiniopClipDescriptor_t() + i_tensor = to_tensor(input, lib) + o_tensor = to_tensor(output, lib) if inplace == Inplace.OUT_OF_PLACE else (i_tensor) + + ans = clip(input, min_value, max_value) + + check_error( + lib.infiniopCreateClipDescriptor( + handle, + ctypes.byref(descriptor), + o_tensor.descriptor, + i_tensor.descriptor, + min_value.item(), + max_value.item(), + ) + ) + + # Invalidate the shape and strides in the descriptor to prevent them from being directly used by the kernel + o_tensor.descriptor.contents.invalidate() + i_tensor.descriptor.contents.invalidate() + + check_error( + lib.infiniopClip(descriptor, + o_tensor.data, + i_tensor.data, + None) + ) + # print(f" min:{min_value}, max:{max_value}, input:{input}, ans:{ans}, output:{output},") + assert torch.allclose(output, ans, atol=0, rtol=0) + check_error(lib.infiniopDestroyClipDescriptor(descriptor)) + + +def test_cpu(lib, test_cases): + device = DeviceEnum.DEVICE_CPU + handle = create_handle(lib, device) + for o_shape, i_shape, min_value, max_value, inplace in test_cases: + test(lib, handle, "cpu", o_shape, i_shape, min_value, max_value, tensor_dtype=torch.float16, inplace=inplace) + test(lib, handle, "cpu", o_shape, i_shape, min_value, max_value, tensor_dtype=torch.float32, inplace=inplace) + destroy_handle(lib, handle) + + +if __name__ == "__main__": + test_cases = [ + # o_shape, i_shape, min, max, inplace + ((1, 3), (1, 3), None, None, Inplace.OUT_OF_PLACE), + ((3, 3), (3, 3), 1.0, 2.0, Inplace.OUT_OF_PLACE), + ((), (), None, None, Inplace.OUT_OF_PLACE), + ((2, 20, 3), (2, 20, 3), 0., -2.0, Inplace.INPLACE_), + + ((32, 20, 512), (32, 20, 512), -0.3, 0.3, Inplace.INPLACE_), + ((32, 256, 112, 112), (32, 256, 112, 112), -0.1, 0.1, Inplace.OUT_OF_PLACE), + # ((32, 150, 5120), (32, 150, 5120), None, None, Inplace.OUT_OF_PLACE), + + ((2, 4, 3), (2, 1, 3), None, None, Inplace.OUT_OF_PLACE), + ((2, 3, 4, 5), (2, 3, 4, 5), -0.1, 0.1, Inplace.OUT_OF_PLACE), + ((3, 2, 4, 5), (4, 5), None, None, Inplace.OUT_OF_PLACE), + ] + + args = get_args() + lib = open_lib() + lib.infiniopCreateClipDescriptor.restype = c_int32 + lib.infiniopCreateClipDescriptor.argtypes = [ + infiniopHandle_t, + POINTER(infiniopClipDescriptor_t), + infiniopTensorDescriptor_t, + infiniopTensorDescriptor_t, + c_float, + c_float, + ] + lib.infiniopClip.restype = c_int32 + lib.infiniopClip.argtypes = [ + infiniopClipDescriptor_t, + c_void_p, + c_void_p, + # + c_void_p, + ] + lib.infiniopDestroyClipDescriptor.restype = c_int32 + lib.infiniopDestroyClipDescriptor.argtypes = [ + infiniopClipDescriptor_t, + ] + + if args.cpu: + test_cpu(lib, test_cases) + # if args.cuda: + # test_cuda(lib, test_cases) + # if args.bang: + # test_bang(lib, test_cases) + if not (args.cpu or args.cuda or args.bang): + test_cpu(lib, test_cases) + print("\033[92mTest passed!\033[0m") diff --git a/operatorspy/tests/gather.py b/operatorspy/tests/gather.py new file mode 100644 index 00000000..98d496e7 --- /dev/null +++ b/operatorspy/tests/gather.py @@ -0,0 +1,177 @@ +from ctypes import POINTER, Structure, c_int32, c_int64, c_void_p +import ctypes +import sys +import os + +sys.path.insert(0, os.path.abspath(os.path.join(os.path.dirname(__file__), "..", ".."))) +from operatorspy import ( + open_lib, + to_tensor, + DeviceEnum, + infiniopHandle_t, + infiniopTensorDescriptor_t, + create_handle, + destroy_handle, + check_error, +) + +from operatorspy.tests.test_utils import get_args +from enum import Enum, auto +import torch +import numpy as np + + +class Inplace(Enum): + OUT_OF_PLACE = auto() + + +class GatherDescriptor(Structure): + _fields_ = [("device", c_int32)] + + +infiniopGatherDescriptor_t = POINTER(GatherDescriptor) + + +def gather(data, indices, axis): + np_input = data.numpy() + np_indices = indices.numpy() + np_output = np.take(np_input, np_indices, axis=axis) + return torch.from_numpy(np_output) + +def gather(rank, axis, inputTensor, indexTensor): + indices = [slice(None)] * rank + indices[axis] = indexTensor + outTensor = inputTensor[tuple(indices)] + return outTensor + + +def test( + lib, + handle, + torch_device, + o_shape, + data_shape, + indices_shape, + axis=0, + tensor_dtype=torch.float16, +): + # if inplace != Inplace.OUT_OF_PLACE: + # print("Unsupported test: calculatin in-place is not supported") + # return + # if data_shape != indices_shape: + # print("Unsupported test: broadcasting does not support in-place") + # return + # q+r-1 + # if len(o_shape) != len(indices_shape) + len(data_shape) - 1: + # print("Unsupported test: output-shape is not valid") + # return + + data = torch.rand(data_shape, dtype=tensor_dtype).to(torch_device) + # indices = torch.randint(0, data.shape[axis], data_shape, device=torch_device).to(torch.int32) + indices = torch.randint(-data.shape[axis], data.shape[axis], data_shape, device=torch_device).to(torch.int32) + # ans = gather(data, indices, axis) + ans = gather(len(data_shape), axis, data, indices) + # output = torch.zeros(o_shape, dtype=tensor_dtype).to(torch_device) + output = torch.zeros(ans.shape, dtype=tensor_dtype, device=torch_device) + + print( + f"Testing Gather on {torch_device} with output_shape:{ans.shape} data_shape:{data_shape} indices_shape:{indices_shape} axis:{axis} dtype:{tensor_dtype}" + ) + # print(ans.shape, indices_shape, data_shape) + + descriptor = infiniopGatherDescriptor_t() + data_tensor = to_tensor(data, lib) + indices_tensor = to_tensor(indices, lib) + o_tensor = to_tensor(output, lib) + + check_error( + lib.infiniopCreateGatherDescriptor( + handle, + ctypes.byref(descriptor), + o_tensor.descriptor, + data_tensor.descriptor, + indices_tensor.descriptor, + c_int64(axis), + ) + ) + + # Invalidate the shape and strides in the descriptor to prevent them from being directly used by the kernel + o_tensor.descriptor.contents.invalidate() + data_tensor.descriptor.contents.invalidate() + indices_tensor.descriptor.contents.invalidate() + + check_error( + lib.infiniopGather(descriptor, + o_tensor.data, + data_tensor.data, + indices_tensor.data, + None) + ) + assert torch.allclose(output, ans, atol=0, rtol=0) + check_error(lib.infiniopDestroyGatherDescriptor(descriptor)) + + +def get_o_shape(input_shape, indices_shape, axis): + output_shape = input_shape[:axis] + tuple(indices_shape) + input_shape[axis + 1:] + return output_shape + + +def test_cpu(lib, test_cases): + device = DeviceEnum.DEVICE_CPU + handle = create_handle(lib, device) + for data_shape, indices_shape, axis, _ in test_cases: + o_shape = get_o_shape(data_shape, indices_shape, axis) + test(lib, handle, "cpu", o_shape, data_shape, indices_shape, axis, tensor_dtype=torch.float16) + test(lib, handle, "cpu", o_shape, data_shape, indices_shape, axis, tensor_dtype=torch.float32) + destroy_handle(lib, handle) + + +if __name__ == "__main__": + test_cases = [ + # q+r-1 + # data_shape, indices_shape, axis, inplace + # ((), (), (), Inplace.OUT_OF_PLACE), + ((3, 3), (2, ), 0, Inplace.OUT_OF_PLACE), + ((32, 64), (10, 4), 1, Inplace.OUT_OF_PLACE), + ((4, 3, 4), (2, 2), 1, Inplace.OUT_OF_PLACE), + ((32, 20, 64), (2, 3), 2, Inplace.OUT_OF_PLACE), + ((1, 3, 4, 5), (1, 2), 3, Inplace.OUT_OF_PLACE), + # ((32, 64, 16, 8), (12, 2), 0, Inplace.OUT_OF_PLACE), + #Cannot allocate memory + ] + + args = get_args() + lib = open_lib() + lib.infiniopCreateGatherDescriptor.restype = c_int32 + lib.infiniopCreateGatherDescriptor.argtypes = [ + infiniopHandle_t, + POINTER(infiniopGatherDescriptor_t), + infiniopTensorDescriptor_t, + infiniopTensorDescriptor_t, + infiniopTensorDescriptor_t, + c_int64, + ] + lib.infiniopGather.restype = c_int32 + lib.infiniopGather.argtypes = [ + infiniopGatherDescriptor_t, + c_void_p, + c_void_p, + c_void_p, + c_void_p, + ] + lib.infiniopDestroyGatherDescriptor.restype = c_int32 + lib.infiniopDestroyGatherDescriptor.argtypes = [ + infiniopGatherDescriptor_t, + ] + + if args.cpu: + test_cpu(lib, test_cases) + # if args.cuda: + # test_cuda(lib, test_cases) + # if args.bang: + # test_bang(lib, test_cases) + # if args.musa: + # test_musa(lib, test_cases) + if not (args.cpu or args.cuda or args.bang or args.musa): + test_cpu(lib, test_cases) + print("\033[92mTest passed!\033[0m") diff --git a/operatorspy/tests/reduce_max.py b/operatorspy/tests/reduce_max.py new file mode 100644 index 00000000..50e43def --- /dev/null +++ b/operatorspy/tests/reduce_max.py @@ -0,0 +1,230 @@ +from ctypes import POINTER, Structure, c_void_p, c_int32, c_uint64, c_bool +import ctypes +import sys +import os +import time + +sys.path.insert(0, os.path.abspath(os.path.join(os.path.dirname(__file__), "..", ".."))) +from operatorspy import ( + open_lib, + to_tensor, + DeviceEnum, + infiniopHandle_t, + infiniopTensorDescriptor_t, + create_handle, + destroy_handle, + check_error, +) + +from operatorspy.tests.test_utils import get_args +from enum import Enum, auto +import torch +import numpy as np + +from typing import Tuple + +# constant for control whether profile the pytorch and lib functions +# NOTE: need to manually add synchronization function to the lib function, +# e.g., cudaDeviceSynchronize() for CUDA +PROFILE = False +NUM_PRERUN = 10 +NUM_ITERATIONS = 1000 + + +class Inplace(Enum): + OUT_OF_PLACE = auto() + + +class ReduceMaxDescriptor(Structure): + _fields_ = [("device", c_int32)] + + +infiniopReduceMaxDescriptor_t = POINTER(ReduceMaxDescriptor) + + +def reduce_max(input, axis, keepdims=True, noop_with_empty_axes=False): + if axis == (): + if noop_with_empty_axes: + return input + else: + return torch.amax(input, dim=axis, keepdim=keepdims) + return torch.amax(input, dim=axis, keepdim=keepdims) + +# convert a python tuple to a ctype void pointer +def tuple_to_int64_p(py_tuple: Tuple): + array = ctypes.c_int64 * len(py_tuple) + data_array = array(*py_tuple) + return ctypes.cast(data_array, ctypes.POINTER(ctypes.c_int64)) + +def test( + lib, + handle, + torch_device, + data_shape, + axes, + keepdims=True, + noop_with_empty_axes=False, + tensor_dtype=torch.float16, +): + print( + f"Testing ReduceMax on {torch_device} with data_shape:{data_shape} axes:{axes} keepdims:{keepdims} noop_with_empty_axes:{noop_with_empty_axes} dtype:{tensor_dtype}" + ) + + # if inplace != Inplace.OUT_OF_PLACE: + # print("Unsupported test: calculatin in-place is not supported") + # return + + data = torch.rand(data_shape, dtype=tensor_dtype).to(torch_device) + ans = reduce_max(data, axes, keepdims, noop_with_empty_axes) + reduced = torch.zeros(ans.shape, dtype=tensor_dtype).to(torch_device) + # print(f"in pytorch, data.size:{data.size()}, reduced.size:{reduced.size()}") + + for _ in range(NUM_PRERUN if PROFILE else 0): + ans = reduce_max(data, axes, keepdims, noop_with_empty_axes) + if PROFILE: + start_time = time.time() + for i in range(NUM_ITERATIONS): + _ = reduce_max(data, axes, keepdims, noop_with_empty_axes) + elapsed = (time.time() - start_time) / NUM_ITERATIONS + print(f"pytorch time: {elapsed :6f}") + + reduced_tensor = to_tensor(reduced, lib) + data_tensor = to_tensor(data, lib) + descriptor = infiniopReduceMaxDescriptor_t() + + check_error( + lib.infiniopCreateReduceMaxDescriptor( + handle, + ctypes.byref(descriptor), + reduced_tensor.descriptor, + data_tensor.descriptor, + tuple_to_int64_p(axes), + c_uint64(len(axes)), + c_bool(keepdims), + c_bool(noop_with_empty_axes), + ) + ) + + # Invalidate the shape and strides in the descriptor to prevent them from being directly used by the kernel + reduced_tensor.descriptor.contents.invalidate() + data_tensor.descriptor.contents.invalidate() + + workspaceSize = ctypes.c_uint64(0) + check_error( + lib.infiniopGetReduceMaxWorkspaceSize(descriptor, ctypes.byref(workspaceSize)) + ) + workspace = torch.zeros(int(workspaceSize.value), dtype=torch.uint8).to(torch_device) + workspace_ptr = ctypes.cast(workspace.data_ptr(), ctypes.POINTER(ctypes.c_uint8)) + # print("!!") + + for _ in range(NUM_PRERUN if PROFILE else 1): + check_error( + lib.infiniopReduceMax( + descriptor, + workspace_ptr, + workspaceSize, + reduced_tensor.data, + data_tensor.data, + None) + ) + if PROFILE: + start_time = time.time() + for i in range(NUM_ITERATIONS): + check_error( + lib.infiniopReduceMax( + descriptor, + workspace_ptr, + workspaceSize, + reduced_tensor.data, + data_tensor.data, + None) + ) + elapsed = (time.time() - start_time) / NUM_ITERATIONS + print(f" lib time: {elapsed :6f}") + + # print(f" reduced: {reduced}\n ans: {ans}") + if tensor_dtype == torch.float16: + assert torch.allclose(reduced, ans, atol=0, rtol=1e-3) + elif tensor_dtype == torch.float32: + assert torch.allclose(reduced, ans, atol=0, rtol=1e-5) + check_error(lib.infiniopDestroyReduceMaxDescriptor(descriptor)) + # print("!!!!") + + +def test_cpu(lib, test_cases): + device = DeviceEnum.DEVICE_CPU + handle = create_handle(lib, device) + for data_shape, axes, keepdims, noop_with_empty_axes in test_cases: + test(lib, handle, "cpu", data_shape, axes, keepdims, noop_with_empty_axes, tensor_dtype=torch.float16) + test(lib, handle, "cpu", data_shape, axes, keepdims, noop_with_empty_axes, tensor_dtype=torch.float32) + destroy_handle(lib, handle) + + +if __name__ == "__main__": + test_cases = [ + # data_shape, axes, keepdims, noop_with_empty_axes + + ((2, 3, 2, 5), (1, 3), True, False), + ((2, 3, 2, 5), (-2, -1), False, False), + # ((2, 3, 2, 5), (-3, -2), True, False), #Floating point exception + # ((3, 2, 5, 4), (-4, -3, -2, -1), True, False), + + ((1, 3), (), True, True), + ((1, 3), (), False, True), + + ((2, 3), (0,), True, False), + ((32, 20, 512), (2,), True, False), + ((3, 2, 5, 4), (0, 1, 2, 3), True, False), + ((32, 56, 112, 112), (0, 1, 2), True, False), + + ((2, 3, 5), (0,), False, False), + ((32, 20, 512), (1, 2), False, False), + ((3, 2, 5, 4), (0, 1, 2, 3), False, False), + ((64, 64, 64, 64), (0, 2, 3), False, False), + ] + + args = get_args() + lib = open_lib() + lib.infiniopCreateReduceMaxDescriptor.restype = c_int32 + lib.infiniopCreateReduceMaxDescriptor.argtypes = [ + infiniopHandle_t, + POINTER(infiniopReduceMaxDescriptor_t), + infiniopTensorDescriptor_t, + infiniopTensorDescriptor_t, + POINTER(ctypes.c_int64), + c_uint64, + c_bool, + c_bool, + ] + lib.infiniopGetReduceMaxWorkspaceSize.restype = c_int32 + lib.infiniopGetReduceMaxWorkspaceSize.argtypes = [ + infiniopReduceMaxDescriptor_t, + POINTER(c_uint64), + ] + lib.infiniopReduceMax.restype = c_int32 + lib.infiniopReduceMax.argtypes = [ + infiniopReduceMaxDescriptor_t, + c_void_p, + c_uint64, + # + c_void_p, + c_void_p, + # + c_void_p, + ] + lib.infiniopDestroyReduceMaxDescriptor.restype = c_int32 + lib.infiniopDestroyReduceMaxDescriptor.argtypes = [ + infiniopReduceMaxDescriptor_t, + ] + + if args.cpu: + test_cpu(lib, test_cases) + # if args.cuda: + # test_cuda(lib, test_cases) + # if args.bang: + # test_bang(lib, test_cases) + # if args.musa: + # test_musa(lib, test_cases) + if not (args.cpu or args.cuda or args.bang or args.musa): + test_cpu(lib, test_cases) + print("\033[92mTest passed!\033[0m") diff --git a/operatorspy/tests/reduce_mean.py b/operatorspy/tests/reduce_mean.py new file mode 100644 index 00000000..8e135522 --- /dev/null +++ b/operatorspy/tests/reduce_mean.py @@ -0,0 +1,230 @@ +from ctypes import POINTER, Structure, c_void_p, c_int32, c_uint64, c_bool +import ctypes +import sys +import os +import time + +sys.path.insert(0, os.path.abspath(os.path.join(os.path.dirname(__file__), "..", ".."))) +from operatorspy import ( + open_lib, + to_tensor, + DeviceEnum, + infiniopHandle_t, + infiniopTensorDescriptor_t, + create_handle, + destroy_handle, + check_error, +) + +from operatorspy.tests.test_utils import get_args +from enum import Enum, auto +import torch +import numpy as np + +from typing import Tuple + +# constant for control whether profile the pytorch and lib functions +# NOTE: need to manually add synchronization function to the lib function, +# e.g., cudaDeviceSynchronize() for CUDA +PROFILE = False +NUM_PRERUN = 10 +NUM_ITERATIONS = 1000 + + +class Inplace(Enum): + OUT_OF_PLACE = auto() + + +class ReduceMeanDescriptor(Structure): + _fields_ = [("device", c_int32)] + + +infiniopReduceMeanDescriptor_t = POINTER(ReduceMeanDescriptor) + + +def reduce_mean(input, axis, keepdims=True, noop_with_empty_axes=False): + if axis == (): + if noop_with_empty_axes: + return input + else: + return torch.mean(input, dim=axis, keepdim=keepdims) + return torch.mean(input, dim=axis, keepdim=keepdims) + +# convert a python tuple to a ctype void pointer +def tuple_to_int64_p(py_tuple: Tuple): + array = ctypes.c_int64 * len(py_tuple) + data_array = array(*py_tuple) + return ctypes.cast(data_array, ctypes.POINTER(ctypes.c_int64)) + +def test( + lib, + handle, + torch_device, + data_shape, + axes, + keepdims=True, + noop_with_empty_axes=False, + tensor_dtype=torch.float16, +): + print( + f"Testing ReduceMean on {torch_device} with data_shape:{data_shape} axes:{axes} keepdims:{keepdims} noop_with_empty_axes:{noop_with_empty_axes} dtype:{tensor_dtype}" + ) + + # if inplace != Inplace.OUT_OF_PLACE: + # print("Unsupported test: calculatin in-place is not supported") + # return + + data = torch.rand(data_shape, dtype=tensor_dtype).to(torch_device) + ans = reduce_mean(data, axes, keepdims, noop_with_empty_axes) + reduced = torch.zeros(ans.shape, dtype=tensor_dtype).to(torch_device) + # print(f"in pytorch, data.size:{data.size()}, reduced.size:{reduced.size()}") + + for _ in range(NUM_PRERUN if PROFILE else 0): + ans = reduce_mean(data, axes, keepdims, noop_with_empty_axes) + if PROFILE: + start_time = time.time() + for i in range(NUM_ITERATIONS): + _ = reduce_mean(data, axes, keepdims, noop_with_empty_axes) + elapsed = (time.time() - start_time) / NUM_ITERATIONS + print(f"pytorch time: {elapsed :6f}") + + reduced_tensor = to_tensor(reduced, lib) + data_tensor = to_tensor(data, lib) + descriptor = infiniopReduceMeanDescriptor_t() + + check_error( + lib.infiniopCreateReduceMeanDescriptor( + handle, + ctypes.byref(descriptor), + reduced_tensor.descriptor, + data_tensor.descriptor, + tuple_to_int64_p(axes), + c_uint64(len(axes)), + c_bool(keepdims), + c_bool(noop_with_empty_axes), + ) + ) + + # Invalidate the shape and strides in the descriptor to prevent them from being directly used by the kernel + reduced_tensor.descriptor.contents.invalidate() + data_tensor.descriptor.contents.invalidate() + + workspaceSize = ctypes.c_uint64(0) + check_error( + lib.infiniopGetReduceMeanWorkspaceSize(descriptor, ctypes.byref(workspaceSize)) + ) + workspace = torch.zeros(int(workspaceSize.value), dtype=torch.uint8).to(torch_device) + workspace_ptr = ctypes.cast(workspace.data_ptr(), ctypes.POINTER(ctypes.c_uint8)) + # print("!!") + + for _ in range(NUM_PRERUN if PROFILE else 1): + check_error( + lib.infiniopReduceMean( + descriptor, + workspace_ptr, + workspaceSize, + reduced_tensor.data, + data_tensor.data, + None) + ) + if PROFILE: + start_time = time.time() + for i in range(NUM_ITERATIONS): + check_error( + lib.infiniopReduceMean( + descriptor, + workspace_ptr, + workspaceSize, + reduced_tensor.data, + data_tensor.data, + None) + ) + elapsed = (time.time() - start_time) / NUM_ITERATIONS + print(f" lib time: {elapsed :6f}") + + if tensor_dtype == torch.float16: + assert torch.allclose(reduced, ans, atol=0, rtol=1e-3) + elif tensor_dtype == torch.float32: + # assert torch.allclose(reduced, ans, atol=0, rtol=1e-5) + assert torch.allclose(reduced, ans, atol=0, rtol=1e-3) + check_error(lib.infiniopDestroyReduceMeanDescriptor(descriptor)) + # print("!!!!") + + +def test_cpu(lib, test_cases): + device = DeviceEnum.DEVICE_CPU + handle = create_handle(lib, device) + for data_shape, axes, keepdims, noop_with_empty_axes in test_cases: + test(lib, handle, "cpu", data_shape, axes, keepdims, noop_with_empty_axes, tensor_dtype=torch.float16) + test(lib, handle, "cpu", data_shape, axes, keepdims, noop_with_empty_axes, tensor_dtype=torch.float32) + destroy_handle(lib, handle) + + +if __name__ == "__main__": + test_cases = [ + # data_shape, axes, keepdims, noop_with_empty_axes + + ((2, 3, 2, 5), (1, 3), True, False), + ((2, 3, 2, 5), (-2, -1), False, False), + # ((3, 2, 5, 4), (-3, -2, -1), True, False), #Floating point exception + # ((3, 2, 5, 4), (-4, -3, -2, -1), True, False), #AssertionError + + ((1, 3), (), True, True), + ((1, 3), (), False, True), + + ((2, 3), (0,), True, False), + ((32, 20, 512), (2,), True, False), + ((3, 2, 5, 4), (0, 1, 2, 3), True, False), + ((32, 56, 112, 112), (0, 1, 2), True, False), + + ((2, 3, 5), (0,), False, False), + ((32, 20, 512), (1, 2), False, False), + ((3, 2, 5, 4), (0, 1, 2, 3), False, False), + ((64, 64, 64, 64), (0, 2, 3), False, False), + ] + + args = get_args() + lib = open_lib() + lib.infiniopCreateReduceMeanDescriptor.restype = c_int32 + lib.infiniopCreateReduceMeanDescriptor.argtypes = [ + infiniopHandle_t, + POINTER(infiniopReduceMeanDescriptor_t), + infiniopTensorDescriptor_t, + infiniopTensorDescriptor_t, + POINTER(ctypes.c_int64), + c_uint64, + c_bool, + c_bool, + ] + lib.infiniopGetReduceMeanWorkspaceSize.restype = c_int32 + lib.infiniopGetReduceMeanWorkspaceSize.argtypes = [ + infiniopReduceMeanDescriptor_t, + POINTER(c_uint64), + ] + lib.infiniopReduceMean.restype = c_int32 + lib.infiniopReduceMean.argtypes = [ + infiniopReduceMeanDescriptor_t, + c_void_p, + c_uint64, + # + c_void_p, + c_void_p, + # + c_void_p, + ] + lib.infiniopDestroyReduceMeanDescriptor.restype = c_int32 + lib.infiniopDestroyReduceMeanDescriptor.argtypes = [ + infiniopReduceMeanDescriptor_t, + ] + + if args.cpu: + test_cpu(lib, test_cases) + # if args.cuda: + # test_cuda(lib, test_cases) + # if args.bang: + # test_bang(lib, test_cases) + # if args.musa: + # test_musa(lib, test_cases) + if not (args.cpu or args.cuda or args.bang or args.musa): + test_cpu(lib, test_cases) + print("\033[92mTest passed!\033[0m") diff --git a/operatorspy/tests/reduce_min.py b/operatorspy/tests/reduce_min.py new file mode 100644 index 00000000..a7c08a11 --- /dev/null +++ b/operatorspy/tests/reduce_min.py @@ -0,0 +1,228 @@ +from ctypes import POINTER, Structure, c_void_p, c_int32, c_uint64, c_bool +import ctypes +import sys +import os +import time + +sys.path.insert(0, os.path.abspath(os.path.join(os.path.dirname(__file__), "..", ".."))) +from operatorspy import ( + open_lib, + to_tensor, + DeviceEnum, + infiniopHandle_t, + infiniopTensorDescriptor_t, + create_handle, + destroy_handle, + check_error, +) + +from operatorspy.tests.test_utils import get_args +from enum import Enum, auto +import torch +import numpy as np + +from typing import Tuple + +# constant for control whether profile the pytorch and lib functions +# NOTE: need to manually add synchronization function to the lib function, +# e.g., cudaDeviceSynchronize() for CUDA +PROFILE = False +NUM_PRERUN = 10 +NUM_ITERATIONS = 1000 + + +class Inplace(Enum): + OUT_OF_PLACE = auto() + + +class ReduceMinDescriptor(Structure): + _fields_ = [("device", c_int32)] + + +infiniopReduceMinDescriptor_t = POINTER(ReduceMinDescriptor) + + +def reduce_min(input, axis, keepdims=True, noop_with_empty_axes=False): + if axis == (): + if noop_with_empty_axes: + return input + else: + return torch.amin(input, dim=axis, keepdim=keepdims) + return torch.amin(input, dim=axis, keepdim=keepdims) + +# convert a python tuple to a ctype void pointer +def tuple_to_int64_p(py_tuple: Tuple): + array = ctypes.c_int64 * len(py_tuple) + data_array = array(*py_tuple) + return ctypes.cast(data_array, ctypes.POINTER(ctypes.c_int64)) + +def test( + lib, + handle, + torch_device, + data_shape, + axes, + keepdims=True, + noop_with_empty_axes=False, + tensor_dtype=torch.float16, +): + print( + f"Testing ReduceMin on {torch_device} with data_shape:{data_shape} axes:{axes} keepdims:{keepdims} noop_with_empty_axes:{noop_with_empty_axes} dtype:{tensor_dtype}" + ) + + # if inplace != Inplace.OUT_OF_PLACE: + # print("Unsupported test: calculatin in-place is not supported") + # return + + data = torch.rand(data_shape, dtype=tensor_dtype).to(torch_device) + ans = reduce_min(data, axes, keepdims, noop_with_empty_axes) + reduced = torch.zeros(ans.shape, dtype=tensor_dtype).to(torch_device) + # print(f"in pytorch, data.size:{data.size()}, reduced.size:{reduced.size()}") + + for _ in range(NUM_PRERUN if PROFILE else 0): + ans = reduce_min(data, axes, keepdims, noop_with_empty_axes) + if PROFILE: + start_time = time.time() + for i in range(NUM_ITERATIONS): + _ = reduce_min(data, axes, keepdims, noop_with_empty_axes) + elapsed = (time.time() - start_time) / NUM_ITERATIONS + print(f"pytorch time: {elapsed :6f}") + + reduced_tensor = to_tensor(reduced, lib) + data_tensor = to_tensor(data, lib) + descriptor = infiniopReduceMinDescriptor_t() + + check_error( + lib.infiniopCreateReduceMinDescriptor( + handle, + ctypes.byref(descriptor), + reduced_tensor.descriptor, + data_tensor.descriptor, + tuple_to_int64_p(axes), + c_uint64(len(axes)), + c_bool(keepdims), + c_bool(noop_with_empty_axes), + ) + ) + + # Invalidate the shape and strides in the descriptor to prevent them from being directly used by the kernel + reduced_tensor.descriptor.contents.invalidate() + data_tensor.descriptor.contents.invalidate() + + workspaceSize = ctypes.c_uint64(0) + check_error( + lib.infiniopGetReduceMinWorkspaceSize(descriptor, ctypes.byref(workspaceSize)) + ) + workspace = torch.zeros(int(workspaceSize.value), dtype=torch.uint8).to(torch_device) + workspace_ptr = ctypes.cast(workspace.data_ptr(), ctypes.POINTER(ctypes.c_uint8)) + # print("!!") + + for _ in range(NUM_PRERUN if PROFILE else 1): + check_error( + lib.infiniopReduceMin( + descriptor, + workspace_ptr, + workspaceSize, + reduced_tensor.data, + data_tensor.data, + None) + ) + if PROFILE: + start_time = time.time() + for i in range(NUM_ITERATIONS): + check_error( + lib.infiniopReduceMin( + descriptor, + workspace_ptr, + workspaceSize, + reduced_tensor.data, + data_tensor.data, + None) + ) + elapsed = (time.time() - start_time) / NUM_ITERATIONS + print(f" lib time: {elapsed :6f}") + + if tensor_dtype == torch.float16: + assert torch.allclose(reduced, ans, atol=0, rtol=1e-3) + elif tensor_dtype == torch.float32: + assert torch.allclose(reduced, ans, atol=0, rtol=1e-5) + check_error(lib.infiniopDestroyReduceMinDescriptor(descriptor)) + # print("!!!!") + + +def test_cpu(lib, test_cases): + device = DeviceEnum.DEVICE_CPU + handle = create_handle(lib, device) + for data_shape, axes, keepdims, noop_with_empty_axes in test_cases: + test(lib, handle, "cpu", data_shape, axes, keepdims, noop_with_empty_axes, tensor_dtype=torch.float16) + test(lib, handle, "cpu", data_shape, axes, keepdims, noop_with_empty_axes, tensor_dtype=torch.float32) + destroy_handle(lib, handle) + + +if __name__ == "__main__": + test_cases = [ + # data_shape, axes, keepdims, noop_with_empty_axes + + ((2, 3, 2, 5), (1, 3), True, False), + ((2, 3, 2, 5), (-2, -1), False, False), + ((3, 2, 5, 4), (-3, -2, -1), True, False), + + ((1, 3), (), True, True), + ((1, 3), (), False, True), + + ((2, 3), (0,), True, False), + ((32, 20, 512), (2,), True, False), + ((3, 2, 5, 4), (0, 1, 2, 3), True, False), + ((32, 56, 112, 112), (0, 1, 2), True, False), + + ((2, 3, 5), (0,), False, False), + ((32, 20, 512), (1, 2), False, False), + ((3, 2, 5, 4), (0, 1, 2, 3), False, False), + ((64, 64, 64, 64), (0, 2, 3), False, False), + ] + + args = get_args() + lib = open_lib() + lib.infiniopCreateReduceMinDescriptor.restype = c_int32 + lib.infiniopCreateReduceMinDescriptor.argtypes = [ + infiniopHandle_t, + POINTER(infiniopReduceMinDescriptor_t), + infiniopTensorDescriptor_t, + infiniopTensorDescriptor_t, + POINTER(ctypes.c_int64), + c_uint64, + c_bool, + c_bool, + ] + lib.infiniopGetReduceMinWorkspaceSize.restype = c_int32 + lib.infiniopGetReduceMinWorkspaceSize.argtypes = [ + infiniopReduceMinDescriptor_t, + POINTER(c_uint64), + ] + lib.infiniopReduceMin.restype = c_int32 + lib.infiniopReduceMin.argtypes = [ + infiniopReduceMinDescriptor_t, + c_void_p, + c_uint64, + # + c_void_p, + c_void_p, + # + c_void_p, + ] + lib.infiniopDestroyReduceMinDescriptor.restype = c_int32 + lib.infiniopDestroyReduceMinDescriptor.argtypes = [ + infiniopReduceMinDescriptor_t, + ] + + if args.cpu: + test_cpu(lib, test_cases) + # if args.cuda: + # test_cuda(lib, test_cases) + # if args.bang: + # test_bang(lib, test_cases) + # if args.musa: + # test_musa(lib, test_cases) + if not (args.cpu or args.cuda or args.bang or args.musa): + test_cpu(lib, test_cases) + print("\033[92mTest passed!\033[0m") diff --git a/operatorspy/tests/where.py b/operatorspy/tests/where.py new file mode 100644 index 00000000..12c849c0 --- /dev/null +++ b/operatorspy/tests/where.py @@ -0,0 +1,166 @@ +from ctypes import POINTER, Structure, c_int32, c_void_p, c_float +import ctypes +import sys +import os + +sys.path.insert(0, os.path.abspath(os.path.join(os.path.dirname(__file__), "..", ".."))) +from operatorspy import ( + open_lib, + to_tensor, + DeviceEnum, + infiniopHandle_t, + infiniopTensorDescriptor_t, + create_handle, + destroy_handle, + check_error, +) + +from operatorspy.tests.test_utils import get_args +from enum import Enum, auto +import torch + + +class Inplace(Enum): + OUT_OF_PLACE = auto() + INPLACE_X = auto() + INPLACE_Y = auto() + + +class WhereDescriptor(Structure): + _fields_ = [("device", c_int32)] + + +infiniopWhereDescriptor_t = POINTER(WhereDescriptor) + +def where(condition, x, y): + return torch.where(condition, x, y) + + +def test( + lib, + handle, + torch_device, + o_shape, + condition_shape, + x_shape, + y_shape, + tensor_dtype=torch.float16, + inplace=Inplace.OUT_OF_PLACE, +): + print( + f"Testing Where on {torch_device} with o_shape:{o_shape} x_shape:{x_shape} y_shape:{y_shape}\ + dtype:{tensor_dtype} inplace: {inplace.name}" + ) + if x_shape != y_shape and inplace != Inplace.OUT_OF_PLACE: + print("Unsupported test: broadcasting does not support in-place") + return + + x = torch.rand(x_shape, dtype=tensor_dtype).to(torch_device) + y = torch.rand(y_shape, dtype=tensor_dtype).to(torch_device) + output = torch.rand(o_shape, dtype=tensor_dtype).to(torch_device) \ + if inplace == Inplace.OUT_OF_PLACE else (x if inplace == Inplace.INPLACE_X else y) + condition_ = torch.randint(0, 2, condition_shape, dtype=torch.bool).to(torch_device) + condition = condition_.to(dtype=torch.uint8).to(torch_device) + + descriptor = infiniopWhereDescriptor_t() + x_tensor = to_tensor(x, lib) + y_tensor = to_tensor(y, lib) + o_tensor = to_tensor(output, lib) \ + if inplace == Inplace.OUT_OF_PLACE else (x_tensor if inplace == Inplace.INPLACE_X else y_tensor) + condition_tensor = to_tensor(condition, lib) + + ans = where(condition_, x, y) + + check_error( + lib.infiniopCreateWhereDescriptor( + handle, + ctypes.byref(descriptor), + o_tensor.descriptor, + condition_tensor.descriptor, + x_tensor.descriptor, + y_tensor.descriptor, + ) + ) + + # Invalidate the shape and strides in the descriptor to prevent them from being directly used by the kernel + o_tensor.descriptor.contents.invalidate() + condition_tensor.descriptor.contents.invalidate() + x_tensor.descriptor.contents.invalidate() + y_tensor.descriptor.contents.invalidate() + + check_error( + lib.infiniopWhere(descriptor, + o_tensor.data, + condition_tensor.data, + x_tensor.data, + y_tensor.data, + None) + ) + + # print(f"--ans:{ans}, output:{output}") + assert torch.allclose(output, ans, atol=0, rtol=0) + check_error(lib.infiniopDestroyWhereDescriptor(descriptor)) + + +def test_cpu(lib, test_cases): + device = DeviceEnum.DEVICE_CPU + handle = create_handle(lib, device) + for o_shape, condition_shape, x_shape, y_shape, inplace in test_cases: + test(lib, handle, "cpu", o_shape, condition_shape, x_shape, y_shape, tensor_dtype=torch.float16, inplace=inplace) + test(lib, handle, "cpu", o_shape, condition_shape, x_shape, y_shape, tensor_dtype=torch.float32, inplace=inplace) + destroy_handle(lib, handle) + + +if __name__ == "__main__": + test_cases = [ + # o_shape, condition_shape, x_shape, y_shape, inplace + ((1, 3), (1, 3), (1, 3), (1, 3), Inplace.OUT_OF_PLACE), + ((3, 3), (3, 3), (3, 3), (3, 3), Inplace.OUT_OF_PLACE), + ((), (), (), (), Inplace.OUT_OF_PLACE), + ((2, 20, 3), (2, 20, 3), (2, 1, 3), (2, 20, 3), Inplace.OUT_OF_PLACE), + + ((32, 20, 512), (32, 20, 512), (32, 20, 512), (32, 20, 512), Inplace.INPLACE_X), + ((32, 20, 512), (32, 20, 512), (32, 20, 512), (32, 20, 512), Inplace.INPLACE_Y), + ((32, 150, 512), (32, 150, 512), (32, 150, 512), (32, 150, 1), Inplace.OUT_OF_PLACE), + ((32, 256, 112, 112), (32, 256, 112, 112), (32, 256, 112, 1), (32, 256, 112, 112), Inplace.OUT_OF_PLACE), + + ((2, 4, 3), (2, 4, 3), (2, 1, 3), (4, 3), Inplace.OUT_OF_PLACE), + ((2, 3, 4, 5), (2, 3, 4, 5), (2, 3, 4, 5), (5,), Inplace.OUT_OF_PLACE), + ((3, 2, 4, 5), (3, 2, 4, 5), (4, 5), (3, 2, 1, 1), Inplace.OUT_OF_PLACE), + ] + + args = get_args() + lib = open_lib() + lib.infiniopCreateWhereDescriptor.restype = c_int32 + lib.infiniopCreateWhereDescriptor.argtypes = [ + infiniopHandle_t, + POINTER(infiniopWhereDescriptor_t), + infiniopTensorDescriptor_t, + infiniopTensorDescriptor_t, + infiniopTensorDescriptor_t, + infiniopTensorDescriptor_t, + ] + lib.infiniopWhere.restype = c_int32 + lib.infiniopWhere.argtypes = [ + infiniopWhereDescriptor_t, + c_void_p, + c_void_p, + c_void_p, + c_void_p, + # + c_void_p, + ] + lib.infiniopDestroyWhereDescriptor.restype = c_int32 + lib.infiniopDestroyWhereDescriptor.argtypes = [ + infiniopWhereDescriptor_t, + ] + + if args.cpu: + test_cpu(lib, test_cases) + # if args.cuda: + # test_cuda(lib, test_cases) + # if args.bang: + # test_bang(lib, test_cases) + if not (args.cpu or args.cuda or args.bang): + test_cpu(lib, test_cases) + print("\033[92mTest passed!\033[0m") diff --git a/src/ops/clip/cpu/clip_cpu.cc b/src/ops/clip/cpu/clip_cpu.cc new file mode 100644 index 00000000..12d86f3c --- /dev/null +++ b/src/ops/clip/cpu/clip_cpu.cc @@ -0,0 +1,110 @@ +#include "clip_cpu.h" +#include "../../../devices/cpu/common_cpu.h" +#include "../../utils.h" +#include + +inline void incrementOne(uint64_t *indices, uint64_t const *shape, uint64_t ndim) { + for (int64_t i = ndim - 1; i >= 0; --i) { + if (++indices[i] != shape[i]) { + return; + } + indices[i] = 0; + } +} + +inline uint64_t compactToFlat(uint64_t const *indices, uint64_t const *strides, uint64_t ndim) { + return std::inner_product(indices, indices + ndim, strides, uint64_t(0)); +} + +infiniopStatus_t cpuCreateClipDescriptor(infiniopHandle_t, + ClipCpuDescriptor_t *desc_ptr, + infiniopTensorDescriptor_t output, + infiniopTensorDescriptor_t input, + float min_value = std::numeric_limits::lowest(), + float max_value = std::numeric_limits::max()) { + uint64_t ndim = output->ndim; + + if (!is_contiguous(output) || !is_contiguous(input)) { + return STATUS_BAD_TENSOR_STRIDES; + } + if (output->dt != F16 && output->dt != F32) { + return STATUS_BAD_TENSOR_DTYPE; + } + if (output->dt != input->dt) { + return STATUS_BAD_TENSOR_DTYPE; + } + + uint64_t o_data_size = std::accumulate( + output->shape, output->shape + output->ndim, 1ULL, std::multiplies()); + + uint64_t *i_strides = new uint64_t[ndim]; + std::copy(input->strides, input->strides + ndim, i_strides); + + uint64_t *o_indices = new uint64_t[ndim]; + std::fill(o_indices, o_indices + ndim, 0); + + uint64_t *o_shape = new uint64_t[ndim]; + std::copy(output->shape, output->shape + ndim, o_shape); + + *desc_ptr = new ClipCpuDescriptor{ + DevCpu, + output->dt, + ndim, + o_data_size, + o_shape, + i_strides, + o_indices, + min_value, + max_value, + }; + + return STATUS_SUCCESS; +} + +infiniopStatus_t cpuDestroyClipDescriptor(ClipCpuDescriptor_t desc) { + delete[] desc->o_shape; + delete[] desc->i_strides; + delete[] desc->o_indices; + delete desc; + return STATUS_SUCCESS; +} + +template +infiniopStatus_t clip_cpu(ClipCpuDescriptor_t desc, + void *output, + void const *input) { + auto i_ = reinterpret_cast(input); + auto o_ = reinterpret_cast(output); + const auto &indices = desc->o_indices; + + float min_value = desc->min_value; + float max_value = desc->max_value; + for (uint64_t i = 0; i < desc->o_data_size; ++i, incrementOne(indices, desc->o_shape, desc->ndim)) { + auto i_index = compactToFlat(indices, desc->i_strides, desc->ndim); + if constexpr (std::is_same::value) { + o_[i] = f32_to_f16(std::fmin( + max_value, + std::fmax(f16_to_f32(i_[i_index]), min_value))); + } else { + o_[i] = std::fmin( + max_value, + std::fmax(i_[i_index], min_value)); + } + } + return STATUS_SUCCESS; +} + +infiniopStatus_t cpuClip(ClipCpuDescriptor_t desc, + void *output, + void const *input, + // float min_value = std::numeric_limits::lowest(), + // float max_value = std::numeric_limits::max(), + void *stream = nullptr) { + if (desc->dtype == F16) { + return clip_cpu(desc, output, input); + } + if (desc->dtype == F32) { + return clip_cpu(desc, output, input); + } + return STATUS_BAD_TENSOR_DTYPE; +} diff --git a/src/ops/clip/cpu/clip_cpu.h b/src/ops/clip/cpu/clip_cpu.h new file mode 100644 index 00000000..c9c45add --- /dev/null +++ b/src/ops/clip/cpu/clip_cpu.h @@ -0,0 +1,38 @@ +#ifndef __CPU_CLIP_H__ +#define __CPU_CLIP_H__ + +#include "operators.h" +#include +#include + +struct ClipCpuDescriptor { + Device device; + DT dtype; + uint64_t ndim; + uint64_t o_data_size; + uint64_t const *o_shape; + uint64_t const *i_strides; + uint64_t *o_indices; + float min_value; + float max_value; +}; + +typedef struct ClipCpuDescriptor *ClipCpuDescriptor_t; + +infiniopStatus_t cpuCreateClipDescriptor(infiniopHandle_t, + ClipCpuDescriptor_t *, + infiniopTensorDescriptor_t output, + infiniopTensorDescriptor_t input, + float min_value, + float max_value); + +infiniopStatus_t cpuClip(ClipCpuDescriptor_t desc, + void *output, + void const *input, + // float min_value, + // float max_value, + void *stream); + +infiniopStatus_t cpuDestroyClipDescriptor(ClipCpuDescriptor_t desc); + +#endif diff --git a/src/ops/clip/operator.cc b/src/ops/clip/operator.cc new file mode 100644 index 00000000..18a00169 --- /dev/null +++ b/src/ops/clip/operator.cc @@ -0,0 +1,85 @@ +#include "../utils.h" +#include "operators.h" +#include "ops/clip/clip.h" + +#ifdef ENABLE_CPU +#include "cpu/clip_cpu.h" +#endif +#ifdef ENABLE_NV_GPU +// #include "../../devices/cuda/cuda_handle.h" +// #include "cuda/clip.cuh" +#endif +#ifdef ENABLE_MTHREADS_GPU +// #include "musa/clip_musa.h" +#endif + +__C infiniopStatus_t infiniopCreateClipDescriptor(infiniopHandle_t handle, + infiniopClipDescriptor_t *desc_ptr, + infiniopTensorDescriptor_t output, + infiniopTensorDescriptor_t input, + float min, + float max) { + switch (handle->device) { +#ifdef ENABLE_CPU + case DevCpu: + return cpuCreateClipDescriptor(handle, (ClipCpuDescriptor_t *) desc_ptr, output, input, min, max); +#endif +#ifdef ENABLE_NV_GPU + case DevNvGpu: { + } +#endif +#ifdef ENABLE_CAMBRICON_MLU + // TODO +#endif +#ifdef ENABLE_MTHREADS_GPU + case DevMthreadsGpu: { + } +#endif + } + return STATUS_BAD_DEVICE; +} + +__C infiniopStatus_t infiniopClip(infiniopClipDescriptor_t desc, + void *output, + void const *input, + void *stream) { + switch (desc->device) { +#ifdef ENABLE_CPU + case DevCpu: + return cpuClip((ClipCpuDescriptor_t) desc, output, input, stream); +#endif +#ifdef ENABLE_NV_GPU + case DevNvGpu: { + } +#endif +#ifdef ENABLE_CAMBRICON_MLU + // TODO +#endif +#ifdef ENABLE_MTHREADS_GPU + case DevMthreadsGpu: { + } +#endif + } + return STATUS_BAD_DEVICE; +} + +__C infiniopStatus_t infiniopDestroyClipDescriptor(infiniopClipDescriptor_t desc) { + switch (desc->device) { +#ifdef ENABLE_CPU + case DevCpu: + return cpuDestroyClipDescriptor((ClipCpuDescriptor_t) desc); +#endif +#ifdef ENABLE_NV_GPU + case DevNvGpu: { + } +#endif +#ifdef ENABLE_CAMBRICON_MLU + // TODO +#endif +#ifdef ENABLE_MTHREADS_GPU + case DevMthreadsGpu: { + } +#endif + } + return STATUS_BAD_DEVICE; +} diff --git a/src/ops/gather/cpu/gather_cpu.cc b/src/ops/gather/cpu/gather_cpu.cc new file mode 100644 index 00000000..2a609999 --- /dev/null +++ b/src/ops/gather/cpu/gather_cpu.cc @@ -0,0 +1,174 @@ +#include "gather_cpu.h" +#include "../../../devices/cpu/common_cpu.h" +#include "../../utils.h" + +// inline void incrementOne(uint64_t *indices, uint64_t const *shape, uint64_t ndim) { +// for (int64_t i = ndim - 1; i >= 0; --i) { +// if (++indices[i] != shape[i]) { +// return; +// } +// indices[i] = 0; +// } +// } + +inline uint64_t compactToFlat(uint64_t const *indices, uint64_t const *strides, uint64_t ndim) { + return std::inner_product(indices, indices + ndim, strides, uint64_t(0)); +} + +infiniopStatus_t cpuCreateGatherDescriptor(infiniopHandle_t, + GatherCpuDescriptor_t *desc_ptr, + infiniopTensorDescriptor_t output, + infiniopTensorDescriptor_t data, + infiniopTensorDescriptor_t indices, + int64_t axis = 0) { + // printf("\tcpuCreateGatherDescriptor\n"); + uint64_t indices_ndim = indices->ndim;// rank q + uint64_t data_ndim = data->ndim; // rank r + uint64_t ndim = output->ndim; // q+r-1 + + if (!is_contiguous(data) || !is_contiguous(indices) || !is_contiguous(output)) { + return STATUS_BAD_TENSOR_STRIDES; + } + if (output->dt != F16 && output->dt != F32) { + return STATUS_BAD_TENSOR_DTYPE; + } + if (output->dt != data->dt) { + return STATUS_BAD_TENSOR_DTYPE; + } + if (indices->dt != I32 && indices->dt != I64) { + return STATUS_BAD_TENSOR_DTYPE; + } + // q+r-1 + if (ndim != indices_ndim + data_ndim - 1) { + printf("\tseems wrong shape\n"); + return STATUS_BAD_TENSOR_SHAPE; + } + // [-r, r-1] + // 0 < -2 || 0 > 1 + if (axis < (int64_t)(-data_ndim) || axis > (int64_t)(data_ndim - 1)) { + // axis:0, data_ndim:2 + printf("\tseems wrong dim, axis:%ld, data_ndim:%ld\n", axis, data_ndim); + if (axis > data_ndim - 1) { + printf("\t%ld > %ld\n", axis, data_ndim - 1); + } + if (axis < 0-data_ndim) { + printf("\t%ld < %ld\n", axis, -data_ndim); + } + return STATUS_BAD_PARAM; + } + axis = (axis + data_ndim) % data_ndim; + // printf("\tcheck over\n"); + + // uint64_t o_data_size = std::accumulate(output->shape, output->shape + output->ndim, 1ULL, std::multiplies()); + + // uint64_t *o_shape = new uint64_t[ndim]; + // std::copy(output->shape, output->shape + ndim, o_shape); + + uint64_t *data_shape = new uint64_t[data->ndim]; + memcpy(data_shape, data->shape, data->ndim * sizeof(uint64_t)); + + uint64_t *indices_shape = new uint64_t[indices->ndim]; + memcpy(indices_shape, indices->shape, indices->ndim * sizeof(uint64_t)); + + uint64_t *data_strides = new uint64_t[ndim]; + memcpy(data_strides, data->strides, data->ndim * sizeof(uint64_t)); + + uint64_t *indices_strides = new uint64_t[ndim]; + memcpy(indices_strides, indices->strides, indices->ndim * sizeof(uint64_t)); + + // uint64_t *o_indices = new uint64_t[ndim]; + // std::fill(o_indices, o_indices + ndim, 0); + + *desc_ptr = new GatherCpuDescriptor{ + DevCpu, + output->dt, + indices->dt, + data_ndim, + indices_ndim, + // o_data_size, + // o_shape, + data_shape, + indices_shape, + data_strides, + indices_strides, + // o_indices, + axis, + }; + + return STATUS_SUCCESS; +} + +infiniopStatus_t cpuDestroyGatherDescriptor(GatherCpuDescriptor_t desc) { + // delete[] desc->o_shape; + delete[] desc->data_shape; + delete[] desc->indices_shape; + delete[] desc->data_strides; + delete[] desc->indices_strides; + // delete[] desc->o_indices; + delete desc; + return STATUS_SUCCESS; +} + +template +infiniopStatus_t gather_cpu(GatherCpuDescriptor_t desc, + void *output, + void const *data, + void const *indices) { + // printf("\tgather_cpu\n"); + auto data_ = reinterpret_cast(data); + auto output_ = reinterpret_cast(output); + auto indices_ = reinterpret_cast(indices); + const uint64_t axis = desc->axis; + + uint64_t outer_ = 1; + for (uint64_t i = 0; i < axis; ++i) { + outer_ *= desc->data_shape[i]; + } + uint64_t indices_element_count = 1; + for (uint64_t i = 0; i < desc->indices_ndim; ++i) { + indices_element_count *= desc->indices_shape[i]; + } + uint64_t cpy_len = 1; + for (uint64_t i = axis + 1; i < desc->data_ndim; ++i) { + cpy_len *= desc->data_shape[i]; + } + + for (uint64_t outer = 0; outer < outer_; ++outer) { + for (uint64_t idx = 0; idx < indices_element_count; ++idx) { + Tindices index_val = indices_[idx]; + // bounds [-s, s-1] + index_val = (index_val + desc->data_shape[axis]) % desc->data_shape[axis]; + + const uint64_t data_offset = outer * desc->data_shape[axis] * cpy_len + + index_val * cpy_len; + const uint64_t output_offset = outer * indices_element_count * cpy_len + + idx * cpy_len; + memcpy( + output_ + output_offset, + data_ + data_offset, + sizeof(Tdata) * cpy_len); + } + } + return STATUS_SUCCESS; +} + +infiniopStatus_t cpuGather(GatherCpuDescriptor_t desc, + void *output, + void const *data, + void const *indices, + void *stream) { + if (desc->dtype == F16 && desc->indices_dtype == I32) { + return gather_cpu(desc, output, data, indices); + } + if (desc->dtype == F32 && desc->indices_dtype == I32) { + return gather_cpu(desc, output, data, indices); + } + if (desc->dtype == F16 && desc->indices_dtype == I64) { + return gather_cpu(desc, output, data, indices); + } + if (desc->dtype == F32 && desc->indices_dtype == I64) { + return gather_cpu(desc, output, data, indices); + } + + return STATUS_BAD_TENSOR_DTYPE; +} diff --git a/src/ops/gather/cpu/gather_cpu.h b/src/ops/gather/cpu/gather_cpu.h new file mode 100644 index 00000000..7bb5d346 --- /dev/null +++ b/src/ops/gather/cpu/gather_cpu.h @@ -0,0 +1,41 @@ +#ifndef __CPU_GATHER_H__ +#define __CPU_GATHER_H__ + +#include "operators.h" +#include +#include + +struct GatherCpuDescriptor { + Device device; + DT dtype; + DT indices_dtype; + uint64_t data_ndim; + uint64_t indices_ndim; + // uint64_t o_data_size; + // uint64_t const *o_shape; + uint64_t const *data_shape; + uint64_t const *indices_shape; + uint64_t const *data_strides; + uint64_t const *indices_strides; + // uint64_t *o_indices; + int64_t axis; +}; + +typedef struct GatherCpuDescriptor *GatherCpuDescriptor_t; + +infiniopStatus_t cpuCreateGatherDescriptor(infiniopHandle_t, + GatherCpuDescriptor_t *, + infiniopTensorDescriptor_t output, + infiniopTensorDescriptor_t data, + infiniopTensorDescriptor_t indices, + int64_t axis); + +infiniopStatus_t cpuGather(GatherCpuDescriptor_t desc, + void *output, + void const *data, + void const *indices, + void *stream); + +infiniopStatus_t cpuDestroyGatherDescriptor(GatherCpuDescriptor_t desc); + +#endif diff --git a/src/ops/gather/operator.cc b/src/ops/gather/operator.cc new file mode 100644 index 00000000..6b77a3d3 --- /dev/null +++ b/src/ops/gather/operator.cc @@ -0,0 +1,80 @@ +#include "../utils.h" +#include "operators.h" +#include "ops/gather/gather.h" + +#ifdef ENABLE_CPU +#include "cpu/gather_cpu.h" +#endif +#ifdef ENABLE_NV_GPU +#endif +#ifdef ENABLE_MTHREADS_GPU +#endif + +__C infiniopStatus_t infiniopCreateGatherDescriptor(infiniopHandle_t handle, + infiniopGatherDescriptor_t *desc_ptr, + infiniopTensorDescriptor_t output, + infiniopTensorDescriptor_t data, + infiniopTensorDescriptor_t indices, + int64_t axis) { + switch (handle->device) { +#ifdef ENABLE_CPU + case DevCpu: + return cpuCreateGatherDescriptor(handle, (GatherCpuDescriptor_t *) desc_ptr, output, data, indices, axis); +#endif +#ifdef ENABLE_NV_GPU + case DevNvGpu: { + } +#endif +#ifdef ENABLE_CAMBRICON_MLU + // TODO +#endif +#ifdef ENABLE_MTHREADS_GPU + case DevMthreadsGpu: { + } +#endif + } + return STATUS_BAD_DEVICE; +} + +__C infiniopStatus_t infiniopGather(infiniopGatherDescriptor_t desc, void *output, void const *data, void const *indices, void *stream) { + switch (desc->device) { +#ifdef ENABLE_CPU + case DevCpu: + return cpuGather((GatherCpuDescriptor_t) desc, output, data, indices, stream); +#endif +#ifdef ENABLE_NV_GPU + case DevNvGpu: { + } + +#endif +#ifdef ENABLE_CAMBRICON_MLU + // TODO +#endif +#ifdef ENABLE_MTHREADS_GPU + case DevMthreadsGpu: { + } +#endif + } + return STATUS_BAD_DEVICE; +} + +__C infiniopStatus_t infiniopDestroyGatherDescriptor(infiniopGatherDescriptor_t desc) { + switch (desc->device) { +#ifdef ENABLE_CPU + case DevCpu: + return cpuDestroyGatherDescriptor((GatherCpuDescriptor_t) desc); +#endif +#ifdef ENABLE_NV_GPU + case DevNvGpu: { + } +#endif +#ifdef ENABLE_CAMBRICON_MLU + // TODO +#endif +#ifdef ENABLE_MTHREADS_GPU + case DevMthreadsGpu: { + } +#endif + } + return STATUS_BAD_DEVICE; +} diff --git a/src/ops/reduce/cpu/reduce_cpu.cc b/src/ops/reduce/cpu/reduce_cpu.cc new file mode 100644 index 00000000..eeaae862 --- /dev/null +++ b/src/ops/reduce/cpu/reduce_cpu.cc @@ -0,0 +1,437 @@ +#include "reduce_cpu.h" +#include "../../../devices/cpu/common_cpu.h" +#include "../../utils.h" +#include + +inline void incrementOne(uint64_t *indices, uint64_t const *shape, uint64_t ndim) { + for (int64_t i = ndim - 1; i >= 0; --i) { + if (++indices[i] != shape[i]) { + return; + } + indices[i] = 0; + } +} + +inline uint64_t compactToFlat(uint64_t const *indices, uint64_t const *strides, uint64_t ndim) { + return std::inner_product(indices, indices + ndim, strides, uint64_t(0)); +} + +infiniopStatus_t cpuCreateReduceDescriptor(infiniopHandle_t handle, + ReduceCpuDescriptor_t *desc_ptr, + infiniopTensorDescriptor_t reduced, + infiniopTensorDescriptor_t data, + int64_t *axes, + uint64_t axes_ndim, + bool keepdims, + bool noop_with_empty_axes, + int reduce_type) { + if (!is_contiguous(reduced) || !is_contiguous(data)) { + return STATUS_BAD_TENSOR_STRIDES; + } + if (reduce_type > 2){ + return STATUS_BAD_PARAM; + } + if (reduced->dt != data->dt) { + return STATUS_BAD_TENSOR_DTYPE; + } + if (reduced->dt != F16 && reduced->dt != F32) { + return STATUS_BAD_TENSOR_DTYPE; + } + + uint64_t data_ndim = data->ndim; + uint64_t reduced_ndim = data_ndim - axes_ndim; + + uint64_t data_size = std::accumulate(data->shape, data->shape + data->ndim, 1ULL, std::multiplies()); + // + uint64_t axes_size = 1; + uint64_t reduced_size = 1; + uint64_t reduced_size_ = std::accumulate(reduced->shape, reduced->shape + reduced->ndim, 1ULL, std::multiplies()); + + int64_t *reduced_axes; + int64_t *axes_axes; + + int64_t *data_strides = new int64_t[data_ndim]; + memcpy(data_strides, data->strides, data_ndim * sizeof(int64_t)); + int64_t *reduced_strides; + int64_t *anti_out_strides; + + // axes is empty + if (axes_ndim == 0) { + if (noop_with_empty_axes == false) { + // reduce all axes + axes_ndim = data_ndim; + axes_size = data_size; + axes_axes = new int64_t[axes_ndim]; + for (int i = 0; i < axes_ndim; i ++) { + axes_axes[i] = i; + } + anti_out_strides = new int64_t[axes_ndim]; + memcpy(anti_out_strides, data_strides, axes_ndim * sizeof(int64_t)); + + reduced_ndim = 1; + reduced_size = 1; + reduced_axes = nullptr; + reduced_strides = new int64_t[reduced_ndim]; + reduced_strides[0] = 1; + } else { + // input tensor will not be reduced + axes_ndim = 0; + axes_size = 0; + axes_axes = nullptr; + anti_out_strides = new int64_t[axes_ndim]; + + reduced_ndim = data_ndim; + reduced_size = data_size; + reduced_axes = new int64_t[reduced_ndim]; + for (int i = 0; i < reduced_ndim; i ++) { + reduced_axes[i] = i; + } + reduced_strides = new int64_t[reduced_ndim]; + memcpy(reduced_strides, data_strides, reduced_ndim * sizeof(int64_t)); + } + } else { + // if (axes != nullptr && axes_ndim > 0) + if (keepdims == true) { + // 保持 + // reduced_ndim = reduced->ndim; + } else { + // 减少张量维度 + reduced_ndim = data_ndim - axes_ndim; + } + // reduced_shape = new uint64_t[reduced_ndim]; + + axes_size = 1; + reduced_size = 1; + std::vector axes_vec, reduced_axes_vec; + for(uint64_t i = 0; i < data_ndim; i++) { + bool is_axis = false; + for(uint64_t j = 0; j < axes_ndim; j++) { + // printf("\t\tdata-dim %lu, axes-dim %lu, axes is %lu\n", i, j, axes[j]); + uint64_t axe = (axes[j] + data_ndim) % data_ndim; + if (axe == i) { + is_axis = true; + break; + } + } + + if (is_axis) { + axes_size *= data->shape[i]; + axes_vec.emplace_back(i); + if (keepdims == true) { + // reduced_axes_vec.emplace_back(i); + } + } else { + reduced_size *= data->shape[i]; + // printf("\t\tdim %lu, data_shape is %lu, reduced_size now is %lu\n", i, data->shape[i], reduced_size); + reduced_axes_vec.emplace_back(i); + } + } + // ? + // sort(axes_.begin(), axes_.end()); + // sort(reduced_axes_.begin(), reduced_axes_.end()); + + // [-r,r-1], r = rank(data) + axes_axes = new int64_t[axes_ndim]; + for (int i = 0; i < axes_ndim; i ++) { + axes_axes[i] = axes_vec[i]; + // axes_axes[i] = (axes_vec[i] + data_ndim) % data_ndim; + } + reduced_axes = new int64_t[reduced_ndim]; + for (int i = 0; i < reduced_ndim; i ++) { + reduced_axes[i] = reduced_axes_vec[i]; + // reduced_axes[i] = (reduced_axes_vec[i] + data_ndim) % data_ndim; + } + + // + anti_out_strides = new int64_t[axes_ndim]; + for(uint64_t i = 0; i < axes_ndim; i++) { + anti_out_strides[i] = 1; + for(uint64_t j = i + 1; j < axes_ndim; j++) { + anti_out_strides[i] *= data->shape[axes[j]]; + } + } + + // + reduced_strides = new int64_t[reduced_ndim]; + for(uint64_t i = 0; i < reduced_ndim; i++) { + reduced_strides[i] = 1; + for (uint64_t j = i + 1; j < reduced_ndim; j++) { + uint64_t shape = data->shape[reduced_axes_vec[j]]; + if (keepdims == true) { + // shape = 1; + } + reduced_strides[i] *= shape; + } + } + } + // printf("\tdata_ndim: %lu, reduced_ndim: %lu\n", data_ndim, reduced_ndim); + assert(reduced_size == reduced_size_); + + *desc_ptr = new ReduceCpuDescriptor{ + DevCpu, + reduced->dt, + + reduced_ndim, + data_ndim, + axes_ndim, + + reduced_size, + data_size, + axes_size, + + reduced_axes, + axes_axes, + + data_strides, + reduced_strides, + anti_out_strides, + + keepdims, + reduce_type + }; + return STATUS_SUCCESS; +} + +infiniopStatus_t cpuDestroyReduceDescriptor(ReduceCpuDescriptor_t desc) { + // printf("\t--cpuDestroyReduceDescriptor\n"); + // for (int i = 0; i < desc->reduced_ndim; i ++) + // printf("\treduced_axes[%d]=%lu\n", i, desc->reduced_axes[i]); + + delete[] (desc->reduced_axes); + delete[] (desc->axes); + delete[] (desc->reduced_strides); + delete[] (desc->data_strides); + delete[] (desc->anti_out_strides); + delete desc; + // printf("\t--delete desc\n"); + return STATUS_SUCCESS; +} + +template +infiniopStatus_t reduce_cpu(ReduceCpuDescriptor_t desc, + void *reduced, + void const *data) { + auto reduced_ = reinterpret_cast(reduced); + auto data_ = reinterpret_cast(data); + + auto reduced_ndim_ = desc->reduced_ndim; + auto data_ndim_ = desc->data_ndim; + auto axes_ndim_ = desc->axes_ndim; + + auto reduced_size_ = desc->reduced_size; + auto data_size_ = desc->data_size; + auto axes_size_ = desc->axes_size; + // printf("\tdata_size: %lu, reduced_size: %lu, axes_size: %lu\n", data_size_, reduced_size_, axes_size_); + + auto reduced_axes_ = desc->reduced_axes; + auto axes_ = desc->axes; + + auto data_strides_ = desc->data_strides; + auto reduced_strides_ = desc->reduced_strides; + auto anti_out_strides_ = desc->anti_out_strides; + + auto reduce_mode_ = desc->reduce_mode; + + // input tensor will not be reduced + if (axes_ == nullptr) { + for (uint64_t i = 0; i < data_size_; i++) { + reduced_[i] = data_[i]; + } + return STATUS_SUCCESS; + } + + // reduce all axes + if (reduced_axes_ == nullptr) { + float result; + // intitialzation + switch (reduce_mode_) { + case 0: { // ReduceMin + result = std::numeric_limits::max(); + } + break; + case 1: { // ReduceMax + result = std::numeric_limits::lowest(); + } + break; + case 2: { // ReduceMean + result = 0; + } + break; + } + for (int i = 0; i < data_size_; i++) { + if constexpr (std::is_same::value) { + switch (reduce_mode_) { + case 0: { // ReduceMin + result = std::min(result, f16_to_f32(data_[i])); + } + break; + case 1: { // ReduceMax + result = std::max(result, f16_to_f32(data_[i])); + } + break; + case 2: { // ReduceMean + result += f16_to_f32(data_[i]); + } + break; + } + } else { + switch (reduce_mode_) { + case 0: { // ReduceMin + result = std::min(result, data_[i]); + } + break; + case 1: { // ReduceMax + result = std::max(result, data_[i]); + } + break; + case 2: { // ReduceMean + result += data_[i]; + } + break; + } + } + } + reduced_[0] = result; + return STATUS_SUCCESS; + } + + // printf("reduce_cpu handling\n"); +// #pragma omp parallel for + for(uint64_t i = 0; i < reduced_size_; i++) { + float result; + // intitialzation + switch (reduce_mode_) { + case 0: { // ReduceMin + result = std::numeric_limits::max(); + } + break; + case 1: { // ReduceMax + result = std::numeric_limits::lowest(); + } + break; + case 2: { // ReduceMean + result = 0; + } + break; + } + + uint64_t idx = 0; + uint64_t temp_i = i; + for(uint64_t j = 0; j < reduced_ndim_; j++) { + // 将 reduced 张量的多维索引映射到 data 张量的线性索引 + idx += temp_i / reduced_strides_[j] * data_strides_[reduced_axes_[j]]; + temp_i %= reduced_strides_[j]; + } + for(uint64_t j = 0; j < axes_size_; j++) { + uint64_t data_idx_ = idx; + uint64_t temp_j = j; + + // reduced_ndim + axes_ndim = data_ndim + for (uint64_t k = 0; k < axes_ndim_; k++) { + // 继续 + data_idx_ += temp_j / anti_out_strides_[k] * data_strides_[axes_[k]]; + temp_j %= anti_out_strides_[k]; + } + + if constexpr (std::is_same::value) { + switch (reduce_mode_) { + case 0: { // ReduceMin + result = std::min(result, f16_to_f32(data_[data_idx_])); + } + break; + case 1: { // ReduceMax + result = std::max(result, f16_to_f32(data_[data_idx_])); + } + break; + case 2: { // ReduceMean + result += f16_to_f32(data_[data_idx_]); + } + break; + } + } else { + switch (reduce_mode_) { + case 0: { // ReduceMin + result = std::min(result, data_[data_idx_]); + } + break; + case 1: { // ReduceMax + result = std::max(result, data_[data_idx_]); + } + break; + case 2: { // ReduceMean + result += data_[data_idx_]; + } + break; + } + } + } + + // final + if constexpr (std::is_same::value) { + switch (reduce_mode_) { + case 0: { // ReduceMin + reduced_[i] = f32_to_f16(result); + } + break; + case 1: { // ReduceMax + reduced_[i] = f32_to_f16(result); + } + break; + case 2: { // ReduceMean + reduced_[i] = f32_to_f16(result / axes_size_); + } + break; + } + } else { + switch (reduce_mode_) { + case 0: { // ReduceMin + reduced_[i] = result; + } + break; + case 1: { // ReduceMax + reduced_[i] = result; + } + break; + case 2: { // ReduceMean + reduced_[i] = result / axes_size_; + } + break; + } + } + } + + if (desc->keepdims == true && false) { + auto new_reduced_strides = new int64_t[reduced_ndim_]; + uint64_t j = 0; + for(uint64_t i = 0, j = 0; i < data_ndim_; i++) { + new_reduced_strides[i] = 1; + + for (; j < reduced_ndim_; j++) { + if (i == reduced_axes_[j]) { + new_reduced_strides[i] = reduced_strides_[j]; + break; + } + } + } + + desc->reduced_ndim = data_ndim_; + delete[] desc->reduced_strides; + printf("\tdelete reduced_strides\n"); + desc->reduced_strides = new_reduced_strides; + } + return STATUS_SUCCESS; +} + +infiniopStatus_t cpuReduce(ReduceCpuDescriptor_t desc, + void *reduced, + void const *data, + void *stream) { + if (desc->dt == F16) { + return reduce_cpu(desc, reduced, data); + } + if (desc->dt == F32) { + return reduce_cpu(desc, reduced, data); + } + + return STATUS_BAD_TENSOR_DTYPE; +} diff --git a/src/ops/reduce/cpu/reduce_cpu.h b/src/ops/reduce/cpu/reduce_cpu.h new file mode 100644 index 00000000..01ff5cce --- /dev/null +++ b/src/ops/reduce/cpu/reduce_cpu.h @@ -0,0 +1,55 @@ +#ifndef __CPU_REDUCE_H__ +#define __CPU_REDUCE_H__ + +#include "../../../devices/cpu/common_cpu.h" +#include "operators.h" +#include +#include +#include +#include +#include + +struct ReduceCpuDescriptor { + Device device; + DataLayout dt; + + uint64_t reduced_ndim; + uint64_t data_ndim; + uint64_t axes_ndim; + + uint64_t reduced_size; + uint64_t data_size; + uint64_t axes_size; + + int64_t const *reduced_axes; //在 data 中需要保留的维度 + int64_t const *axes; + + int64_t const *data_strides; + int64_t const *reduced_strides; + int64_t const *anti_out_strides; + + // int reduce_element_num; + bool keepdims; + int reduce_mode; +}; + +typedef struct ReduceCpuDescriptor *ReduceCpuDescriptor_t; + +infiniopStatus_t cpuCreateReduceDescriptor(infiniopHandle_t handle, + ReduceCpuDescriptor_t *desc_ptr, + infiniopTensorDescriptor_t reduced, + infiniopTensorDescriptor_t data, + int64_t *axes, + uint64_t axes_ndim, + bool keepdims, + bool noop_with_empty_axes, + int reduce_type); + +infiniopStatus_t cpuReduce(ReduceCpuDescriptor_t desc, + void *reduced, + void const *data, + void *stream); + +infiniopStatus_t cpuDestroyReduceDescriptor(ReduceCpuDescriptor_t desc); + +#endif diff --git a/src/ops/reduce/operator.cc b/src/ops/reduce/operator.cc new file mode 100644 index 00000000..8b3333fd --- /dev/null +++ b/src/ops/reduce/operator.cc @@ -0,0 +1,105 @@ +#include "../utils.h" +#include "operators.h" +#include "reduce.h" + +#ifdef ENABLE_CPU +#include "cpu/reduce_cpu.h" +#endif +#ifdef ENABLE_NV_GPU +#endif +#ifdef ENABLE_MTHREADS_GPU +#endif + +__C infiniopStatus_t infiniopCreateReduceDescriptor(infiniopHandle_t handle, + infiniopReduceDescriptor_t *desc_ptr, + infiniopTensorDescriptor_t reduced, + infiniopTensorDescriptor_t data, + int64_t *axes, + uint64_t axes_ndim, + bool keepdims, + bool noop_with_empty_axes, + int reduce_type) { + switch (handle->device) { +#ifdef ENABLE_CPU + case DevCpu: + return cpuCreateReduceDescriptor(handle, (ReduceCpuDescriptor_t *) desc_ptr, + reduced, data, axes, axes_ndim, keepdims, noop_with_empty_axes, reduce_type); +#endif +#ifdef ENABLE_NV_GPU + case DevNvGpu: { + } +#endif +#ifdef ENABLE_CAMBRICON_MLU + // TODO +#endif +#ifdef ENABLE_MTHREADS_GPU + case DevMthreadsGpu: { + } +#endif + } + return STATUS_BAD_DEVICE; +} + +__C infiniopStatus_t infiniopGetReduceWorkspaceSize(infiniopReduceDescriptor_t desc, uint64_t *size) { + // std::cout << desc->device << std::endl; + switch (desc->device) { +#ifdef ENABLE_CPU + case DevCpu: { + return STATUS_SUCCESS; + } +#endif +#ifdef ENABLE_NV_GPU + case DevNvGpu: { + } +#endif + default: + return STATUS_BAD_DEVICE; + } + return STATUS_BAD_DEVICE; +} + +__C infiniopStatus_t infiniopReduce(infiniopReduceDescriptor_t desc, + void *reduced, + void const *data, + void *stream) { + switch (desc->device) { +#ifdef ENABLE_CPU + case DevCpu: + return cpuReduce((ReduceCpuDescriptor_t) desc, reduced, data, stream); +#endif +#ifdef ENABLE_NV_GPU + case DevNvGpu: { + } + +#endif +#ifdef ENABLE_CAMBRICON_MLU + // TODO +#endif +#ifdef ENABLE_MTHREADS_GPU + case DevMthreadsGpu: { + } +#endif + } + return STATUS_BAD_DEVICE; +} + +__C infiniopStatus_t infiniopDestroyReduceDescriptor(infiniopReduceDescriptor_t desc) { + switch (desc->device) { +#ifdef ENABLE_CPU + case DevCpu: + return cpuDestroyReduceDescriptor((ReduceCpuDescriptor_t) desc); +#endif +#ifdef ENABLE_NV_GPU + case DevNvGpu: { + } +#endif +#ifdef ENABLE_CAMBRICON_MLU + // TODO +#endif +#ifdef ENABLE_MTHREADS_GPU + case DevMthreadsGpu: { + } +#endif + } + return STATUS_BAD_DEVICE; +} diff --git a/src/ops/reduce/reduce.h b/src/ops/reduce/reduce.h new file mode 100644 index 00000000..62307127 --- /dev/null +++ b/src/ops/reduce/reduce.h @@ -0,0 +1,30 @@ +#ifndef REDUCE_H +#define REDUCE_H + +#include "export.h" +#include "operators.h" + +typedef struct ReduceDescriptor { + Device device; +} ReduceDescriptor; +typedef ReduceDescriptor *infiniopReduceDescriptor_t; + +__C infiniopStatus_t infiniopCreateReduceDescriptor(infiniopHandle_t handle, + infiniopReduceDescriptor_t *desc_ptr, + infiniopTensorDescriptor_t reduced, + infiniopTensorDescriptor_t data, + int64_t *axes, + uint64_t axes_ndim, + bool keepdims, + bool noop_with_empty_axes, + int reduce_type); + +__C infiniopStatus_t infiniopGetReduceWorkspaceSize(infiniopReduceDescriptor_t desc, uint64_t *size); + +__C infiniopStatus_t infiniopReduce(infiniopReduceDescriptor_t desc, + void *reduced, + void const *data, + void *stream); + +__C infiniopStatus_t infiniopDestroyReduceDescriptor(infiniopReduceDescriptor_t desc); +#endif diff --git a/src/ops/reduce_max/operator.cc b/src/ops/reduce_max/operator.cc new file mode 100644 index 00000000..91798ab5 --- /dev/null +++ b/src/ops/reduce_max/operator.cc @@ -0,0 +1,60 @@ +#include "../reduce/reduce.h" +#include "../utils.h" +#include "ops/reduce_max/reduce_max.h" + +struct _ReduceMaxDescriptor { + Device device; + infiniopReduceDescriptor_t reduce_desc; + uint64_t workspace_size; +}; + +typedef struct _ReduceMaxDescriptor *_ReduceMaxDescriptor_t; + +__C __export infiniopStatus_t infiniopCreateReduceMaxDescriptor(infiniopHandle_t handle, + infiniopReduceMaxDescriptor_t *desc_ptr, + infiniopTensorDescriptor_t reduced, + infiniopTensorDescriptor_t data, + int64_t *axes, + uint64_t axes_ndim, + bool keepdims, + bool noop_with_empty_axes) { + infiniopReduceDescriptor_t reduce_desc; + // case 0: { // ReduceMin + // case 1: { // ReduceMax + // case 2: { // ReduceMean + CHECK_STATUS(infiniopCreateReduceDescriptor(handle, &reduce_desc, reduced, data, axes, axes_ndim, + keepdims, noop_with_empty_axes, 1), STATUS_SUCCESS); + + uint64_t workspace_size = 0; + CHECK_STATUS(infiniopGetReduceWorkspaceSize(reduce_desc, &workspace_size), STATUS_SUCCESS); + + *(_ReduceMaxDescriptor_t *) desc_ptr = new _ReduceMaxDescriptor{ + handle->device, + reduce_desc, + workspace_size}; + + return STATUS_SUCCESS; +} + +__C __export infiniopStatus_t infiniopGetReduceMaxWorkspaceSize(infiniopReduceMaxDescriptor_t desc, uint64_t *size) { + *size = ((_ReduceMaxDescriptor_t) desc)->workspace_size; + return STATUS_SUCCESS; +} + +__C __export infiniopStatus_t infiniopReduceMax(infiniopReduceMaxDescriptor_t desc, void *workspace, uint64_t workspace_size, + void *reduced, void const *data, void *stream) { + auto _desc = (_ReduceMaxDescriptor_t) desc; + if (workspace_size < _desc->workspace_size) { + return STATUS_MEMORY_NOT_ALLOCATED; + } + + CHECK_STATUS(infiniopReduce(_desc->reduce_desc, reduced, data, stream), + STATUS_SUCCESS); + return STATUS_SUCCESS; +} + +__C __export infiniopStatus_t infiniopDestroyReduceMaxDescriptor(infiniopReduceMaxDescriptor_t desc) { + CHECK_STATUS(infiniopDestroyReduceDescriptor(((_ReduceMaxDescriptor_t) desc)->reduce_desc), STATUS_SUCCESS); + delete desc; + return STATUS_SUCCESS; +} diff --git a/src/ops/reduce_mean/operator.cc b/src/ops/reduce_mean/operator.cc new file mode 100644 index 00000000..31460382 --- /dev/null +++ b/src/ops/reduce_mean/operator.cc @@ -0,0 +1,59 @@ +#include "../reduce/reduce.h" +#include "../utils.h" +#include "ops/reduce_mean/reduce_mean.h" + +struct _ReduceMeanDescriptor { + Device device; + infiniopReduceDescriptor_t reduce_desc; + uint64_t workspace_size; +}; + +typedef struct _ReduceMeanDescriptor *_ReduceMeanDescriptor_t; + +__C __export infiniopStatus_t infiniopCreateReduceMeanDescriptor(infiniopHandle_t handle, + infiniopReduceMeanDescriptor_t *desc_ptr, + infiniopTensorDescriptor_t reduced, + infiniopTensorDescriptor_t data, + int64_t *axes, + uint64_t axes_ndim, + bool keepdims, + bool noop_with_empty_axes) { + infiniopReduceDescriptor_t reduce_desc; + + // case 0: { // ReduceMin + // case 1: { // ReduceMax + // case 2: { // ReduceMean + CHECK_STATUS(infiniopCreateReduceDescriptor(handle, &reduce_desc, reduced, data, axes, axes_ndim, + keepdims, noop_with_empty_axes, 2), STATUS_SUCCESS); + uint64_t workspace_size = 0; + CHECK_STATUS(infiniopGetReduceWorkspaceSize(reduce_desc, &workspace_size), STATUS_SUCCESS); + + *(_ReduceMeanDescriptor_t *) desc_ptr = new _ReduceMeanDescriptor{ + handle->device, + reduce_desc, + workspace_size}; + + return STATUS_SUCCESS; +} + +__C __export infiniopStatus_t infiniopGetReduceMeanWorkspaceSize(infiniopReduceMeanDescriptor_t desc, uint64_t *size) { + *size = ((_ReduceMeanDescriptor_t) desc)->workspace_size; + return STATUS_SUCCESS; +} + +__C __export infiniopStatus_t infiniopReduceMean(infiniopReduceMeanDescriptor_t desc, void *workspace, uint64_t workspace_size, void *reduced, void const *data, void *stream) { + auto _desc = (_ReduceMeanDescriptor_t) desc; + if (workspace_size < _desc->workspace_size) { + return STATUS_MEMORY_NOT_ALLOCATED; + } + + CHECK_STATUS(infiniopReduce(_desc->reduce_desc, reduced, data, stream), + STATUS_SUCCESS); + return STATUS_SUCCESS; +} + +__C __export infiniopStatus_t infiniopDestroyReduceMeanDescriptor(infiniopReduceMeanDescriptor_t desc) { + CHECK_STATUS(infiniopDestroyReduceDescriptor(((_ReduceMeanDescriptor_t) desc)->reduce_desc), STATUS_SUCCESS); + delete desc; + return STATUS_SUCCESS; +} diff --git a/src/ops/reduce_min/operator.cc b/src/ops/reduce_min/operator.cc new file mode 100644 index 00000000..b3d1303e --- /dev/null +++ b/src/ops/reduce_min/operator.cc @@ -0,0 +1,59 @@ +#include "../reduce/reduce.h" +#include "../utils.h" +#include "ops/reduce_min/reduce_min.h" + +struct _ReduceMinDescriptor { + Device device; + infiniopReduceDescriptor_t reduce_desc; + uint64_t workspace_size; +}; + +typedef struct _ReduceMinDescriptor *_ReduceMinDescriptor_t; + +__C __export infiniopStatus_t infiniopCreateReduceMinDescriptor(infiniopHandle_t handle, + infiniopReduceMinDescriptor_t *desc_ptr, + infiniopTensorDescriptor_t reduced, + infiniopTensorDescriptor_t data, + int64_t *axes, + uint64_t axes_ndim, + bool keepdims, + bool noop_with_empty_axes) { + infiniopReduceDescriptor_t reduce_desc; + + // case 0: { // ReduceMin + // case 1: { // ReduceMax + // case 2: { // ReduceMean + CHECK_STATUS(infiniopCreateReduceDescriptor(handle, &reduce_desc, reduced, data, axes, axes_ndim, + keepdims, noop_with_empty_axes, 0), STATUS_SUCCESS); + uint64_t workspace_size = 0; + CHECK_STATUS(infiniopGetReduceWorkspaceSize(reduce_desc, &workspace_size), STATUS_SUCCESS); + + *(_ReduceMinDescriptor_t *) desc_ptr = new _ReduceMinDescriptor{ + handle->device, + reduce_desc, + workspace_size}; + + return STATUS_SUCCESS; +} + +__C __export infiniopStatus_t infiniopGetReduceMinWorkspaceSize(infiniopReduceMinDescriptor_t desc, uint64_t *size) { + *size = ((_ReduceMinDescriptor_t) desc)->workspace_size; + return STATUS_SUCCESS; +} + +__C __export infiniopStatus_t infiniopReduceMin(infiniopReduceMinDescriptor_t desc, void *workspace, uint64_t workspace_size, void *reduced, void const *data, void *stream) { + auto _desc = (_ReduceMinDescriptor_t) desc; + if (workspace_size < _desc->workspace_size) { + return STATUS_MEMORY_NOT_ALLOCATED; + } + + CHECK_STATUS(infiniopReduce(_desc->reduce_desc, reduced, data, stream), + STATUS_SUCCESS); + return STATUS_SUCCESS; +} + +__C __export infiniopStatus_t infiniopDestroyReduceMinDescriptor(infiniopReduceMinDescriptor_t desc) { + CHECK_STATUS(infiniopDestroyReduceDescriptor(((_ReduceMinDescriptor_t) desc)->reduce_desc), STATUS_SUCCESS); + delete desc; + return STATUS_SUCCESS; +} diff --git a/src/ops/where/cpu/where_cpu.cc b/src/ops/where/cpu/where_cpu.cc new file mode 100644 index 00000000..5da759a4 --- /dev/null +++ b/src/ops/where/cpu/where_cpu.cc @@ -0,0 +1,121 @@ +#include "where_cpu.h" +#include "../../../devices/cpu/common_cpu.h" +#include "../../utils.h" +#include + +inline void incrementOne(uint64_t *indices, uint64_t const *shape, uint64_t ndim) { + for (int64_t i = ndim - 1; i >= 0; --i) { + if (++indices[i] != shape[i]) { + return; + } + indices[i] = 0; + } +} + +inline uint64_t compactToFlat(uint64_t const *indices, uint64_t const *strides, uint64_t ndim) { + return std::inner_product(indices, indices + ndim, strides, uint64_t(0)); +} + +infiniopStatus_t cpuCreateWhereDescriptor(infiniopHandle_t, + WhereCpuDescriptor_t *desc_ptr, + infiniopTensorDescriptor_t output, + infiniopTensorDescriptor_t condition, + infiniopTensorDescriptor_t x, + infiniopTensorDescriptor_t y) { + uint64_t ndim = output->ndim; + + if (!isValidBroadcastShape(x, y, output)) { + return STATUS_BAD_TENSOR_SHAPE; + } + if (!is_contiguous(x) || !is_contiguous(y) || !is_contiguous(output)) { + return STATUS_BAD_TENSOR_STRIDES; + } + if (output->dt != F16 && output->dt != F32) { + return STATUS_BAD_TENSOR_DTYPE; + } + if (output->dt != x->dt || output->dt != y->dt) { + return STATUS_BAD_TENSOR_DTYPE; + } + if (condition->dt != U8) { + return STATUS_BAD_TENSOR_DTYPE; + } + + uint64_t o_data_size = std::accumulate( + output->shape, output->shape + output->ndim, 1ULL, std::multiplies()); + + uint64_t *o_shape = new uint64_t[ndim]; + std::copy(output->shape, output->shape + ndim, o_shape); + + uint64_t *x_strides = new uint64_t[ndim]; + uint64_t *y_strides = new uint64_t[ndim]; + for (size_t i = 0; i < ndim; ++i) { + x_strides[i] = (i < ndim - x->ndim || output->shape[i] != x->shape[i + x->ndim - ndim]) ? 0 : x->strides[i + x->ndim - ndim]; + y_strides[i] = (i < ndim - y->ndim || output->shape[i] != y->shape[i + y->ndim - ndim]) ? 0 : y->strides[i + y->ndim - ndim]; + } + + uint64_t *o_indices = new uint64_t[ndim]; + std::fill(o_indices, o_indices + ndim, 0); + + *desc_ptr = new WhereCpuDescriptor{ + DevCpu, + output->dt, + ndim, + o_data_size, + o_shape, + x_strides, + y_strides, + o_indices, + }; + + return STATUS_SUCCESS; +} + +infiniopStatus_t cpuDestroyWhereDescriptor(WhereCpuDescriptor_t desc) { + delete[] desc->o_shape; + delete[] desc->x_strides; + delete[] desc->y_strides; + delete[] desc->o_indices; + delete desc; + return STATUS_SUCCESS; +} + +template +infiniopStatus_t clip_cpu(WhereCpuDescriptor_t desc, + void *output, + void const *condition, + void const *x, + void const *y) { + auto x_ = reinterpret_cast(x); + auto y_ = reinterpret_cast(y); + auto condition_ = reinterpret_cast(condition); + + auto o_ = reinterpret_cast(output); + const auto &o_indices = desc->o_indices; + + for (uint64_t i = 0; i < desc->o_data_size; ++i, incrementOne(o_indices, desc->o_shape, desc->ndim)) { + auto x_index = compactToFlat(o_indices, desc->x_strides, desc->ndim); + auto y_index = compactToFlat(o_indices, desc->y_strides, desc->ndim); + + if (condition_[i]) { + o_[i] = x_[x_index]; + } else { + o_[i] = y_[y_index]; + } + } + return STATUS_SUCCESS; +} + +infiniopStatus_t cpuWhere(WhereCpuDescriptor_t desc, + void *output, + void const *condition, + void const *x, + void const *y, + void *stream) { + if (desc->dtype == F16) { + return clip_cpu(desc, output, condition, x, y); + } + if (desc->dtype == F32) { + return clip_cpu(desc, output, condition, x, y); + } + return STATUS_BAD_TENSOR_DTYPE; +} diff --git a/src/ops/where/cpu/where_cpu.h b/src/ops/where/cpu/where_cpu.h new file mode 100644 index 00000000..9c9b66aa --- /dev/null +++ b/src/ops/where/cpu/where_cpu.h @@ -0,0 +1,37 @@ +#ifndef __CPU_WHERE_H__ +#define __CPU_WHERE_H__ + +#include "operators.h" +#include +#include + +struct WhereCpuDescriptor { + Device device; + DT dtype; + uint64_t ndim; + uint64_t o_data_size; + uint64_t const *o_shape; + uint64_t const *x_strides; + uint64_t const *y_strides; + uint64_t *o_indices; +}; + +typedef struct WhereCpuDescriptor *WhereCpuDescriptor_t; + +infiniopStatus_t cpuCreateWhereDescriptor(infiniopHandle_t, + WhereCpuDescriptor_t *, + infiniopTensorDescriptor_t output, + infiniopTensorDescriptor_t condition, + infiniopTensorDescriptor_t x, + infiniopTensorDescriptor_t y); + +infiniopStatus_t cpuWhere(WhereCpuDescriptor_t desc, + void *output, + void const *condition, + void const *x, + void const *y, + void *stream); + +infiniopStatus_t cpuDestroyWhereDescriptor(WhereCpuDescriptor_t desc); + +#endif diff --git a/src/ops/where/operator.cc b/src/ops/where/operator.cc new file mode 100644 index 00000000..7bdfbbd1 --- /dev/null +++ b/src/ops/where/operator.cc @@ -0,0 +1,82 @@ +#include "../utils.h" +#include "operators.h" +#include "ops/where/where.h" + +#ifdef ENABLE_CPU +#include "cpu/where_cpu.h" +#endif +#ifdef ENABLE_NV_GPU +#endif + +__C infiniopStatus_t infiniopCreateWhereDescriptor(infiniopHandle_t handle, + infiniopWhereDescriptor_t *desc_ptr, + infiniopTensorDescriptor_t output, + infiniopTensorDescriptor_t condition, + infiniopTensorDescriptor_t x, + infiniopTensorDescriptor_t y) { + switch (handle->device) { +#ifdef ENABLE_CPU + case DevCpu: + return cpuCreateWhereDescriptor(handle, (WhereCpuDescriptor_t *) desc_ptr, output, condition, x, y); +#endif +#ifdef ENABLE_NV_GPU + case DevNvGpu: { + } +#endif +#ifdef ENABLE_CAMBRICON_MLU + // TODO +#endif +#ifdef ENABLE_MTHREADS_GPU + case DevMthreadsGpu: { + } +#endif + } + return STATUS_BAD_DEVICE; +} + +__C infiniopStatus_t infiniopWhere(infiniopWhereDescriptor_t desc, + void *output, + void const *condition, + void const *x, + void const *y, + void *stream) { + switch (desc->device) { +#ifdef ENABLE_CPU + case DevCpu: + return cpuWhere((WhereCpuDescriptor_t) desc, output, condition, x, y, stream); +#endif +#ifdef ENABLE_NV_GPU + case DevNvGpu: { + } +#endif +#ifdef ENABLE_CAMBRICON_MLU + // TODO +#endif +#ifdef ENABLE_MTHREADS_GPU + case DevMthreadsGpu: { + } +#endif + } + return STATUS_BAD_DEVICE; +} + +__C infiniopStatus_t infiniopDestroyWhereDescriptor(infiniopWhereDescriptor_t desc) { + switch (desc->device) { +#ifdef ENABLE_CPU + case DevCpu: + return cpuDestroyWhereDescriptor((WhereCpuDescriptor_t) desc); +#endif +#ifdef ENABLE_NV_GPU + case DevNvGpu: { + } +#endif +#ifdef ENABLE_CAMBRICON_MLU + // TODO +#endif +#ifdef ENABLE_MTHREADS_GPU + case DevMthreadsGpu: { + } +#endif + } + return STATUS_BAD_DEVICE; +}