diff --git a/include/infini_operators.h b/include/infini_operators.h index 9a5a2555..647fee2f 100644 --- a/include/infini_operators.h +++ b/include/infini_operators.h @@ -3,17 +3,25 @@ #include "ops/attention/attention.h" #include "ops/avg_pool/avg_pool.h" #include "ops/causal_softmax/causal_softmax.h" -#include "ops/global_avg_pool/global_avg_pool.h" +#include "ops/conv/conv.h" +#include "ops/div/div.h" #include "ops/expand/expand.h" #include "ops/gemm/gemm.h" -#include "ops/conv/conv.h" +#include "ops/global_avg_pool/global_avg_pool.h" #include "ops/matmul/matmul.h" +#include "ops/max/max.h" +#include "ops/min/min.h" #include "ops/max_pool/max_pool.h" #include "ops/mlp/mlp.h" +#include "ops/mod/mod.h" +#include "ops/mul/mul.h" +#include "ops/pow/pow.h" #include "ops/random_sample/random_sample.h" #include "ops/rearrange/rearrange.h" #include "ops/relu/relu.h" #include "ops/rms_norm/rms_norm.h" #include "ops/rotary_embedding/rotary_embedding.h" +#include "ops/sub/sub.h" #include "ops/swiglu/swiglu.h" +#include "ops/transpose/transpose.h" #include "tensor/tensor_descriptor.h" diff --git a/include/ops/div/div.h b/include/ops/div/div.h new file mode 100644 index 00000000..a749272d --- /dev/null +++ b/include/ops/div/div.h @@ -0,0 +1,27 @@ +#ifndef DIV_H +#define DIV_H + +#include "../../export.h" +#include "../../operators.h" + +typedef struct DivDescriptor { + Device device; +} DivDescriptor; + +typedef DivDescriptor *infiniopDivDescriptor_t; + +__C __export infiniopStatus_t infiniopCreateDivDescriptor(infiniopHandle_t handle, + infiniopDivDescriptor_t *desc_ptr, + infiniopTensorDescriptor_t c, + infiniopTensorDescriptor_t a, + infiniopTensorDescriptor_t b); + +__C __export infiniopStatus_t infiniopDiv(infiniopDivDescriptor_t desc, + void *c, + void const *a, + void const *b, + void *stream); + +__C __export infiniopStatus_t infiniopDestroyDivDescriptor(infiniopDivDescriptor_t desc); + +#endif diff --git a/include/ops/max/max.h b/include/ops/max/max.h new file mode 100644 index 00000000..446dcb2e --- /dev/null +++ b/include/ops/max/max.h @@ -0,0 +1,27 @@ +#ifndef MAX_H +#define MAX_H + +#include "../../export.h" +#include "../../operators.h" + +typedef struct MaxDescriptor { + Device device; +} MaxDescriptor; + +typedef MaxDescriptor *infiniopMaxDescriptor_t; + +__C __export infiniopStatus_t infiniopCreateMaxDescriptor(infiniopHandle_t handle, + infiniopMaxDescriptor_t *desc_ptr, + infiniopTensorDescriptor_t c, + infiniopTensorDescriptor_t a, + infiniopTensorDescriptor_t b); + +__C __export infiniopStatus_t infiniopMax(infiniopMaxDescriptor_t desc, + void *c, + void const *a, + void const *b, + void *stream); + +__C __export infiniopStatus_t infiniopDestroyMaxDescriptor(infiniopMaxDescriptor_t desc); + +#endif diff --git a/include/ops/min/min.h b/include/ops/min/min.h new file mode 100644 index 00000000..28767f93 --- /dev/null +++ b/include/ops/min/min.h @@ -0,0 +1,27 @@ +#ifndef MIN_H +#define MIN_H + +#include "../../export.h" +#include "../../operators.h" + +typedef struct MinDescriptor { + Device device; +} MinDescriptor; + +typedef MinDescriptor *infiniopMinDescriptor_t; + +__C __export infiniopStatus_t infiniopCreateMinDescriptor(infiniopHandle_t handle, + infiniopMinDescriptor_t *desc_ptr, + infiniopTensorDescriptor_t c, + infiniopTensorDescriptor_t a, + infiniopTensorDescriptor_t b); + +__C __export infiniopStatus_t infiniopMin(infiniopMinDescriptor_t desc, + void *c, + void const *a, + void const *b, + void *stream); + +__C __export infiniopStatus_t infiniopDestroyMinDescriptor(infiniopMinDescriptor_t desc); + +#endif diff --git a/include/ops/mod/mod.h b/include/ops/mod/mod.h new file mode 100644 index 00000000..0065c885 --- /dev/null +++ b/include/ops/mod/mod.h @@ -0,0 +1,27 @@ +#ifndef MOD_H +#define MOD_H + +#include "../../export.h" +#include "../../operators.h" + +typedef struct ModDescriptor { + Device device; +} ModDescriptor; + +typedef ModDescriptor *infiniopModDescriptor_t; + +__C __export infiniopStatus_t infiniopCreateModDescriptor(infiniopHandle_t handle, + infiniopModDescriptor_t *desc_ptr, + infiniopTensorDescriptor_t c, + infiniopTensorDescriptor_t a, + infiniopTensorDescriptor_t b); + +__C __export infiniopStatus_t infiniopMod(infiniopModDescriptor_t desc, + void *c, + void const *a, + void const *b, + void *stream); + +__C __export infiniopStatus_t infiniopDestroyModDescriptor(infiniopModDescriptor_t desc); + +#endif diff --git a/include/ops/mul/mul.h b/include/ops/mul/mul.h new file mode 100644 index 00000000..4a30afe4 --- /dev/null +++ b/include/ops/mul/mul.h @@ -0,0 +1,27 @@ +#ifndef MUL_H +#define MUL_H + +#include "../../export.h" +#include "../../operators.h" + +typedef struct MulDescriptor { + Device device; +} MulDescriptor; + +typedef MulDescriptor *infiniopMulDescriptor_t; + +__C __export infiniopStatus_t infiniopCreateMulDescriptor(infiniopHandle_t handle, + infiniopMulDescriptor_t *desc_ptr, + infiniopTensorDescriptor_t c, + infiniopTensorDescriptor_t a, + infiniopTensorDescriptor_t b); + +__C __export infiniopStatus_t infiniopMul(infiniopMulDescriptor_t desc, + void *c, + void const *a, + void const *b, + void *stream); + +__C __export infiniopStatus_t infiniopDestroyMulDescriptor(infiniopMulDescriptor_t desc); + +#endif diff --git a/include/ops/pow/pow.h b/include/ops/pow/pow.h new file mode 100644 index 00000000..3ba6eb7c --- /dev/null +++ b/include/ops/pow/pow.h @@ -0,0 +1,27 @@ +#ifndef POW_H +#define POW_H + +#include "../../export.h" +#include "../../operators.h" + +typedef struct PowDescriptor { + Device device; +} PowDescriptor; + +typedef PowDescriptor *infiniopPowDescriptor_t; + +__C __export infiniopStatus_t infiniopCreatePowDescriptor(infiniopHandle_t handle, + infiniopPowDescriptor_t *desc_ptr, + infiniopTensorDescriptor_t c, + infiniopTensorDescriptor_t a, + infiniopTensorDescriptor_t b); + +__C __export infiniopStatus_t infiniopPow(infiniopPowDescriptor_t desc, + void *c, + void const *a, + void const *b, + void *stream); + +__C __export infiniopStatus_t infiniopDestroyPowDescriptor(infiniopPowDescriptor_t desc); + +#endif diff --git a/include/ops/sub/sub.h b/include/ops/sub/sub.h new file mode 100644 index 00000000..f5577720 --- /dev/null +++ b/include/ops/sub/sub.h @@ -0,0 +1,27 @@ +#ifndef SUB_H +#define SUB_H + +#include "../../export.h" +#include "../../operators.h" + +typedef struct SubDescriptor { + Device device; +} SubDescriptor; + +typedef SubDescriptor *infiniopSubDescriptor_t; + +__C __export infiniopStatus_t infiniopCreateSubDescriptor(infiniopHandle_t handle, + infiniopSubDescriptor_t *desc_ptr, + infiniopTensorDescriptor_t c, + infiniopTensorDescriptor_t a, + infiniopTensorDescriptor_t b); + +__C __export infiniopStatus_t infiniopSub(infiniopSubDescriptor_t desc, + void *c, + void const *a, + void const *b, + void *stream); + +__C __export infiniopStatus_t infiniopDestroySubDescriptor(infiniopSubDescriptor_t desc); + +#endif diff --git a/operatorspy/tests/add.py b/operatorspy/tests/add.py index d766208c..0217d7d2 100644 --- a/operatorspy/tests/add.py +++ b/operatorspy/tests/add.py @@ -2,6 +2,7 @@ import ctypes import sys import os +import time sys.path.insert(0, os.path.abspath(os.path.join(os.path.dirname(__file__), "..", ".."))) from operatorspy import ( @@ -19,11 +20,18 @@ from enum import Enum, auto import torch +# constant for control whether profile the pytorch and lib functions +# NOTE: need to manually add synchronization function to the lib function (elementwise.cu), +# e.g., cudaDeviceSynchronize() for CUDA +PROFILE = False +NUM_PRERUN = 10 +NUM_ITERATIONS = 1000 class Inplace(Enum): OUT_OF_PLACE = auto() INPLACE_A = auto() INPLACE_B = auto() + INPLACE_AB = auto() class AddDescriptor(Structure): @@ -34,6 +42,10 @@ class AddDescriptor(Structure): def add(x, y): + if PROFILE: + ans = torch.add(x, y) + torch.cuda.synchronize() + return ans return torch.add(x, y) @@ -55,13 +67,20 @@ def test( return a = torch.rand(a_shape, dtype=tensor_dtype).to(torch_device) - b = torch.rand(b_shape, dtype=tensor_dtype).to(torch_device) + b = torch.rand(b_shape, dtype=tensor_dtype).to(torch_device) if inplace != Inplace.INPLACE_AB else a c = torch.rand(c_shape, dtype=tensor_dtype).to(torch_device) if inplace == Inplace.OUT_OF_PLACE else (a if inplace == Inplace.INPLACE_A else b) - ans = add(a, b) + for i in range(NUM_PRERUN if PROFILE else 1): + ans = add(a, b) + if PROFILE: + start_time = time.time() + for i in range(NUM_ITERATIONS): + _ = add(a, b) + elapsed = (time.time() - start_time) / NUM_ITERATIONS + print(f"pytorch time: {elapsed :6f}") a_tensor = to_tensor(a, lib) - b_tensor = to_tensor(b, lib) + b_tensor = to_tensor(b, lib) if inplace != Inplace.INPLACE_AB else a_tensor c_tensor = to_tensor(c, lib) if inplace == Inplace.OUT_OF_PLACE else (a_tensor if inplace == Inplace.INPLACE_A else b_tensor) descriptor = infiniopAddDescriptor_t() @@ -74,9 +93,24 @@ def test( b_tensor.descriptor, ) ) - lib.infiniopAdd( - descriptor, c_tensor.data, a_tensor.data, b_tensor.data, None - ) + + for i in range(NUM_PRERUN if PROFILE else 1): + check_error( + lib.infiniopAdd( + descriptor, c_tensor.data, a_tensor.data, b_tensor.data, None + ) + ) + if PROFILE: + start_time = time.time() + for i in range(NUM_ITERATIONS): + check_error( + lib.infiniopAdd( + descriptor, c_tensor.data, a_tensor.data, b_tensor.data, None + ) + ) + elapsed = (time.time() - start_time) / NUM_ITERATIONS + print(f" lib time: {elapsed :6f}") + assert torch.allclose(c, ans, atol=0, rtol=1e-3) check_error(lib.infiniopDestroyAddDescriptor(descriptor)) @@ -113,20 +147,17 @@ def test_bang(lib, test_cases): if __name__ == "__main__": test_cases = [ # c_shape, a_shape, b_shape, inplace - # ((32, 150, 512000), (32, 150, 512000), (32, 150, 512000), Inplace.OUT_OF_PLACE), # ((32, 150, 51200), (32, 150, 51200), (32, 150, 1), Inplace.OUT_OF_PLACE), # ((32, 150, 51200), (32, 150, 51200), (32, 150, 51200), Inplace.OUT_OF_PLACE), ((1, 3), (1, 3), (1, 3), Inplace.OUT_OF_PLACE), ((), (), (), Inplace.OUT_OF_PLACE), - ((3, 3), (3, 3), (3, 3), Inplace.OUT_OF_PLACE), - ((2, 20, 3), (2, 1, 3), (2, 20, 3), Inplace.OUT_OF_PLACE), - ((32, 20, 512), (32, 20, 512), (32, 20, 512), Inplace.INPLACE_A), - ((32, 20, 512), (32, 20, 512), (32, 20, 512), Inplace.INPLACE_B), - ((32, 256, 112, 112), (32, 256, 112, 1), (32, 256, 112, 112), Inplace.OUT_OF_PLACE), - ((32, 256, 112, 112), (32, 256, 112, 112), (32, 256, 112, 112), Inplace.OUT_OF_PLACE), ((2, 4, 3), (2, 1, 3), (4, 3), Inplace.OUT_OF_PLACE), ((2, 3, 4, 5), (2, 3, 4, 5), (5,), Inplace.OUT_OF_PLACE), ((3, 2, 4, 5), (4, 5), (3, 2, 1, 1), Inplace.OUT_OF_PLACE), + ((3, 20, 33), (3, 20, 33), (3, 20, 33), Inplace.INPLACE_A) if not PROFILE else ((32, 10, 100), (32, 10, 100), (32, 10, 100), Inplace.OUT_OF_PLACE), + ((3, 20, 33), (3, 20, 33), (3, 20, 33), Inplace.INPLACE_B) if not PROFILE else ((32, 15, 510), (32, 15, 510), (32, 15, 510), Inplace.OUT_OF_PLACE), + ((3, 20, 33), (3, 20, 33), (3, 20, 33), Inplace.INPLACE_AB) if not PROFILE else ((32, 256, 112, 112), (32, 256, 112, 1), (32, 256, 112, 112), Inplace.OUT_OF_PLACE), + ((32, 3, 112, 112), (32, 3, 112, 112), (32, 3, 112, 112), Inplace.OUT_OF_PLACE) if not PROFILE else ((32, 256, 112, 112), (32, 256, 112, 112), (32, 256, 112, 112), Inplace.OUT_OF_PLACE), ] args = get_args() lib = open_lib() diff --git a/operatorspy/tests/div.py b/operatorspy/tests/div.py new file mode 100644 index 00000000..37ebbc84 --- /dev/null +++ b/operatorspy/tests/div.py @@ -0,0 +1,212 @@ +from ctypes import POINTER, Structure, c_int32, c_void_p +import ctypes +import sys +import os +import time + +sys.path.insert(0, os.path.abspath(os.path.join(os.path.dirname(__file__), "..", ".."))) +from operatorspy import ( + open_lib, + to_tensor, + DeviceEnum, + infiniopHandle_t, + infiniopTensorDescriptor_t, + create_handle, + destroy_handle, + check_error, +) + +from operatorspy.tests.test_utils import get_args +from enum import Enum, auto +import torch + +# constant for control whether profile the pytorch and lib functions +# NOTE: need to manually add synchronization function to the lib function (elementwise.cu), +# e.g., cudaDeviceSynchronize() for CUDA +PROFILE = False +NUM_PRERUN = 10 +NUM_ITERATIONS = 1000 + +class Inplace(Enum): + OUT_OF_PLACE = auto() + INPLACE_A = auto() + INPLACE_B = auto() + INPLACE_AB = auto() + + +class DivDescriptor(Structure): + _fields_ = [("device", c_int32)] + + +infiniopDivDescriptor_t = POINTER(DivDescriptor) + +def find_and_print_differing_indices( + a, b, c, ans, atol=0, rtol=1e-2 +): + if c.shape != ans.shape: + raise ValueError("Tensors must have the same shape to compare.") + + # Calculate the difference mask based on atol and rtol + diff_mask = torch.abs(c - ans) > (atol + rtol * torch.abs(ans)) + diff_indices = torch.nonzero(diff_mask, as_tuple=False) + + # Print the indices and the differing elements + for idx in diff_indices: + index_tuple = tuple(idx.tolist()) + print( + f"Index: {index_tuple}, a: {a[index_tuple]}, b: {b[index_tuple]}, y element: {c[index_tuple]}, ans element: {ans[index_tuple]}" + ) + + return diff_indices + + +def div(x, y): + if PROFILE: + ans = torch.div(x, y) + torch.cuda.synchronize() + return ans + return torch.div(x, y) + + +def test( + lib, + handle, + torch_device, + c_shape, + a_shape, + b_shape, + tensor_dtype=torch.float16, + inplace=Inplace.OUT_OF_PLACE, +): + print( + f"Testing Div on {torch_device} with c_shape:{c_shape} a_shape:{a_shape} b_shape:{b_shape} dtype:{tensor_dtype} inplace: {inplace.name}" + ) + if a_shape != b_shape and inplace != Inplace.OUT_OF_PLACE: + print("Unsupported test: broadcasting does not support in-place") + return + + a = torch.rand(a_shape, dtype=tensor_dtype).to(torch_device) * 10 + b = torch.rand(b_shape, dtype=tensor_dtype).to(torch_device) * 2 if inplace != Inplace.INPLACE_AB else a + c = torch.rand(c_shape, dtype=tensor_dtype).to(torch_device) if inplace == Inplace.OUT_OF_PLACE else (a if inplace == Inplace.INPLACE_A else b) + + for i in range(NUM_PRERUN if PROFILE else 1): + ans = div(a, b) + if PROFILE: + start_time = time.time() + for i in range(NUM_ITERATIONS): + _ = div(a, b) + elapsed = (time.time() - start_time) / NUM_ITERATIONS + print(f"pytorch time: {elapsed :6f}") + + a_tensor = to_tensor(a, lib) + b_tensor = to_tensor(b, lib) if inplace != Inplace.INPLACE_AB else a_tensor + c_tensor = to_tensor(c, lib) if inplace == Inplace.OUT_OF_PLACE else (a_tensor if inplace == Inplace.INPLACE_A else b_tensor) + descriptor = infiniopDivDescriptor_t() + + check_error( + lib.infiniopCreateDivDescriptor( + handle, + ctypes.byref(descriptor), + c_tensor.descriptor, + a_tensor.descriptor, + b_tensor.descriptor, + ) + ) + + for i in range(NUM_PRERUN if PROFILE else 1): + check_error( + lib.infiniopDiv( + descriptor, c_tensor.data, a_tensor.data, b_tensor.data, None + ) + ) + if PROFILE: + start_time = time.time() + for i in range(NUM_ITERATIONS): + check_error( + lib.infiniopDiv( + descriptor, c_tensor.data, a_tensor.data, b_tensor.data, None + ) + ) + elapsed = (time.time() - start_time) / NUM_ITERATIONS + print(f" lib time: {elapsed :6f}") + + assert torch.allclose(c, ans, atol=0, rtol=1e-3, equal_nan=True) + check_error(lib.infiniopDestroyDivDescriptor(descriptor)) + + +def test_cpu(lib, test_cases): + device = DeviceEnum.DEVICE_CPU + handle = create_handle(lib, device) + for c_shape, a_shape, b_shape, inplace in test_cases: + test(lib, handle, "cpu", c_shape, a_shape, b_shape, tensor_dtype=torch.float16, inplace=inplace) + test(lib, handle, "cpu", c_shape, a_shape, b_shape, tensor_dtype=torch.float32, inplace=inplace) + destroy_handle(lib, handle) + + +def test_cuda(lib, test_cases): + device = DeviceEnum.DEVICE_CUDA + handle = create_handle(lib, device) + for c_shape, a_shape, b_shape, inplace in test_cases: + test(lib, handle, "cuda", c_shape, a_shape, b_shape, tensor_dtype=torch.float16, inplace=inplace) + test(lib, handle, "cuda", c_shape, a_shape, b_shape, tensor_dtype=torch.float32, inplace=inplace) + destroy_handle(lib, handle) + + +def test_bang(lib, test_cases): + import torch_mlu + + device = DeviceEnum.DEVICE_BANG + handle = create_handle(lib, device) + for c_shape, a_shape, b_shape, inplace in test_cases: + test(lib, handle, "mlu", c_shape, a_shape, b_shape, tensor_dtype=torch.float16, inplace=inplace) + test(lib, handle, "mlu", c_shape, a_shape, b_shape, tensor_dtype=torch.float32, inplace=inplace) + destroy_handle(lib, handle) + + +if __name__ == "__main__": + test_cases = [ + # c_shape, a_shape, b_shape, inplace + # ((32, 150, 51200), (32, 150, 51200), (32, 150, 1), Inplace.OUT_OF_PLACE), + # ((32, 150, 51200), (32, 150, 51200), (32, 150, 51200), Inplace.OUT_OF_PLACE), + ((1, 3), (1, 3), (1, 3), Inplace.OUT_OF_PLACE), + ((), (), (), Inplace.OUT_OF_PLACE), + ((2, 4, 3), (2, 1, 3), (4, 3), Inplace.OUT_OF_PLACE), + ((2, 3, 4, 5), (2, 3, 4, 5), (5,), Inplace.OUT_OF_PLACE), + ((3, 2, 4, 5), (4, 5), (3, 2, 1, 1), Inplace.OUT_OF_PLACE), + ((3, 20, 33), (3, 20, 33), (3, 20, 33), Inplace.INPLACE_A) if not PROFILE else ((32, 10, 100), (32, 10, 100), (32, 10, 100), Inplace.OUT_OF_PLACE), + ((3, 20, 33), (3, 20, 33), (3, 20, 33), Inplace.INPLACE_B) if not PROFILE else ((32, 15, 510), (32, 15, 510), (32, 15, 510), Inplace.OUT_OF_PLACE), + ((3, 20, 33), (3, 20, 33), (3, 20, 33), Inplace.INPLACE_AB) if not PROFILE else ((32, 256, 112, 112), (32, 256, 112, 1), (32, 256, 112, 112), Inplace.OUT_OF_PLACE), + ((32, 3, 112, 112), (32, 3, 112, 112), (32, 3, 112, 112), Inplace.OUT_OF_PLACE) if not PROFILE else ((32, 256, 112, 112), (32, 256, 112, 112), (32, 256, 112, 112), Inplace.OUT_OF_PLACE), + ] + args = get_args() + lib = open_lib() + lib.infiniopCreateDivDescriptor.restype = c_int32 + lib.infiniopCreateDivDescriptor.argtypes = [ + infiniopHandle_t, + POINTER(infiniopDivDescriptor_t), + infiniopTensorDescriptor_t, + infiniopTensorDescriptor_t, + infiniopTensorDescriptor_t, + ] + lib.infiniopDiv.restype = c_int32 + lib.infiniopDiv.argtypes = [ + infiniopDivDescriptor_t, + c_void_p, + c_void_p, + c_void_p, + c_void_p, + ] + lib.infiniopDestroyDivDescriptor.restype = c_int32 + lib.infiniopDestroyDivDescriptor.argtypes = [ + infiniopDivDescriptor_t, + ] + + if args.cpu: + test_cpu(lib, test_cases) + if args.cuda: + test_cuda(lib, test_cases) + if args.bang: + test_bang(lib, test_cases) + if not (args.cpu or args.cuda or args.bang): + test_cpu(lib, test_cases) + print("\033[92mTest passed!\033[0m") diff --git a/operatorspy/tests/max.py b/operatorspy/tests/max.py new file mode 100644 index 00000000..2efeb651 --- /dev/null +++ b/operatorspy/tests/max.py @@ -0,0 +1,194 @@ +from ctypes import POINTER, Structure, c_int32, c_void_p +import ctypes +import sys +import os +import time + +sys.path.insert(0, os.path.abspath(os.path.join(os.path.dirname(__file__), "..", ".."))) +from operatorspy import ( + open_lib, + to_tensor, + DeviceEnum, + infiniopHandle_t, + infiniopTensorDescriptor_t, + create_handle, + destroy_handle, + check_error, +) + +from operatorspy.tests.test_utils import get_args +from enum import Enum, auto +import torch + +# constant for control whether profile the pytorch and lib functions +# NOTE: need to manually add synchronization function to the lib function (elementwise.cu), +# e.g., cudaDeviceSynchronize() for CUDA +PROFILE = False +NUM_PRERUN = 10 +NUM_ITERATIONS = 1000 + +class Inplace(Enum): + OUT_OF_PLACE = auto() + INPLACE_A = auto() + INPLACE_B = auto() + INPLACE_AB = auto() + + +class MaxDescriptor(Structure): + _fields_ = [("device", c_int32)] + + +infiniopMaxDescriptor_t = POINTER(MaxDescriptor) + + +def max(x, y): + if PROFILE: + ans = torch.max(x, y) + torch.cuda.synchronize() + return ans + return torch.max(x, y) + + +def test( + lib, + handle, + torch_device, + c_shape, + a_shape, + b_shape, + tensor_dtype=torch.float16, + inplace=Inplace.OUT_OF_PLACE, +): + print( + f"Testing Max on {torch_device} with c_shape:{c_shape} a_shape:{a_shape} b_shape:{b_shape} dtype:{tensor_dtype} inplace: {inplace.name}" + ) + if a_shape != b_shape and inplace != Inplace.OUT_OF_PLACE: + print("Unsupported test: broadcasting does not support in-place") + return + + a = torch.rand(a_shape, dtype=tensor_dtype).to(torch_device) * 10 + b = torch.rand(b_shape, dtype=tensor_dtype).to(torch_device) if inplace != Inplace.INPLACE_AB else a + c = torch.rand(c_shape, dtype=tensor_dtype).to(torch_device) if inplace == Inplace.OUT_OF_PLACE else (a if inplace == Inplace.INPLACE_A else b) + + for i in range(NUM_PRERUN if PROFILE else 1): + ans = max(a, b) + if PROFILE: + start_time = time.time() + for i in range(NUM_ITERATIONS): + _ = max(a, b) + elapsed = (time.time() - start_time) / NUM_ITERATIONS + print(f"pytorch time: {elapsed :6f}") + + a_tensor = to_tensor(a, lib) + b_tensor = to_tensor(b, lib) if inplace != Inplace.INPLACE_AB else a_tensor + c_tensor = to_tensor(c, lib) if inplace == Inplace.OUT_OF_PLACE else (a_tensor if inplace == Inplace.INPLACE_A else b_tensor) + descriptor = infiniopMaxDescriptor_t() + + check_error( + lib.infiniopCreateMaxDescriptor( + handle, + ctypes.byref(descriptor), + c_tensor.descriptor, + a_tensor.descriptor, + b_tensor.descriptor, + ) + ) + + for i in range(NUM_PRERUN if PROFILE else 1): + check_error( + lib.infiniopMax( + descriptor, c_tensor.data, a_tensor.data, b_tensor.data, None + ) + ) + if PROFILE: + start_time = time.time() + for i in range(NUM_ITERATIONS): + check_error( + lib.infiniopMax( + descriptor, c_tensor.data, a_tensor.data, b_tensor.data, None + ) + ) + elapsed = (time.time() - start_time) / NUM_ITERATIONS + print(f" lib time: {elapsed :6f}") + + assert torch.allclose(c, ans, atol=0, rtol=0) + check_error(lib.infiniopDestroyMaxDescriptor(descriptor)) + + +def test_cpu(lib, test_cases): + device = DeviceEnum.DEVICE_CPU + handle = create_handle(lib, device) + for c_shape, a_shape, b_shape, inplace in test_cases: + test(lib, handle, "cpu", c_shape, a_shape, b_shape, tensor_dtype=torch.float16, inplace=inplace) + test(lib, handle, "cpu", c_shape, a_shape, b_shape, tensor_dtype=torch.float32, inplace=inplace) + destroy_handle(lib, handle) + + +def test_cuda(lib, test_cases): + device = DeviceEnum.DEVICE_CUDA + handle = create_handle(lib, device) + for c_shape, a_shape, b_shape, inplace in test_cases: + test(lib, handle, "cuda", c_shape, a_shape, b_shape, tensor_dtype=torch.float16, inplace=inplace) + test(lib, handle, "cuda", c_shape, a_shape, b_shape, tensor_dtype=torch.float32, inplace=inplace) + destroy_handle(lib, handle) + + +def test_bang(lib, test_cases): + import torch_mlu + + device = DeviceEnum.DEVICE_BANG + handle = create_handle(lib, device) + for c_shape, a_shape, b_shape, inplace in test_cases: + test(lib, handle, "mlu", c_shape, a_shape, b_shape, tensor_dtype=torch.float16, inplace=inplace) + test(lib, handle, "mlu", c_shape, a_shape, b_shape, tensor_dtype=torch.float32, inplace=inplace) + destroy_handle(lib, handle) + + +if __name__ == "__main__": + test_cases = [ + # c_shape, a_shape, b_shape, inplace + # ((32, 150, 51200), (32, 150, 51200), (32, 150, 1), Inplace.OUT_OF_PLACE), + # ((32, 150, 51200), (32, 150, 51200), (32, 150, 51200), Inplace.OUT_OF_PLACE), + ((1, 3), (1, 3), (1, 3), Inplace.OUT_OF_PLACE), + ((), (), (), Inplace.OUT_OF_PLACE), + ((2, 4, 3), (2, 1, 3), (4, 3), Inplace.OUT_OF_PLACE), + ((2, 3, 4, 5), (2, 3, 4, 5), (5,), Inplace.OUT_OF_PLACE), + ((3, 2, 4, 5), (4, 5), (3, 2, 1, 1), Inplace.OUT_OF_PLACE), + ((3, 20, 33), (3, 20, 33), (3, 20, 33), Inplace.INPLACE_A) if not PROFILE else ((32, 10, 100), (32, 10, 100), (32, 10, 100), Inplace.OUT_OF_PLACE), + ((3, 20, 33), (3, 20, 33), (3, 20, 33), Inplace.INPLACE_B) if not PROFILE else ((32, 15, 510), (32, 15, 510), (32, 15, 510), Inplace.OUT_OF_PLACE), + ((3, 20, 33), (3, 20, 33), (3, 20, 33), Inplace.INPLACE_AB) if not PROFILE else ((32, 256, 112, 112), (32, 256, 112, 1), (32, 256, 112, 112), Inplace.OUT_OF_PLACE), + ((33, 333, 333), (33, 333, 333), (33, 333, 333), Inplace.OUT_OF_PLACE), + ((32, 3, 112, 112), (32, 3, 112, 112), (32, 3, 112, 112), Inplace.OUT_OF_PLACE) if not PROFILE else ((32, 256, 112, 112), (32, 256, 112, 112), (32, 256, 112, 112), Inplace.OUT_OF_PLACE), + ] + args = get_args() + lib = open_lib() + lib.infiniopCreateMaxDescriptor.restype = c_int32 + lib.infiniopCreateMaxDescriptor.argtypes = [ + infiniopHandle_t, + POINTER(infiniopMaxDescriptor_t), + infiniopTensorDescriptor_t, + infiniopTensorDescriptor_t, + infiniopTensorDescriptor_t, + ] + lib.infiniopMax.restype = c_int32 + lib.infiniopMax.argtypes = [ + infiniopMaxDescriptor_t, + c_void_p, + c_void_p, + c_void_p, + c_void_p, + ] + lib.infiniopDestroyMaxDescriptor.restype = c_int32 + lib.infiniopDestroyMaxDescriptor.argtypes = [ + infiniopMaxDescriptor_t, + ] + + if args.cpu: + test_cpu(lib, test_cases) + if args.cuda: + test_cuda(lib, test_cases) + if args.bang: + test_bang(lib, test_cases) + if not (args.cpu or args.cuda or args.bang): + test_cpu(lib, test_cases) + print("\033[92mTest passed!\033[0m") diff --git a/operatorspy/tests/min.py b/operatorspy/tests/min.py new file mode 100644 index 00000000..548f4819 --- /dev/null +++ b/operatorspy/tests/min.py @@ -0,0 +1,193 @@ +from ctypes import POINTER, Structure, c_int32, c_void_p +import ctypes +import sys +import os +import time + +sys.path.insert(0, os.path.abspath(os.path.join(os.path.dirname(__file__), "..", ".."))) +from operatorspy import ( + open_lib, + to_tensor, + DeviceEnum, + infiniopHandle_t, + infiniopTensorDescriptor_t, + create_handle, + destroy_handle, + check_error, +) + +from operatorspy.tests.test_utils import get_args +from enum import Enum, auto +import torch + +# constant for control whether profile the pytorch and lib functions +# NOTE: need to manually add synchronization function to the lib function (elementwise.cu), +# e.g., cudaDeviceSynchronize() for CUDA +PROFILE = False +NUM_PRERUN = 10 +NUM_ITERATIONS = 1000 + +class Inplace(Enum): + OUT_OF_PLACE = auto() + INPLACE_A = auto() + INPLACE_B = auto() + INPLACE_AB = auto() + + +class MinDescriptor(Structure): + _fields_ = [("device", c_int32)] + + +infiniopMinDescriptor_t = POINTER(MinDescriptor) + + +def min(x, y): + if PROFILE: + ans = torch.min(x, y) + torch.cuda.synchronize() + return ans + return torch.min(x, y) + + +def test( + lib, + handle, + torch_device, + c_shape, + a_shape, + b_shape, + tensor_dtype=torch.float16, + inplace=Inplace.OUT_OF_PLACE, +): + print( + f"Testing Min on {torch_device} with c_shape:{c_shape} a_shape:{a_shape} b_shape:{b_shape} dtype:{tensor_dtype} inplace: {inplace.name}" + ) + if a_shape != b_shape and inplace != Inplace.OUT_OF_PLACE: + print("Unsupported test: broadcasting does not support in-place") + return + + a = torch.rand(a_shape, dtype=tensor_dtype).to(torch_device) * 10 + b = torch.rand(b_shape, dtype=tensor_dtype).to(torch_device) if inplace != Inplace.INPLACE_AB else a + c = torch.rand(c_shape, dtype=tensor_dtype).to(torch_device) if inplace == Inplace.OUT_OF_PLACE else (a if inplace == Inplace.INPLACE_A else b) + + for i in range(NUM_PRERUN if PROFILE else 1): + ans = min(a, b) + if PROFILE: + start_time = time.time() + for i in range(NUM_ITERATIONS): + _ = min(a, b) + elapsed = (time.time() - start_time) / NUM_ITERATIONS + print(f"pytorch time: {elapsed :6f}") + + a_tensor = to_tensor(a, lib) + b_tensor = to_tensor(b, lib) if inplace != Inplace.INPLACE_AB else a_tensor + c_tensor = to_tensor(c, lib) if inplace == Inplace.OUT_OF_PLACE else (a_tensor if inplace == Inplace.INPLACE_A else b_tensor) + descriptor = infiniopMinDescriptor_t() + + check_error( + lib.infiniopCreateMinDescriptor( + handle, + ctypes.byref(descriptor), + c_tensor.descriptor, + a_tensor.descriptor, + b_tensor.descriptor, + ) + ) + + for i in range(NUM_PRERUN if PROFILE else 1): + check_error( + lib.infiniopMin( + descriptor, c_tensor.data, a_tensor.data, b_tensor.data, None + ) + ) + if PROFILE: + start_time = time.time() + for i in range(NUM_ITERATIONS): + check_error( + lib.infiniopMin( + descriptor, c_tensor.data, a_tensor.data, b_tensor.data, None + ) + ) + elapsed = (time.time() - start_time) / NUM_ITERATIONS + print(f" lib time: {elapsed :6f}") + + assert torch.allclose(c, ans, atol=0, rtol=0) + check_error(lib.infiniopDestroyMinDescriptor(descriptor)) + + +def test_cpu(lib, test_cases): + device = DeviceEnum.DEVICE_CPU + handle = create_handle(lib, device) + for c_shape, a_shape, b_shape, inplace in test_cases: + test(lib, handle, "cpu", c_shape, a_shape, b_shape, tensor_dtype=torch.float16, inplace=inplace) + test(lib, handle, "cpu", c_shape, a_shape, b_shape, tensor_dtype=torch.float32, inplace=inplace) + destroy_handle(lib, handle) + + +def test_cuda(lib, test_cases): + device = DeviceEnum.DEVICE_CUDA + handle = create_handle(lib, device) + for c_shape, a_shape, b_shape, inplace in test_cases: + test(lib, handle, "cuda", c_shape, a_shape, b_shape, tensor_dtype=torch.float16, inplace=inplace) + test(lib, handle, "cuda", c_shape, a_shape, b_shape, tensor_dtype=torch.float32, inplace=inplace) + destroy_handle(lib, handle) + + +def test_bang(lib, test_cases): + import torch_mlu + + device = DeviceEnum.DEVICE_BANG + handle = create_handle(lib, device) + for c_shape, a_shape, b_shape, inplace in test_cases: + test(lib, handle, "mlu", c_shape, a_shape, b_shape, tensor_dtype=torch.float16, inplace=inplace) + test(lib, handle, "mlu", c_shape, a_shape, b_shape, tensor_dtype=torch.float32, inplace=inplace) + destroy_handle(lib, handle) + + +if __name__ == "__main__": + test_cases = [ + # c_shape, a_shape, b_shape, inplace + # ((32, 150, 51200), (32, 150, 51200), (32, 150, 1), Inplace.OUT_OF_PLACE), + # ((32, 150, 51200), (32, 150, 51200), (32, 150, 51200), Inplace.OUT_OF_PLACE), + ((1, 3), (1, 3), (1, 3), Inplace.OUT_OF_PLACE), + ((), (), (), Inplace.OUT_OF_PLACE), + ((2, 4, 3), (2, 1, 3), (4, 3), Inplace.OUT_OF_PLACE), + ((2, 3, 4, 5), (2, 3, 4, 5), (5,), Inplace.OUT_OF_PLACE), + ((3, 2, 4, 5), (4, 5), (3, 2, 1, 1), Inplace.OUT_OF_PLACE), + ((3, 20, 33), (3, 20, 33), (3, 20, 33), Inplace.INPLACE_A) if not PROFILE else ((32, 10, 100), (32, 10, 100), (32, 10, 100), Inplace.OUT_OF_PLACE), + ((3, 20, 33), (3, 20, 33), (3, 20, 33), Inplace.INPLACE_B) if not PROFILE else ((32, 15, 510), (32, 15, 510), (32, 15, 510), Inplace.OUT_OF_PLACE), + ((3, 20, 33), (3, 20, 33), (3, 20, 33), Inplace.INPLACE_AB) if not PROFILE else ((32, 256, 112, 112), (32, 256, 112, 1), (32, 256, 112, 112), Inplace.OUT_OF_PLACE), + ((32, 3, 112, 112), (32, 3, 112, 112), (32, 3, 112, 112), Inplace.OUT_OF_PLACE) if not PROFILE else ((32, 256, 112, 112), (32, 256, 112, 112), (32, 256, 112, 112), Inplace.OUT_OF_PLACE), + ] + args = get_args() + lib = open_lib() + lib.infiniopCreateMinDescriptor.restype = c_int32 + lib.infiniopCreateMinDescriptor.argtypes = [ + infiniopHandle_t, + POINTER(infiniopMinDescriptor_t), + infiniopTensorDescriptor_t, + infiniopTensorDescriptor_t, + infiniopTensorDescriptor_t, + ] + lib.infiniopMin.restype = c_int32 + lib.infiniopMin.argtypes = [ + infiniopMinDescriptor_t, + c_void_p, + c_void_p, + c_void_p, + c_void_p, + ] + lib.infiniopDestroyMinDescriptor.restype = c_int32 + lib.infiniopDestroyMinDescriptor.argtypes = [ + infiniopMinDescriptor_t, + ] + + if args.cpu: + test_cpu(lib, test_cases) + if args.cuda: + test_cuda(lib, test_cases) + if args.bang: + test_bang(lib, test_cases) + if not (args.cpu or args.cuda or args.bang): + test_cpu(lib, test_cases) + print("\033[92mTest passed!\033[0m") \ No newline at end of file diff --git a/operatorspy/tests/mod.py b/operatorspy/tests/mod.py new file mode 100644 index 00000000..7bf5beaf --- /dev/null +++ b/operatorspy/tests/mod.py @@ -0,0 +1,224 @@ +from ctypes import POINTER, Structure, c_int32, c_void_p +import ctypes +import sys +import os +import time + +sys.path.insert(0, os.path.abspath(os.path.join(os.path.dirname(__file__), "..", ".."))) +from operatorspy import ( + open_lib, + to_tensor, + DeviceEnum, + infiniopHandle_t, + infiniopTensorDescriptor_t, + create_handle, + destroy_handle, + check_error, +) + +from operatorspy.tests.test_utils import get_args +from enum import Enum, auto +import torch + +# constant for control whether profile the pytorch and lib functions +# NOTE: need to manually add synchronization function to the lib function (elementwise.cu), +# e.g., cudaDeviceSynchronize() for CUDA +PROFILE = False +NUM_PRERUN = 10 +NUM_ITERATIONS = 1000 + + +class Inplace(Enum): + OUT_OF_PLACE = auto() + INPLACE_A = auto() + INPLACE_B = auto() + INPLACE_AB = auto() + + +class ModDescriptor(Structure): + _fields_ = [("device", c_int32)] + + +infiniopModDescriptor_t = POINTER(ModDescriptor) + + +def find_and_print_differing_indices(a, b, c, ans, atol=0, rtol=1e-2): + if c.shape != ans.shape: + raise ValueError("Tensors must have the same shape to compare.") + + # Calculate the difference mask based on atol and rtol + diff_mask = torch.abs(c - ans) > (atol + rtol * torch.abs(ans)) + diff_indices = torch.nonzero(diff_mask, as_tuple=False) + + # Print the indices and the differing elements + for idx in diff_indices: + index_tuple = tuple(idx.tolist()) + print( + f"Index: {index_tuple}, a: {a[index_tuple]}, b: {b[index_tuple]}, y element: {c[index_tuple]}, ans element: {ans[index_tuple]}" + ) + + return diff_indices + + +def mod(x, y): + if PROFILE: + ans = torch.fmod(x, y) + torch.cuda.synchronize() + return ans + return torch.fmod(x, y) + + +def test( + lib, + handle, + torch_device, + c_shape, + a_shape, + b_shape, + tensor_dtype=torch.float16, + inplace=Inplace.OUT_OF_PLACE, +): + print( + f"Testing Mod on {torch_device} with c_shape:{c_shape} a_shape:{a_shape} b_shape:{b_shape} dtype:{tensor_dtype} inplace: {inplace.name}" + ) + if a_shape != b_shape and inplace != Inplace.OUT_OF_PLACE: + print("Unsupported test: broadcasting does not support in-place") + return + + a = torch.rand(a_shape, dtype=tensor_dtype).to(torch_device) * 10 + b = ( + torch.rand(b_shape, dtype=tensor_dtype).to(torch_device) + if inplace != Inplace.INPLACE_AB + else a + ) + c = ( + torch.rand(c_shape, dtype=tensor_dtype).to(torch_device) + if inplace == Inplace.OUT_OF_PLACE + else (a if inplace == Inplace.INPLACE_A else b) + ) + + for i in range(NUM_PRERUN if PROFILE else 1): + ans = mod(a, b) + if PROFILE: + start_time = time.time() + for i in range(NUM_ITERATIONS): + _ = mod(a, b) + elapsed = (time.time() - start_time) / NUM_ITERATIONS + print(f"pytorch time: {elapsed :6f}") + + a_tensor = to_tensor(a, lib) + b_tensor = to_tensor(b, lib) if inplace != Inplace.INPLACE_AB else a_tensor + c_tensor = ( + to_tensor(c, lib) + if inplace == Inplace.OUT_OF_PLACE + else (a_tensor if inplace == Inplace.INPLACE_A else b_tensor) + ) + descriptor = infiniopModDescriptor_t() + + check_error( + lib.infiniopCreateModDescriptor( + handle, + ctypes.byref(descriptor), + c_tensor.descriptor, + a_tensor.descriptor, + b_tensor.descriptor, + ) + ) + + for i in range(NUM_PRERUN if PROFILE else 1): + check_error( + lib.infiniopMod( + descriptor, c_tensor.data, a_tensor.data, b_tensor.data, None + ) + ) + if PROFILE: + start_time = time.time() + for i in range(NUM_ITERATIONS): + check_error( + lib.infiniopMod( + descriptor, c_tensor.data, a_tensor.data, b_tensor.data, None + ) + ) + elapsed = (time.time() - start_time) / NUM_ITERATIONS + print(f" lib time: {elapsed :6f}") + + assert torch.allclose(c, ans, atol=0, rtol=0, equal_nan=True) + check_error(lib.infiniopDestroyModDescriptor(descriptor)) + + +def test_cpu(lib, test_cases): + device = DeviceEnum.DEVICE_CPU + handle = create_handle(lib, device) + for c_shape, a_shape, b_shape, inplace in test_cases: + test(lib, handle, "cpu", c_shape, a_shape, b_shape, tensor_dtype=torch.float16, inplace=inplace) + test(lib, handle, "cpu", c_shape, a_shape, b_shape, tensor_dtype=torch.float32, inplace=inplace) + destroy_handle(lib, handle) + + +def test_cuda(lib, test_cases): + device = DeviceEnum.DEVICE_CUDA + handle = create_handle(lib, device) + for c_shape, a_shape, b_shape, inplace in test_cases: + test(lib, handle, "cuda", c_shape, a_shape, b_shape, tensor_dtype=torch.float16, inplace=inplace) + test(lib, handle, "cuda", c_shape, a_shape, b_shape, tensor_dtype=torch.float32, inplace=inplace) + destroy_handle(lib, handle) + + +def test_bang(lib, test_cases): + import torch_mlu + + device = DeviceEnum.DEVICE_BANG + handle = create_handle(lib, device) + for c_shape, a_shape, b_shape, inplace in test_cases: + test(lib, handle, "mlu", c_shape, a_shape, b_shape, tensor_dtype=torch.float16, inplace=inplace) + test(lib, handle, "mlu", c_shape, a_shape, b_shape, tensor_dtype=torch.float32, inplace=inplace) + destroy_handle(lib, handle) + + +if __name__ == "__main__": + test_cases = [ + # c_shape, a_shape, b_shape, inplace + # ((32, 150, 51200), (32, 150, 51200), (32, 150, 1), Inplace.OUT_OF_PLACE), + ((1, 3), (1, 3), (1, 3), Inplace.OUT_OF_PLACE), + ((), (), (), Inplace.OUT_OF_PLACE), + ((2, 4, 3), (2, 1, 3), (4, 3), Inplace.OUT_OF_PLACE), + ((2, 3, 4, 5), (2, 3, 4, 5), (5,), Inplace.OUT_OF_PLACE), + ((3, 2, 4, 5), (4, 5), (3, 2, 1, 1), Inplace.OUT_OF_PLACE), + ((3, 20, 33), (3, 20, 33), (3, 20, 33), Inplace.OUT_OF_PLACE), + ((3, 20, 33), (3, 20, 33), (3, 20, 33), Inplace.INPLACE_A) if not PROFILE else ((32, 10, 100), (32, 10, 100), (32, 10, 100), Inplace.OUT_OF_PLACE), + ((3, 20, 33), (3, 20, 33), (3, 20, 33), Inplace.INPLACE_B) if not PROFILE else ((32, 15, 510), (32, 15, 510), (32, 15, 510), Inplace.OUT_OF_PLACE), + ((3, 20, 33), (3, 20, 33), (3, 20, 33), Inplace.INPLACE_AB) if not PROFILE else ((32, 256, 112, 112), (32, 256, 112, 1), (32, 256, 112, 112), Inplace.OUT_OF_PLACE), + ((32, 3, 112, 112), (32, 3, 112, 112), (32, 3, 112, 112), Inplace.OUT_OF_PLACE) if not PROFILE else ((32, 256, 112, 112), (32, 256, 112, 112), (32, 256, 112, 112), Inplace.OUT_OF_PLACE), + ] + args = get_args() + lib = open_lib() + lib.infiniopCreateModDescriptor.restype = c_int32 + lib.infiniopCreateModDescriptor.argtypes = [ + infiniopHandle_t, + POINTER(infiniopModDescriptor_t), + infiniopTensorDescriptor_t, + infiniopTensorDescriptor_t, + infiniopTensorDescriptor_t, + ] + lib.infiniopMod.restype = c_int32 + lib.infiniopMod.argtypes = [ + infiniopModDescriptor_t, + c_void_p, + c_void_p, + c_void_p, + c_void_p, + ] + lib.infiniopDestroyModDescriptor.restype = c_int32 + lib.infiniopDestroyModDescriptor.argtypes = [ + infiniopModDescriptor_t, + ] + + if args.cpu: + test_cpu(lib, test_cases) + if args.cuda: + test_cuda(lib, test_cases) + if args.bang: + test_bang(lib, test_cases) + if not (args.cpu or args.cuda or args.bang): + test_cpu(lib, test_cases) + print("\033[92mTest passed!\033[0m") \ No newline at end of file diff --git a/operatorspy/tests/mul.py b/operatorspy/tests/mul.py new file mode 100644 index 00000000..11baabb9 --- /dev/null +++ b/operatorspy/tests/mul.py @@ -0,0 +1,193 @@ +from ctypes import POINTER, Structure, c_int32, c_void_p +import ctypes +import sys +import os +import time + +sys.path.insert(0, os.path.abspath(os.path.join(os.path.dirname(__file__), "..", ".."))) +from operatorspy import ( + open_lib, + to_tensor, + DeviceEnum, + infiniopHandle_t, + infiniopTensorDescriptor_t, + create_handle, + destroy_handle, + check_error, +) + +from operatorspy.tests.test_utils import get_args +from enum import Enum, auto +import torch + +# constant for control whether profile the pytorch and lib functions +# NOTE: need to manually add synchronization function to the lib function (elementwise.cu), +# e.g., cudaDeviceSynchronize() for CUDA +PROFILE = False +NUM_PRERUN = 10 +NUM_ITERATIONS = 1000 + +class Inplace(Enum): + OUT_OF_PLACE = auto() + INPLACE_A = auto() + INPLACE_B = auto() + INPLACE_AB = auto() + + +class MulDescriptor(Structure): + _fields_ = [("device", c_int32)] + + +infiniopMulDescriptor_t = POINTER(MulDescriptor) + + +def mul(x, y): + if PROFILE: + ans = torch.mul(x, y) + torch.cuda.synchronize() + return ans + return torch.mul(x, y) + + +def test( + lib, + handle, + torch_device, + c_shape, + a_shape, + b_shape, + tensor_dtype=torch.float16, + inplace=Inplace.OUT_OF_PLACE, +): + print( + f"Testing Mul on {torch_device} with c_shape:{c_shape} a_shape:{a_shape} b_shape:{b_shape} dtype:{tensor_dtype} inplace: {inplace.name}" + ) + if a_shape != b_shape and inplace != Inplace.OUT_OF_PLACE: + print("Unsupported test: broadcasting does not support in-place") + return + + a = torch.rand(a_shape, dtype=tensor_dtype).to(torch_device) * 10 + b = torch.rand(b_shape, dtype=tensor_dtype).to(torch_device) if inplace != Inplace.INPLACE_AB else a + c = torch.rand(c_shape, dtype=tensor_dtype).to(torch_device) if inplace == Inplace.OUT_OF_PLACE else (a if inplace == Inplace.INPLACE_A else b) + + for i in range(NUM_PRERUN if PROFILE else 1): + ans = mul(a, b) + if PROFILE: + start_time = time.time() + for i in range(NUM_ITERATIONS): + _ = mul(a, b) + elapsed = (time.time() - start_time) / NUM_ITERATIONS + print(f"pytorch time: {elapsed :6f}") + + a_tensor = to_tensor(a, lib) + b_tensor = to_tensor(b, lib) if inplace != Inplace.INPLACE_AB else a_tensor + c_tensor = to_tensor(c, lib) if inplace == Inplace.OUT_OF_PLACE else (a_tensor if inplace == Inplace.INPLACE_A else b_tensor) + descriptor = infiniopMulDescriptor_t() + + check_error( + lib.infiniopCreateMulDescriptor( + handle, + ctypes.byref(descriptor), + c_tensor.descriptor, + a_tensor.descriptor, + b_tensor.descriptor, + ) + ) + + for i in range(NUM_PRERUN if PROFILE else 1): + check_error( + lib.infiniopMul( + descriptor, c_tensor.data, a_tensor.data, b_tensor.data, None + ) + ) + if PROFILE: + start_time = time.time() + for i in range(NUM_ITERATIONS): + check_error( + lib.infiniopMul( + descriptor, c_tensor.data, a_tensor.data, b_tensor.data, None + ) + ) + elapsed = (time.time() - start_time) / NUM_ITERATIONS + print(f" lib time: {elapsed :6f}") + + assert torch.allclose(c, ans, atol=1e-6, rtol=1e-3) + check_error(lib.infiniopDestroyMulDescriptor(descriptor)) + + +def test_cpu(lib, test_cases): + device = DeviceEnum.DEVICE_CPU + handle = create_handle(lib, device) + for c_shape, a_shape, b_shape, inplace in test_cases: + test(lib, handle, "cpu", c_shape, a_shape, b_shape, tensor_dtype=torch.float16, inplace=inplace) + test(lib, handle, "cpu", c_shape, a_shape, b_shape, tensor_dtype=torch.float32, inplace=inplace) + destroy_handle(lib, handle) + + +def test_cuda(lib, test_cases): + device = DeviceEnum.DEVICE_CUDA + handle = create_handle(lib, device) + for c_shape, a_shape, b_shape, inplace in test_cases: + test(lib, handle, "cuda", c_shape, a_shape, b_shape, tensor_dtype=torch.float16, inplace=inplace) + test(lib, handle, "cuda", c_shape, a_shape, b_shape, tensor_dtype=torch.float32, inplace=inplace) + destroy_handle(lib, handle) + + +def test_bang(lib, test_cases): + import torch_mlu + + device = DeviceEnum.DEVICE_BANG + handle = create_handle(lib, device) + for c_shape, a_shape, b_shape, inplace in test_cases: + test(lib, handle, "mlu", c_shape, a_shape, b_shape, tensor_dtype=torch.float16, inplace=inplace) + test(lib, handle, "mlu", c_shape, a_shape, b_shape, tensor_dtype=torch.float32, inplace=inplace) + destroy_handle(lib, handle) + + +if __name__ == "__main__": + test_cases = [ + # c_shape, a_shape, b_shape, inplace + # ((32, 150, 51200), (32, 150, 51200), (32, 150, 1), Inplace.OUT_OF_PLACE), + # ((32, 150, 51200), (32, 150, 51200), (32, 150, 51200), Inplace.OUT_OF_PLACE), + ((1, 3), (1, 3), (1, 3), Inplace.OUT_OF_PLACE), + ((), (), (), Inplace.OUT_OF_PLACE), + ((2, 4, 3), (2, 1, 3), (4, 3), Inplace.OUT_OF_PLACE), + ((2, 3, 4, 5), (2, 3, 4, 5), (5,), Inplace.OUT_OF_PLACE), + ((3, 2, 4, 5), (4, 5), (3, 2, 1, 1), Inplace.OUT_OF_PLACE), + ((3, 20, 33), (3, 20, 33), (3, 20, 33), Inplace.INPLACE_A) if not PROFILE else ((32, 10, 100), (32, 10, 100), (32, 10, 100), Inplace.OUT_OF_PLACE), + ((3, 20, 33), (3, 20, 33), (3, 20, 33), Inplace.INPLACE_B) if not PROFILE else ((32, 15, 510), (32, 15, 510), (32, 15, 510), Inplace.OUT_OF_PLACE), + ((3, 20, 33), (3, 20, 33), (3, 20, 33), Inplace.INPLACE_AB) if not PROFILE else ((32, 256, 112, 112), (32, 256, 112, 1), (32, 256, 112, 112), Inplace.OUT_OF_PLACE), + ((32, 3, 112, 112), (32, 3, 112, 112), (32, 3, 112, 112), Inplace.OUT_OF_PLACE) if not PROFILE else ((32, 256, 112, 112), (32, 256, 112, 112), (32, 256, 112, 112), Inplace.OUT_OF_PLACE), + ] + args = get_args() + lib = open_lib() + lib.infiniopCreateMulDescriptor.restype = c_int32 + lib.infiniopCreateMulDescriptor.argtypes = [ + infiniopHandle_t, + POINTER(infiniopMulDescriptor_t), + infiniopTensorDescriptor_t, + infiniopTensorDescriptor_t, + infiniopTensorDescriptor_t, + ] + lib.infiniopMul.restype = c_int32 + lib.infiniopMul.argtypes = [ + infiniopMulDescriptor_t, + c_void_p, + c_void_p, + c_void_p, + c_void_p, + ] + lib.infiniopDestroyMulDescriptor.restype = c_int32 + lib.infiniopDestroyMulDescriptor.argtypes = [ + infiniopMulDescriptor_t, + ] + + if args.cpu: + test_cpu(lib, test_cases) + if args.cuda: + test_cuda(lib, test_cases) + if args.bang: + test_bang(lib, test_cases) + if not (args.cpu or args.cuda or args.bang): + test_cpu(lib, test_cases) + print("\033[92mTest passed!\033[0m") diff --git a/operatorspy/tests/pow.py b/operatorspy/tests/pow.py new file mode 100644 index 00000000..7e1a72ee --- /dev/null +++ b/operatorspy/tests/pow.py @@ -0,0 +1,191 @@ +from ctypes import POINTER, Structure, c_int32, c_void_p +import ctypes +import sys +import os +import time + +sys.path.insert(0, os.path.abspath(os.path.join(os.path.dirname(__file__), "..", ".."))) +from operatorspy import ( + open_lib, + to_tensor, + DeviceEnum, + infiniopHandle_t, + infiniopTensorDescriptor_t, + create_handle, + destroy_handle, + check_error, +) + +from operatorspy.tests.test_utils import get_args +from enum import Enum, auto +import torch + +# constant for control whether profile the pytorch and lib functions +# NOTE: need to manually add synchronization function to the lib function (elementwise.cu), +# e.g., cudaDeviceSynchronize() for CUDA +PROFILE = False +NUM_PRERUN = 10 +NUM_ITERATIONS = 1000 + +class Inplace(Enum): + OUT_OF_PLACE = auto() + INPLACE_A = auto() + INPLACE_B = auto() + INPLACE_AB = auto() + + +class PowDescriptor(Structure): + _fields_ = [("device", c_int32)] + + +infiniopPowDescriptor_t = POINTER(PowDescriptor) + + +def pow(x, y): + if PROFILE: + ans = torch.pow(x, y) + torch.cuda.synchronize() + return ans + return torch.pow(x, y) + + +def test( + lib, + handle, + torch_device, + c_shape, + a_shape, + b_shape, + tensor_dtype=torch.float16, + inplace=Inplace.OUT_OF_PLACE, +): + print( + f"Testing Pow on {torch_device} with c_shape:{c_shape} a_shape:{a_shape} b_shape:{b_shape} dtype:{tensor_dtype} inplace: {inplace.name}" + ) + + a = torch.rand(a_shape, dtype=tensor_dtype).to(torch_device) * 5 + b = torch.rand(b_shape, dtype=tensor_dtype).to(torch_device) * 5 if inplace != Inplace.INPLACE_AB else a + c = torch.rand(c_shape, dtype=tensor_dtype).to(torch_device) if inplace == Inplace.OUT_OF_PLACE else (a if inplace == Inplace.INPLACE_A else b) + + for i in range(NUM_PRERUN if PROFILE else 1): + ans = pow(a, b) + if PROFILE: + start_time = time.time() + for i in range(NUM_ITERATIONS): + _ = pow(a, b) + elapsed = (time.time() - start_time) / NUM_ITERATIONS + print(f"pytorch time: {elapsed :6f}") + + a_tensor = to_tensor(a, lib) + b_tensor = to_tensor(b, lib) if inplace != Inplace.INPLACE_AB else a_tensor + c_tensor = to_tensor(c, lib) if inplace == Inplace.OUT_OF_PLACE else (a_tensor if inplace == Inplace.INPLACE_A else b_tensor) + descriptor = infiniopPowDescriptor_t() + + check_error( + lib.infiniopCreatePowDescriptor( + handle, + ctypes.byref(descriptor), + c_tensor.descriptor, + a_tensor.descriptor, + b_tensor.descriptor, + ) + ) + + for i in range(NUM_PRERUN if PROFILE else 1): + check_error( + lib.infiniopPow( + descriptor, c_tensor.data, a_tensor.data, b_tensor.data, None + ) + ) + if PROFILE: + start_time = time.time() + for i in range(NUM_ITERATIONS): + check_error( + lib.infiniopPow( + descriptor, c_tensor.data, a_tensor.data, b_tensor.data, None + ) + ) + elapsed = (time.time() - start_time) / NUM_ITERATIONS + print(f" lib time: {elapsed :6f}") + + assert torch.allclose(c, ans, atol=1e-7, rtol=1e-3, equal_nan=True) + check_error(lib.infiniopDestroyPowDescriptor(descriptor)) + + +def test_cpu(lib, test_cases): + device = DeviceEnum.DEVICE_CPU + handle = create_handle(lib, device) + for c_shape, a_shape, b_shape, inplace in test_cases: + test(lib, handle, "cpu", c_shape, a_shape, b_shape, tensor_dtype=torch.float16, inplace=inplace) + test(lib, handle, "cpu", c_shape, a_shape, b_shape, tensor_dtype=torch.float32, inplace=inplace) + destroy_handle(lib, handle) + + +def test_cuda(lib, test_cases): + device = DeviceEnum.DEVICE_CUDA + handle = create_handle(lib, device) + for c_shape, a_shape, b_shape, inplace in test_cases: + test(lib, handle, "cuda", c_shape, a_shape, b_shape, tensor_dtype=torch.float16, inplace=inplace) + test(lib, handle, "cuda", c_shape, a_shape, b_shape, tensor_dtype=torch.float32, inplace=inplace) + destroy_handle(lib, handle) + + +def test_bang(lib, test_cases): + import torch_mlu + + device = DeviceEnum.DEVICE_BANG + handle = create_handle(lib, device) + for c_shape, a_shape, b_shape, inplace in test_cases: + test(lib, handle, "mlu", c_shape, a_shape, b_shape, tensor_dtype=torch.float16, inplace=inplace) + test(lib, handle, "mlu", c_shape, a_shape, b_shape, tensor_dtype=torch.float32, inplace=inplace) + destroy_handle(lib, handle) + + +if __name__ == "__main__": + test_cases = [ + # c_shape, a_shape, b_shape, inplace + # ((32, 150, 51200), (32, 150, 51200), (32, 150, 1), Inplace.OUT_OF_PLACE), + # ((32, 150, 51200), (32, 150, 51200), (32, 150, 51200), Inplace.OUT_OF_PLACE), + ((1, 3), (1, 3), (1, 3), Inplace.OUT_OF_PLACE), + ((), (), (), Inplace.OUT_OF_PLACE), + ((2, 4, 3), (2, 1, 3), (4, 3), Inplace.OUT_OF_PLACE), + ((2, 3, 4, 5), (2, 3, 4, 5), (5,), Inplace.OUT_OF_PLACE), + ((3, 2, 4, 5), (4, 5), (3, 2, 1, 1), Inplace.OUT_OF_PLACE), + ((3, 20, 33), (3, 20, 33), (3, 20, 33), Inplace.INPLACE_A) if not PROFILE else ((32, 10, 100), (32, 10, 100), (32, 10, 100), Inplace.OUT_OF_PLACE), + ((3, 20, 33), (3, 20, 33), (3, 20, 33), Inplace.INPLACE_B) if not PROFILE else ((32, 15, 510), (32, 15, 510), (32, 15, 510), Inplace.OUT_OF_PLACE), + ((3, 20, 33), (3, 20, 33), (3, 20, 33), Inplace.INPLACE_AB) if not PROFILE else ((32, 256, 112, 112), (32, 256, 112, 1), (32, 256, 112, 112), Inplace.OUT_OF_PLACE), + ((33, 333, 333), (33, 333, 333), (33, 333, 333), Inplace.OUT_OF_PLACE), + ((32, 3, 112, 112), (32, 3, 112, 112), (32, 3, 112, 112), Inplace.OUT_OF_PLACE) if not PROFILE else ((32, 256, 112, 112), (32, 256, 112, 112), (32, 256, 112, 112), Inplace.OUT_OF_PLACE), + ] + args = get_args() + lib = open_lib() + lib.infiniopCreatePowDescriptor.restype = c_int32 + lib.infiniopCreatePowDescriptor.argtypes = [ + infiniopHandle_t, + POINTER(infiniopPowDescriptor_t), + infiniopTensorDescriptor_t, + infiniopTensorDescriptor_t, + infiniopTensorDescriptor_t, + ] + lib.infiniopPow.restype = c_int32 + lib.infiniopPow.argtypes = [ + infiniopPowDescriptor_t, + c_void_p, + c_void_p, + c_void_p, + c_void_p, + ] + lib.infiniopDestroyPowDescriptor.restype = c_int32 + lib.infiniopDestroyPowDescriptor.argtypes = [ + infiniopPowDescriptor_t, + ] + + if args.cpu: + test_cpu(lib, test_cases) + if args.cuda: + test_cuda(lib, test_cases) + if args.bang: + test_bang(lib, test_cases) + if not (args.cpu or args.cuda or args.bang): + test_cpu(lib, test_cases) + print("\033[92mTest passed!\033[0m") diff --git a/operatorspy/tests/sub.py b/operatorspy/tests/sub.py new file mode 100644 index 00000000..ef68b1a4 --- /dev/null +++ b/operatorspy/tests/sub.py @@ -0,0 +1,193 @@ +from ctypes import POINTER, Structure, c_int32, c_void_p +import ctypes +import sys +import os +import time + +sys.path.insert(0, os.path.abspath(os.path.join(os.path.dirname(__file__), "..", ".."))) +from operatorspy import ( + open_lib, + to_tensor, + DeviceEnum, + infiniopHandle_t, + infiniopTensorDescriptor_t, + create_handle, + destroy_handle, + check_error, +) + +from operatorspy.tests.test_utils import get_args +from enum import Enum, auto +import torch + +# constant for control whether profile the pytorch and lib functions +# NOTE: need to manually add synchronization function to the lib function (elementwise.cu), +# e.g., cudaDeviceSynchronize() for CUDA +PROFILE = False +NUM_PRERUN = 10 +NUM_ITERATIONS = 1000 + +class Inplace(Enum): + OUT_OF_PLACE = auto() + INPLACE_A = auto() + INPLACE_B = auto() + INPLACE_AB = auto() + + +class SubDescriptor(Structure): + _fields_ = [("device", c_int32)] + + +infiniopSubDescriptor_t = POINTER(SubDescriptor) + + +def subtract(x, y): + if PROFILE: + ans = torch.subtract(x, y) + torch.cuda.synchronize() + return ans + return torch.subtract(x, y) + + +def test( + lib, + handle, + torch_device, + c_shape, + a_shape, + b_shape, + tensor_dtype=torch.float16, + inplace=Inplace.OUT_OF_PLACE, +): + print( + f"Testing Sub on {torch_device} with c_shape:{c_shape} a_shape:{a_shape} b_shape:{b_shape} dtype:{tensor_dtype} inplace: {inplace.name}" + ) + if a_shape != b_shape and inplace != Inplace.OUT_OF_PLACE: + print("Unsupported test: broadcasting does not support in-place") + return + + a = torch.rand(a_shape, dtype=tensor_dtype).to(torch_device) + b = torch.rand(b_shape, dtype=tensor_dtype).to(torch_device) if inplace != Inplace.INPLACE_AB else a + c = torch.rand(c_shape, dtype=tensor_dtype).to(torch_device) if inplace == Inplace.OUT_OF_PLACE else (a if inplace == Inplace.INPLACE_A else b) + + for i in range(NUM_PRERUN if PROFILE else 1): + ans = subtract(a, b) + if PROFILE: + start_time = time.time() + for i in range(NUM_ITERATIONS): + _ = subtract(a, b) + elapsed = (time.time() - start_time) / NUM_ITERATIONS + print(f"pytorch time: {elapsed :6f}") + + a_tensor = to_tensor(a, lib) + b_tensor = to_tensor(b, lib) if inplace != Inplace.INPLACE_AB else a_tensor + c_tensor = to_tensor(c, lib) if inplace == Inplace.OUT_OF_PLACE else (a_tensor if inplace == Inplace.INPLACE_A else b_tensor) + descriptor = infiniopSubDescriptor_t() + + check_error( + lib.infiniopCreateSubDescriptor( + handle, + ctypes.byref(descriptor), + c_tensor.descriptor, + a_tensor.descriptor, + b_tensor.descriptor, + ) + ) + + for i in range(NUM_PRERUN if PROFILE else 1): + check_error( + lib.infiniopSub( + descriptor, c_tensor.data, a_tensor.data, b_tensor.data, None + ) + ) + if PROFILE: + start_time = time.time() + for i in range(NUM_ITERATIONS): + check_error( + lib.infiniopSub( + descriptor, c_tensor.data, a_tensor.data, b_tensor.data, None + ) + ) + elapsed = (time.time() - start_time) / NUM_ITERATIONS + print(f" lib time: {elapsed :6f}") + + assert torch.allclose(c, ans, atol=0, rtol=1e-3) + check_error(lib.infiniopDestroySubDescriptor(descriptor)) + + +def test_cpu(lib, test_cases): + device = DeviceEnum.DEVICE_CPU + handle = create_handle(lib, device) + for c_shape, a_shape, b_shape, inplace in test_cases: + test(lib, handle, "cpu", c_shape, a_shape, b_shape, tensor_dtype=torch.float16, inplace=inplace) + test(lib, handle, "cpu", c_shape, a_shape, b_shape, tensor_dtype=torch.float32, inplace=inplace) + destroy_handle(lib, handle) + + +def test_cuda(lib, test_cases): + device = DeviceEnum.DEVICE_CUDA + handle = create_handle(lib, device) + for c_shape, a_shape, b_shape, inplace in test_cases: + test(lib, handle, "cuda", c_shape, a_shape, b_shape, tensor_dtype=torch.float16, inplace=inplace) + test(lib, handle, "cuda", c_shape, a_shape, b_shape, tensor_dtype=torch.float32, inplace=inplace) + destroy_handle(lib, handle) + + +def test_bang(lib, test_cases): + import torch_mlu + + device = DeviceEnum.DEVICE_BANG + handle = create_handle(lib, device) + for c_shape, a_shape, b_shape, inplace in test_cases: + test(lib, handle, "mlu", c_shape, a_shape, b_shape, tensor_dtype=torch.float16, inplace=inplace) + test(lib, handle, "mlu", c_shape, a_shape, b_shape, tensor_dtype=torch.float32, inplace=inplace) + destroy_handle(lib, handle) + + +if __name__ == "__main__": + test_cases = [ + # c_shape, a_shape, b_shape, inplace + # ((32, 150, 51200), (32, 150, 51200), (32, 150, 1), Inplace.OUT_OF_PLACE), + # ((32, 150, 51200), (32, 150, 51200), (32, 150, 51200), Inplace.OUT_OF_PLACE), + ((1, 3), (1, 3), (1, 3), Inplace.OUT_OF_PLACE), + ((), (), (), Inplace.OUT_OF_PLACE), + ((2, 4, 3), (2, 1, 3), (4, 3), Inplace.OUT_OF_PLACE), + ((2, 3, 4, 5), (2, 3, 4, 5), (5,), Inplace.OUT_OF_PLACE), + ((3, 2, 4, 5), (4, 5), (3, 2, 1, 1), Inplace.OUT_OF_PLACE), + ((3, 20, 33), (3, 20, 33), (3, 20, 33), Inplace.INPLACE_A) if not PROFILE else ((32, 10, 100), (32, 10, 100), (32, 10, 100), Inplace.OUT_OF_PLACE), + ((3, 20, 33), (3, 20, 33), (3, 20, 33), Inplace.INPLACE_B) if not PROFILE else ((32, 15, 510), (32, 15, 510), (32, 15, 510), Inplace.OUT_OF_PLACE), + ((3, 20, 33), (3, 20, 33), (3, 20, 33), Inplace.INPLACE_AB) if not PROFILE else ((32, 256, 112, 112), (32, 256, 112, 1), (32, 256, 112, 112), Inplace.OUT_OF_PLACE), + ((32, 3, 112, 112), (32, 3, 112, 112), (32, 3, 112, 112), Inplace.OUT_OF_PLACE) if not PROFILE else ((32, 256, 112, 112), (32, 256, 112, 112), (32, 256, 112, 112), Inplace.OUT_OF_PLACE), + ] + args = get_args() + lib = open_lib() + lib.infiniopCreateSubDescriptor.restype = c_int32 + lib.infiniopCreateSubDescriptor.argtypes = [ + infiniopHandle_t, + POINTER(infiniopSubDescriptor_t), + infiniopTensorDescriptor_t, + infiniopTensorDescriptor_t, + infiniopTensorDescriptor_t, + ] + lib.infiniopSub.restype = c_int32 + lib.infiniopSub.argtypes = [ + infiniopSubDescriptor_t, + c_void_p, + c_void_p, + c_void_p, + c_void_p, + ] + lib.infiniopDestroySubDescriptor.restype = c_int32 + lib.infiniopDestroySubDescriptor.argtypes = [ + infiniopSubDescriptor_t, + ] + + if args.cpu: + test_cpu(lib, test_cases) + if args.cuda: + test_cuda(lib, test_cases) + if args.bang: + test_bang(lib, test_cases) + if not (args.cpu or args.cuda or args.bang): + test_cpu(lib, test_cases) + print("\033[92mTest passed!\033[0m") diff --git a/src/ops/add/cpu/add_cpu.cc b/src/ops/add/cpu/add_cpu.cc deleted file mode 100644 index 649fa052..00000000 --- a/src/ops/add/cpu/add_cpu.cc +++ /dev/null @@ -1,101 +0,0 @@ -#include "add_cpu.h" -#include "../../../devices/cpu/common_cpu.h" -#include "../../utils.h" - -inline void incrementOne(uint64_t *indices, uint64_t const *shape, uint64_t ndim) { - for (int64_t i = ndim - 1; i >= 0; --i) { - if (++indices[i] != shape[i]) { - return; - } - indices[i] = 0; - } -} - -inline uint64_t compactToFlat(uint64_t const *indices, uint64_t const *strides, uint64_t ndim) { - return std::inner_product(indices, indices + ndim, strides, uint64_t(0)); -} - -infiniopStatus_t cpuCreateAddDescriptor(infiniopHandle_t, - AddCpuDescriptor_t *desc_ptr, - infiniopTensorDescriptor_t c, - infiniopTensorDescriptor_t a, - infiniopTensorDescriptor_t b) { - uint64_t ndim = c->ndim; - if (!isValidBroadcastShape(a, b, c)) { - return STATUS_BAD_TENSOR_SHAPE; - } - if (!is_contiguous(a) || !is_contiguous(b) || !is_contiguous(c)) { - return STATUS_BAD_TENSOR_STRIDES; - } - if (c->dt != F16 && c->dt != F32) { - return STATUS_BAD_TENSOR_DTYPE; - } - if (c->dt != a->dt || c->dt != b->dt) { - return STATUS_BAD_TENSOR_DTYPE; - } - - uint64_t c_data_size = std::accumulate(c->shape, c->shape + c->ndim, 1ULL, std::multiplies()); - - // get the adjusted strides for a and b - uint64_t *a_strides = new uint64_t[ndim]; - uint64_t *b_strides = new uint64_t[ndim]; - for (size_t i = 0; i < ndim; ++i) { - a_strides[i] = (i < ndim - a->ndim || c->shape[i] != a->shape[i + a->ndim - ndim]) ? 0 : a->strides[i + a->ndim - ndim]; - b_strides[i] = (i < ndim - b->ndim || c->shape[i] != b->shape[i + b->ndim - ndim]) ? 0 : b->strides[i + b->ndim - ndim]; - } - - uint64_t *c_indices = new uint64_t[ndim]; - std::fill(c_indices, c_indices + ndim, 0); - - *desc_ptr = new AddCpuDescriptor{ - DevCpu, - c->dt, - ndim, - c_data_size, - c->shape, - a_strides, - b_strides, - c_indices, - }; - - return STATUS_SUCCESS; -} - -infiniopStatus_t cpuDestroyAddDescriptor(AddCpuDescriptor_t desc) { - delete[] desc->a_strides; - delete[] desc->b_strides; - delete[] desc->c_indices; - delete desc; - return STATUS_SUCCESS; -} - -template -infiniopStatus_t add_cpu(AddCpuDescriptor_t desc, void *c, void const *a, void const *b) { - auto a_ = reinterpret_cast(a); - auto b_ = reinterpret_cast(b); - auto c_ = reinterpret_cast(c); - const auto &indices = desc->c_indices; - - for (uint64_t i = 0; i < desc->c_data_size; ++i, incrementOne(indices, desc->c_shape, desc->ndim)) { - auto a_index = compactToFlat(indices, desc->a_strides, desc->ndim); - auto b_index = compactToFlat(indices, desc->b_strides, desc->ndim); - if constexpr (std::is_same::value) { - c_[i] = f32_to_f16(f16_to_f32(a_[a_index]) + f16_to_f32(b_[b_index])); - } else { - c_[i] = a_[a_index] + b_[b_index]; - } - } - return STATUS_SUCCESS; -} - -infiniopStatus_t cpuAdd(AddCpuDescriptor_t desc, - void *c, void const *a, void const *b, - void *stream) { - if (desc->dtype == F16) { - return add_cpu(desc, c, a, b); - } - if (desc->dtype == F32) { - return add_cpu(desc, c, a, b); - } - return STATUS_BAD_TENSOR_DTYPE; -} diff --git a/src/ops/add/cpu/add_cpu.h b/src/ops/add/cpu/add_cpu.h deleted file mode 100644 index 42e62435..00000000 --- a/src/ops/add/cpu/add_cpu.h +++ /dev/null @@ -1,33 +0,0 @@ -#ifndef __CPU_ADD_H__ -#define __CPU_ADD_H__ - -#include "operators.h" -#include -#include - -struct AddCpuDescriptor { - Device device; - DT dtype; - uint64_t ndim; - uint64_t c_data_size; - uint64_t const *c_shape; - uint64_t const *a_strides; - uint64_t const *b_strides; - uint64_t *c_indices; -}; - -typedef struct AddCpuDescriptor *AddCpuDescriptor_t; - -infiniopStatus_t cpuCreateAddDescriptor(infiniopHandle_t, - AddCpuDescriptor_t *, - infiniopTensorDescriptor_t c, - infiniopTensorDescriptor_t a, - infiniopTensorDescriptor_t b); - -infiniopStatus_t cpuAdd(AddCpuDescriptor_t desc, - void *c, void const *a, void const *b, - void *stream); - -infiniopStatus_t cpuDestroyAddDescriptor(AddCpuDescriptor_t desc); - -#endif diff --git a/src/ops/add/cuda/add.cc b/src/ops/add/cuda/add.cc deleted file mode 100644 index b010894f..00000000 --- a/src/ops/add/cuda/add.cc +++ /dev/null @@ -1,81 +0,0 @@ -#include "add.cuh" -#include "../../../devices/cuda/common_cuda.h" -#include "../../utils.h" - -infiniopStatus_t cudaCreateAddDescriptor(CudaHandle_t handle, - AddCudaDescriptor_t *desc_ptr, - infiniopTensorDescriptor_t c, - infiniopTensorDescriptor_t a, - infiniopTensorDescriptor_t b) { - uint64_t ndim = c->ndim; - if (!isValidBroadcastShape(a, b, c)) { - return STATUS_BAD_TENSOR_SHAPE; - } - if (!is_contiguous(a) || !is_contiguous(b) || !is_contiguous(c)) { - return STATUS_BAD_TENSOR_STRIDES; - } - if (c->dt != F16 && c->dt != F32) { - return STATUS_BAD_TENSOR_DTYPE; - } - if (c->dt != a->dt || c->dt != b->dt) { - return STATUS_BAD_TENSOR_DTYPE; - } - bool broadcasted = false; - if (ndim != a->ndim || ndim != b->ndim) { - broadcasted = true; - } else { - for (uint64_t i = 0; i < ndim; ++i) { - if (c->shape[i] != a->shape[i] || c->shape[i] != b->shape[i]) { - broadcasted = true; - break; - } - } - } - - uint64_t c_data_size = std::accumulate(c->shape, c->shape + c->ndim, 1ULL, std::multiplies()); - - // get the adjusted strides for a and b - int64_t *a_strides = new int64_t[ndim]; - int64_t *b_strides = new int64_t[ndim]; - for (size_t i = 0; i < ndim; ++i) { - a_strides[i] = (i < ndim - a->ndim || c->shape[i] != a->shape[i + a->ndim - ndim]) ? 0 : a->strides[i + a->ndim - ndim]; - b_strides[i] = (i < ndim - b->ndim || c->shape[i] != b->shape[i + b->ndim - ndim]) ? 0 : b->strides[i + b->ndim - ndim]; - } - - cudaDeviceProp prop; - cudaGetDeviceProperties(&prop, handle->device_id); - - int64_t *a_strides_d, *b_strides_d, *c_strides_d; - checkCudaErrorWithCode(cudaMalloc(&a_strides_d, ndim * sizeof(int64_t)), STATUS_MEMORY_NOT_ALLOCATED); - checkCudaErrorWithCode(cudaMalloc(&b_strides_d, ndim * sizeof(int64_t)), STATUS_MEMORY_NOT_ALLOCATED); - checkCudaErrorWithCode(cudaMalloc(&c_strides_d, ndim * sizeof(int64_t)), STATUS_MEMORY_NOT_ALLOCATED); - checkCudaErrorWithCode(cudaMemcpy(a_strides_d, a_strides, ndim * sizeof(int64_t), cudaMemcpyHostToDevice), STATUS_EXECUTION_FAILED); - checkCudaErrorWithCode(cudaMemcpy(b_strides_d, b_strides, ndim * sizeof(int64_t), cudaMemcpyHostToDevice), STATUS_EXECUTION_FAILED); - checkCudaErrorWithCode(cudaMemcpy(c_strides_d, c->strides, ndim * sizeof(int64_t), cudaMemcpyHostToDevice), STATUS_EXECUTION_FAILED); - - *desc_ptr = new AddCudaDescriptor{ - DevNvGpu, - c->dt, - handle->device_id, - ndim, - c_data_size, - static_cast(prop.maxGridSize[0]), - a_strides_d, - b_strides_d, - c_strides_d, - broadcasted, - }; - - delete[] a_strides; - delete[] b_strides; - - return STATUS_SUCCESS; -} - -infiniopStatus_t cudaDestroyAddDescriptor(AddCudaDescriptor_t desc) { - checkCudaErrorWithCode(cudaFree((void *) desc->a_strides), STATUS_EXECUTION_FAILED); - checkCudaErrorWithCode(cudaFree((void *) desc->b_strides), STATUS_EXECUTION_FAILED); - checkCudaErrorWithCode(cudaFree((void *) desc->c_strides), STATUS_EXECUTION_FAILED); - delete desc; - return STATUS_SUCCESS; -} diff --git a/src/ops/add/cuda/add.cu b/src/ops/add/cuda/add.cu deleted file mode 100644 index 9d9aefcb..00000000 --- a/src/ops/add/cuda/add.cu +++ /dev/null @@ -1,116 +0,0 @@ -#include "../../../devices/cuda/common_cuda.h" -#include "../../utils.h" -#include "add.cuh" - -/** - * @brief A templated vector struct that supports element-wise addition on arrays. - * - * @tparam T - The access data type for elements in the vector. - * @tparam TComp - The computation data type used for arithmetic operations. - * @tparam N - The number of elements of type T in the vector for a single access. - */ -template -struct vecN { - T data[N]; - - __device__ __forceinline__ vecN operator+(const vecN &other) const { - vecN result; - - for (int i = 0; i < N; ++i) { - if constexpr (std::is_same::value) { - result.data[i] = data[i] + other.data[i]; - } else { - constexpr static size_t pack_size = sizeof(T) / sizeof(TComp); - auto data_ = reinterpret_cast *>(result.data); - data_[i] = std::move(reinterpret_cast const *>(data)[i] + - reinterpret_cast const *>(other.data)[i]); - } - } - - return result; - } - - __device__ __forceinline__ const T &operator[](size_t i) const { - return data[i]; - } -}; - -template -__global__ void add( - Tdata *c, - const Tdata *a, - const Tdata *b, - const int64_t *a_strides, - const int64_t *b_strides, - const int64_t *c_strides, - uint64_t data_size, - uint64_t ndim, - uint64_t offset, - bool broadcasted, - unsigned pack_size) { - uint64_t idx = blockIdx.x * blockDim.x + threadIdx.x + offset; - - if (idx < data_size) { - if (broadcasted) { - idx *= pack_size; - auto a_ = reinterpret_cast(a); - auto b_ = reinterpret_cast(b); - auto c_ = reinterpret_cast(c); -#pragma unroll - for (size_t i = 0; i < pack_size; ++i) { - auto a_idx = getDstOffset(idx + i, ndim, c_strides, a_strides); - auto b_idx = getDstOffset(idx + i, ndim, c_strides, b_strides); - c_[idx + i] = a_[a_idx] + b_[b_idx]; - } - return; - } - c[idx] = a[idx] + b[idx]; - } -} - -template -void _add_nv_gpu(AddCudaDescriptor_t desc, Tdata *c, Tdata const *a, Tdata const *b, uint64_t data_size, uint64_t pack_size, uint64_t offset, void *stream) { - if (data_size == 0) { - return; - } - dim3 blockDims = dim3(std::min(static_cast(256), data_size)); - dim3 gridDims = dim3(std::min(ROUND_UP_DIV(data_size, blockDims.x), desc->max_grid_size)); - uint64_t step = gridDims.x * blockDims.x; - - cudaStream_t cuda_stream = reinterpret_cast(stream); - -#pragma unroll - for (uint64_t i = 0; i < data_size; i += step) { - add<<>>( - c, a, b, desc->a_strides, desc->b_strides, desc->c_strides, offset + data_size, desc->ndim, offset + i, desc->broadcasted, pack_size); - } -} - -template -infiniopStatus_t add_nv_gpu(AddCudaDescriptor_t desc, void *c, void const *a, void const *b, void *stream, uint64_t pack_size) { - const auto data_size = desc->c_data_size / pack_size; - const auto a_vec = reinterpret_cast(a); - const auto b_vec = reinterpret_cast(b); - const auto c_vec = reinterpret_cast(c); - _add_nv_gpu(desc, c_vec, a_vec, b_vec, data_size, pack_size, 0, stream); - - const auto remainder = desc->c_data_size % pack_size; - const auto a_ = reinterpret_cast(a); - const auto b_ = reinterpret_cast(b); - const auto c_ = reinterpret_cast(c); - _add_nv_gpu(desc, c_, a_, b_, remainder, 1, data_size * pack_size, stream); - return STATUS_SUCCESS; -} - -infiniopStatus_t cudaAdd(AddCudaDescriptor_t desc, - void *c, void const *a, void const *b, - void *stream) { - checkCudaError(cudaSetDevice(desc->device_id)); - if (desc->dtype == F16) { - return add_nv_gpu, half>(desc, c, a, b, stream, 8); - } - if (desc->dtype == F32) { - return add_nv_gpu, float>(desc, c, a, b, stream, 4); - } - return STATUS_BAD_TENSOR_DTYPE; -} diff --git a/src/ops/add/cuda/add.cuh b/src/ops/add/cuda/add.cuh deleted file mode 100644 index 03a181eb..00000000 --- a/src/ops/add/cuda/add.cuh +++ /dev/null @@ -1,37 +0,0 @@ -#ifndef __CUDA_ADD_H__ -#define __CUDA_ADD_H__ - -#include "../../../devices/cuda/common_cuda.h" -#include "../../../devices/cuda/cuda_handle.h" -#include "operators.h" -#include -#include - -struct AddCudaDescriptor { - Device device; - DT dtype; - int device_id; - uint64_t ndim; - uint64_t c_data_size; - uint64_t max_grid_size; - int64_t const *a_strides; - int64_t const *b_strides; - int64_t const *c_strides; - bool broadcasted; -}; - -typedef struct AddCudaDescriptor *AddCudaDescriptor_t; - -infiniopStatus_t cudaCreateAddDescriptor(CudaHandle_t, - AddCudaDescriptor_t *, - infiniopTensorDescriptor_t c, - infiniopTensorDescriptor_t a, - infiniopTensorDescriptor_t b); - -infiniopStatus_t cudaAdd(AddCudaDescriptor_t desc, - void *c, void const *a, void const *b, - void *stream); - -infiniopStatus_t cudaDestroyAddDescriptor(AddCudaDescriptor_t desc); - -#endif diff --git a/src/ops/add/operator.cc b/src/ops/add/operator.cc index c2a30ea8..b2c51f70 100644 --- a/src/ops/add/operator.cc +++ b/src/ops/add/operator.cc @@ -1,73 +1,44 @@ +#include "../binary/binary.h" #include "../utils.h" -#include "operators.h" #include "ops/add/add.h" +#include "tensor/tensor_descriptor.h" -#ifdef ENABLE_CPU -#include "cpu/add_cpu.h" -#endif -#ifdef ENABLE_NV_GPU -#include "../../devices/cuda/cuda_handle.h" -#include "cuda/add.cuh" -#endif +struct _AddDescriptor { + Device device; + infiniopBinaryDescriptor_t binary_desc; +}; -__C infiniopStatus_t infiniopCreateAddDescriptor( - infiniopHandle_t handle, - infiniopAddDescriptor_t *desc_ptr, - infiniopTensorDescriptor_t c, - infiniopTensorDescriptor_t a, - infiniopTensorDescriptor_t b) { - switch (handle->device) { -#ifdef ENABLE_CPU - case DevCpu: - return cpuCreateAddDescriptor(handle, (AddCpuDescriptor_t *) desc_ptr, c, a, b); -#endif -#ifdef ENABLE_NV_GPU - case DevNvGpu: { - return cudaCreateAddDescriptor((CudaHandle_t) handle, (AddCudaDescriptor_t *) desc_ptr, c, a, b); - } +typedef struct _AddDescriptor *_AddDescriptor_t; -#endif -#ifdef ENABLE_CAMBRICON_MLU - // TODO -#endif - } - return STATUS_BAD_DEVICE; -} +__C __export infiniopStatus_t infiniopCreateAddDescriptor(infiniopHandle_t handle, + infiniopAddDescriptor_t *desc_ptr, + infiniopTensorDescriptor_t c_desc, + infiniopTensorDescriptor_t a_desc, + infiniopTensorDescriptor_t b_desc) { + // binary desc + infiniopBinaryDescriptor_t binary_desc; + CHECK_STATUS(infiniopCreateBinaryDescriptor(handle, &binary_desc, c_desc, a_desc, b_desc, BinaryMode::Add), STATUS_SUCCESS); -__C infiniopStatus_t infiniopAdd(infiniopAddDescriptor_t desc, void *c, void const *a, void const *b, void *stream) { - switch (desc->device) { -#ifdef ENABLE_CPU - case DevCpu: - return cpuAdd((AddCpuDescriptor_t) desc, c, a, b, stream); -#endif -#ifdef ENABLE_NV_GPU - case DevNvGpu: { - return cudaAdd((AddCudaDescriptor_t) desc, c, a, b, stream); - } + *(_AddDescriptor_t *) desc_ptr = new _AddDescriptor{ + handle->device, + binary_desc, + }; -#endif -#ifdef ENABLE_CAMBRICON_MLU - // TODO -#endif - } - return STATUS_BAD_DEVICE; + return STATUS_SUCCESS; } -__C infiniopStatus_t infiniopDestroyAddDescriptor(infiniopAddDescriptor_t desc) { - switch (desc->device) { -#ifdef ENABLE_CPU - case DevCpu: - return cpuDestroyAddDescriptor((AddCpuDescriptor_t) desc); -#endif -#ifdef ENABLE_NV_GPU - case DevNvGpu: { - return cudaDestroyAddDescriptor((AddCudaDescriptor_t) desc); - } +__C __export infiniopStatus_t infiniopAdd(infiniopAddDescriptor_t desc, + void *c, + void const *a, + void const *b, + void *stream) { + auto _desc = (_AddDescriptor_t) desc; + CHECK_STATUS(infiniopBinary(_desc->binary_desc, c, a, b, stream), STATUS_SUCCESS); + return STATUS_SUCCESS; +} -#endif -#ifdef ENABLE_CAMBRICON_MLU - // TODO -#endif - } - return STATUS_BAD_DEVICE; +__C __export infiniopStatus_t infiniopDestroyAddDescriptor(infiniopAddDescriptor_t desc) { + CHECK_STATUS(infiniopDestroyBinaryDescriptor(((_AddDescriptor_t) desc)->binary_desc), STATUS_SUCCESS); + delete desc; + return STATUS_SUCCESS; } diff --git a/src/ops/binary/binary.h b/src/ops/binary/binary.h new file mode 100644 index 00000000..1e953b51 --- /dev/null +++ b/src/ops/binary/binary.h @@ -0,0 +1,72 @@ +#ifndef BINARY_H +#define BINARY_H + +#include "export.h" +#include "operators.h" +#include + +typedef struct BinaryDescriptor { + Device device; +} BinaryDescriptor; + +typedef BinaryDescriptor *infiniopBinaryDescriptor_t; + +/** + * @brief Represents all the currently defined binary operations. + */ +struct BinaryMode { + /** + * Note: new operation type should be added **BEFORE** the "Count" type + * "Count" should remain as the last element. New Arithmetic operations should be + * added in the 'Arithmetic operations' section, i.e., before the 'Logical operations:' section. + */ + enum Mode { + // Arithmetic operations: + Add, + Subtract, + Multiply, + Divide, + Pow, + Mod, + Max, + Min, + BitwiseAnd, + BitwiseOr, + BitwiseXor, + + // Logical operations: + // **TODO Not currently supported** + // Requires Boolean data type + And, + Or, + Xor, + Less, + LessOrEqual, + Equal, + Greater, + GreaterOrEqual, + + Count,///< Number of binary operation types (marker for counting purposes). + }; + + // This static constant holds the total number of defined binary operations. + static const size_t numBinaryMode = Count; +}; + +__C infiniopStatus_t infiniopCreateBinaryDescriptor(infiniopHandle_t handle, + infiniopBinaryDescriptor_t *desc_ptr, + infiniopTensorDescriptor_t c, + infiniopTensorDescriptor_t a, + infiniopTensorDescriptor_t b, + int mode); + +__C infiniopStatus_t infiniopBinary(infiniopBinaryDescriptor_t desc, + void *c, + void const *a, + void const *b, + void *stream); + +__C infiniopStatus_t infiniopDestroyBinaryDescriptor(infiniopBinaryDescriptor_t desc); + + +#endif diff --git a/src/ops/binary/cpu/binary_cpu.cc b/src/ops/binary/cpu/binary_cpu.cc new file mode 100644 index 00000000..6ee50655 --- /dev/null +++ b/src/ops/binary/cpu/binary_cpu.cc @@ -0,0 +1,176 @@ +#include "binary_cpu.h" +#include "../../../devices/cpu/common_cpu.h" +#include "../../utils.h" + +infiniopStatus_t cpuCreateBinaryDescriptor(infiniopHandle_t, + BinaryCpuDescriptor_t *desc_ptr, + infiniopTensorDescriptor_t c, + infiniopTensorDescriptor_t a, + infiniopTensorDescriptor_t b, + int mode) { + uint64_t ndim = c->ndim; + if (!isValidBroadcastShape(a, b, c)) { + return STATUS_BAD_TENSOR_SHAPE; + } + if (!is_contiguous(a) || !is_contiguous(b) || !is_contiguous(c)) { + return STATUS_BAD_TENSOR_STRIDES; + } + if (c->dt != F16 && c->dt != F32) { + return STATUS_BAD_TENSOR_DTYPE; + } + if (c->dt != a->dt || c->dt != b->dt) { + return STATUS_BAD_TENSOR_DTYPE; + } + bool broadcasted = false; + if (ndim != a->ndim || ndim != b->ndim) { + broadcasted = true; + } else { + for (uint64_t i = 0; i < ndim; ++i) { + if (c->shape[i] != a->shape[i] || c->shape[i] != b->shape[i]) { + broadcasted = true; + break; + } + } + } + if (mode < 0 || mode >= BinaryMode::numBinaryMode) { + return STATUS_BAD_PARAM; + } + // bitwise operations are only valid for integral types + if (c->dt.exponent != 0 && (mode == BinaryMode::BitwiseAnd || mode == BinaryMode::BitwiseOr || mode == BinaryMode::BitwiseXor)) { + return STATUS_BAD_PARAM; + } + + uint64_t c_data_size = std::accumulate(c->shape, c->shape + c->ndim, 1ULL, std::multiplies()); + + int64_t *a_strides = nullptr; + int64_t *b_strides = nullptr; + int64_t *c_strides = nullptr; + uint64_t *c_shape = new uint64_t[ndim]; + std::copy(c->shape, c->shape + ndim, c_shape); + + if (broadcasted) { + // get the adjusted strides for a and b + a_strides = new int64_t[ndim]; + b_strides = new int64_t[ndim]; + c_strides = new int64_t[ndim]; + for (size_t i = 0; i < ndim; ++i) { + a_strides[i] = (i < ndim - a->ndim || c->shape[i] != a->shape[i + a->ndim - ndim]) ? 0 : a->strides[i + a->ndim - ndim]; + b_strides[i] = (i < ndim - b->ndim || c->shape[i] != b->shape[i + b->ndim - ndim]) ? 0 : b->strides[i + b->ndim - ndim]; + c_strides[i] = c->strides[i]; + } + } + + *desc_ptr = new BinaryCpuDescriptor{ + DevCpu, + c->dt, + ndim, + c_data_size, + c_shape, + a_strides, + b_strides, + c_strides, + broadcasted, + mode, + }; + + return STATUS_SUCCESS; +} + +infiniopStatus_t cpuDestroyBinaryDescriptor(BinaryCpuDescriptor_t desc) { + delete[] desc->a_strides; + delete[] desc->b_strides; + delete[] desc->c_strides; + delete[] desc->c_shape; + delete desc; + return STATUS_SUCCESS; +} + +// binary elementwise kernel that performs the actual binary computation +template +Tdata binary_kernel(const Tdata &a, const Tdata &b, int mode) { + switch (mode) { + // Arithmetic operations: + case BinaryMode::Add: + return a + b; + case BinaryMode::Subtract: + return a - b; + case BinaryMode::Multiply: + return a * b; + case BinaryMode::Divide: + return a / b; + case BinaryMode::Pow: + return std::pow(a, b); + case BinaryMode::Mod: + if constexpr (std::is_floating_point_v) { + return std::fmod(a, b); + } else { + return a % b; + } + case BinaryMode::Max: + if constexpr (std::is_floating_point_v) { + return std::fmax(a, b); + } else { + return std::max(a, b); + } + case BinaryMode::Min: + if constexpr (std::is_floating_point_v) { + return std::fmin(a, b); + } else { + return std::min(a, b); + } + case BinaryMode::BitwiseAnd: + if constexpr (std::is_integral_v) { + return a & b; + } else { + ASSERT(false); + } + case BinaryMode::BitwiseOr: + if constexpr (std::is_integral_v) { + return a | b; + } else { + ASSERT(false); + } + case BinaryMode::BitwiseXor: + if constexpr (std::is_integral_v) { + return a ^ b; + } else { + ASSERT(false); + } + // Logical operations: + default: + fprintf(stderr, "Unrecognized binary operation: how did you get here??\n"); + ASSERT(false); + return a; + } +} + +template +infiniopStatus_t binary_cpu(BinaryCpuDescriptor_t desc, void *c, void const *a, void const *b) { + auto a_ = reinterpret_cast(a); + auto b_ = reinterpret_cast(b); + auto c_ = reinterpret_cast(c); + +#pragma omp parallel for + for (uint64_t i = 0; i < desc->c_data_size; ++i) { + auto a_index = desc->broadcasted ? getDstOffset(i, desc->ndim, desc->c_strides, desc->a_strides) : i; + auto b_index = desc->broadcasted ? getDstOffset(i, desc->ndim, desc->c_strides, desc->b_strides) : i; + if constexpr (std::is_same::value) { + c_[i] = f32_to_f16(binary_kernel(f16_to_f32(a_[a_index]), f16_to_f32(b_[b_index]), desc->mode)); + } else { + c_[i] = binary_kernel(a_[a_index], b_[b_index], desc->mode); + } + } + return STATUS_SUCCESS; +} + +infiniopStatus_t cpuBinary(BinaryCpuDescriptor_t desc, + void *c, void const *a, void const *b, + void *stream) { + if (desc->dtype == F16) { + return binary_cpu(desc, c, a, b); + } + if (desc->dtype == F32) { + return binary_cpu(desc, c, a, b); + } + return STATUS_BAD_TENSOR_DTYPE; +} diff --git a/src/ops/binary/cpu/binary_cpu.h b/src/ops/binary/cpu/binary_cpu.h new file mode 100644 index 00000000..b9502bd1 --- /dev/null +++ b/src/ops/binary/cpu/binary_cpu.h @@ -0,0 +1,37 @@ +#ifndef __CPU_BINARY_H__ +#define __CPU_BINARY_H__ + +#include "../binary.h" +#include "operators.h" +#include +#include + +struct BinaryCpuDescriptor { + Device device; + DT dtype; + uint64_t ndim; + uint64_t c_data_size; + uint64_t const *c_shape; + int64_t const *a_strides; + int64_t const *b_strides; + int64_t const *c_strides; + bool broadcasted; + int mode; +}; + +typedef struct BinaryCpuDescriptor *BinaryCpuDescriptor_t; + +infiniopStatus_t cpuCreateBinaryDescriptor(infiniopHandle_t, + BinaryCpuDescriptor_t *, + infiniopTensorDescriptor_t c, + infiniopTensorDescriptor_t a, + infiniopTensorDescriptor_t b, + int mode); + +infiniopStatus_t cpuBinary(BinaryCpuDescriptor_t desc, + void *c, void const *a, void const *b, + void *stream); + +infiniopStatus_t cpuDestroyBinaryDescriptor(BinaryCpuDescriptor_t desc); + +#endif diff --git a/src/ops/binary/cuda/binary.cc b/src/ops/binary/cuda/binary.cc new file mode 100644 index 00000000..9b91ea1f --- /dev/null +++ b/src/ops/binary/cuda/binary.cc @@ -0,0 +1,82 @@ +#include "binary.cuh" +#include "../../../devices/cuda/common_cuda.h" +#include "../../utils.h" + +infiniopStatus_t cudaCreateBinaryDescriptor(CudaHandle_t handle, + BinaryCudaDescriptor_t *desc_ptr, + infiniopTensorDescriptor_t c, + infiniopTensorDescriptor_t a, + infiniopTensorDescriptor_t b, + int mode) { + uint64_t ndim = c->ndim; + if (!isValidBroadcastShape(a, b, c)) { + return STATUS_BAD_TENSOR_SHAPE; + } + if (!is_contiguous(a) || !is_contiguous(b) || !is_contiguous(c)) { + return STATUS_BAD_TENSOR_STRIDES; + } + if (c->dt != F16 && c->dt != F32) { + return STATUS_BAD_TENSOR_DTYPE; + } + if (c->dt != a->dt || c->dt != b->dt) { + return STATUS_BAD_TENSOR_DTYPE; + } + bool broadcasted = false; + if (ndim != a->ndim || ndim != b->ndim) { + broadcasted = true; + } else { + for (uint64_t i = 0; i < ndim; ++i) { + if (c->shape[i] != a->shape[i] || c->shape[i] != b->shape[i]) { + broadcasted = true; + break; + } + } + } + if (mode < 0 || mode >= BinaryMode::numBinaryMode) { + return STATUS_BAD_PARAM; + } + // bitwise operations are only valid for integral types + if (c->dt.exponent != 0 && (mode == BinaryMode::BitwiseAnd || mode == BinaryMode::BitwiseOr || mode == BinaryMode::BitwiseXor)) { + return STATUS_BAD_PARAM; + } + + uint64_t c_data_size = std::accumulate(c->shape, c->shape + c->ndim, 1ULL, std::multiplies()); + + char *strides_d = nullptr; + size_t stride_arr_size = ndim * sizeof(int64_t); + if (broadcasted) { + // get the adjusted strides for a and b + int64_t a_strides[ndim]; + int64_t b_strides[ndim]; + for (size_t i = 0; i < ndim; ++i) { + a_strides[i] = (i < ndim - a->ndim || c->shape[i] != a->shape[i + a->ndim - ndim]) ? 0 : a->strides[i + a->ndim - ndim]; + b_strides[i] = (i < ndim - b->ndim || c->shape[i] != b->shape[i + b->ndim - ndim]) ? 0 : b->strides[i + b->ndim - ndim]; + } + + // malloc and copy the strides to the device + checkCudaErrorWithCode(cudaMalloc(&strides_d, 3 * stride_arr_size), STATUS_MEMORY_NOT_ALLOCATED); + checkCudaErrorWithCode(cudaMemcpy(strides_d, a_strides, stride_arr_size, cudaMemcpyHostToDevice), STATUS_EXECUTION_FAILED); + checkCudaErrorWithCode(cudaMemcpy(strides_d + stride_arr_size, b_strides, stride_arr_size, cudaMemcpyHostToDevice), STATUS_EXECUTION_FAILED); + checkCudaErrorWithCode(cudaMemcpy(strides_d + 2 * stride_arr_size, c->strides, stride_arr_size, cudaMemcpyHostToDevice), STATUS_EXECUTION_FAILED); + } + + *desc_ptr = new BinaryCudaDescriptor{ + DevNvGpu, + c->dt, + handle->device_id, + ndim, + c_data_size, + static_cast(handle->prop.maxGridSize[0]), + reinterpret_cast(strides_d), + broadcasted, + mode, + }; + + return STATUS_SUCCESS; +} + +infiniopStatus_t cudaDestroyBinaryDescriptor(BinaryCudaDescriptor_t desc) { + checkCudaErrorWithCode(cudaFree((void *) desc->strides_d), STATUS_EXECUTION_FAILED); + delete desc; + return STATUS_SUCCESS; +} diff --git a/src/ops/binary/cuda/binary.cu b/src/ops/binary/cuda/binary.cu new file mode 100644 index 00000000..68ea712a --- /dev/null +++ b/src/ops/binary/cuda/binary.cu @@ -0,0 +1,329 @@ +#include "../../../devices/cuda/common_cuda.h" +#include "../../utils.h" +#include "binary.cuh" +#include + +/** + * @brief A templated vector type that supports binary elementwise operation. + * + * @tparam T - The access data type for elements in the vector. + * @tparam TComp - The computation data type used for arithmetic operations. + * @tparam N - The number of elements of type T in the vector for a single access. + */ +template +struct vecN { + T data[N]; + constexpr static size_t pack_size = sizeof(T) / sizeof(TComp); + + __device__ __forceinline__ vecN compute(const vecN &other, BinaryMode::Mode mode) const { + vecN result; +#pragma unroll + for (int i = 0; i < N; ++i) { + if constexpr (std::is_same_v) { + result[i] = binary_op(data[i], other[i], mode); + } else { + auto &data_ = reinterpret_cast *>(result.data)[i]; + const auto this_data = reinterpret_cast *>(data)[i]; + const auto other_data = reinterpret_cast *>(other.data)[i]; + data_ = binary_op(this_data, other_data, mode); + } + } + return result; + } + + // Overloaded operators that call the compute method with the appropriate mode. + __device__ __forceinline__ vecN operator+(const vecN &other) const { + return compute(other, BinaryMode::Add); + } + + __device__ __forceinline__ vecN operator-(const vecN &other) const { + return compute(other, BinaryMode::Subtract); + } + + __device__ __forceinline__ vecN operator*(const vecN &other) const { + return compute(other, BinaryMode::Multiply); + } + + __device__ __forceinline__ vecN operator/(const vecN &other) const { + return compute(other, BinaryMode::Divide); + } + + __device__ __forceinline__ vecN operator&(const vecN &other) const { + return compute(other, BinaryMode::BitwiseAnd); + } + + __device__ __forceinline__ vecN operator|(const vecN &other) const { + return compute(other, BinaryMode::BitwiseOr); + } + + __device__ __forceinline__ vecN operator^(const vecN &other) const { + return compute(other, BinaryMode::BitwiseXor); + } + + __device__ __forceinline__ vecN pow(const vecN &other) const { + return compute(other, BinaryMode::Pow); + } + + __device__ __forceinline__ vecN mod(const vecN &other) const { + return compute(other, BinaryMode::Mod); + } + + __device__ __forceinline__ vecN max(const vecN &other) const { + return compute(other, BinaryMode::Max); + } + + __device__ __forceinline__ vecN min(const vecN &other) const { + return compute(other, BinaryMode::Min); + } + + __device__ __forceinline__ const T &operator[](size_t i) const { + return data[i]; + } + + __device__ __forceinline__ T &operator[](size_t i) { + return data[i]; + } +}; + + +template +__device__ __forceinline__ Tdata binary_op(const Tdata &a, const Tdata &b, int mode) { + switch (mode) { + // Arithmetic operations: + case BinaryMode::Add: + if constexpr (std::is_same_v) { + return __hadd2_rn(a, b); + } else if constexpr (std::is_same_v) { + return __fadd_rn(a, b); + } else { + return a + b; + } + case BinaryMode::Subtract: + if constexpr (std::is_same_v) { + return __hsub2(a, b); + } else if constexpr (std::is_same_v) { + return __fsub_rn(a, b); + } else { + return a - b; + } + case BinaryMode::Multiply: + if constexpr (std::is_same_v) { + return __hmul2(a, b); + } else if constexpr (std::is_same_v) { + return __fmul_rd(a, b); + } else { + return a * b; + } + case BinaryMode::Divide: + if constexpr (std::is_same_v) { + return __h2div(a, b); + } else if constexpr (std::is_same_v) { + return __fdividef(a, b); + } else { + return a / b; + } + // return a / b; + case BinaryMode::Pow: + if constexpr (std::is_arithmetic_v) { + if constexpr (std::is_same_v) { + return __powf(a, b); + } else { + return std::pow(a, b); + } + } else if constexpr (std::is_same_v) { + float a_ = __half2float(a); + float b_ = __half2float(b); + float ans_f = __powf(a_, b_); + return __float2half(isnan(ans_f) ? std::pow(a_, b_) : ans_f); + } else if constexpr (!std::is_same_v) { + return a.pow(b); + } + return a; + case BinaryMode::Mod: + if constexpr (std::is_floating_point_v) { + return std::fmod(a, b); + } else if constexpr (std::is_same_v) { + return __float2half(std::fmod(__half2float(a), __half2float(b))); + } else if constexpr (std::is_arithmetic_v) { + return a % b; + } else if constexpr (!std::is_same_v) { + return a.mod(b); + } + return a; + case BinaryMode::Max: + if constexpr (std::is_fundamental_v) { + if constexpr (std::is_floating_point_v) { + return std::fmax(a, b); + } else { + return std::max(a, b); + } + } else if constexpr (std::is_same_v) { + return __hmax(a, b); + } else if constexpr (std::is_same_v) { + return __hmax2(a, b); + } else { + return a.max(b); + } + case BinaryMode::Min: + if constexpr (std::is_fundamental_v) { + if constexpr (std::is_floating_point_v) { + return std::fmin(a, b); + } else { + return std::min(a, b); + } + } else if constexpr (std::is_same_v) { + return __hmin(a, b); + } else if constexpr (std::is_same_v) { + return __hmin2(a, b); + } else { + return a.min(b); + } + case BinaryMode::BitwiseAnd: + if constexpr (std::is_integral_v) { + return a & b; + } else { + printf("Error: Non-integral input to bitwise operator\n"); + return a; + } + case BinaryMode::BitwiseOr: + if constexpr (std::is_integral_v) { + return a | b; + } else { + printf("Error: Non-integral input to bitwise operator\n"); + return a; + } + case BinaryMode::BitwiseXor: + if constexpr (std::is_integral_v) { + return a ^ b; + } else { + printf("Error: Non-integral input to bitwise operator\n"); + return a; + } + // Logical operations: + // TODO. Currently Not supported yet + default: + printf("Unrecognized binary operation\n"); + return a; + } +} + +template +__global__ void binary( + Tdata *c, + const Tdata *a, + const Tdata *b, + const int64_t *__restrict__ a_strides, + const int64_t *__restrict__ b_strides, + const int64_t *__restrict__ c_strides, + uint64_t data_size, + uint64_t ndim, + uint64_t offset, + bool broadcasted, + unsigned pack_size, + const int mode) { + uint64_t idx = blockIdx.x * blockDim.x + threadIdx.x + offset; + + if (idx < data_size) { + if (broadcasted) { + idx *= pack_size; + auto a_ = reinterpret_cast(a); + auto b_ = reinterpret_cast(b); + auto c_ = reinterpret_cast(c); +#pragma unroll + for (size_t i = 0; i < pack_size; ++i) { + auto a_idx = getDstOffset(idx + i, ndim, c_strides, a_strides); + auto b_idx = getDstOffset(idx + i, ndim, c_strides, b_strides); + c_[idx + i] = binary_op(a_[a_idx], b_[b_idx], mode); + } + return; + } + Tdata a_data = a[idx]; + Tdata b_data = b[idx]; + c[idx] = binary_op(a_data, b_data, mode); + } +} + +template +void _binary_nv_gpu(BinaryCudaDescriptor_t desc, Tdata *c, Tdata const *a, Tdata const *b, uint64_t data_size, uint64_t pack_size, uint64_t offset, void *stream) { + if (data_size == 0) { + return; + } + dim3 blockDims = dim3(std::min(static_cast(BLOCK_SIZE), data_size)); + dim3 gridDims = dim3(std::min(ROUND_UP_DIV(data_size, blockDims.x), desc->max_grid_size)); + uint64_t step = gridDims.x * blockDims.x; + + const int64_t *a_strides = nullptr; + const int64_t *b_strides = nullptr; + const int64_t *c_strides = nullptr; + if (desc->broadcasted) { + a_strides = desc->strides_d; + b_strides = desc->strides_d + desc->ndim; + c_strides = desc->strides_d + 2 * desc->ndim; + } + cudaStream_t cuda_stream = reinterpret_cast(stream); + +#pragma unroll + for (uint64_t i = 0; i < data_size; i += step) { + binary<<>>( + c, a, b, a_strides, b_strides, c_strides, offset + data_size, desc->ndim, offset + i, desc->broadcasted, pack_size, desc->mode); + } +} + +template +infiniopStatus_t binary_nv_gpu(BinaryCudaDescriptor_t desc, void *c, void const *a, void const *b, void *stream, uint64_t pack_size) { + const auto data_size = desc->c_data_size / pack_size; + const auto a_vec = reinterpret_cast(a); + const auto b_vec = reinterpret_cast(b); + const auto c_vec = reinterpret_cast(c); + _binary_nv_gpu(desc, c_vec, a_vec, b_vec, data_size, pack_size, 0, stream); + + const auto remainder = desc->c_data_size % pack_size; + if (remainder > 0) { + const auto a_ = reinterpret_cast(a); + const auto b_ = reinterpret_cast(b); + const auto c_ = reinterpret_cast(c); + _binary_nv_gpu(desc, c_, a_, b_, remainder, 1, data_size * pack_size, stream); + } + cudaDeviceSynchronize(); + return STATUS_SUCCESS; +} + +infiniopStatus_t cudaBinary(BinaryCudaDescriptor_t desc, + void *c, void const *a, void const *b, + void *stream) { + checkCudaError(cudaSetDevice(desc->device_id)); + if (desc->dtype == F16) { + switch (desc->mode) { + case BinaryMode::Add: + case BinaryMode::Subtract: + case BinaryMode::Multiply: + case BinaryMode::Divide: + return binary_nv_gpu, half, 256>(desc, c, a, b, stream, 8); + case BinaryMode::Pow: + return binary_nv_gpu, half, 128>(desc, c, a, b, stream, 16); + case BinaryMode::Max: + case BinaryMode::Min: + return binary_nv_gpu, half, 256>(desc, c, a, b, stream, 8); + default: + return binary_nv_gpu, half, 64>(desc, c, a, b, stream, 8); + } + } + if (desc->dtype == F32) { + switch (desc->mode) { + case BinaryMode::Add: + case BinaryMode::Subtract: + case BinaryMode::Multiply: + case BinaryMode::Divide: + return binary_nv_gpu, float, 256>(desc, c, a, b, stream, 4); + case BinaryMode::Pow: + case BinaryMode::Mod: + return binary_nv_gpu, float, 128>(desc, c, a, b, stream, 4); + case BinaryMode::Max: + case BinaryMode::Min: + return binary_nv_gpu, float, 64>(desc, c, a, b, stream, 4); + default: + return binary_nv_gpu, float, 256>(desc, c, a, b, stream, 4); + } + } + return STATUS_BAD_TENSOR_DTYPE; +} diff --git a/src/ops/binary/cuda/binary.cuh b/src/ops/binary/cuda/binary.cuh new file mode 100644 index 00000000..efcc8adc --- /dev/null +++ b/src/ops/binary/cuda/binary.cuh @@ -0,0 +1,38 @@ +#ifndef __CUDA_BINARY_H__ +#define __CUDA_BINARY_H__ + +#include "../../../devices/cuda/common_cuda.h" +#include "../../../devices/cuda/cuda_handle.h" +#include "../binary.h" +#include "operators.h" +#include +#include + +struct BinaryCudaDescriptor { + Device device; + DT dtype; + int device_id; + uint64_t ndim; + uint64_t c_data_size; + uint64_t max_grid_size; + int64_t const *strides_d; + bool broadcasted; + int mode; +}; + +typedef struct BinaryCudaDescriptor *BinaryCudaDescriptor_t; + +infiniopStatus_t cudaCreateBinaryDescriptor(CudaHandle_t, + BinaryCudaDescriptor_t *, + infiniopTensorDescriptor_t c, + infiniopTensorDescriptor_t a, + infiniopTensorDescriptor_t b, + int mode); + +infiniopStatus_t cudaBinary(BinaryCudaDescriptor_t desc, + void *c, void const *a, void const *b, + void *stream); + +infiniopStatus_t cudaDestroyBinaryDescriptor(BinaryCudaDescriptor_t desc); + +#endif diff --git a/src/ops/binary/operator.cc b/src/ops/binary/operator.cc new file mode 100644 index 00000000..5ad7b94f --- /dev/null +++ b/src/ops/binary/operator.cc @@ -0,0 +1,74 @@ +#include "../utils.h" +#include "binary.h" +#include "operators.h" + +#ifdef ENABLE_CPU +#include "cpu/binary_cpu.h" +#endif +#ifdef ENABLE_NV_GPU +#include "../../devices/cuda/cuda_handle.h" +#include "cuda/binary.cuh" +#endif + +__C infiniopStatus_t infiniopCreateBinaryDescriptor( + infiniopHandle_t handle, + infiniopBinaryDescriptor_t *desc_ptr, + infiniopTensorDescriptor_t c, + infiniopTensorDescriptor_t a, + infiniopTensorDescriptor_t b, + int mode) { + switch (handle->device) { +#ifdef ENABLE_CPU + case DevCpu: + return cpuCreateBinaryDescriptor(handle, (BinaryCpuDescriptor_t *) desc_ptr, c, a, b, mode); +#endif +#ifdef ENABLE_NV_GPU + case DevNvGpu: { + return cudaCreateBinaryDescriptor((CudaHandle_t) handle, (BinaryCudaDescriptor_t *) desc_ptr, c, a, b, mode); + } + +#endif +#ifdef ENABLE_CAMBRICON_MLU + // TODO +#endif + } + return STATUS_BAD_DEVICE; +} + +__C infiniopStatus_t infiniopBinary(infiniopBinaryDescriptor_t desc, void *c, void const *a, void const *b, void *stream) { + switch (desc->device) { +#ifdef ENABLE_CPU + case DevCpu: + return cpuBinary((BinaryCpuDescriptor_t) desc, c, a, b, stream); +#endif +#ifdef ENABLE_NV_GPU + case DevNvGpu: { + return cudaBinary((BinaryCudaDescriptor_t) desc, c, a, b, stream); + } + +#endif +#ifdef ENABLE_CAMBRICON_MLU + // TODO +#endif + } + return STATUS_BAD_DEVICE; +} + +__C infiniopStatus_t infiniopDestroyBinaryDescriptor(infiniopBinaryDescriptor_t desc) { + switch (desc->device) { +#ifdef ENABLE_CPU + case DevCpu: + return cpuDestroyBinaryDescriptor((BinaryCpuDescriptor_t) desc); +#endif +#ifdef ENABLE_NV_GPU + case DevNvGpu: { + return cudaDestroyBinaryDescriptor((BinaryCudaDescriptor_t) desc); + } + +#endif +#ifdef ENABLE_CAMBRICON_MLU + // TODO +#endif + } + return STATUS_BAD_DEVICE; +} diff --git a/src/ops/div/operator.cc b/src/ops/div/operator.cc new file mode 100644 index 00000000..794ffa05 --- /dev/null +++ b/src/ops/div/operator.cc @@ -0,0 +1,44 @@ +#include "../binary/binary.h" +#include "../utils.h" +#include "ops/div/div.h" +#include "tensor/tensor_descriptor.h" + +struct _DivDescriptor { + Device device; + infiniopBinaryDescriptor_t binary_desc; +}; + +typedef struct _DivDescriptor *_DivDescriptor_t; + +__C __export infiniopStatus_t infiniopCreateDivDescriptor(infiniopHandle_t handle, + infiniopDivDescriptor_t *desc_ptr, + infiniopTensorDescriptor_t c_desc, + infiniopTensorDescriptor_t a_desc, + infiniopTensorDescriptor_t b_desc) { + // binary desc + infiniopBinaryDescriptor_t binary_desc; + CHECK_STATUS(infiniopCreateBinaryDescriptor(handle, &binary_desc, c_desc, a_desc, b_desc, BinaryMode::Divide), STATUS_SUCCESS); + + *(_DivDescriptor_t *) desc_ptr = new _DivDescriptor{ + handle->device, + binary_desc, + }; + + return STATUS_SUCCESS; +} + +__C __export infiniopStatus_t infiniopDiv(infiniopDivDescriptor_t desc, + void *c, + void const *a, + void const *b, + void *stream) { + auto _desc = (_DivDescriptor_t) desc; + CHECK_STATUS(infiniopBinary(_desc->binary_desc, c, a, b, stream), STATUS_SUCCESS); + return STATUS_SUCCESS; +} + +__C __export infiniopStatus_t infiniopDestroyDivDescriptor(infiniopDivDescriptor_t desc) { + CHECK_STATUS(infiniopDestroyBinaryDescriptor(((_DivDescriptor_t) desc)->binary_desc), STATUS_SUCCESS); + delete desc; + return STATUS_SUCCESS; +} diff --git a/src/ops/max/operator.cc b/src/ops/max/operator.cc new file mode 100644 index 00000000..2e79eaa8 --- /dev/null +++ b/src/ops/max/operator.cc @@ -0,0 +1,44 @@ +#include "../binary/binary.h" +#include "../utils.h" +#include "ops/max/max.h" +#include "tensor/tensor_descriptor.h" + +struct _MaxDescriptor { + Device device; + infiniopBinaryDescriptor_t binary_desc; +}; + +typedef struct _MaxDescriptor *_MaxDescriptor_t; + +__C __export infiniopStatus_t infiniopCreateMaxDescriptor(infiniopHandle_t handle, + infiniopMaxDescriptor_t *desc_ptr, + infiniopTensorDescriptor_t c_desc, + infiniopTensorDescriptor_t a_desc, + infiniopTensorDescriptor_t b_desc) { + // binary desc + infiniopBinaryDescriptor_t binary_desc; + CHECK_STATUS(infiniopCreateBinaryDescriptor(handle, &binary_desc, c_desc, a_desc, b_desc, BinaryMode::Max), STATUS_SUCCESS); + + *(_MaxDescriptor_t *) desc_ptr = new _MaxDescriptor{ + handle->device, + binary_desc, + }; + + return STATUS_SUCCESS; +} + +__C __export infiniopStatus_t infiniopMax(infiniopMaxDescriptor_t desc, + void *c, + void const *a, + void const *b, + void *stream) { + auto _desc = (_MaxDescriptor_t) desc; + CHECK_STATUS(infiniopBinary(_desc->binary_desc, c, a, b, stream), STATUS_SUCCESS); + return STATUS_SUCCESS; +} + +__C __export infiniopStatus_t infiniopDestroyMaxDescriptor(infiniopMaxDescriptor_t desc) { + CHECK_STATUS(infiniopDestroyBinaryDescriptor(((_MaxDescriptor_t) desc)->binary_desc), STATUS_SUCCESS); + delete desc; + return STATUS_SUCCESS; +} diff --git a/src/ops/min/operator.cc b/src/ops/min/operator.cc new file mode 100644 index 00000000..f6fd0924 --- /dev/null +++ b/src/ops/min/operator.cc @@ -0,0 +1,44 @@ +#include "../binary/binary.h" +#include "../utils.h" +#include "ops/min/min.h" +#include "tensor/tensor_descriptor.h" + +struct _MinDescriptor { + Device device; + infiniopBinaryDescriptor_t binary_desc; +}; + +typedef struct _MinDescriptor *_MinDescriptor_t; + +__C __export infiniopStatus_t infiniopCreateMinDescriptor(infiniopHandle_t handle, + infiniopMinDescriptor_t *desc_ptr, + infiniopTensorDescriptor_t c_desc, + infiniopTensorDescriptor_t a_desc, + infiniopTensorDescriptor_t b_desc) { + // binary desc + infiniopBinaryDescriptor_t binary_desc; + CHECK_STATUS(infiniopCreateBinaryDescriptor(handle, &binary_desc, c_desc, a_desc, b_desc, BinaryMode::Min), STATUS_SUCCESS); + + *(_MinDescriptor_t *) desc_ptr = new _MinDescriptor{ + handle->device, + binary_desc, + }; + + return STATUS_SUCCESS; +} + +__C __export infiniopStatus_t infiniopMin(infiniopMinDescriptor_t desc, + void *c, + void const *a, + void const *b, + void *stream) { + auto _desc = (_MinDescriptor_t) desc; + CHECK_STATUS(infiniopBinary(_desc->binary_desc, c, a, b, stream), STATUS_SUCCESS); + return STATUS_SUCCESS; +} + +__C __export infiniopStatus_t infiniopDestroyMinDescriptor(infiniopMinDescriptor_t desc) { + CHECK_STATUS(infiniopDestroyBinaryDescriptor(((_MinDescriptor_t) desc)->binary_desc), STATUS_SUCCESS); + delete desc; + return STATUS_SUCCESS; +} diff --git a/src/ops/mod/operator.cc b/src/ops/mod/operator.cc new file mode 100644 index 00000000..3b92d729 --- /dev/null +++ b/src/ops/mod/operator.cc @@ -0,0 +1,44 @@ +#include "../binary/binary.h" +#include "../utils.h" +#include "ops/mod/mod.h" +#include "tensor/tensor_descriptor.h" + +struct _ModDescriptor { + Device device; + infiniopBinaryDescriptor_t binary_desc; +}; + +typedef struct _ModDescriptor *_ModDescriptor_t; + +__C __export infiniopStatus_t infiniopCreateModDescriptor(infiniopHandle_t handle, + infiniopModDescriptor_t *desc_ptr, + infiniopTensorDescriptor_t c_desc, + infiniopTensorDescriptor_t a_desc, + infiniopTensorDescriptor_t b_desc) { + // binary desc + infiniopBinaryDescriptor_t binary_desc; + CHECK_STATUS(infiniopCreateBinaryDescriptor(handle, &binary_desc, c_desc, a_desc, b_desc, BinaryMode::Mod), STATUS_SUCCESS); + + *(_ModDescriptor_t *) desc_ptr = new _ModDescriptor{ + handle->device, + binary_desc, + }; + + return STATUS_SUCCESS; +} + +__C __export infiniopStatus_t infiniopMod(infiniopModDescriptor_t desc, + void *c, + void const *a, + void const *b, + void *stream) { + auto _desc = (_ModDescriptor_t) desc; + CHECK_STATUS(infiniopBinary(_desc->binary_desc, c, a, b, stream), STATUS_SUCCESS); + return STATUS_SUCCESS; +} + +__C __export infiniopStatus_t infiniopDestroyModDescriptor(infiniopModDescriptor_t desc) { + CHECK_STATUS(infiniopDestroyBinaryDescriptor(((_ModDescriptor_t) desc)->binary_desc), STATUS_SUCCESS); + delete desc; + return STATUS_SUCCESS; +} diff --git a/src/ops/mul/operator.cc b/src/ops/mul/operator.cc new file mode 100644 index 00000000..6c115c37 --- /dev/null +++ b/src/ops/mul/operator.cc @@ -0,0 +1,44 @@ +#include "../binary/binary.h" +#include "../utils.h" +#include "ops/mul/mul.h" +#include "tensor/tensor_descriptor.h" + +struct _MulDescriptor { + Device device; + infiniopBinaryDescriptor_t binary_desc; +}; + +typedef struct _MulDescriptor *_MulDescriptor_t; + +__C __export infiniopStatus_t infiniopCreateMulDescriptor(infiniopHandle_t handle, + infiniopMulDescriptor_t *desc_ptr, + infiniopTensorDescriptor_t c_desc, + infiniopTensorDescriptor_t a_desc, + infiniopTensorDescriptor_t b_desc) { + // binary desc + infiniopBinaryDescriptor_t binary_desc; + CHECK_STATUS(infiniopCreateBinaryDescriptor(handle, &binary_desc, c_desc, a_desc, b_desc, BinaryMode::Multiply), STATUS_SUCCESS); + + *(_MulDescriptor_t *) desc_ptr = new _MulDescriptor{ + handle->device, + binary_desc, + }; + + return STATUS_SUCCESS; +} + +__C __export infiniopStatus_t infiniopMul(infiniopMulDescriptor_t desc, + void *c, + void const *a, + void const *b, + void *stream) { + auto _desc = (_MulDescriptor_t) desc; + CHECK_STATUS(infiniopBinary(_desc->binary_desc, c, a, b, stream), STATUS_SUCCESS); + return STATUS_SUCCESS; +} + +__C __export infiniopStatus_t infiniopDestroyMulDescriptor(infiniopMulDescriptor_t desc) { + CHECK_STATUS(infiniopDestroyBinaryDescriptor(((_MulDescriptor_t) desc)->binary_desc), STATUS_SUCCESS); + delete desc; + return STATUS_SUCCESS; +} diff --git a/src/ops/pow/operator.cc b/src/ops/pow/operator.cc new file mode 100644 index 00000000..4a47f3e3 --- /dev/null +++ b/src/ops/pow/operator.cc @@ -0,0 +1,44 @@ +#include "../binary/binary.h" +#include "../utils.h" +#include "ops/pow/pow.h" +#include "tensor/tensor_descriptor.h" + +struct _PowDescriptor { + Device device; + infiniopBinaryDescriptor_t binary_desc; +}; + +typedef struct _PowDescriptor *_PowDescriptor_t; + +__C __export infiniopStatus_t infiniopCreatePowDescriptor(infiniopHandle_t handle, + infiniopPowDescriptor_t *desc_ptr, + infiniopTensorDescriptor_t c_desc, + infiniopTensorDescriptor_t a_desc, + infiniopTensorDescriptor_t b_desc) { + // binary desc + infiniopBinaryDescriptor_t binary_desc; + CHECK_STATUS(infiniopCreateBinaryDescriptor(handle, &binary_desc, c_desc, a_desc, b_desc, BinaryMode::Pow), STATUS_SUCCESS); + + *(_PowDescriptor_t *) desc_ptr = new _PowDescriptor{ + handle->device, + binary_desc, + }; + + return STATUS_SUCCESS; +} + +__C __export infiniopStatus_t infiniopPow(infiniopPowDescriptor_t desc, + void *c, + void const *a, + void const *b, + void *stream) { + auto _desc = (_PowDescriptor_t) desc; + CHECK_STATUS(infiniopBinary(_desc->binary_desc, c, a, b, stream), STATUS_SUCCESS); + return STATUS_SUCCESS; +} + +__C __export infiniopStatus_t infiniopDestroyPowDescriptor(infiniopPowDescriptor_t desc) { + CHECK_STATUS(infiniopDestroyBinaryDescriptor(((_PowDescriptor_t) desc)->binary_desc), STATUS_SUCCESS); + delete desc; + return STATUS_SUCCESS; +} diff --git a/src/ops/sub/operator.cc b/src/ops/sub/operator.cc new file mode 100644 index 00000000..0943548d --- /dev/null +++ b/src/ops/sub/operator.cc @@ -0,0 +1,44 @@ +#include "../binary/binary.h" +#include "../utils.h" +#include "ops/sub/sub.h" +#include "tensor/tensor_descriptor.h" + +struct _SubDescriptor { + Device device; + infiniopBinaryDescriptor_t binary_desc; +}; + +typedef struct _SubDescriptor *_SubDescriptor_t; + +__C __export infiniopStatus_t infiniopCreateSubDescriptor(infiniopHandle_t handle, + infiniopSubDescriptor_t *desc_ptr, + infiniopTensorDescriptor_t c_desc, + infiniopTensorDescriptor_t a_desc, + infiniopTensorDescriptor_t b_desc) { + // binary desc + infiniopBinaryDescriptor_t binary_desc; + CHECK_STATUS(infiniopCreateBinaryDescriptor(handle, &binary_desc, c_desc, a_desc, b_desc, BinaryMode::Subtract), STATUS_SUCCESS); + + *(_SubDescriptor_t *) desc_ptr = new _SubDescriptor{ + handle->device, + binary_desc, + }; + + return STATUS_SUCCESS; +} + +__C __export infiniopStatus_t infiniopSub(infiniopSubDescriptor_t desc, + void *c, + void const *a, + void const *b, + void *stream) { + auto _desc = (_SubDescriptor_t) desc; + CHECK_STATUS(infiniopBinary(_desc->binary_desc, c, a, b, stream), STATUS_SUCCESS); + return STATUS_SUCCESS; +} + +__C __export infiniopStatus_t infiniopDestroySubDescriptor(infiniopSubDescriptor_t desc) { + CHECK_STATUS(infiniopDestroyBinaryDescriptor(((_SubDescriptor_t) desc)->binary_desc), STATUS_SUCCESS); + delete desc; + return STATUS_SUCCESS; +}