diff --git a/include/infiniop.h b/include/infiniop.h index d51b8d92e..30a07e4b4 100644 --- a/include/infiniop.h +++ b/include/infiniop.h @@ -4,18 +4,27 @@ #include "infiniop/handle.h" #include "infiniop/ops/add.h" #include "infiniop/ops/attention.h" +#include "infiniop/ops/cast.h" #include "infiniop/ops/causal_softmax.h" #include "infiniop/ops/clip.h" #include "infiniop/ops/conv.h" +#include "infiniop/ops/cos.h" +#include "infiniop/ops/exp.h" #include "infiniop/ops/gemm.h" +#include "infiniop/ops/hardswish.h" +#include "infiniop/ops/leakyrelu.h" #include "infiniop/ops/mul.h" #include "infiniop/ops/random_sample.h" #include "infiniop/ops/rearrange.h" #include "infiniop/ops/relu.h" #include "infiniop/ops/rms_norm.h" #include "infiniop/ops/rope.h" +#include "infiniop/ops/sigmoid_backward.h" +#include "infiniop/ops/sin.h" #include "infiniop/ops/sub.h" #include "infiniop/ops/swiglu.h" +#include "infiniop/ops/tanh.h" +#include "infiniop/ops/where.h" #include "infiniop/tensor_descriptor.h" #endif // __INFINIOP_API_H__ diff --git a/include/infiniop/ops/cast.h b/include/infiniop/ops/cast.h new file mode 100644 index 000000000..81d771efe --- /dev/null +++ b/include/infiniop/ops/cast.h @@ -0,0 +1,24 @@ +#ifndef __INFINIOP_CAST_API_H__ +#define __INFINIOP_CAST_API_H__ + +#include "../operator_descriptor.h" + +typedef struct InfiniopDescriptor *infiniopCastDescriptor_t; + +__C __export infiniStatus_t infiniopCreateCastDescriptor(infiniopHandle_t handle, + infiniopCastDescriptor_t *desc_ptr, + infiniopTensorDescriptor_t output, + infiniopTensorDescriptor_t input); + +__C __export infiniStatus_t infiniopGetCastWorkspaceSize(infiniopCastDescriptor_t desc, size_t *size); + +__C __export infiniStatus_t infiniopCast(infiniopCastDescriptor_t desc, + void *workspace, + size_t workspace_size, + void *output, + const void *input, + void *stream); + +__C __export infiniStatus_t infiniopDestroyCastDescriptor(infiniopCastDescriptor_t desc); + +#endif diff --git a/include/infiniop/ops/cos.h b/include/infiniop/ops/cos.h new file mode 100644 index 000000000..aeb551e77 --- /dev/null +++ b/include/infiniop/ops/cos.h @@ -0,0 +1,24 @@ +#ifndef __INFINIOP_COS_API_H__ +#define __INFINIOP_COS_API_H__ + +#include "../operator_descriptor.h" + +typedef struct InfiniopDescriptor *infiniopCosDescriptor_t; + +__C __export infiniStatus_t infiniopCreateCosDescriptor(infiniopHandle_t handle, + infiniopCosDescriptor_t *desc_ptr, + infiniopTensorDescriptor_t output, + infiniopTensorDescriptor_t input); + +__C __export infiniStatus_t infiniopGetCosWorkspaceSize(infiniopCosDescriptor_t desc, size_t *size); + +__C __export infiniStatus_t infiniopCos(infiniopCosDescriptor_t desc, + void *workspace, + size_t workspace_size, + void *output, + const void *input, + void *stream); + +__C __export infiniStatus_t infiniopDestroyCosDescriptor(infiniopCosDescriptor_t desc); + +#endif diff --git a/include/infiniop/ops/exp.h b/include/infiniop/ops/exp.h new file mode 100644 index 000000000..624bc5363 --- /dev/null +++ b/include/infiniop/ops/exp.h @@ -0,0 +1,24 @@ +#ifndef __INFINIOP_EXP_API_H__ +#define __INFINIOP_EXP_API_H__ + +#include "../operator_descriptor.h" + +typedef struct InfiniopDescriptor *infiniopExpDescriptor_t; + +__C __export infiniStatus_t infiniopCreateExpDescriptor(infiniopHandle_t handle, + infiniopExpDescriptor_t *desc_ptr, + infiniopTensorDescriptor_t output, + infiniopTensorDescriptor_t input); + +__C __export infiniStatus_t infiniopGetExpWorkspaceSize(infiniopExpDescriptor_t desc, size_t *size); + +__C __export infiniStatus_t infiniopExp(infiniopExpDescriptor_t desc, + void *workspace, + size_t workspace_size, + void *output, + const void *input, + void *stream); + +__C __export infiniStatus_t infiniopDestroyExpDescriptor(infiniopExpDescriptor_t desc); + +#endif diff --git a/include/infiniop/ops/hardswish.h b/include/infiniop/ops/hardswish.h new file mode 100644 index 000000000..8d655fe82 --- /dev/null +++ b/include/infiniop/ops/hardswish.h @@ -0,0 +1,24 @@ +#ifndef __INFINIOP_HARDSWISH_API_H__ +#define __INFINIOP_HARDSWISH_API_H__ + +#include "../operator_descriptor.h" + +typedef struct InfiniopDescriptor *infiniopHardswishDescriptor_t; + +__C __export infiniStatus_t infiniopCreateHardswishDescriptor(infiniopHandle_t handle, + infiniopHardswishDescriptor_t *desc_ptr, + infiniopTensorDescriptor_t output, + infiniopTensorDescriptor_t input); + +__C __export infiniStatus_t infiniopGetHardswishWorkspaceSize(infiniopHardswishDescriptor_t desc, size_t *size); + +__C __export infiniStatus_t infiniopHardswish(infiniopHardswishDescriptor_t desc, + void *workspace, + size_t workspace_size, + void *output, + const void *input, + void *stream); + +__C __export infiniStatus_t infiniopDestroyHardswishDescriptor(infiniopHardswishDescriptor_t desc); + +#endif diff --git a/include/infiniop/ops/leakyrelu.h b/include/infiniop/ops/leakyrelu.h new file mode 100644 index 000000000..adc46d1c6 --- /dev/null +++ b/include/infiniop/ops/leakyrelu.h @@ -0,0 +1,25 @@ +#ifndef __INFINIOP_LEAKYRELU_API_H__ +#define __INFINIOP_LEAKYRELU_API_H__ + +#include "../operator_descriptor.h" + +typedef struct InfiniopDescriptor *infiniopLeakyreluDescriptor_t; + +__C __export infiniStatus_t infiniopCreateLeakyreluDescriptor(infiniopHandle_t handle, + infiniopLeakyreluDescriptor_t *desc_ptr, + infiniopTensorDescriptor_t output, + infiniopTensorDescriptor_t input, + float negative_slope); + +__C __export infiniStatus_t infiniopGetLeakyreluWorkspaceSize(infiniopLeakyreluDescriptor_t desc, size_t *size); + +__C __export infiniStatus_t infiniopLeakyrelu(infiniopLeakyreluDescriptor_t desc, + void *workspace, + size_t workspace_size, + void *output, + const void *input, + void *stream); + +__C __export infiniStatus_t infiniopDestroyLeakyreluDescriptor(infiniopLeakyreluDescriptor_t desc); + +#endif diff --git a/include/infiniop/ops/sigmoid_backward.h b/include/infiniop/ops/sigmoid_backward.h new file mode 100644 index 000000000..abab0cde7 --- /dev/null +++ b/include/infiniop/ops/sigmoid_backward.h @@ -0,0 +1,26 @@ +#ifndef __INFINIOP_SIGMOID_BACKWARD_API_H__ +#define __INFINIOP_SIGMOID_BACKWARD_API_H__ + +#include "../operator_descriptor.h" + +typedef struct InfiniopDescriptor *infiniopSigmoidBackwardDescriptor_t; + +__C __export infiniStatus_t infiniopCreateSigmoidBackwardDescriptor(infiniopHandle_t handle, + infiniopSigmoidBackwardDescriptor_t *desc_ptr, + infiniopTensorDescriptor_t grad_input, + infiniopTensorDescriptor_t input, + infiniopTensorDescriptor_t grad_output); + +__C __export infiniStatus_t infiniopGetSigmoidBackwardWorkspaceSize(infiniopSigmoidBackwardDescriptor_t desc, size_t *size); + +__C __export infiniStatus_t infiniopSigmoidBackward(infiniopSigmoidBackwardDescriptor_t desc, + void *workspace, + size_t workspace_size, + void *grad_input, + const void *input, + const void *grad_output, + void *stream); + +__C __export infiniStatus_t infiniopDestroySigmoidBackwardDescriptor(infiniopSigmoidBackwardDescriptor_t desc); + +#endif diff --git a/include/infiniop/ops/sin.h b/include/infiniop/ops/sin.h new file mode 100644 index 000000000..640deccc0 --- /dev/null +++ b/include/infiniop/ops/sin.h @@ -0,0 +1,24 @@ +#ifndef __INFINIOP_SIN_API_H__ +#define __INFINIOP_SIN_API_H__ + +#include "../operator_descriptor.h" + +typedef struct InfiniopDescriptor *infiniopSinDescriptor_t; + +__C __export infiniStatus_t infiniopCreateSinDescriptor(infiniopHandle_t handle, + infiniopSinDescriptor_t *desc_ptr, + infiniopTensorDescriptor_t output, + infiniopTensorDescriptor_t input); + +__C __export infiniStatus_t infiniopGetSinWorkspaceSize(infiniopSinDescriptor_t desc, size_t *size); + +__C __export infiniStatus_t infiniopSin(infiniopSinDescriptor_t desc, + void *workspace, + size_t workspace_size, + void *output, + const void *input, + void *stream); + +__C __export infiniStatus_t infiniopDestroySinDescriptor(infiniopSinDescriptor_t desc); + +#endif diff --git a/include/infiniop/ops/tanh.h b/include/infiniop/ops/tanh.h new file mode 100644 index 000000000..742dba860 --- /dev/null +++ b/include/infiniop/ops/tanh.h @@ -0,0 +1,24 @@ +#ifndef __INFINIOP_TANH_API_H__ +#define __INFINIOP_TANH_API_H__ + +#include "../operator_descriptor.h" + +typedef struct InfiniopDescriptor *infiniopTanhDescriptor_t; + +__C __export infiniStatus_t infiniopCreateTanhDescriptor(infiniopHandle_t handle, + infiniopTanhDescriptor_t *desc_ptr, + infiniopTensorDescriptor_t output, + infiniopTensorDescriptor_t input); + +__C __export infiniStatus_t infiniopGetTanhWorkspaceSize(infiniopTanhDescriptor_t desc, size_t *size); + +__C __export infiniStatus_t infiniopTanh(infiniopTanhDescriptor_t desc, + void *workspace, + size_t workspace_size, + void *output, + const void *input, + void *stream); + +__C __export infiniStatus_t infiniopDestroyTanhDescriptor(infiniopTanhDescriptor_t desc); + +#endif diff --git a/include/infiniop/ops/where.h b/include/infiniop/ops/where.h new file mode 100644 index 000000000..713db102f --- /dev/null +++ b/include/infiniop/ops/where.h @@ -0,0 +1,28 @@ +#ifndef __INFINIOP_WHERE_API_H__ +#define __INFINIOP_WHERE_API_H__ + +#include "../operator_descriptor.h" + +typedef struct InfiniopDescriptor *infiniopWhereDescriptor_t; + +__C __export infiniStatus_t infiniopCreateWhereDescriptor(infiniopHandle_t handle, + infiniopWhereDescriptor_t *desc_ptr, + infiniopTensorDescriptor_t c, + infiniopTensorDescriptor_t a, + infiniopTensorDescriptor_t b, + infiniopTensorDescriptor_t condition); + +__C __export infiniStatus_t infiniopGetWhereWorkspaceSize(infiniopWhereDescriptor_t desc, size_t *size); + +__C __export infiniStatus_t infiniopWhere(infiniopWhereDescriptor_t desc, + void *workspace, + size_t workspace_size, + void *c, + const void *a, + const void *b, + const void *condition, + void *stream); + +__C __export infiniStatus_t infiniopDestroyWhereDescriptor(infiniopWhereDescriptor_t desc); + +#endif diff --git a/scripts/python_test.py b/scripts/python_test.py index eb2d4319e..02fd65c63 100644 --- a/scripts/python_test.py +++ b/scripts/python_test.py @@ -24,6 +24,15 @@ def run_tests(args): "rope.py", "sub.py", "swiglu.py", + "exp.py", + "sin.py", + "cos.py", + "leakyrelu.py", + "tanh.py", + "sigmoid_backward.py", + "hardswish.py", + "cast.py", + "where.py", ]: result = subprocess.run( f"python {test} {args} --debug", text=True, encoding="utf-8", shell=True diff --git a/src/infiniop-test/include/ops.hpp b/src/infiniop-test/include/ops.hpp index 3820f7cfd..c3a120703 100644 --- a/src/infiniop-test/include/ops.hpp +++ b/src/infiniop-test/include/ops.hpp @@ -16,6 +16,15 @@ DECLARE_INFINIOP_TEST(add) DECLARE_INFINIOP_TEST(causal_softmax) DECLARE_INFINIOP_TEST(rearrange) DECLARE_INFINIOP_TEST(sub) +DECLARE_INFINIOP_TEST(exp) +DECLARE_INFINIOP_TEST(sin) +DECLARE_INFINIOP_TEST(cos) +DECLARE_INFINIOP_TEST(leakyrelu) +DECLARE_INFINIOP_TEST(tanh) +DECLARE_INFINIOP_TEST(sigmoid_backward) +DECLARE_INFINIOP_TEST(hardswish) +DECLARE_INFINIOP_TEST(cast) +DECLARE_INFINIOP_TEST(where) #define REGISTER_INFINIOP_TEST(name) \ { \ @@ -30,19 +39,28 @@ DECLARE_INFINIOP_TEST(sub) /* * Register all the tests here */ -#define TEST_BUILDER_MAPPINGS \ - { \ - REGISTER_INFINIOP_TEST(gemm) \ - REGISTER_INFINIOP_TEST(random_sample) \ - REGISTER_INFINIOP_TEST(add) \ - REGISTER_INFINIOP_TEST(mul) \ - REGISTER_INFINIOP_TEST(clip) \ - REGISTER_INFINIOP_TEST(swiglu) \ - REGISTER_INFINIOP_TEST(rope) \ - REGISTER_INFINIOP_TEST(rms_norm) \ - REGISTER_INFINIOP_TEST(causal_softmax) \ - REGISTER_INFINIOP_TEST(rearrange) \ - REGISTER_INFINIOP_TEST(sub) \ +#define TEST_BUILDER_MAPPINGS \ + { \ + REGISTER_INFINIOP_TEST(gemm) \ + REGISTER_INFINIOP_TEST(random_sample) \ + REGISTER_INFINIOP_TEST(add) \ + REGISTER_INFINIOP_TEST(mul) \ + REGISTER_INFINIOP_TEST(clip) \ + REGISTER_INFINIOP_TEST(swiglu) \ + REGISTER_INFINIOP_TEST(rope) \ + REGISTER_INFINIOP_TEST(rms_norm) \ + REGISTER_INFINIOP_TEST(causal_softmax) \ + REGISTER_INFINIOP_TEST(rearrange) \ + REGISTER_INFINIOP_TEST(sub) \ + REGISTER_INFINIOP_TEST(exp) \ + REGISTER_INFINIOP_TEST(sin) \ + REGISTER_INFINIOP_TEST(cos) \ + REGISTER_INFINIOP_TEST(leakyrelu) \ + REGISTER_INFINIOP_TEST(tanh) \ + REGISTER_INFINIOP_TEST(sigmoid_backward) \ + REGISTER_INFINIOP_TEST(hardswish) \ + REGISTER_INFINIOP_TEST(cast) \ + REGISTER_INFINIOP_TEST(where) \ } namespace infiniop_test { diff --git a/src/infiniop-test/src/ops/cast.cpp b/src/infiniop-test/src/ops/cast.cpp new file mode 100644 index 000000000..6547bc25a --- /dev/null +++ b/src/infiniop-test/src/ops/cast.cpp @@ -0,0 +1,122 @@ +#include "ops.hpp" +#include "utils.hpp" +#include +#include +#include + +namespace infiniop_test::cast { +struct Test::Attributes { + std::shared_ptr input; + std::shared_ptr output; + std::shared_ptr ans; +}; + +std::shared_ptr Test::build( + std::unordered_map> attributes, + std::unordered_map> tensors, + double rtol, double atol) { + auto test = std::shared_ptr(new Test(rtol, atol)); + test->_attributes = new Attributes(); + if (tensors.find("input") == tensors.end() + || tensors.find("output") == tensors.end() + || tensors.find("ans") == tensors.end()) { + throw std::runtime_error("Invalid Test"); + } + + test->_attributes->input = tensors["input"]; + test->_attributes->output = tensors["output"]; + test->_attributes->ans = tensors["ans"]; + + auto elemType = test->_attributes->input->ggml_type(); + if (elemType == GGML_TYPE_I32) { + test->_rtol = 1e-5; + test->_atol = 1e-5; + } + if (elemType == GGML_TYPE_I64) { + test->_rtol = 1e-5; + test->_atol = 1e-5; + } + if (elemType == GGML_TYPE_F16) { + test->_rtol = 1e-3; + test->_atol = 1e-3; + } + if (elemType == GGML_TYPE_F32) { + test->_rtol = 1e-7; + test->_atol = 1e-7; + } + if (elemType == GGML_TYPE_F64) { + test->_rtol = 1e-7; + test->_atol = 1e-7; + } + return test; +} + +std::shared_ptr Test::run( + infiniopHandle_t handle, infiniDevice_t device, int device_id, size_t warm_ups, size_t iterations) { + infiniopCastDescriptor_t op_desc; + auto input = _attributes->input->to(device, device_id); + auto output = _attributes->output->to(device, device_id); + + CHECK_OR(infiniopCreateCastDescriptor(handle, &op_desc, + output->desc(), + input->desc()), + return TEST_FAILED(OP_CREATION_FAILED, "Failed to create op descriptor.")); + size_t workspace_size; + CHECK_OR(infiniopGetCastWorkspaceSize(op_desc, &workspace_size), + return TEST_FAILED(OP_CREATION_FAILED, "Failed to get workspace size.")); + void *workspace; + CHECK_OR(infinirtMalloc(&workspace, workspace_size), + return TEST_FAILED(OP_CREATION_FAILED, "Failed to allocate workspace.")); + CHECK_OR(infiniopCast(op_desc, workspace, workspace_size, + output->data(), + input->data(), + nullptr), + return TEST_FAILED(OP_EXECUTION_FAILED, "Failed during execution.")); + + try { + allClose(output, _attributes->ans, _rtol, _atol); + } catch (const std::exception &e) { + return TEST_FAILED(RESULT_INCORRECT, e.what()); + } + + double elapsed_time = 0.; + + elapsed_time = benchmark( + [=]() { + infiniopCast( + op_desc, workspace, workspace_size, + output->data(), + input->data(), + nullptr); + }, + warm_ups, iterations); + + return TEST_PASSED(elapsed_time); +} + +std::vector Test::attribute_names() { + return {}; +} + +std::vector Test::tensor_names() { + return {"input", "output", "ans"}; +} + +std::vector Test::output_names() { + return {"output"}; +} + +std::string Test::toString() const { + std::ostringstream oss; + oss << op_name() << std::endl; + oss << "- input: " << _attributes->input->info() << std::endl; + oss << "- output: " << _attributes->output->info() << std::endl; + oss << std::scientific << std::setprecision(2); + oss << "- rtol=" << _rtol << ", atol=" << _atol << std::endl; + return oss.str(); +} + +Test::~Test() { + delete _attributes; +} +} // namespace infiniop_test::cast diff --git a/src/infiniop-test/src/ops/cos.cpp b/src/infiniop-test/src/ops/cos.cpp new file mode 100644 index 000000000..52de283af --- /dev/null +++ b/src/infiniop-test/src/ops/cos.cpp @@ -0,0 +1,114 @@ +#include "ops.hpp" +#include "utils.hpp" +#include +#include +#include + +namespace infiniop_test::cos { +struct Test::Attributes { + std::shared_ptr input; + std::shared_ptr output; + std::shared_ptr ans; +}; + +std::shared_ptr Test::build( + std::unordered_map> attributes, + std::unordered_map> tensors, + double rtol, double atol) { + auto test = std::shared_ptr(new Test(rtol, atol)); + test->_attributes = new Attributes(); + if (tensors.find("input") == tensors.end() + || tensors.find("output") == tensors.end() + || tensors.find("ans") == tensors.end()) { + throw std::runtime_error("Invalid Test"); + } + + test->_attributes->input = tensors["input"]; + test->_attributes->output = tensors["output"]; + test->_attributes->ans = tensors["ans"]; + + auto elemType = test->_attributes->input->ggml_type(); + if (elemType == GGML_TYPE_BF16) { + test->_rtol = 1e-2; + test->_atol = 1e-2; + } + if (elemType == GGML_TYPE_F16) { + test->_rtol = 1e-3; + test->_atol = 1e-3; + } + if (elemType == GGML_TYPE_F32) { + test->_rtol = 1e-7; + test->_atol = 1e-7; + } + + return test; +} + +std::shared_ptr Test::run( + infiniopHandle_t handle, infiniDevice_t device, int device_id, size_t warm_ups, size_t iterations) { + infiniopCosDescriptor_t op_desc; + auto input = _attributes->input->to(device, device_id); + auto output = _attributes->output->to(device, device_id); + CHECK_OR(infiniopCreateCosDescriptor(handle, &op_desc, + output->desc(), + input->desc()), + return TEST_FAILED(OP_CREATION_FAILED, "Failed to create op descriptor.")); + size_t workspace_size; + CHECK_OR(infiniopGetCosWorkspaceSize(op_desc, &workspace_size), + return TEST_FAILED(OP_CREATION_FAILED, "Failed to get workspace size.")); + void *workspace; + CHECK_OR(infinirtMalloc(&workspace, workspace_size), + return TEST_FAILED(OP_CREATION_FAILED, "Failed to allocate workspace.")); + CHECK_OR(infiniopCos(op_desc, workspace, workspace_size, + output->data(), + input->data(), + nullptr), + return TEST_FAILED(OP_EXECUTION_FAILED, "Failed during execution.")); + + try { + allClose(output, _attributes->ans, _rtol, _atol); + } catch (const std::exception &e) { + return TEST_FAILED(RESULT_INCORRECT, e.what()); + } + + double elapsed_time = 0.; + + elapsed_time = benchmark( + [=]() { + infiniopCos( + op_desc, workspace, workspace_size, + output->data(), + input->data(), + nullptr); + }, + warm_ups, iterations); + + return TEST_PASSED(elapsed_time); +} + +std::vector Test::attribute_names() { + return {}; +} + +std::vector Test::tensor_names() { + return {"input", "output", "ans"}; +} + +std::vector Test::output_names() { + return {"output"}; +} + +std::string Test::toString() const { + std::ostringstream oss; + oss << op_name() << std::endl; + oss << "- input: " << _attributes->input->info() << std::endl; + oss << "- output: " << _attributes->output->info() << std::endl; + oss << std::scientific << std::setprecision(2); + oss << "- rtol=" << _rtol << ", atol=" << _atol << std::endl; + return oss.str(); +} + +Test::~Test() { + delete _attributes; +} +} // namespace infiniop_test::cos diff --git a/src/infiniop-test/src/ops/exp.cpp b/src/infiniop-test/src/ops/exp.cpp new file mode 100644 index 000000000..070f8ef6b --- /dev/null +++ b/src/infiniop-test/src/ops/exp.cpp @@ -0,0 +1,114 @@ +#include "ops.hpp" +#include "utils.hpp" +#include +#include +#include + +namespace infiniop_test::exp { +struct Test::Attributes { + std::shared_ptr input; + std::shared_ptr output; + std::shared_ptr ans; +}; + +std::shared_ptr Test::build( + std::unordered_map> attributes, + std::unordered_map> tensors, + double rtol, double atol) { + auto test = std::shared_ptr(new Test(rtol, atol)); + test->_attributes = new Attributes(); + if (tensors.find("input") == tensors.end() + || tensors.find("output") == tensors.end() + || tensors.find("ans") == tensors.end()) { + throw std::runtime_error("Invalid Test"); + } + + test->_attributes->input = tensors["input"]; + test->_attributes->output = tensors["output"]; + test->_attributes->ans = tensors["ans"]; + + auto elemType = test->_attributes->input->ggml_type(); + if (elemType == GGML_TYPE_BF16) { + test->_rtol = 1e-2; + test->_atol = 1e-2; + } + if (elemType == GGML_TYPE_F16) { + test->_rtol = 1e-3; + test->_atol = 1e-3; + } + if (elemType == GGML_TYPE_F32) { + test->_rtol = 1e-6; + test->_atol = 1e-6; + } + + return test; +} + +std::shared_ptr Test::run( + infiniopHandle_t handle, infiniDevice_t device, int device_id, size_t warm_ups, size_t iterations) { + infiniopExpDescriptor_t op_desc; + auto input = _attributes->input->to(device, device_id); + auto output = _attributes->output->to(device, device_id); + CHECK_OR(infiniopCreateExpDescriptor(handle, &op_desc, + output->desc(), + input->desc()), + return TEST_FAILED(OP_CREATION_FAILED, "Failed to create op descriptor.")); + size_t workspace_size; + CHECK_OR(infiniopGetExpWorkspaceSize(op_desc, &workspace_size), + return TEST_FAILED(OP_CREATION_FAILED, "Failed to get workspace size.")); + void *workspace; + CHECK_OR(infinirtMalloc(&workspace, workspace_size), + return TEST_FAILED(OP_CREATION_FAILED, "Failed to allocate workspace.")); + CHECK_OR(infiniopExp(op_desc, workspace, workspace_size, + output->data(), + input->data(), + nullptr), + return TEST_FAILED(OP_EXECUTION_FAILED, "Failed during execution.")); + + try { + allClose(output, _attributes->ans, _rtol, _atol); + } catch (const std::exception &e) { + return TEST_FAILED(RESULT_INCORRECT, e.what()); + } + + double elapsed_time = 0.; + + elapsed_time = benchmark( + [=]() { + infiniopExp( + op_desc, workspace, workspace_size, + output->data(), + input->data(), + nullptr); + }, + warm_ups, iterations); + + return TEST_PASSED(elapsed_time); +} + +std::vector Test::attribute_names() { + return {}; +} + +std::vector Test::tensor_names() { + return {"input", "output", "ans"}; +} + +std::vector Test::output_names() { + return {"output"}; +} + +std::string Test::toString() const { + std::ostringstream oss; + oss << op_name() << std::endl; + oss << "- input: " << _attributes->input->info() << std::endl; + oss << "- output: " << _attributes->output->info() << std::endl; + oss << std::scientific << std::setprecision(2); + oss << "- rtol=" << _rtol << ", atol=" << _atol << std::endl; + return oss.str(); +} + +Test::~Test() { + delete _attributes; +} +} // namespace infiniop_test::exp diff --git a/src/infiniop-test/src/ops/hardswish.cpp b/src/infiniop-test/src/ops/hardswish.cpp new file mode 100644 index 000000000..0ccf4f52a --- /dev/null +++ b/src/infiniop-test/src/ops/hardswish.cpp @@ -0,0 +1,114 @@ +#include "ops.hpp" +#include "utils.hpp" +#include +#include +#include + +namespace infiniop_test::hardswish { +struct Test::Attributes { + std::shared_ptr input; + std::shared_ptr output; + std::shared_ptr ans; +}; + +std::shared_ptr Test::build( + std::unordered_map> attributes, + std::unordered_map> tensors, + double rtol, double atol) { + auto test = std::shared_ptr(new Test(rtol, atol)); + test->_attributes = new Attributes(); + if (tensors.find("input") == tensors.end() + || tensors.find("output") == tensors.end() + || tensors.find("ans") == tensors.end()) { + throw std::runtime_error("Invalid Test"); + } + + test->_attributes->input = tensors["input"]; + test->_attributes->output = tensors["output"]; + test->_attributes->ans = tensors["ans"]; + + auto elemType = test->_attributes->input->ggml_type(); + if (elemType == GGML_TYPE_BF16) { + test->_rtol = 1e-2; + test->_atol = 1e-2; + } + if (elemType == GGML_TYPE_F16) { + test->_rtol = 1e-3; + test->_atol = 1e-3; + } + if (elemType == GGML_TYPE_F32) { + test->_rtol = 1e-6; + test->_atol = 1e-6; + } + + return test; +} + +std::shared_ptr Test::run( + infiniopHandle_t handle, infiniDevice_t device, int device_id, size_t warm_ups, size_t iterations) { + infiniopHardswishDescriptor_t op_desc; + auto input = _attributes->input->to(device, device_id); + auto output = _attributes->output->to(device, device_id); + CHECK_OR(infiniopCreateHardswishDescriptor(handle, &op_desc, + output->desc(), + input->desc()), + return TEST_FAILED(OP_CREATION_FAILED, "Failed to create op descriptor.")); + size_t workspace_size; + CHECK_OR(infiniopGetHardswishWorkspaceSize(op_desc, &workspace_size), + return TEST_FAILED(OP_CREATION_FAILED, "Failed to get workspace size.")); + void *workspace; + CHECK_OR(infinirtMalloc(&workspace, workspace_size), + return TEST_FAILED(OP_CREATION_FAILED, "Failed to allocate workspace.")); + CHECK_OR(infiniopHardswish(op_desc, workspace, workspace_size, + output->data(), + input->data(), + nullptr), + return TEST_FAILED(OP_EXECUTION_FAILED, "Failed during execution.")); + + try { + allClose(output, _attributes->ans, _rtol, _atol); + } catch (const std::exception &e) { + return TEST_FAILED(RESULT_INCORRECT, e.what()); + } + + double elapsed_time = 0.; + + elapsed_time = benchmark( + [=]() { + infiniopHardswish( + op_desc, workspace, workspace_size, + output->data(), + input->data(), + nullptr); + }, + warm_ups, iterations); + + return TEST_PASSED(elapsed_time); +} + +std::vector Test::attribute_names() { + return {}; +} + +std::vector Test::tensor_names() { + return {"input", "output", "ans"}; +} + +std::vector Test::output_names() { + return {"output"}; +} + +std::string Test::toString() const { + std::ostringstream oss; + oss << op_name() << std::endl; + oss << "- input: " << _attributes->input->info() << std::endl; + oss << "- output: " << _attributes->output->info() << std::endl; + oss << std::scientific << std::setprecision(2); + oss << "- rtol=" << _rtol << ", atol=" << _atol << std::endl; + return oss.str(); +} + +Test::~Test() { + delete _attributes; +} +} // namespace infiniop_test::hardswish diff --git a/src/infiniop-test/src/ops/leakyrelu.cpp b/src/infiniop-test/src/ops/leakyrelu.cpp new file mode 100644 index 000000000..b7d9eb89c --- /dev/null +++ b/src/infiniop-test/src/ops/leakyrelu.cpp @@ -0,0 +1,123 @@ +#include "ops.hpp" +#include "utils.hpp" +#include +#include +#include + +namespace infiniop_test::leakyrelu { +struct Test::Attributes { + float negative_slope; + std::shared_ptr input; + std::shared_ptr output; + std::shared_ptr ans; +}; + +std::shared_ptr Test::build( + std::unordered_map> attributes, + std::unordered_map> tensors, + double rtol, double atol) { + auto test = std::shared_ptr(new Test(rtol, atol)); + test->_attributes = new Attributes(); + if (attributes.find("negative_slope") == attributes.end() + || tensors.find("input") == tensors.end() + || tensors.find("output") == tensors.end() + || tensors.find("ans") == tensors.end()) { + throw std::runtime_error("Invalid Test"); + } + + test->_attributes->negative_slope = *reinterpret_cast(attributes["negative_slope"].data()); + + test->_attributes->input = tensors["input"]; + test->_attributes->output = tensors["output"]; + test->_attributes->ans = tensors["ans"]; + + auto elemType = test->_attributes->input->ggml_type(); + if (elemType == GGML_TYPE_BF16) { + test->_rtol = 1e-2; + test->_atol = 1e-2; + } + if (elemType == GGML_TYPE_F16) { + test->_rtol = 1e-3; + test->_atol = 1e-3; + } + if (elemType == GGML_TYPE_F32) { + test->_rtol = 1e-7; + test->_atol = 1e-7; + } + + return test; +} + +std::shared_ptr Test::run( + infiniopHandle_t handle, infiniDevice_t device, int device_id, size_t warm_ups, size_t iterations) { + infiniopLeakyreluDescriptor_t op_desc; + auto input = _attributes->input->to(device, device_id); + auto output = _attributes->output->to(device, device_id); + CHECK_OR(infiniopCreateLeakyreluDescriptor(handle, &op_desc, + output->desc(), + input->desc(), + _attributes->negative_slope), + return TEST_FAILED(OP_CREATION_FAILED, "Failed to create op descriptor.")); + + size_t workspace_size; + CHECK_OR(infiniopGetLeakyreluWorkspaceSize(op_desc, &workspace_size), + return TEST_FAILED(OP_CREATION_FAILED, "Failed to get workspace size.")); + void *workspace = nullptr; + if (workspace_size > 0) { + CHECK_OR(infinirtMalloc(&workspace, workspace_size), + return TEST_FAILED(OP_CREATION_FAILED, "Failed to allocate workspace")); + } + CHECK_OR(infiniopLeakyrelu(op_desc, workspace, workspace_size, + output->data(), + input->data(), + nullptr), + return TEST_FAILED(OP_EXECUTION_FAILED, "Failed during execution.")); + + try { + allClose(output, _attributes->ans, _rtol, _atol); + } catch (const std::exception &e) { + return TEST_FAILED(RESULT_INCORRECT, e.what()); + } + + double elapsed_time = 0.; + + elapsed_time = benchmark( + [=]() { + infiniopLeakyrelu( + op_desc, workspace, workspace_size, + output->data(), + input->data(), + nullptr); + }, + warm_ups, iterations); + + return TEST_PASSED(elapsed_time); +} + +std::vector Test::attribute_names() { + return {"negative_slope"}; +} + +std::vector Test::tensor_names() { + return {"input", "output", "ans"}; +} + +std::vector Test::output_names() { + return {"output"}; +} + +std::string Test::toString() const { + std::ostringstream oss; + oss << op_name() << std::endl; + oss << "- negative_slope=" << _attributes->negative_slope << std::endl; + oss << "- input: " << _attributes->input->info() << std::endl; + oss << "- output: " << _attributes->output->info() << std::endl; + oss << std::scientific << std::setprecision(2); + oss << "- rtol=" << _rtol << ", atol=" << _atol << std::endl; + return oss.str(); +} + +Test::~Test() { + delete _attributes; +} +} // namespace infiniop_test::leakyrelu diff --git a/src/infiniop-test/src/ops/sigmoid_backward.cpp b/src/infiniop-test/src/ops/sigmoid_backward.cpp new file mode 100644 index 000000000..434dbf598 --- /dev/null +++ b/src/infiniop-test/src/ops/sigmoid_backward.cpp @@ -0,0 +1,122 @@ +#include "ops.hpp" +#include "utils.hpp" +#include +#include +#include + +namespace infiniop_test::sigmoid_backward { +struct Test::Attributes { + std::shared_ptr input; + std::shared_ptr grad_output; + std::shared_ptr grad_input; + std::shared_ptr ans; +}; + +std::shared_ptr Test::build( + std::unordered_map> attributes, + std::unordered_map> tensors, + double rtol, double atol) { + auto test = std::shared_ptr(new Test(rtol, atol)); + test->_attributes = new Attributes(); + if (tensors.find("input") == tensors.end() + || tensors.find("grad_output") == tensors.end() + || tensors.find("grad_input") == tensors.end() + || tensors.find("ans") == tensors.end()) { + throw std::runtime_error("Invalid Test"); + } + + test->_attributes->input = tensors["input"]; + test->_attributes->grad_output = tensors["grad_output"]; + test->_attributes->grad_input = tensors["grad_input"]; + test->_attributes->ans = tensors["ans"]; + + auto elemType = test->_attributes->input->ggml_type(); + if (elemType == GGML_TYPE_BF16) { + test->_rtol = 1e-2; + test->_atol = 1e-2; + } + if (elemType == GGML_TYPE_F16) { + test->_rtol = 1e-3; + test->_atol = 1e-3; + } + if (elemType == GGML_TYPE_F32) { + test->_rtol = 1e-6; + test->_atol = 1e-6; + } + + return test; +} + +std::shared_ptr Test::run( + infiniopHandle_t handle, infiniDevice_t device, int device_id, size_t warm_ups, size_t iterations) { + infiniopSigmoidBackwardDescriptor_t op_desc; + auto input = _attributes->input->to(device, device_id); + auto grad_output = _attributes->grad_output->to(device, device_id); + auto grad_input = _attributes->grad_input->to(device, device_id); + CHECK_OR(infiniopCreateSigmoidBackwardDescriptor(handle, &op_desc, + grad_input->desc(), + input->desc(), + grad_output->desc()), + return TEST_FAILED(OP_CREATION_FAILED, "Failed to create op descriptor.")); + size_t workspace_size; + CHECK_OR(infiniopGetSigmoidBackwardWorkspaceSize(op_desc, &workspace_size), + return TEST_FAILED(OP_CREATION_FAILED, "Failed to get workspace size.")); + void *workspace; + CHECK_OR(infinirtMalloc(&workspace, workspace_size), + return TEST_FAILED(OP_CREATION_FAILED, "Failed to allocate workspace.")); + CHECK_OR(infiniopSigmoidBackward(op_desc, workspace, workspace_size, + grad_input->data(), + input->data(), + grad_output->data(), + nullptr), + return TEST_FAILED(OP_EXECUTION_FAILED, "Failed during execution.")); + + try { + allClose(grad_input, _attributes->ans, _rtol, _atol); + } catch (const std::exception &e) { + return TEST_FAILED(RESULT_INCORRECT, e.what()); + } + + double elapsed_time = 0.; + + elapsed_time = benchmark( + [=]() { + infiniopSigmoidBackward( + op_desc, workspace, workspace_size, + grad_input->data(), + input->data(), + grad_output->data(), + nullptr); + }, + warm_ups, iterations); + + return TEST_PASSED(elapsed_time); +} + +std::vector Test::attribute_names() { + return {}; +} + +std::vector Test::tensor_names() { + return {"input", "grad_output", "grad_input", "ans"}; +} + +std::vector Test::output_names() { + return {"grad_input"}; +} + +std::string Test::toString() const { + std::ostringstream oss; + oss << op_name() << std::endl; + oss << "- input: " << _attributes->input->info() << std::endl; + oss << "- grad_output: " << _attributes->grad_output->info() << std::endl; + oss << "- grad_input: " << _attributes->grad_input->info() << std::endl; + oss << std::scientific << std::setprecision(2); + oss << "- rtol=" << _rtol << ", atol=" << _atol << std::endl; + return oss.str(); +} + +Test::~Test() { + delete _attributes; +} +} // namespace infiniop_test::sigmoid_backward diff --git a/src/infiniop-test/src/ops/sin.cpp b/src/infiniop-test/src/ops/sin.cpp new file mode 100644 index 000000000..e1406e588 --- /dev/null +++ b/src/infiniop-test/src/ops/sin.cpp @@ -0,0 +1,114 @@ +#include "ops.hpp" +#include "utils.hpp" +#include +#include +#include + +namespace infiniop_test::sin { +struct Test::Attributes { + std::shared_ptr input; + std::shared_ptr output; + std::shared_ptr ans; +}; + +std::shared_ptr Test::build( + std::unordered_map> attributes, + std::unordered_map> tensors, + double rtol, double atol) { + auto test = std::shared_ptr(new Test(rtol, atol)); + test->_attributes = new Attributes(); + if (tensors.find("input") == tensors.end() + || tensors.find("output") == tensors.end() + || tensors.find("ans") == tensors.end()) { + throw std::runtime_error("Invalid Test"); + } + + test->_attributes->input = tensors["input"]; + test->_attributes->output = tensors["output"]; + test->_attributes->ans = tensors["ans"]; + + auto elemType = test->_attributes->input->ggml_type(); + if (elemType == GGML_TYPE_BF16) { + test->_rtol = 1e-2; + test->_atol = 1e-2; + } + if (elemType == GGML_TYPE_F16) { + test->_rtol = 1e-3; + test->_atol = 1e-3; + } + if (elemType == GGML_TYPE_F32) { + test->_rtol = 1e-7; + test->_atol = 1e-7; + } + + return test; +} + +std::shared_ptr Test::run( + infiniopHandle_t handle, infiniDevice_t device, int device_id, size_t warm_ups, size_t iterations) { + infiniopSinDescriptor_t op_desc; + auto input = _attributes->input->to(device, device_id); + auto output = _attributes->output->to(device, device_id); + CHECK_OR(infiniopCreateSinDescriptor(handle, &op_desc, + output->desc(), + input->desc()), + return TEST_FAILED(OP_CREATION_FAILED, "Failed to create op descriptor.")); + size_t workspace_size; + CHECK_OR(infiniopGetSinWorkspaceSize(op_desc, &workspace_size), + return TEST_FAILED(OP_CREATION_FAILED, "Failed to get workspace size.")); + void *workspace; + CHECK_OR(infinirtMalloc(&workspace, workspace_size), + return TEST_FAILED(OP_CREATION_FAILED, "Failed to allocate workspace.")); + CHECK_OR(infiniopSin(op_desc, workspace, workspace_size, + output->data(), + input->data(), + nullptr), + return TEST_FAILED(OP_EXECUTION_FAILED, "Failed during execution.")); + + try { + allClose(output, _attributes->ans, _rtol, _atol); + } catch (const std::exception &e) { + return TEST_FAILED(RESULT_INCORRECT, e.what()); + } + + double elapsed_time = 0.; + + elapsed_time = benchmark( + [=]() { + infiniopSin( + op_desc, workspace, workspace_size, + output->data(), + input->data(), + nullptr); + }, + warm_ups, iterations); + + return TEST_PASSED(elapsed_time); +} + +std::vector Test::attribute_names() { + return {}; +} + +std::vector Test::tensor_names() { + return {"input", "output", "ans"}; +} + +std::vector Test::output_names() { + return {"output"}; +} + +std::string Test::toString() const { + std::ostringstream oss; + oss << op_name() << std::endl; + oss << "- input: " << _attributes->input->info() << std::endl; + oss << "- output: " << _attributes->output->info() << std::endl; + oss << std::scientific << std::setprecision(2); + oss << "- rtol=" << _rtol << ", atol=" << _atol << std::endl; + return oss.str(); +} + +Test::~Test() { + delete _attributes; +} +} // namespace infiniop_test::sin diff --git a/src/infiniop-test/src/ops/tanh.cpp b/src/infiniop-test/src/ops/tanh.cpp new file mode 100644 index 000000000..6f966de09 --- /dev/null +++ b/src/infiniop-test/src/ops/tanh.cpp @@ -0,0 +1,114 @@ +#include "ops.hpp" +#include "utils.hpp" +#include +#include +#include + +namespace infiniop_test::tanh { +struct Test::Attributes { + std::shared_ptr input; + std::shared_ptr output; + std::shared_ptr ans; +}; + +std::shared_ptr Test::build( + std::unordered_map> attributes, + std::unordered_map> tensors, + double rtol, double atol) { + auto test = std::shared_ptr(new Test(rtol, atol)); + test->_attributes = new Attributes(); + if (tensors.find("input") == tensors.end() + || tensors.find("output") == tensors.end() + || tensors.find("ans") == tensors.end()) { + throw std::runtime_error("Invalid Test"); + } + + test->_attributes->input = tensors["input"]; + test->_attributes->output = tensors["output"]; + test->_attributes->ans = tensors["ans"]; + + auto elemType = test->_attributes->input->ggml_type(); + if (elemType == GGML_TYPE_BF16) { + test->_rtol = 1e-2; + test->_atol = 1e-2; + } + if (elemType == GGML_TYPE_F16) { + test->_rtol = 1e-3; + test->_atol = 1e-3; + } + if (elemType == GGML_TYPE_F32) { + test->_rtol = 1e-6; + test->_atol = 1e-6; + } + + return test; +} + +std::shared_ptr Test::run( + infiniopHandle_t handle, infiniDevice_t device, int device_id, size_t warm_ups, size_t iterations) { + infiniopTanhDescriptor_t op_desc; + auto input = _attributes->input->to(device, device_id); + auto output = _attributes->output->to(device, device_id); + CHECK_OR(infiniopCreateTanhDescriptor(handle, &op_desc, + output->desc(), + input->desc()), + return TEST_FAILED(OP_CREATION_FAILED, "Failed to create op descriptor.")); + size_t workspace_size; + CHECK_OR(infiniopGetTanhWorkspaceSize(op_desc, &workspace_size), + return TEST_FAILED(OP_CREATION_FAILED, "Failed to get workspace size.")); + void *workspace; + CHECK_OR(infinirtMalloc(&workspace, workspace_size), + return TEST_FAILED(OP_CREATION_FAILED, "Failed to allocate workspace.")); + CHECK_OR(infiniopTanh(op_desc, workspace, workspace_size, + output->data(), + input->data(), + nullptr), + return TEST_FAILED(OP_EXECUTION_FAILED, "Failed during execution.")); + + try { + allClose(output, _attributes->ans, _rtol, _atol); + } catch (const std::exception &e) { + return TEST_FAILED(RESULT_INCORRECT, e.what()); + } + + double elapsed_time = 0.; + + elapsed_time = benchmark( + [=]() { + infiniopTanh( + op_desc, workspace, workspace_size, + output->data(), + input->data(), + nullptr); + }, + warm_ups, iterations); + + return TEST_PASSED(elapsed_time); +} + +std::vector Test::attribute_names() { + return {}; +} + +std::vector Test::tensor_names() { + return {"input", "output", "ans"}; +} + +std::vector Test::output_names() { + return {"output"}; +} + +std::string Test::toString() const { + std::ostringstream oss; + oss << op_name() << std::endl; + oss << "- input: " << _attributes->input->info() << std::endl; + oss << "- output: " << _attributes->output->info() << std::endl; + oss << std::scientific << std::setprecision(2); + oss << "- rtol=" << _rtol << ", atol=" << _atol << std::endl; + return oss.str(); +} + +Test::~Test() { + delete _attributes; +} +} // namespace infiniop_test::tanh diff --git a/src/infiniop-test/src/ops/where.cpp b/src/infiniop-test/src/ops/where.cpp new file mode 100644 index 000000000..fea9cba92 --- /dev/null +++ b/src/infiniop-test/src/ops/where.cpp @@ -0,0 +1,151 @@ +#include "ops.hpp" +#include "utils.hpp" +#include +#include +#include + +namespace infiniop_test::where { +struct Test::Attributes { + std::shared_ptr a; + std::shared_ptr b; + std::shared_ptr condition; + std::shared_ptr c; + std::shared_ptr ans; +}; + +std::shared_ptr Test::build( + std::unordered_map> attributes, + std::unordered_map> tensors, + double rtol, double atol) { + auto test = std::shared_ptr(new Test(rtol, atol)); + test->_attributes = new Attributes(); + if (tensors.find("a") == tensors.end() + || tensors.find("b") == tensors.end() + || tensors.find("condition") == tensors.end() + || tensors.find("c") == tensors.end() + || tensors.find("ans") == tensors.end()) { + throw std::runtime_error("Invalid Test"); + } + + test->_attributes->a = tensors["a"]; + test->_attributes->b = tensors["b"]; + test->_attributes->condition = tensors["condition"]; + test->_attributes->c = tensors["c"]; + test->_attributes->ans = tensors["ans"]; + + auto elemType = test->_attributes->a->ggml_type(); + if (elemType == GGML_TYPE_I8) { + test->_rtol = 1e-5; + test->_atol = 1e-5; + } + if (elemType == GGML_TYPE_I16) { + test->_rtol = 1e-5; + test->_atol = 1e-5; + } + if (elemType == GGML_TYPE_I32) { + test->_rtol = 1e-5; + test->_atol = 1e-5; + } + if (elemType == GGML_TYPE_I64) { + test->_rtol = 1e-5; + test->_atol = 1e-5; + } + if (elemType == GGML_TYPE_F16) { + test->_rtol = 1e-7; + test->_atol = 1e-7; + } + if (elemType == GGML_TYPE_F32) { + test->_rtol = 1e-7; + test->_atol = 1e-7; + } + if (elemType == GGML_TYPE_F64) { + test->_rtol = 1e-7; + test->_atol = 1e-7; + } + if (elemType == GGML_TYPE_BF16) { + test->_rtol = 1e-5; + test->_atol = 1e-5; + } + + return test; +} + +std::shared_ptr Test::run( + infiniopHandle_t handle, infiniDevice_t device, int device_id, size_t warm_ups, size_t iterations) { + infiniopWhereDescriptor_t op_desc; + auto a = _attributes->a->to(device, device_id); + auto b = _attributes->b->to(device, device_id); + auto condition = _attributes->condition->to(device, device_id); + auto c = _attributes->c->to(device, device_id); + CHECK_OR(infiniopCreateWhereDescriptor(handle, &op_desc, + c->desc(), + a->desc(), + b->desc(), + condition->desc()), + return TEST_FAILED(OP_CREATION_FAILED, "Failed to create op descriptor.")); + size_t workspace_size; + CHECK_OR(infiniopGetWhereWorkspaceSize(op_desc, &workspace_size), + return TEST_FAILED(OP_CREATION_FAILED, "Failed to get workspace size.")); + void *workspace; + CHECK_OR(infinirtMalloc(&workspace, workspace_size), + return TEST_FAILED(OP_CREATION_FAILED, "Failed to allocate workspace.")); + CHECK_OR(infiniopWhere(op_desc, workspace, workspace_size, + c->data(), + a->data(), + b->data(), + condition->data(), + nullptr), + return TEST_FAILED(OP_EXECUTION_FAILED, "Failed during execution.")); + + try { + allClose(c, _attributes->ans, _rtol, _atol); + } catch (const std::exception &e) { + return TEST_FAILED(RESULT_INCORRECT, e.what()); + } + + double elapsed_time = 0.; + + elapsed_time = benchmark( + [=]() { + infiniopWhere( + op_desc, workspace, workspace_size, + c->data(), + a->data(), + b->data(), + condition->data(), + nullptr); + }, + warm_ups, iterations); + + return TEST_PASSED(elapsed_time); +} + +std::vector Test::attribute_names() { + return {}; +} + +std::vector Test::tensor_names() { + return {"a", "b", "condition", "c", "ans"}; +} + +std::vector Test::output_names() { + return {"c"}; +} + +std::string Test::toString() const { + std::ostringstream oss; + oss << op_name() << std::endl; + oss << "- a: " << _attributes->a->info() << std::endl; + oss << "- b: " << _attributes->b->info() << std::endl; + oss << "- condition: " << _attributes->condition->info() << std::endl; + oss << "- c: " << _attributes->c->info() << std::endl; + oss << std::scientific << std::setprecision(2); + oss << "- rtol=" << _rtol << ", atol=" << _atol << std::endl; + return oss.str(); +} + +Test::~Test() { + delete _attributes; +} + +} // namespace infiniop_test::where diff --git a/src/infiniop/ops/cast/cast.h b/src/infiniop/ops/cast/cast.h new file mode 100644 index 000000000..5e66997cc --- /dev/null +++ b/src/infiniop/ops/cast/cast.h @@ -0,0 +1,48 @@ +#ifndef __CAST_H__ +#define __CAST_H__ + +#include "../../operator.h" +#include "info.h" + +#define DESCRIPTOR(NAMESPACE) \ + \ + namespace op::cast::NAMESPACE { \ + class Descriptor final : public InfiniopDescriptor { \ + struct Opaque; \ + Opaque *_opaque; \ + \ + CastInfo _info; \ + size_t _min_workspace_size; \ + \ + Descriptor( \ + CastInfo info, \ + size_t min_workspace_size, \ + Opaque *opaque, \ + infiniDevice_t device_type, \ + int device_id) \ + : InfiniopDescriptor{device_type, device_id}, \ + _opaque(opaque), \ + _info(info), \ + _min_workspace_size(min_workspace_size) {} \ + \ + public: \ + ~Descriptor(); \ + \ + static infiniStatus_t create( \ + infiniopHandle_t handle, \ + Descriptor **desc_ptr, \ + infiniopTensorDescriptor_t out_desc, \ + infiniopTensorDescriptor_t in_desc); \ + \ + size_t workspaceSize() const; \ + \ + infiniStatus_t calculate( \ + void *workspace, \ + size_t workspace_size, \ + void *output, \ + const void *input, \ + void *stream) const; \ + }; \ + } + +#endif // __CAST_H__ diff --git a/src/infiniop/ops/cast/cpu/cast_cpu.cc b/src/infiniop/ops/cast/cpu/cast_cpu.cc new file mode 100644 index 000000000..36d2e9e28 --- /dev/null +++ b/src/infiniop/ops/cast/cpu/cast_cpu.cc @@ -0,0 +1,144 @@ +#include "cast_cpu.h" +#include "../../../devices/cpu/common_cpu.h" +#include "../info.h" +#include "infinicore.h" +#include + +namespace op::cast::cpu { + +Descriptor::~Descriptor() = default; + +infiniStatus_t Descriptor::create( + infiniopHandle_t handle_, + Descriptor **desc_ptr, + infiniopTensorDescriptor_t out_desc, + infiniopTensorDescriptor_t in_desc) { + + auto handle = reinterpret_cast(handle_); + + auto info_r = CastInfo::create(out_desc, in_desc); + CHECK_RESULT(info_r); + + *desc_ptr = new Descriptor( + info_r.take(), + 0, + nullptr, + handle->device, handle->device_id); + + return INFINI_STATUS_SUCCESS; +} + +size_t Descriptor::workspaceSize() const { + return _min_workspace_size; +} + +template +static inline void cpu_cast_impl_incremental( + void *output, const void *input, const op::cast::CastInfo &info) { + + const size_t ndim = info.shape.size(); + const size_t n = info.n; + + auto out_base = reinterpret_cast(output); + auto in_base = reinterpret_cast(input); + + const std::vector &shape = info.shape; + const std::vector &in_stride = info.in_stride; + const std::vector &out_stride = info.out_stride; + + if (n == 0) { + return; + } + + std::vector idx(ndim, 0); + ptrdiff_t in_off = 0; + ptrdiff_t out_off = 0; + + for (size_t it = 0; it < n; ++it) { + const Tin *in_elem = in_base + in_off; + Tout *out_elem = out_base + out_off; + *out_elem = utils::cast(*in_elem); + + for (int d = static_cast(ndim) - 1; d >= 0; --d) { + idx[d] += 1; + if (in_stride[d] != 0) { + in_off += in_stride[d]; + } + if (out_stride[d] != 0) { + out_off += out_stride[d]; + } + + if (idx[d] < shape[d]) { + break; + } else { + idx[d] = 0; + if (in_stride[d] != 0) { + in_off -= static_cast(shape[d]) * in_stride[d]; + } + if (out_stride[d] != 0) { + out_off -= static_cast(shape[d]) * out_stride[d]; + } + } + } + } +} + +infiniStatus_t Descriptor::calculate( + void *workspace, + size_t workspace_size, + void *output, + const void *input, + void *stream) const { + + if (output == const_cast(input)) { + return INFINI_STATUS_BAD_PARAM; // or INFINI_STATUS_INPLACE_NOT_SUPPORTED + } + +#define CASE_OUT(DT_OUT, TOUT) \ + case DT_OUT: { \ + switch (_info.dt_in) { \ + case INFINI_DTYPE_I32: \ + cpu_cast_impl_incremental(output, input, _info); \ + break; \ + case INFINI_DTYPE_I64: \ + cpu_cast_impl_incremental(output, input, _info); \ + break; \ + case INFINI_DTYPE_U32: \ + cpu_cast_impl_incremental(output, input, _info); \ + break; \ + case INFINI_DTYPE_U64: \ + cpu_cast_impl_incremental(output, input, _info); \ + break; \ + case INFINI_DTYPE_F16: \ + cpu_cast_impl_incremental(output, input, _info); \ + break; \ + case INFINI_DTYPE_F32: \ + cpu_cast_impl_incremental(output, input, _info); \ + break; \ + case INFINI_DTYPE_F64: \ + cpu_cast_impl_incremental(output, input, _info); \ + break; \ + default: \ + return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED; \ + } \ + break; \ + } + + switch (_info.dt_out) { + CASE_OUT(INFINI_DTYPE_I32, int32_t); + CASE_OUT(INFINI_DTYPE_I64, int64_t); + CASE_OUT(INFINI_DTYPE_U32, uint32_t); + CASE_OUT(INFINI_DTYPE_U64, uint64_t); + CASE_OUT(INFINI_DTYPE_F16, fp16_t); + CASE_OUT(INFINI_DTYPE_F32, float); + CASE_OUT(INFINI_DTYPE_F64, double); + default: + return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED; + } + +#undef CASE_OUT + + return INFINI_STATUS_SUCCESS; +} + +} // namespace op::cast::cpu diff --git a/src/infiniop/ops/cast/cpu/cast_cpu.h b/src/infiniop/ops/cast/cpu/cast_cpu.h new file mode 100644 index 000000000..ca929a694 --- /dev/null +++ b/src/infiniop/ops/cast/cpu/cast_cpu.h @@ -0,0 +1,8 @@ +#ifndef __CAST_CPU_H__ +#define __CAST_CPU_H__ + +#include "../cast.h" + +DESCRIPTOR(cpu) + +#endif // __CAST_CPU_H__ diff --git a/src/infiniop/ops/cast/cuda/kernel.cuh b/src/infiniop/ops/cast/cuda/kernel.cuh new file mode 100644 index 000000000..3736442a3 --- /dev/null +++ b/src/infiniop/ops/cast/cuda/kernel.cuh @@ -0,0 +1,79 @@ +#ifndef __CAST_CUDA_KERNEL_CUH__ +#define __CAST_CUDA_KERNEL_CUH__ + +#include +#include +#include + +template +__device__ __forceinline__ Tout device_cast(const Tin &v) { + if constexpr (std::is_same_v) { + float f; + if constexpr (std::is_same_v) { + f = __half2float(v); + } else { + f = static_cast(v); + } + return __float2half_rn(f); + } else if constexpr (std::is_same_v) { + if constexpr (std::is_same_v) { + return __half2float(v); + } else { + return static_cast(v); + } + } else if constexpr (std::is_same_v) { + if constexpr (std::is_same_v) { + return static_cast(__half2float(v)); + } else { + return static_cast(v); + } + } else { // integer outputs + // convert via double/float then to integer (truncate) + if constexpr (std::is_same_v) { + float f = __half2float(v); + return static_cast(f); + } else { + return static_cast(v); + } + } +} + +template +__global__ void cast_kernel( + ToutDev *__restrict__ out, + const TinDev *__restrict__ in, + size_t n, + const size_t *__restrict__ shape, + const size_t *__restrict__ div, + const long long *__restrict__ in_stride, + const long long *__restrict__ out_stride, + int ndim) { + + size_t gid = static_cast(blockIdx.x) * blockDim.x + threadIdx.x; + size_t grid_stride = static_cast(blockDim.x) * gridDim.x; + + for (size_t linear = gid; linear < n; linear += grid_stride) { + unsigned long long rem = linear; + long long in_off = 0; + long long out_off = 0; + for (int d = 0; d < ndim; ++d) { + unsigned long long idx_d = 0; + size_t divisor = div[d]; + if (divisor != 0) { + idx_d = rem / divisor; + rem = rem % divisor; + } else { + idx_d = 0; + } + if (in_stride[d] != 0) { + in_off += static_cast(idx_d) * in_stride[d]; + } + if (out_stride[d] != 0) { + out_off += static_cast(idx_d) * out_stride[d]; + } + } + out[static_cast(out_off)] = device_cast(in[static_cast(in_off)]); + } +} + +#endif // __CAST_CUDA_KERNEL_CUH__ diff --git a/src/infiniop/ops/cast/info.h b/src/infiniop/ops/cast/info.h new file mode 100644 index 000000000..8f85f6da8 --- /dev/null +++ b/src/infiniop/ops/cast/info.h @@ -0,0 +1,60 @@ +#ifndef __CAST_INFO_H__ +#define __CAST_INFO_H__ + +#include "../../../utils.h" +#include "../../tensor.h" +#include + +namespace op::cast { + +class CastInfo { + CastInfo() = default; + +public: + infiniDtype_t dt_in; + infiniDtype_t dt_out; + std::vector shape; + std::vector in_stride; + std::vector out_stride; + size_t n; + + static utils::Result create( + infiniopTensorDescriptor_t out_desc, + infiniopTensorDescriptor_t in_desc) { + + auto dt_out = out_desc->dtype(); + auto dt_in = in_desc->dtype(); + + CHECK_DTYPE(dt_in, + INFINI_DTYPE_I32, INFINI_DTYPE_I64, + INFINI_DTYPE_U32, INFINI_DTYPE_U64, + INFINI_DTYPE_F16, INFINI_DTYPE_F32, INFINI_DTYPE_F64); + CHECK_DTYPE(dt_out, + INFINI_DTYPE_I32, INFINI_DTYPE_I64, + INFINI_DTYPE_U32, INFINI_DTYPE_U64, + INFINI_DTYPE_F16, INFINI_DTYPE_F32, INFINI_DTYPE_F64); + + CHECK_OR_RETURN(out_desc->ndim() == in_desc->ndim(), INFINI_STATUS_BAD_TENSOR_SHAPE); + for (size_t i = 0; i < out_desc->ndim(); ++i) { + CHECK_OR_RETURN(out_desc->dim(i) == in_desc->dim(i), INFINI_STATUS_BAD_TENSOR_SHAPE); + } + + size_t n = 1; + for (size_t i = 0; i < in_desc->ndim(); ++i) { + n *= static_cast(in_desc->dim(i)); + } + + return utils::Result(CastInfo{ + dt_in, + dt_out, + out_desc->shape(), + in_desc->strides(), + out_desc->strides(), + n, + }); + } +}; + +} // namespace op::cast + +#endif // __CAST_INFO_H__ diff --git a/src/infiniop/ops/cast/metax/cast_metax.h b/src/infiniop/ops/cast/metax/cast_metax.h new file mode 100644 index 000000000..5ba92911e --- /dev/null +++ b/src/infiniop/ops/cast/metax/cast_metax.h @@ -0,0 +1,8 @@ +#ifndef __CAST_METAX_API_H__ +#define __CAST_METAX_API_H__ + +#include "../cast.h" + +DESCRIPTOR(metax) + +#endif // __CAST_METAX_API_H__ diff --git a/src/infiniop/ops/cast/metax/cast_metax.maca b/src/infiniop/ops/cast/metax/cast_metax.maca new file mode 100644 index 000000000..4b2103da3 --- /dev/null +++ b/src/infiniop/ops/cast/metax/cast_metax.maca @@ -0,0 +1,201 @@ +#include "../cuda/kernel.cuh" +#include "../../../devices/metax/metax_common.h" +#include "../cast.h" +#include "cast_metax.h" +#include "../info.h" + +namespace op::cast::metax { + +struct Descriptor::Opaque { + std::shared_ptr internal; +}; + +Descriptor::~Descriptor() { + delete _opaque; +} + +template struct MapHcType { using Type = T; }; +template <> struct MapHcType { using Type = half; }; + +infiniStatus_t Descriptor::create( + infiniopHandle_t handle_, + Descriptor **desc_ptr, + infiniopTensorDescriptor_t out_desc, + infiniopTensorDescriptor_t in_desc) { + auto handle = reinterpret_cast(handle_); + + auto info_r = CastInfo::create(out_desc, in_desc); + CHECK_RESULT(info_r); + auto info = info_r.take(); + + size_t workspace_size = 0; + + *desc_ptr = new Descriptor( + info, + workspace_size, + new Opaque{handle->internal()}, + handle->device, handle->device_id); + + return INFINI_STATUS_SUCCESS; +} + +size_t Descriptor::workspaceSize() const { + return _min_workspace_size; +} + +template +static inline infiniStatus_t metax_cast_impl_incremental( + void *output_, const void *input_, + const op::cast::CastInfo &info, + void *stream_) { + + int bs = 256, grid = 0; + hcError_t propErr; + int device_id_local = 0; + using DevTout = typename MapHcType::Type; + using DevTin = typename MapHcType::Type; + + auto out_dev = reinterpret_cast(output_); + auto in_dev = reinterpret_cast(input_); + auto stream = reinterpret_cast(stream_); + + int ndim = static_cast(info.shape.size()); + if (ndim == 0) { + return INFINI_STATUS_SUCCESS; + } + + std::vector h_shape(info.shape.begin(), info.shape.end()); + std::vector h_div(ndim); + h_div[ndim - 1] = 1; + for (int d = ndim - 2; d >= 0; --d) { + h_div[d] = h_div[d + 1] * h_shape[d + 1]; + } + + std::vector h_in_stride(ndim), h_out_stride(ndim); + for (int d = 0; d < ndim; ++d) { + h_in_stride[d] = static_cast(info.in_stride[d]); + h_out_stride[d] = static_cast(info.out_stride[d]); + } + + size_t *d_shape = nullptr; + size_t *d_div = nullptr; + long long *d_in_stride = nullptr; + long long *d_out_stride = nullptr; + + hcError_t err = hcSuccess; + err = hcMalloc(reinterpret_cast(&d_shape), sizeof(size_t) * ndim); + if (err != hcSuccess) goto cleanup; + err = hcMalloc(reinterpret_cast(&d_div), sizeof(size_t) * ndim); + if (err != hcSuccess) goto cleanup; + err = hcMalloc(reinterpret_cast(&d_in_stride), sizeof(long long) * ndim); + if (err != hcSuccess) goto cleanup; + err = hcMalloc(reinterpret_cast(&d_out_stride), sizeof(long long) * ndim); + if (err != hcSuccess) goto cleanup; + + err = hcMemcpyAsync(d_shape, h_shape.data(), sizeof(size_t) * ndim, hcMemcpyHostToDevice, stream); + if (err != hcSuccess) goto cleanup; + err = hcMemcpyAsync(d_div, h_div.data(), sizeof(size_t) * ndim, hcMemcpyHostToDevice, stream); + if (err != hcSuccess) goto cleanup; + err = hcMemcpyAsync(d_in_stride, h_in_stride.data(), sizeof(long long) * ndim, hcMemcpyHostToDevice, stream); + if (err != hcSuccess) goto cleanup; + err = hcMemcpyAsync(d_out_stride, h_out_stride.data(), sizeof(long long) * ndim, hcMemcpyHostToDevice, stream); + if (err != hcSuccess) goto cleanup; + + device_id_local = 0; + propErr = hcGetDevice(&device_id_local); + if (propErr == hcSuccess) { + hcDeviceProp_t prop; + if (hcGetDeviceProperties(&prop, device_id_local) == hcSuccess) { + bs = std::min(bs, static_cast(prop.maxThreadsPerBlock) / 2); + } else { + if (bs > 256) bs = 256; + } + } else { + if (bs > 256) bs = 256; + } + + if (bs <= 0) bs = 256; + grid = static_cast((info.n + bs - 1) / bs); + if (grid <= 0) grid = 1; + + cast_kernel<<>>( + out_dev, in_dev, info.n, d_shape, d_div, d_in_stride, d_out_stride, ndim); + + err = hcGetLastError(); + if (err != hcSuccess) goto cleanup; + + err = hcStreamSynchronize(stream); + if (err != hcSuccess) goto cleanup; + + hcFree(d_shape); + hcFree(d_div); + hcFree(d_in_stride); + hcFree(d_out_stride); + return INFINI_STATUS_SUCCESS; + +cleanup: + hcFree(d_shape); + hcFree(d_div); + hcFree(d_in_stride); + hcFree(d_out_stride); + return INFINI_STATUS_DEVICE_ARCHITECTURE_NOT_SUPPORTED; +} + +infiniStatus_t Descriptor::calculate( + void *workspace, + size_t workspace_size, + void *output, + const void *input, + void *stream) const { + + if (output == const_cast(input)) { + return INFINI_STATUS_BAD_PARAM; + } + + #define CASE_OUT(DT_OUT, TOUT) \ + case DT_OUT: { \ + switch (_info.dt_in) { \ + case INFINI_DTYPE_I32: \ + metax_cast_impl_incremental(output, input, _info, stream); \ + break; \ + case INFINI_DTYPE_I64: \ + metax_cast_impl_incremental(output, input, _info, stream); \ + break; \ + case INFINI_DTYPE_U32: \ + metax_cast_impl_incremental(output, input, _info, stream); \ + break; \ + case INFINI_DTYPE_U64: \ + metax_cast_impl_incremental(output, input, _info, stream); \ + break; \ + case INFINI_DTYPE_F16: \ + metax_cast_impl_incremental(output, input, _info, stream); \ + break; \ + case INFINI_DTYPE_F32: \ + metax_cast_impl_incremental(output, input, _info, stream); \ + break; \ + case INFINI_DTYPE_F64: \ + metax_cast_impl_incremental(output, input, _info, stream); \ + break; \ + default: \ + return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED; \ + } \ + break; \ + } + + switch (_info.dt_out) { + CASE_OUT(INFINI_DTYPE_I32, int32_t); + CASE_OUT(INFINI_DTYPE_I64, int64_t); + CASE_OUT(INFINI_DTYPE_U32, uint32_t); + CASE_OUT(INFINI_DTYPE_U64, uint64_t); + CASE_OUT(INFINI_DTYPE_F16, fp16_t); + CASE_OUT(INFINI_DTYPE_F32, float); + CASE_OUT(INFINI_DTYPE_F64, double); + default: + return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED; + } + + #undef CASE_OUT + return INFINI_STATUS_SUCCESS; +} + +}; // namespace op::cast::metax diff --git a/src/infiniop/ops/cast/nvidia/cast_nvidia.cu b/src/infiniop/ops/cast/nvidia/cast_nvidia.cu new file mode 100644 index 000000000..8e7eea473 --- /dev/null +++ b/src/infiniop/ops/cast/nvidia/cast_nvidia.cu @@ -0,0 +1,239 @@ +#include "../../../devices/nvidia/nvidia_handle.cuh" +#include "../cast.h" +#include "../cuda/kernel.cuh" +#include "../info.h" +#include "cast_nvidia.cuh" +#include +#include +#include +#include + +namespace op::cast::nvidia { + +struct Descriptor::Opaque { + std::shared_ptr internal; +}; + +Descriptor::~Descriptor() { + delete _opaque; +} + +template +struct MapCudaType { + using Type = T; +}; +template <> +struct MapCudaType { + using Type = half; +}; + +infiniStatus_t Descriptor::create( + infiniopHandle_t handle_, + Descriptor **desc_ptr, + infiniopTensorDescriptor_t out_desc, + infiniopTensorDescriptor_t in_desc) { + auto handle = reinterpret_cast(handle_); + + auto info_r = CastInfo::create(out_desc, in_desc); + CHECK_RESULT(info_r); + auto info = info_r.take(); + + size_t workspace_size = 0; + + *desc_ptr = new Descriptor( + info, + workspace_size, + new Opaque{handle->internal()}, + handle->device, handle->device_id); + + return INFINI_STATUS_SUCCESS; +} + +size_t Descriptor::workspaceSize() const { + return _min_workspace_size; +} + +template +static inline infiniStatus_t cuda_cast_impl_incremental( + void *output_, const void *input_, + const op::cast::CastInfo &info, + void *stream_) { + + int bs = 256, grid = 0; + cudaError_t propErr; + int device_id_local = 0; + using DevTout = typename MapCudaType::Type; + using DevTin = typename MapCudaType::Type; + + auto out_dev = reinterpret_cast(output_); + auto in_dev = reinterpret_cast(input_); + auto stream = reinterpret_cast(stream_); + + int ndim = static_cast(info.shape.size()); + if (ndim == 0) { + return INFINI_STATUS_SUCCESS; + } + + std::vector h_shape(info.shape.begin(), info.shape.end()); + std::vector h_div(ndim); + h_div[ndim - 1] = 1; + for (int d = ndim - 2; d >= 0; --d) { + h_div[d] = h_div[d + 1] * h_shape[d + 1]; + } + + std::vector h_in_stride(ndim), h_out_stride(ndim); + for (int d = 0; d < ndim; ++d) { + h_in_stride[d] = static_cast(info.in_stride[d]); + h_out_stride[d] = static_cast(info.out_stride[d]); + } + + size_t *d_shape = nullptr; + size_t *d_div = nullptr; + long long *d_in_stride = nullptr; + long long *d_out_stride = nullptr; + + cudaError_t err = cudaSuccess; + err = cudaMalloc(reinterpret_cast(&d_shape), sizeof(size_t) * ndim); + if (err != cudaSuccess) { + goto cleanup; + } + err = cudaMalloc(reinterpret_cast(&d_div), sizeof(size_t) * ndim); + if (err != cudaSuccess) { + goto cleanup; + } + err = cudaMalloc(reinterpret_cast(&d_in_stride), sizeof(long long) * ndim); + if (err != cudaSuccess) { + goto cleanup; + } + err = cudaMalloc(reinterpret_cast(&d_out_stride), sizeof(long long) * ndim); + if (err != cudaSuccess) { + goto cleanup; + } + + err = cudaMemcpyAsync(d_shape, h_shape.data(), sizeof(size_t) * ndim, cudaMemcpyHostToDevice, stream); + if (err != cudaSuccess) { + goto cleanup; + } + err = cudaMemcpyAsync(d_div, h_div.data(), sizeof(size_t) * ndim, cudaMemcpyHostToDevice, stream); + if (err != cudaSuccess) { + goto cleanup; + } + err = cudaMemcpyAsync(d_in_stride, h_in_stride.data(), sizeof(long long) * ndim, cudaMemcpyHostToDevice, stream); + if (err != cudaSuccess) { + goto cleanup; + } + err = cudaMemcpyAsync(d_out_stride, h_out_stride.data(), sizeof(long long) * ndim, cudaMemcpyHostToDevice, stream); + if (err != cudaSuccess) { + goto cleanup; + } + + device_id_local = 0; + propErr = cudaGetDevice(&device_id_local); + if (propErr == cudaSuccess) { + cudaDeviceProp prop; + if (cudaGetDeviceProperties(&prop, device_id_local) == cudaSuccess) { + bs = std::min(bs, static_cast(prop.maxThreadsPerBlock) / 2); + } else { + if (bs > 256) { + bs = 256; + } + } + } else { + if (bs > 256) { + bs = 256; + } + } + + if (bs <= 0) { + bs = 256; + } + grid = static_cast((info.n + bs - 1) / bs); + if (grid <= 0) { + grid = 1; + } + + cast_kernel<<>>( + out_dev, in_dev, info.n, d_shape, d_div, d_in_stride, d_out_stride, ndim); + + err = cudaGetLastError(); + if (err != cudaSuccess) { + goto cleanup; + } + + err = cudaStreamSynchronize(stream); + if (err != cudaSuccess) { + goto cleanup; + } + + cudaFree(d_shape); + cudaFree(d_div); + cudaFree(d_in_stride); + cudaFree(d_out_stride); + return INFINI_STATUS_SUCCESS; + +cleanup: + cudaFree(d_shape); + cudaFree(d_div); + cudaFree(d_in_stride); + cudaFree(d_out_stride); + return INFINI_STATUS_DEVICE_ARCHITECTURE_NOT_SUPPORTED; +} + +infiniStatus_t Descriptor::calculate( + void *workspace, + size_t workspace_size, + void *output, + const void *input, + void *stream) const { + + if (output == const_cast(input)) { + return INFINI_STATUS_BAD_PARAM; + } + +#define CASE_OUT(DT_OUT, TOUT) \ + case DT_OUT: { \ + switch (_info.dt_in) { \ + case INFINI_DTYPE_I32: \ + cuda_cast_impl_incremental(output, input, _info, stream); \ + break; \ + case INFINI_DTYPE_I64: \ + cuda_cast_impl_incremental(output, input, _info, stream); \ + break; \ + case INFINI_DTYPE_U32: \ + cuda_cast_impl_incremental(output, input, _info, stream); \ + break; \ + case INFINI_DTYPE_U64: \ + cuda_cast_impl_incremental(output, input, _info, stream); \ + break; \ + case INFINI_DTYPE_F16: \ + cuda_cast_impl_incremental(output, input, _info, stream); \ + break; \ + case INFINI_DTYPE_F32: \ + cuda_cast_impl_incremental(output, input, _info, stream); \ + break; \ + case INFINI_DTYPE_F64: \ + cuda_cast_impl_incremental(output, input, _info, stream); \ + break; \ + default: \ + return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED; \ + } \ + break; \ + } + + switch (_info.dt_out) { + CASE_OUT(INFINI_DTYPE_I32, int32_t); + CASE_OUT(INFINI_DTYPE_I64, int64_t); + CASE_OUT(INFINI_DTYPE_U32, uint32_t); + CASE_OUT(INFINI_DTYPE_U64, uint64_t); + CASE_OUT(INFINI_DTYPE_F16, fp16_t); + CASE_OUT(INFINI_DTYPE_F32, float); + CASE_OUT(INFINI_DTYPE_F64, double); + default: + return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED; + } + +#undef CASE_OUT + return INFINI_STATUS_SUCCESS; +} + +}; // namespace op::cast::nvidia diff --git a/src/infiniop/ops/cast/nvidia/cast_nvidia.cuh b/src/infiniop/ops/cast/nvidia/cast_nvidia.cuh new file mode 100644 index 000000000..032e1fb2e --- /dev/null +++ b/src/infiniop/ops/cast/nvidia/cast_nvidia.cuh @@ -0,0 +1,8 @@ +#ifndef __CAST_CUDA_API_H__ +#define __CAST_CUDA_API_H__ + +#include "../cast.h" + +DESCRIPTOR(nvidia) + +#endif // __CAST_CUDA_API_H__ diff --git a/src/infiniop/ops/cast/operator.cc b/src/infiniop/ops/cast/operator.cc new file mode 100644 index 000000000..12d26953b --- /dev/null +++ b/src/infiniop/ops/cast/operator.cc @@ -0,0 +1,142 @@ +#include "../../operator.h" +#include "../../handle.h" +#include "infiniop/ops/cast.h" + +#ifdef ENABLE_CPU_API +#include "cpu/cast_cpu.h" +#endif +#if defined(ENABLE_NVIDIA_API) || defined(ENABLE_ILUVATAR_API) +#include "nvidia/cast_nvidia.cuh" +#endif +#ifdef ENABLE_METAX_API +#include "metax/cast_metax.h" +#endif + +__C infiniStatus_t infiniopCreateCastDescriptor( + infiniopHandle_t handle, + infiniopCastDescriptor_t *desc_ptr, + infiniopTensorDescriptor_t output_desc, + infiniopTensorDescriptor_t input_desc) { + +#define CREATE(CASE, NAMESPACE) \ + case CASE: \ + return op::cast::NAMESPACE::Descriptor::create( \ + handle, \ + reinterpret_cast(desc_ptr), \ + output_desc, \ + input_desc) + + switch (handle->device) { + +#ifdef ENABLE_CPU_API + CREATE(INFINI_DEVICE_CPU, cpu); +#endif +#ifdef ENABLE_NVIDIA_API + CREATE(INFINI_DEVICE_NVIDIA, nvidia); +#endif +#ifdef ENABLE_ILUVATAR_API + CREATE(INFINI_DEVICE_ILUVATAR, nvidia); +#endif +#ifdef ENABLE_METAX_API + CREATE(INFINI_DEVICE_METAX, metax); +#endif + + default: + return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED; + } + +#undef CREATE +} + +__C infiniStatus_t infiniopGetCastWorkspaceSize(infiniopCastDescriptor_t desc, size_t *size) { + +#define GET(CASE, NAMESPACE) \ + case CASE: \ + *size = reinterpret_cast(desc)->workspaceSize(); \ + return INFINI_STATUS_SUCCESS; + + switch (desc->device_type) { +#ifdef ENABLE_CPU_API + GET(INFINI_DEVICE_CPU, cpu); +#endif +#ifdef ENABLE_NVIDIA_API + GET(INFINI_DEVICE_NVIDIA, nvidia); +#endif +#ifdef ENABLE_ILUVATAR_API + GET(INFINI_DEVICE_ILUVATAR, nvidia); +#endif +#ifdef ENABLE_METAX_API + GET(INFINI_DEVICE_METAX, metax); +#endif + default: + return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED; + } +#undef GET + + return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED; +} + +__C infiniStatus_t infiniopCast( + infiniopCastDescriptor_t desc, + void *workspace, + size_t workspace_size, + void *output, + const void *input, + void *stream) { + +#define CALCULATE(CASE, NAMESPACE) \ + case CASE: \ + return reinterpret_cast(desc) \ + ->calculate(workspace, workspace_size, output, input, stream) + + switch (desc->device_type) { + +#ifdef ENABLE_CPU_API + CALCULATE(INFINI_DEVICE_CPU, cpu); +#endif +#ifdef ENABLE_NVIDIA_API + CALCULATE(INFINI_DEVICE_NVIDIA, nvidia); +#endif +#ifdef ENABLE_ILUVATAR_API + CALCULATE(INFINI_DEVICE_ILUVATAR, nvidia); +#endif +#ifdef ENABLE_METAX_API + CALCULATE(INFINI_DEVICE_METAX, metax); +#endif + + default: + return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED; + } + +#undef CALCULATE +} + +__C infiniStatus_t +infiniopDestroyCastDescriptor(infiniopCastDescriptor_t desc) { + +#define DELETE(CASE, NAMESPACE) \ + case CASE: \ + delete reinterpret_cast(desc); \ + return INFINI_STATUS_SUCCESS + + switch (desc->device_type) { + +#ifdef ENABLE_CPU_API + DELETE(INFINI_DEVICE_CPU, cpu); +#endif +#ifdef ENABLE_NVIDIA_API + DELETE(INFINI_DEVICE_NVIDIA, nvidia); +#endif +#ifdef ENABLE_ILUVATAR_API + DELETE(INFINI_DEVICE_ILUVATAR, nvidia); +#endif +#ifdef ENABLE_METAX_API + DELETE(INFINI_DEVICE_METAX, metax); +#endif + + default: + return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED; + } + +#undef DELETE +} diff --git a/src/infiniop/ops/cos/cpu/cos_cpu.cc b/src/infiniop/ops/cos/cpu/cos_cpu.cc new file mode 100644 index 000000000..f5d27ec49 --- /dev/null +++ b/src/infiniop/ops/cos/cpu/cos_cpu.cc @@ -0,0 +1,52 @@ +#include "cos_cpu.h" + +namespace op::cos::cpu { + +Descriptor::~Descriptor() = default; + +infiniStatus_t Descriptor::create( + infiniopHandle_t handle_, + Descriptor **desc_ptr, + infiniopTensorDescriptor_t out_desc, + std::vector input_desc_vec) { + + auto handle = reinterpret_cast(handle_); + auto dtype = out_desc->dtype(); + + const auto &input_desc = input_desc_vec.at(0); + const auto &output_shape = out_desc->shape(); + const auto &input_shape = input_desc->shape(); + + CHECK_DTYPE(dtype, INFINI_DTYPE_F16, INFINI_DTYPE_F32, INFINI_DTYPE_F64, INFINI_DTYPE_BF16); + + CHECK_SAME_SHAPE(output_shape, input_shape); + + // create CPU elementwise descriptor + CREATE_ELEMENTWISE_CPU_DESCRIPTOR(handle, dtype, out_desc, input_desc_vec); + + return INFINI_STATUS_SUCCESS; +} + +infiniStatus_t Descriptor::calculate( + void *workspace, + size_t workspace_size, + void *output, + std::vector inputs, + void *stream) const { + + switch (_dtype) { + case INFINI_DTYPE_F16: + return _device_info->calculate(_info, output, inputs, stream); + case INFINI_DTYPE_F32: + return _device_info->calculate(_info, output, inputs, stream); + case INFINI_DTYPE_F64: + return _device_info->calculate(_info, output, inputs, stream); + case INFINI_DTYPE_BF16: + return _device_info->calculate(_info, output, inputs, stream); + default: + return INFINI_STATUS_BAD_TENSOR_DTYPE; + } + + return INFINI_STATUS_SUCCESS; +} +} // namespace op::cos::cpu diff --git a/src/infiniop/ops/cos/cpu/cos_cpu.h b/src/infiniop/ops/cos/cpu/cos_cpu.h new file mode 100644 index 000000000..af324eb80 --- /dev/null +++ b/src/infiniop/ops/cos/cpu/cos_cpu.h @@ -0,0 +1,21 @@ +#ifndef __COS_CPU_H__ +#define __COS_CPU_H__ + +#include "../../../elementwise/cpu/elementwise_cpu.h" +#include + +ELEMENTWISE_DESCRIPTOR(cos, cpu) + +namespace op::cos::cpu { +typedef struct CosOp { +public: + static constexpr size_t num_inputs = 1; + + template + T operator()(const T &input) const { + return std::cos(input); + } +} CosOp; +} // namespace op::cos::cpu + +#endif // __COS_CPU_H__ diff --git a/src/infiniop/ops/cos/cuda/kernel.cuh b/src/infiniop/ops/cos/cuda/kernel.cuh new file mode 100644 index 000000000..5db7ee8f6 --- /dev/null +++ b/src/infiniop/ops/cos/cuda/kernel.cuh @@ -0,0 +1,48 @@ +#ifndef __COS_CUDA_H__ +#define __COS_CUDA_H__ + +#include +#include +#include + +namespace op::cos::cuda { +typedef struct CosOp { + static constexpr size_t num_inputs = 1; + + template + __device__ __forceinline__ T operator()(const T &input) const { + auto cos_f32 = [] __device__(float x) { + double xd = static_cast(x); + double yd = std::cos(xd); + return static_cast(yd); + }; + + if constexpr (std::is_same_v) { + float2 vf = __half22float2(input); + float2 vr = make_float2( + cos_f32(vf.x), + cos_f32(vf.y)); + return __float22half2_rn(vr); + } else if constexpr (std::is_same_v) { + float xf = __half2float(input); + float yf = cos_f32(xf); + return __float2half_rn(yf); + } else if constexpr (std::is_same_v) { + float f0 = __bfloat162float(__low2bfloat16(input)); + float f1 = __bfloat162float(__high2bfloat16(input)); + return __floats2bfloat162_rz(cos_f32(f0), cos_f32(f1)); + } else if constexpr (std::is_same_v) { + float xf = __bfloat162float(input); + return __float2bfloat16_rz(cos_f32(xf)); + } else if constexpr (std::is_same_v) { + return cos_f32(input); + } else if constexpr (std::is_same_v) { + return std::cos(input); + } else { + return std::cos(input); + } + } +} CosOp; +} // namespace op::cos::cuda + +#endif // __COS_CUDA_H__ diff --git a/src/infiniop/ops/cos/metax/cos_metax.h b/src/infiniop/ops/cos/metax/cos_metax.h new file mode 100644 index 000000000..a98fa3211 --- /dev/null +++ b/src/infiniop/ops/cos/metax/cos_metax.h @@ -0,0 +1,8 @@ +#ifndef __COS_METAX_API_H__ +#define __COS_METAX_API_H__ + +#include "../../../elementwise/metax/elementwise_metax_api.h" + +ELEMENTWISE_DESCRIPTOR(cos, metax) + +#endif // __COS_METAX_API_H__ diff --git a/src/infiniop/ops/cos/metax/cos_metax.maca b/src/infiniop/ops/cos/metax/cos_metax.maca new file mode 100644 index 000000000..144db47ef --- /dev/null +++ b/src/infiniop/ops/cos/metax/cos_metax.maca @@ -0,0 +1,60 @@ +#include "cos_metax.h" + +#include "../../../elementwise/metax/elementwise_metax.h" + +#include "../cuda/kernel.cuh" + +namespace op::cos::metax { + +Descriptor::~Descriptor() = default; + +infiniStatus_t Descriptor::create( + infiniopHandle_t handle_, + Descriptor **desc_ptr, + infiniopTensorDescriptor_t out_desc, + std::vector input_desc_vec) { + + auto handle = reinterpret_cast(handle_); + auto dtype = out_desc->dtype(); + + const auto &input_desc = input_desc_vec.at(0); + const auto &output_shape = out_desc->shape(); + const auto &input_shape = input_desc->shape(); + + CHECK_DTYPE(dtype, INFINI_DTYPE_F16, INFINI_DTYPE_F32, INFINI_DTYPE_F64, INFINI_DTYPE_BF16); + + CHECK_SAME_SHAPE(output_shape, input_shape); + + // create CUDA elementwise descriptor + CREATE_ELEMENTWISE_METAX_DESCRIPTOR(handle, dtype, out_desc, input_desc_vec) + + return INFINI_STATUS_SUCCESS; +} + +infiniStatus_t Descriptor::calculate( + void *workspace, + size_t workspace_size, + void *output, + std::vector inputs, + void *stream) const { + + if (workspace_size < _workspace_size) { + return INFINI_STATUS_INSUFFICIENT_WORKSPACE; + } + + switch (_dtype) { + case INFINI_DTYPE_F16: + return _device_info->calculate<256, cuda::CosOp, half>(_info, workspace, output, inputs, stream); + case INFINI_DTYPE_BF16: + return _device_info->calculate<256, cuda::CosOp, cuda_bfloat16>(_info, workspace, output, inputs, stream); + case INFINI_DTYPE_F32: + return _device_info->calculate<256, cuda::CosOp, float>(_info, workspace, output, inputs, stream); + case INFINI_DTYPE_F64: + return _device_info->calculate<256, cuda::CosOp, double>(_info, workspace, output, inputs, stream); + default: + return INFINI_STATUS_BAD_TENSOR_DTYPE; + } + + return INFINI_STATUS_SUCCESS; +} +} // namespace op::cos::metax diff --git a/src/infiniop/ops/cos/nvidia/cos_nvidia.cu b/src/infiniop/ops/cos/nvidia/cos_nvidia.cu new file mode 100644 index 000000000..433363c91 --- /dev/null +++ b/src/infiniop/ops/cos/nvidia/cos_nvidia.cu @@ -0,0 +1,59 @@ +#include "../../../elementwise/nvidia/elementwise_nvidia.cuh" + +#include "../cuda/kernel.cuh" +#include "cos_nvidia.cuh" + +namespace op::cos::nvidia { + +Descriptor::~Descriptor() = default; + +infiniStatus_t Descriptor::create( + infiniopHandle_t handle_, + Descriptor **desc_ptr, + infiniopTensorDescriptor_t out_desc, + std::vector input_desc_vec) { + + auto handle = reinterpret_cast(handle_); + auto dtype = out_desc->dtype(); + + const auto &input_desc = input_desc_vec.at(0); + const auto &output_shape = out_desc->shape(); + const auto &input_shape = input_desc->shape(); + + CHECK_DTYPE(dtype, INFINI_DTYPE_F16, INFINI_DTYPE_F32, INFINI_DTYPE_F64, INFINI_DTYPE_BF16); + + CHECK_SAME_SHAPE(output_shape, input_shape); + + // create CUDA elementwise descriptor + CREATE_ELEMENTWISE_CUDA_DESCRIPTOR(handle, dtype, out_desc, input_desc_vec) + + return INFINI_STATUS_SUCCESS; +} + +infiniStatus_t Descriptor::calculate( + void *workspace, + size_t workspace_size, + void *output, + std::vector inputs, + void *stream) const { + + if (workspace_size < _workspace_size) { + return INFINI_STATUS_INSUFFICIENT_WORKSPACE; + } + + switch (_dtype) { + case INFINI_DTYPE_F16: + return _device_info->calculate<256, cuda::CosOp, half>(_info, workspace, output, inputs, stream); + case INFINI_DTYPE_BF16: + return _device_info->calculate<256, cuda::CosOp, cuda_bfloat16>(_info, workspace, output, inputs, stream); + case INFINI_DTYPE_F32: + return _device_info->calculate<256, cuda::CosOp, float>(_info, workspace, output, inputs, stream); + case INFINI_DTYPE_F64: + return _device_info->calculate<256, cuda::CosOp, double>(_info, workspace, output, inputs, stream); + default: + return INFINI_STATUS_BAD_TENSOR_DTYPE; + } + + return INFINI_STATUS_SUCCESS; +} +} // namespace op::cos::nvidia diff --git a/src/infiniop/ops/cos/nvidia/cos_nvidia.cuh b/src/infiniop/ops/cos/nvidia/cos_nvidia.cuh new file mode 100644 index 000000000..f6c350dd6 --- /dev/null +++ b/src/infiniop/ops/cos/nvidia/cos_nvidia.cuh @@ -0,0 +1,8 @@ +#ifndef __COS_CUDA_API_H__ +#define __COS_CUDA_API_H__ + +#include "../../../elementwise/nvidia/elementwise_nvidia_api.cuh" + +ELEMENTWISE_DESCRIPTOR(cos, nvidia) + +#endif // __COS_CUDA_API_H__ diff --git a/src/infiniop/ops/cos/operator.cc b/src/infiniop/ops/cos/operator.cc new file mode 100644 index 000000000..71a5f807c --- /dev/null +++ b/src/infiniop/ops/cos/operator.cc @@ -0,0 +1,142 @@ +#include "../../operator.h" +#include "../../handle.h" +#include "infiniop/ops/cos.h" + +#ifdef ENABLE_CPU_API +#include "cpu/cos_cpu.h" +#endif +#if defined(ENABLE_NVIDIA_API) || defined(ENABLE_ILUVATAR_API) +#include "nvidia/cos_nvidia.cuh" +#endif +#ifdef ENABLE_METAX_API +#include "metax/cos_metax.h" +#endif + +__C infiniStatus_t infiniopCreateCosDescriptor( + infiniopHandle_t handle, + infiniopCosDescriptor_t *desc_ptr, + infiniopTensorDescriptor_t output_desc, + infiniopTensorDescriptor_t input_desc) { + +#define CREATE(CASE, NAMESPACE) \ + case CASE: \ + return op::cos::NAMESPACE::Descriptor::create( \ + handle, \ + reinterpret_cast(desc_ptr), \ + output_desc, \ + {input_desc}) + + switch (handle->device) { + +#ifdef ENABLE_CPU_API + CREATE(INFINI_DEVICE_CPU, cpu); +#endif +#ifdef ENABLE_NVIDIA_API + CREATE(INFINI_DEVICE_NVIDIA, nvidia); +#endif +#ifdef ENABLE_ILUVATAR_API + CREATE(INFINI_DEVICE_ILUVATAR, nvidia); +#endif +#ifdef ENABLE_METAX_API + CREATE(INFINI_DEVICE_METAX, metax); +#endif + + default: + return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED; + } + +#undef CREATE +} + +__C infiniStatus_t infiniopGetCosWorkspaceSize(infiniopCosDescriptor_t desc, size_t *size) { + +#define GET(CASE, NAMESPACE) \ + case CASE: \ + *size = reinterpret_cast(desc)->workspaceSize(); \ + return INFINI_STATUS_SUCCESS; + + switch (desc->device_type) { +#ifdef ENABLE_CPU_API + GET(INFINI_DEVICE_CPU, cpu); +#endif +#ifdef ENABLE_NVIDIA_API + GET(INFINI_DEVICE_NVIDIA, nvidia); +#endif +#ifdef ENABLE_ILUVATAR_API + GET(INFINI_DEVICE_ILUVATAR, nvidia); +#endif +#ifdef ENABLE_METAX_API + GET(INFINI_DEVICE_METAX, metax); +#endif + default: + return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED; + } +#undef GET + + return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED; +} + +__C infiniStatus_t infiniopCos( + infiniopCosDescriptor_t desc, + void *workspace, + size_t workspace_size, + void *output, + const void *input, + void *stream) { + +#define CALCULATE(CASE, NAMESPACE) \ + case CASE: \ + return reinterpret_cast(desc) \ + ->calculate(workspace, workspace_size, output, {input}, stream) + + switch (desc->device_type) { + +#ifdef ENABLE_CPU_API + CALCULATE(INFINI_DEVICE_CPU, cpu); +#endif +#ifdef ENABLE_NVIDIA_API + CALCULATE(INFINI_DEVICE_NVIDIA, nvidia); +#endif +#ifdef ENABLE_ILUVATAR_API + CALCULATE(INFINI_DEVICE_ILUVATAR, nvidia); +#endif +#ifdef ENABLE_METAX_API + CALCULATE(INFINI_DEVICE_METAX, metax); +#endif + + default: + return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED; + } + +#undef CALCULATE +} + +__C infiniStatus_t +infiniopDestroyCosDescriptor(infiniopCosDescriptor_t desc) { + +#define DELETE(CASE, NAMESPACE) \ + case CASE: \ + delete reinterpret_cast(desc); \ + return INFINI_STATUS_SUCCESS + + switch (desc->device_type) { + +#ifdef ENABLE_CPU_API + DELETE(INFINI_DEVICE_CPU, cpu); +#endif +#ifdef ENABLE_NVIDIA_API + DELETE(INFINI_DEVICE_NVIDIA, nvidia); +#endif +#ifdef ENABLE_ILUVATAR_API + DELETE(INFINI_DEVICE_ILUVATAR, nvidia); +#endif +#ifdef ENABLE_METAX_API + DELETE(INFINI_DEVICE_METAX, metax); +#endif + + default: + return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED; + } + +#undef DELETE +} diff --git a/src/infiniop/ops/exp/cpu/exp_cpu.cc b/src/infiniop/ops/exp/cpu/exp_cpu.cc new file mode 100644 index 000000000..58a6d0f2d --- /dev/null +++ b/src/infiniop/ops/exp/cpu/exp_cpu.cc @@ -0,0 +1,52 @@ +#include "exp_cpu.h" + +namespace op::exp::cpu { + +Descriptor::~Descriptor() = default; + +infiniStatus_t Descriptor::create( + infiniopHandle_t handle_, + Descriptor **desc_ptr, + infiniopTensorDescriptor_t out_desc, + std::vector input_desc_vec) { + + auto handle = reinterpret_cast(handle_); + auto dtype = out_desc->dtype(); + + const auto &input_desc = input_desc_vec.at(0); + const auto &output_shape = out_desc->shape(); + const auto &input_shape = input_desc->shape(); + + CHECK_DTYPE(dtype, INFINI_DTYPE_F16, INFINI_DTYPE_F32, INFINI_DTYPE_F64, INFINI_DTYPE_BF16); + + CHECK_SAME_SHAPE(output_shape, input_shape); + + // create CPU elementwise descriptor + CREATE_ELEMENTWISE_CPU_DESCRIPTOR(handle, dtype, out_desc, input_desc_vec); + + return INFINI_STATUS_SUCCESS; +} + +infiniStatus_t Descriptor::calculate( + void *workspace, + size_t workspace_size, + void *output, + std::vector inputs, + void *stream) const { + + switch (_dtype) { + case INFINI_DTYPE_F16: + return _device_info->calculate(_info, output, inputs, stream); + case INFINI_DTYPE_F32: + return _device_info->calculate(_info, output, inputs, stream); + case INFINI_DTYPE_F64: + return _device_info->calculate(_info, output, inputs, stream); + case INFINI_DTYPE_BF16: + return _device_info->calculate(_info, output, inputs, stream); + default: + return INFINI_STATUS_BAD_TENSOR_DTYPE; + } + + return INFINI_STATUS_SUCCESS; +} +} // namespace op::exp::cpu diff --git a/src/infiniop/ops/exp/cpu/exp_cpu.h b/src/infiniop/ops/exp/cpu/exp_cpu.h new file mode 100644 index 000000000..867c7afa5 --- /dev/null +++ b/src/infiniop/ops/exp/cpu/exp_cpu.h @@ -0,0 +1,21 @@ +#ifndef __EXP_CPU_H__ +#define __EXP_CPU_H__ + +#include "../../../elementwise/cpu/elementwise_cpu.h" +#include + +ELEMENTWISE_DESCRIPTOR(exp, cpu) + +namespace op::exp::cpu { +typedef struct ExpOp { +public: + static constexpr size_t num_inputs = 1; + + template + T operator()(const T &input) const { + return std::exp(input); + } +} ExpOp; +} // namespace op::exp::cpu + +#endif // __EXP_CPU_H__ diff --git a/src/infiniop/ops/exp/cuda/kernel.cuh b/src/infiniop/ops/exp/cuda/kernel.cuh new file mode 100644 index 000000000..12446f31a --- /dev/null +++ b/src/infiniop/ops/exp/cuda/kernel.cuh @@ -0,0 +1,39 @@ +#ifndef __EXP_CUDA_H__ +#define __EXP_CUDA_H__ + +#include +#include +#include + +namespace op::exp::cuda { +typedef struct ExpOp { + static constexpr size_t num_inputs = 1; + + template + __device__ __forceinline__ T operator()(const T &input) const { + if constexpr (std::is_same_v) { + float2 vf = __half22float2(input); + float2 vr = make_float2(__expf(vf.x), __expf(vf.y)); + return __float22half2_rn(vr); + } else if constexpr (std::is_same_v) { + float inputf = __half2float(input); + return __float2half_rn(__expf(inputf)); + } else if constexpr (std::is_same_v) { + float f0 = __bfloat162float(__low2bfloat16(input)); + float f1 = __bfloat162float(__high2bfloat16(input)); + return __floats2bfloat162_rn(__expf(f0), __expf(f1)); + } else if constexpr (std::is_same_v) { + float inputf = __bfloat162float(input); + return __float2bfloat16_rn(__expf(inputf)); + } else if constexpr (std::is_same_v) { + return __expf(input); + } else if constexpr (std::is_same_v) { + return std::exp(input); + } else { + return std::exp(input); + } + } +} ExpOp; +} // namespace op::exp::cuda + +#endif // __EXP_CUDA_H__ diff --git a/src/infiniop/ops/exp/metax/exp_metax.h b/src/infiniop/ops/exp/metax/exp_metax.h new file mode 100644 index 000000000..fb10faf9b --- /dev/null +++ b/src/infiniop/ops/exp/metax/exp_metax.h @@ -0,0 +1,8 @@ +#ifndef __EXP_METAX_API_H__ +#define __EXP_METAX_API_H__ + +#include "../../../elementwise/metax/elementwise_metax_api.h" + +ELEMENTWISE_DESCRIPTOR(exp, metax) + +#endif // __EXP_METAX_API_H__ diff --git a/src/infiniop/ops/exp/metax/exp_metax.maca b/src/infiniop/ops/exp/metax/exp_metax.maca new file mode 100644 index 000000000..c71703c6d --- /dev/null +++ b/src/infiniop/ops/exp/metax/exp_metax.maca @@ -0,0 +1,60 @@ +#include "exp_metax.h" + +#include "../../../elementwise/metax/elementwise_metax.h" + +#include "../cuda/kernel.cuh" + +namespace op::exp::metax { + +Descriptor::~Descriptor() = default; + +infiniStatus_t Descriptor::create( + infiniopHandle_t handle_, + Descriptor **desc_ptr, + infiniopTensorDescriptor_t out_desc, + std::vector input_desc_vec) { + + auto handle = reinterpret_cast(handle_); + auto dtype = out_desc->dtype(); + + const auto &input_desc = input_desc_vec.at(0); + const auto &output_shape = out_desc->shape(); + const auto &input_shape = input_desc->shape(); + + CHECK_DTYPE(dtype, INFINI_DTYPE_F16, INFINI_DTYPE_F32, INFINI_DTYPE_F64, INFINI_DTYPE_BF16); + + CHECK_SAME_SHAPE(output_shape, input_shape); + + // create CUDA elementwise descriptor + CREATE_ELEMENTWISE_METAX_DESCRIPTOR(handle, dtype, out_desc, input_desc_vec) + + return INFINI_STATUS_SUCCESS; +} + +infiniStatus_t Descriptor::calculate( + void *workspace, + size_t workspace_size, + void *output, + std::vector inputs, + void *stream) const { + + if (workspace_size < _workspace_size) { + return INFINI_STATUS_INSUFFICIENT_WORKSPACE; + } + + switch (_dtype) { + case INFINI_DTYPE_F16: + return _device_info->calculate<256, cuda::ExpOp, half>(_info, workspace, output, inputs, stream); + case INFINI_DTYPE_BF16: + return _device_info->calculate<256, cuda::ExpOp, cuda_bfloat16>(_info, workspace, output, inputs, stream); + case INFINI_DTYPE_F32: + return _device_info->calculate<256, cuda::ExpOp, float>(_info, workspace, output, inputs, stream); + case INFINI_DTYPE_F64: + return _device_info->calculate<256, cuda::ExpOp, double>(_info, workspace, output, inputs, stream); + default: + return INFINI_STATUS_BAD_TENSOR_DTYPE; + } + + return INFINI_STATUS_SUCCESS; +} +} // namespace op::exp::metax diff --git a/src/infiniop/ops/exp/nvidia/exp_nvidia.cu b/src/infiniop/ops/exp/nvidia/exp_nvidia.cu new file mode 100644 index 000000000..3bdf2eb45 --- /dev/null +++ b/src/infiniop/ops/exp/nvidia/exp_nvidia.cu @@ -0,0 +1,59 @@ +#include "../../../elementwise/nvidia/elementwise_nvidia.cuh" + +#include "../cuda/kernel.cuh" +#include "exp_nvidia.cuh" + +namespace op::exp::nvidia { + +Descriptor::~Descriptor() = default; + +infiniStatus_t Descriptor::create( + infiniopHandle_t handle_, + Descriptor **desc_ptr, + infiniopTensorDescriptor_t out_desc, + std::vector input_desc_vec) { + + auto handle = reinterpret_cast(handle_); + auto dtype = out_desc->dtype(); + + const auto &input_desc = input_desc_vec.at(0); + const auto &output_shape = out_desc->shape(); + const auto &input_shape = input_desc->shape(); + + CHECK_DTYPE(dtype, INFINI_DTYPE_F16, INFINI_DTYPE_F32, INFINI_DTYPE_F64, INFINI_DTYPE_BF16); + + CHECK_SAME_SHAPE(output_shape, input_shape); + + // create CUDA elementwise descriptor + CREATE_ELEMENTWISE_CUDA_DESCRIPTOR(handle, dtype, out_desc, input_desc_vec) + + return INFINI_STATUS_SUCCESS; +} + +infiniStatus_t Descriptor::calculate( + void *workspace, + size_t workspace_size, + void *output, + std::vector inputs, + void *stream) const { + + if (workspace_size < _workspace_size) { + return INFINI_STATUS_INSUFFICIENT_WORKSPACE; + } + + switch (_dtype) { + case INFINI_DTYPE_F16: + return _device_info->calculate<256, cuda::ExpOp, half>(_info, workspace, output, inputs, stream); + case INFINI_DTYPE_BF16: + return _device_info->calculate<256, cuda::ExpOp, cuda_bfloat16>(_info, workspace, output, inputs, stream); + case INFINI_DTYPE_F32: + return _device_info->calculate<256, cuda::ExpOp, float>(_info, workspace, output, inputs, stream); + case INFINI_DTYPE_F64: + return _device_info->calculate<256, cuda::ExpOp, double>(_info, workspace, output, inputs, stream); + default: + return INFINI_STATUS_BAD_TENSOR_DTYPE; + } + + return INFINI_STATUS_SUCCESS; +} +} // namespace op::exp::nvidia diff --git a/src/infiniop/ops/exp/nvidia/exp_nvidia.cuh b/src/infiniop/ops/exp/nvidia/exp_nvidia.cuh new file mode 100644 index 000000000..7545e8f3e --- /dev/null +++ b/src/infiniop/ops/exp/nvidia/exp_nvidia.cuh @@ -0,0 +1,8 @@ +#ifndef __EXP_CUDA_API_H__ +#define __EXP_CUDA_API_H__ + +#include "../../../elementwise/nvidia/elementwise_nvidia_api.cuh" + +ELEMENTWISE_DESCRIPTOR(exp, nvidia) + +#endif // __EXP_CUDA_API_H__ diff --git a/src/infiniop/ops/exp/operator.cc b/src/infiniop/ops/exp/operator.cc new file mode 100644 index 000000000..ee1dc6768 --- /dev/null +++ b/src/infiniop/ops/exp/operator.cc @@ -0,0 +1,142 @@ +#include "../../operator.h" +#include "../../handle.h" +#include "infiniop/ops/exp.h" + +#ifdef ENABLE_CPU_API +#include "cpu/exp_cpu.h" +#endif +#if defined(ENABLE_NVIDIA_API) || defined(ENABLE_ILUVATAR_API) +#include "nvidia/exp_nvidia.cuh" +#endif +#ifdef ENABLE_METAX_API +#include "metax/exp_metax.h" +#endif + +__C infiniStatus_t infiniopCreateExpDescriptor( + infiniopHandle_t handle, + infiniopExpDescriptor_t *desc_ptr, + infiniopTensorDescriptor_t output_desc, + infiniopTensorDescriptor_t input_desc) { + +#define CREATE(CASE, NAMESPACE) \ + case CASE: \ + return op::exp::NAMESPACE::Descriptor::create( \ + handle, \ + reinterpret_cast(desc_ptr), \ + output_desc, \ + {input_desc}) + + switch (handle->device) { + +#ifdef ENABLE_CPU_API + CREATE(INFINI_DEVICE_CPU, cpu); +#endif +#ifdef ENABLE_NVIDIA_API + CREATE(INFINI_DEVICE_NVIDIA, nvidia); +#endif +#ifdef ENABLE_ILUVATAR_API + CREATE(INFINI_DEVICE_ILUVATAR, nvidia); +#endif +#ifdef ENABLE_METAX_API + CREATE(INFINI_DEVICE_METAX, metax); +#endif + + default: + return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED; + } + +#undef CREATE +} + +__C infiniStatus_t infiniopGetExpWorkspaceSize(infiniopExpDescriptor_t desc, size_t *size) { + +#define GET(CASE, NAMESPACE) \ + case CASE: \ + *size = reinterpret_cast(desc)->workspaceSize(); \ + return INFINI_STATUS_SUCCESS; + + switch (desc->device_type) { +#ifdef ENABLE_CPU_API + GET(INFINI_DEVICE_CPU, cpu); +#endif +#ifdef ENABLE_NVIDIA_API + GET(INFINI_DEVICE_NVIDIA, nvidia); +#endif +#ifdef ENABLE_ILUVATAR_API + GET(INFINI_DEVICE_ILUVATAR, nvidia); +#endif +#ifdef ENABLE_METAX_API + GET(INFINI_DEVICE_METAX, metax); +#endif + default: + return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED; + } +#undef GET + + return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED; +} + +__C infiniStatus_t infiniopExp( + infiniopExpDescriptor_t desc, + void *workspace, + size_t workspace_size, + void *output, + const void *input, + void *stream) { + +#define CALCULATE(CASE, NAMESPACE) \ + case CASE: \ + return reinterpret_cast(desc) \ + ->calculate(workspace, workspace_size, output, {input}, stream) + + switch (desc->device_type) { + +#ifdef ENABLE_CPU_API + CALCULATE(INFINI_DEVICE_CPU, cpu); +#endif +#ifdef ENABLE_NVIDIA_API + CALCULATE(INFINI_DEVICE_NVIDIA, nvidia); +#endif +#ifdef ENABLE_ILUVATAR_API + CALCULATE(INFINI_DEVICE_ILUVATAR, nvidia); +#endif +#ifdef ENABLE_METAX_API + CALCULATE(INFINI_DEVICE_METAX, metax); +#endif + + default: + return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED; + } + +#undef CALCULATE +} + +__C infiniStatus_t +infiniopDestroyExpDescriptor(infiniopExpDescriptor_t desc) { + +#define DELETE(CASE, NAMESPACE) \ + case CASE: \ + delete reinterpret_cast(desc); \ + return INFINI_STATUS_SUCCESS + + switch (desc->device_type) { + +#ifdef ENABLE_CPU_API + DELETE(INFINI_DEVICE_CPU, cpu); +#endif +#ifdef ENABLE_NVIDIA_API + DELETE(INFINI_DEVICE_NVIDIA, nvidia); +#endif +#ifdef ENABLE_ILUVATAR_API + DELETE(INFINI_DEVICE_ILUVATAR, nvidia); +#endif +#ifdef ENABLE_METAX_API + DELETE(INFINI_DEVICE_METAX, metax); +#endif + + default: + return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED; + } + +#undef DELETE +} diff --git a/src/infiniop/ops/hardswish/cpu/hardswish_cpu.cc b/src/infiniop/ops/hardswish/cpu/hardswish_cpu.cc new file mode 100644 index 000000000..e7b68508a --- /dev/null +++ b/src/infiniop/ops/hardswish/cpu/hardswish_cpu.cc @@ -0,0 +1,52 @@ +#include "hardswish_cpu.h" + +namespace op::hardswish::cpu { + +Descriptor::~Descriptor() = default; + +infiniStatus_t Descriptor::create( + infiniopHandle_t handle_, + Descriptor **desc_ptr, + infiniopTensorDescriptor_t out_desc, + std::vector input_desc_vec) { + + auto handle = reinterpret_cast(handle_); + auto dtype = out_desc->dtype(); + + const auto &input_desc = input_desc_vec.at(0); + const auto &output_shape = out_desc->shape(); + const auto &input_shape = input_desc->shape(); + + CHECK_DTYPE(dtype, INFINI_DTYPE_F16, INFINI_DTYPE_F32, INFINI_DTYPE_F64, INFINI_DTYPE_BF16); + + CHECK_SAME_SHAPE(output_shape, input_shape); + + // create CPU elementwise descriptor + CREATE_ELEMENTWISE_CPU_DESCRIPTOR(handle, dtype, out_desc, input_desc_vec); + + return INFINI_STATUS_SUCCESS; +} + +infiniStatus_t Descriptor::calculate( + void *workspace, + size_t workspace_size, + void *output, + std::vector inputs, + void *stream) const { + + switch (_dtype) { + case INFINI_DTYPE_F16: + return _device_info->calculate(_info, output, inputs, stream); + case INFINI_DTYPE_F32: + return _device_info->calculate(_info, output, inputs, stream); + case INFINI_DTYPE_F64: + return _device_info->calculate(_info, output, inputs, stream); + case INFINI_DTYPE_BF16: + return _device_info->calculate(_info, output, inputs, stream); + default: + return INFINI_STATUS_BAD_TENSOR_DTYPE; + } + + return INFINI_STATUS_SUCCESS; +} +} // namespace op::hardswish::cpu diff --git a/src/infiniop/ops/hardswish/cpu/hardswish_cpu.h b/src/infiniop/ops/hardswish/cpu/hardswish_cpu.h new file mode 100644 index 000000000..e137be8a0 --- /dev/null +++ b/src/infiniop/ops/hardswish/cpu/hardswish_cpu.h @@ -0,0 +1,30 @@ +#ifndef __HARDSWISH_CPU_H__ +#define __HARDSWISH_CPU_H__ + +#include "../../../elementwise/cpu/elementwise_cpu.h" +#include + +ELEMENTWISE_DESCRIPTOR(hardswish, cpu) + +namespace op::hardswish::cpu { +typedef struct HardswishOp { +public: + static constexpr size_t num_inputs = 1; + + template + T operator()(const T &input) const { + if constexpr (std::is_integral_v) { + return static_cast(0); + } else { + // x * clamp(x + 3, 0, 6) / 6 + auto x = static_cast(input); + double y = x + 3.0; + y = std::min(std::max(y, 0.0), 6.0); + double out = x * (y / 6.0); + return static_cast(out); + } + } +} HardswishOp; +} // namespace op::hardswish::cpu + +#endif // __HARDSWISH_CPU_H__ diff --git a/src/infiniop/ops/hardswish/cuda/kernel.cuh b/src/infiniop/ops/hardswish/cuda/kernel.cuh new file mode 100644 index 000000000..d5b369bce --- /dev/null +++ b/src/infiniop/ops/hardswish/cuda/kernel.cuh @@ -0,0 +1,55 @@ +#ifndef __HARDSWISH_CUDA_H__ +#define __HARDSWISH_CUDA_H__ + +#include +#include +#include + +namespace op::hardswish::cuda { + +typedef struct HardswishOp { + static constexpr size_t num_inputs = 1; + + // Hardswish: f(x) = x * clamp(x + 3, 0, 6) / 6 + __device__ __forceinline__ float hswish_f32(float x) const { + float y = x + 3.0f; + y = y < 0.0f ? 0.0f : (y > 6.0f ? 6.0f : y); + return x * (y * (1.0f / 6.0f)); + } + + template + __device__ __forceinline__ T operator()(const T &input) const { + if constexpr (std::is_same_v) { + float2 vf = __half22float2(input); + float2 vr = make_float2( + hswish_f32(vf.x), + hswish_f32(vf.y)); + return __float22half2_rn(vr); + } else if constexpr (std::is_same_v) { + float xf = __half2float(input); + float yf = hswish_f32(xf); + return __float2half_rn(yf); + } else if constexpr (std::is_same_v) { + float f0 = __bfloat162float(__low2bfloat16(input)); + float f1 = __bfloat162float(__high2bfloat16(input)); + return __floats2bfloat162_rn(hswish_f32(f0), hswish_f32(f1)); + } else if constexpr (std::is_same_v) { + float xf = __bfloat162float(input); + return __float2bfloat16_rz(hswish_f32(xf)); + } else if constexpr (std::is_same_v) { + return hswish_f32(input); + } else if constexpr (std::is_same_v) { + double xd = static_cast(input); + double yd = xd * (std::fmin(std::fmax(xd + 3.0, 0.0), 6.0) / 6.0); + return static_cast(yd); + } else { + double xd = static_cast(input); + double yd = xd * (std::fmin(std::fmax(xd + 3.0, 0.0), 6.0) / 6.0); + return static_cast(yd); + } + } +} HardswishOp; + +} // namespace op::hardswish::cuda + +#endif // __HARDSWISH_CUDA_H__ diff --git a/src/infiniop/ops/hardswish/metax/hardswish_metax.h b/src/infiniop/ops/hardswish/metax/hardswish_metax.h new file mode 100644 index 000000000..16b131aa9 --- /dev/null +++ b/src/infiniop/ops/hardswish/metax/hardswish_metax.h @@ -0,0 +1,8 @@ +#ifndef __HARDSWISH_METAX_API_H__ +#define __HARDSWISH_METAX_API_H__ + +#include "../../../elementwise/metax/elementwise_metax_api.h" + +ELEMENTWISE_DESCRIPTOR(hardswish, metax) + +#endif // __HARDSWISH_METAX_API_H__ diff --git a/src/infiniop/ops/hardswish/metax/hardswish_metax.maca b/src/infiniop/ops/hardswish/metax/hardswish_metax.maca new file mode 100644 index 000000000..e53b94357 --- /dev/null +++ b/src/infiniop/ops/hardswish/metax/hardswish_metax.maca @@ -0,0 +1,60 @@ +#include "hardswish_metax.h" + +#include "../../../elementwise/metax/elementwise_metax.h" + +#include "../cuda/kernel.cuh" + +namespace op::hardswish::metax { + +Descriptor::~Descriptor() = default; + +infiniStatus_t Descriptor::create( + infiniopHandle_t handle_, + Descriptor **desc_ptr, + infiniopTensorDescriptor_t out_desc, + std::vector input_desc_vec) { + + auto handle = reinterpret_cast(handle_); + auto dtype = out_desc->dtype(); + + const auto &input_desc = input_desc_vec.at(0); + const auto &output_shape = out_desc->shape(); + const auto &input_shape = input_desc->shape(); + + CHECK_DTYPE(dtype, INFINI_DTYPE_F16, INFINI_DTYPE_F32, INFINI_DTYPE_F64, INFINI_DTYPE_BF16); + + CHECK_SAME_SHAPE(output_shape, input_shape); + + // create CUDA elementwise descriptor + CREATE_ELEMENTWISE_METAX_DESCRIPTOR(handle, dtype, out_desc, input_desc_vec) + + return INFINI_STATUS_SUCCESS; +} + +infiniStatus_t Descriptor::calculate( + void *workspace, + size_t workspace_size, + void *output, + std::vector inputs, + void *stream) const { + + if (workspace_size < _workspace_size) { + return INFINI_STATUS_INSUFFICIENT_WORKSPACE; + } + + switch (_dtype) { + case INFINI_DTYPE_F16: + return _device_info->calculate<256, cuda::HardswishOp, half>(_info, workspace, output, inputs, stream); + case INFINI_DTYPE_BF16: + return _device_info->calculate<256, cuda::HardswishOp, cuda_bfloat16>(_info, workspace, output, inputs, stream); + case INFINI_DTYPE_F32: + return _device_info->calculate<256, cuda::HardswishOp, float>(_info, workspace, output, inputs, stream); + case INFINI_DTYPE_F64: + return _device_info->calculate<256, cuda::HardswishOp, double>(_info, workspace, output, inputs, stream); + default: + return INFINI_STATUS_BAD_TENSOR_DTYPE; + } + + return INFINI_STATUS_SUCCESS; +} +} // namespace op::hardswish::metax diff --git a/src/infiniop/ops/hardswish/nvidia/hardswish_nvidia.cu b/src/infiniop/ops/hardswish/nvidia/hardswish_nvidia.cu new file mode 100644 index 000000000..9e279c2ef --- /dev/null +++ b/src/infiniop/ops/hardswish/nvidia/hardswish_nvidia.cu @@ -0,0 +1,59 @@ +#include "../../../elementwise/nvidia/elementwise_nvidia.cuh" + +#include "../cuda/kernel.cuh" +#include "hardswish_nvidia.cuh" + +namespace op::hardswish::nvidia { + +Descriptor::~Descriptor() = default; + +infiniStatus_t Descriptor::create( + infiniopHandle_t handle_, + Descriptor **desc_ptr, + infiniopTensorDescriptor_t out_desc, + std::vector input_desc_vec) { + + auto handle = reinterpret_cast(handle_); + auto dtype = out_desc->dtype(); + + const auto &input_desc = input_desc_vec.at(0); + const auto &output_shape = out_desc->shape(); + const auto &input_shape = input_desc->shape(); + + CHECK_DTYPE(dtype, INFINI_DTYPE_F16, INFINI_DTYPE_F32, INFINI_DTYPE_F64, INFINI_DTYPE_BF16); + + CHECK_SAME_SHAPE(output_shape, input_shape); + + // create CUDA elementwise descriptor + CREATE_ELEMENTWISE_CUDA_DESCRIPTOR(handle, dtype, out_desc, input_desc_vec) + + return INFINI_STATUS_SUCCESS; +} + +infiniStatus_t Descriptor::calculate( + void *workspace, + size_t workspace_size, + void *output, + std::vector inputs, + void *stream) const { + + if (workspace_size < _workspace_size) { + return INFINI_STATUS_INSUFFICIENT_WORKSPACE; + } + + switch (_dtype) { + case INFINI_DTYPE_F16: + return _device_info->calculate<256, cuda::HardswishOp, half>(_info, workspace, output, inputs, stream); + case INFINI_DTYPE_BF16: + return _device_info->calculate<256, cuda::HardswishOp, cuda_bfloat16>(_info, workspace, output, inputs, stream); + case INFINI_DTYPE_F32: + return _device_info->calculate<256, cuda::HardswishOp, float>(_info, workspace, output, inputs, stream); + case INFINI_DTYPE_F64: + return _device_info->calculate<256, cuda::HardswishOp, double>(_info, workspace, output, inputs, stream); + default: + return INFINI_STATUS_BAD_TENSOR_DTYPE; + } + + return INFINI_STATUS_SUCCESS; +} +} // namespace op::hardswish::nvidia diff --git a/src/infiniop/ops/hardswish/nvidia/hardswish_nvidia.cuh b/src/infiniop/ops/hardswish/nvidia/hardswish_nvidia.cuh new file mode 100644 index 000000000..f869ad52f --- /dev/null +++ b/src/infiniop/ops/hardswish/nvidia/hardswish_nvidia.cuh @@ -0,0 +1,8 @@ +#ifndef __HARDSWISH_CUDA_API_H__ +#define __HARDSWISH_CUDA_API_H__ + +#include "../../../elementwise/nvidia/elementwise_nvidia_api.cuh" + +ELEMENTWISE_DESCRIPTOR(hardswish, nvidia) + +#endif // __HARDSWISH_CUDA_API_H__ diff --git a/src/infiniop/ops/hardswish/operator.cc b/src/infiniop/ops/hardswish/operator.cc new file mode 100644 index 000000000..e8ba19fc1 --- /dev/null +++ b/src/infiniop/ops/hardswish/operator.cc @@ -0,0 +1,142 @@ +#include "../../operator.h" +#include "../../handle.h" +#include "infiniop/ops/hardswish.h" + +#ifdef ENABLE_CPU_API +#include "cpu/hardswish_cpu.h" +#endif +#if defined(ENABLE_NVIDIA_API) || defined(ENABLE_ILUVATAR_API) +#include "nvidia/hardswish_nvidia.cuh" +#endif +#ifdef ENABLE_METAX_API +#include "metax/hardswish_metax.h" +#endif + +__C infiniStatus_t infiniopCreateHardswishDescriptor( + infiniopHandle_t handle, + infiniopHardswishDescriptor_t *desc_ptr, + infiniopTensorDescriptor_t output_desc, + infiniopTensorDescriptor_t input_desc) { + +#define CREATE(CASE, NAMESPACE) \ + case CASE: \ + return op::hardswish::NAMESPACE::Descriptor::create( \ + handle, \ + reinterpret_cast(desc_ptr), \ + output_desc, \ + {input_desc}) + + switch (handle->device) { + +#ifdef ENABLE_CPU_API + CREATE(INFINI_DEVICE_CPU, cpu); +#endif +#ifdef ENABLE_NVIDIA_API + CREATE(INFINI_DEVICE_NVIDIA, nvidia); +#endif +#ifdef ENABLE_ILUVATAR_API + CREATE(INFINI_DEVICE_ILUVATAR, nvidia); +#endif +#ifdef ENABLE_METAX_API + CREATE(INFINI_DEVICE_METAX, metax); +#endif + + default: + return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED; + } + +#undef CREATE +} + +__C infiniStatus_t infiniopGetHardswishWorkspaceSize(infiniopHardswishDescriptor_t desc, size_t *size) { + +#define GET(CASE, NAMESPACE) \ + case CASE: \ + *size = reinterpret_cast(desc)->workspaceSize(); \ + return INFINI_STATUS_SUCCESS; + + switch (desc->device_type) { +#ifdef ENABLE_CPU_API + GET(INFINI_DEVICE_CPU, cpu); +#endif +#ifdef ENABLE_NVIDIA_API + GET(INFINI_DEVICE_NVIDIA, nvidia); +#endif +#ifdef ENABLE_ILUVATAR_API + GET(INFINI_DEVICE_ILUVATAR, nvidia); +#endif +#ifdef ENABLE_METAX_API + GET(INFINI_DEVICE_METAX, metax); +#endif + default: + return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED; + } +#undef GET + + return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED; +} + +__C infiniStatus_t infiniopHardswish( + infiniopHardswishDescriptor_t desc, + void *workspace, + size_t workspace_size, + void *output, + const void *input, + void *stream) { + +#define CALCULATE(CASE, NAMESPACE) \ + case CASE: \ + return reinterpret_cast(desc) \ + ->calculate(workspace, workspace_size, output, {input}, stream) + + switch (desc->device_type) { + +#ifdef ENABLE_CPU_API + CALCULATE(INFINI_DEVICE_CPU, cpu); +#endif +#ifdef ENABLE_NVIDIA_API + CALCULATE(INFINI_DEVICE_NVIDIA, nvidia); +#endif +#ifdef ENABLE_ILUVATAR_API + CALCULATE(INFINI_DEVICE_ILUVATAR, nvidia); +#endif +#ifdef ENABLE_METAX_API + CALCULATE(INFINI_DEVICE_METAX, metax); +#endif + + default: + return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED; + } + +#undef CALCULATE +} + +__C infiniStatus_t +infiniopDestroyHardswishDescriptor(infiniopHardswishDescriptor_t desc) { + +#define DELETE(CASE, NAMESPACE) \ + case CASE: \ + delete reinterpret_cast(desc); \ + return INFINI_STATUS_SUCCESS + + switch (desc->device_type) { + +#ifdef ENABLE_CPU_API + DELETE(INFINI_DEVICE_CPU, cpu); +#endif +#ifdef ENABLE_NVIDIA_API + DELETE(INFINI_DEVICE_NVIDIA, nvidia); +#endif +#ifdef ENABLE_ILUVATAR_API + DELETE(INFINI_DEVICE_ILUVATAR, nvidia); +#endif +#ifdef ENABLE_METAX_API + DELETE(INFINI_DEVICE_METAX, metax); +#endif + + default: + return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED; + } + +#undef DELETE +} diff --git a/src/infiniop/ops/leakyrelu/cpu/leakyrelu_cpu.cc b/src/infiniop/ops/leakyrelu/cpu/leakyrelu_cpu.cc new file mode 100644 index 000000000..c10a44cb5 --- /dev/null +++ b/src/infiniop/ops/leakyrelu/cpu/leakyrelu_cpu.cc @@ -0,0 +1,114 @@ +#include "leakyrelu_cpu.h" +#include "../../../devices/cpu/common_cpu.h" +#include "../info.h" +#include "infinicore.h" +#include + +namespace op::leakyrelu::cpu { + +struct Descriptor::Opaque {}; + +Descriptor::~Descriptor() { delete _opaque; } + +infiniStatus_t Descriptor::create( + infiniopHandle_t handle_, + Descriptor **desc_ptr, + infiniopTensorDescriptor_t out_desc, + infiniopTensorDescriptor_t in_desc, + float negative_slope) { + + auto handle = reinterpret_cast(handle_); + + auto info_r = LeakyReLUInfo::create(out_desc, in_desc, negative_slope); + CHECK_RESULT(info_r); + + *desc_ptr = new Descriptor( + info_r.take(), + 0, + nullptr, + handle->device, handle->device_id); + + return INFINI_STATUS_SUCCESS; +} + +size_t Descriptor::workspaceSize() const { return _min_workspace_size; } + +template +static inline void cpu_leakyrelu_impl_incremental( + void *output, const void *input, const op::leakyrelu::LeakyReLUInfo &info) { + + const size_t ndim = info.shape.size(); + const size_t n = info.n; + + if (n == 0) { + return; + } + + auto out_base = reinterpret_cast(output); + auto in_base = reinterpret_cast(input); + + const std::vector &shape = info.shape; + const std::vector &in_stride = info.in_stride; + const std::vector &out_stride = info.out_stride; + + std::vector idx(ndim, 0); + ptrdiff_t in_off = 0; + ptrdiff_t out_off = 0; + + for (size_t it = 0; it < n; ++it) { + const T *in_elem = in_base + in_off; + T *out_elem = out_base + out_off; + + float v = utils::cast(*in_elem); + float outv = v >= 0.0f ? v : v * info.negative_slope; + *out_elem = utils::cast(outv); + for (int d = static_cast(ndim) - 1; d >= 0; --d) { + idx[d] += 1; + if (in_stride[d] != 0) { + in_off += in_stride[d]; + } + if (out_stride[d] != 0) { + out_off += out_stride[d]; + } + + if (idx[d] < shape[d]) { + break; + } else { + idx[d] = 0; + if (in_stride[d] != 0) { + in_off -= static_cast(shape[d]) * in_stride[d]; + } + if (out_stride[d] != 0) { + out_off -= static_cast(shape[d]) * out_stride[d]; + } + } + } + } +} + +infiniStatus_t Descriptor::calculate( + void *workspace, + size_t workspace_size, + void *output, + const void *input, + void *stream) const { + + switch (_info.dt_in) { + case INFINI_DTYPE_F16: + cpu_leakyrelu_impl_incremental(output, input, _info); + break; + case INFINI_DTYPE_BF16: + cpu_leakyrelu_impl_incremental(output, input, _info); + break; + case INFINI_DTYPE_F32: + cpu_leakyrelu_impl_incremental(output, input, _info); + break; + case INFINI_DTYPE_F64: + cpu_leakyrelu_impl_incremental(output, input, _info); + break; + default: + return INFINI_STATUS_BAD_TENSOR_DTYPE; + } + return INFINI_STATUS_SUCCESS; +} +} // namespace op::leakyrelu::cpu diff --git a/src/infiniop/ops/leakyrelu/cpu/leakyrelu_cpu.h b/src/infiniop/ops/leakyrelu/cpu/leakyrelu_cpu.h new file mode 100644 index 000000000..e58ca1409 --- /dev/null +++ b/src/infiniop/ops/leakyrelu/cpu/leakyrelu_cpu.h @@ -0,0 +1,7 @@ +#ifndef __LEAKYRELU_CPU_H__ +#define __LEAKYRELU_CPU_H__ +#include "../leakyrelu.h" + +DESCRIPTOR(cpu) + +#endif // __LEAKYRELU_CPU_H__ diff --git a/src/infiniop/ops/leakyrelu/cuda/kernel.cuh b/src/infiniop/ops/leakyrelu/cuda/kernel.cuh new file mode 100644 index 000000000..afca17002 --- /dev/null +++ b/src/infiniop/ops/leakyrelu/cuda/kernel.cuh @@ -0,0 +1,73 @@ +#ifndef __LEAKYRELU_CUDA_KERNEL_CUH__ +#define __LEAKYRELU_CUDA_KERNEL_CUH__ + +#include +#include +#include +#include + +template +__device__ __forceinline__ float to_float_for_leaky(const DevT &v) { + if constexpr (std::is_same_v) { + return __half2float(v); + } else if constexpr (std::is_same_v) { + return __bfloat162float(v); + } else { + return static_cast(v); + } +} + +template +__device__ __forceinline__ DevT from_float_for_leaky(float f) { + if constexpr (std::is_same_v) { + return __float2half_rn(f); + } else if constexpr (std::is_same_v) { + return __float2bfloat16(f); + } else { + return static_cast(f); + } +} + +template +__global__ void leakyrelu_kernel( + DevT *__restrict__ out, + const DevT *__restrict__ in, + size_t n, + float negative_slope, + const size_t *__restrict__ shape, + const size_t *__restrict__ div, + const long long *__restrict__ in_stride, + const long long *__restrict__ out_stride, + int ndim) { + + size_t gid = static_cast(blockIdx.x) * blockDim.x + threadIdx.x; + size_t grid_stride = static_cast(blockDim.x) * gridDim.x; + + for (size_t linear = gid; linear < n; linear += grid_stride) { + unsigned long long rem = linear; + long long in_off = 0; + long long out_off = 0; + for (int d = 0; d < ndim; ++d) { + unsigned long long idx_d = 0; + size_t divisor = div[d]; + if (divisor != 0) { + idx_d = rem / divisor; + rem = rem % divisor; + } else { + idx_d = 0; + } + if (in_stride[d] != 0) { + in_off += static_cast(idx_d) * in_stride[d]; + } + if (out_stride[d] != 0) { + out_off += static_cast(idx_d) * out_stride[d]; + } + } + + float v = to_float_for_leaky(in[static_cast(in_off)]); + float outv = v >= 0.0f ? v : v * negative_slope; + out[static_cast(out_off)] = from_float_for_leaky(outv); + } +} + +#endif // __LEAKYRELU_CUDA_KERNEL_CUH__ diff --git a/src/infiniop/ops/leakyrelu/info.h b/src/infiniop/ops/leakyrelu/info.h new file mode 100644 index 000000000..1f074d85a --- /dev/null +++ b/src/infiniop/ops/leakyrelu/info.h @@ -0,0 +1,53 @@ +#ifndef __LEAKYRELU_INFO_H__ +#define __LEAKYRELU_INFO_H__ + +#include "../../../utils.h" +#include "../../tensor.h" +#include + +namespace op::leakyrelu { + +class LeakyReLUInfo { + LeakyReLUInfo() = default; + +public: + infiniDtype_t dt_in; + std::vector shape; + std::vector in_stride; + std::vector out_stride; + size_t n; + float negative_slope; + + static utils::Result create( + infiniopTensorDescriptor_t out_desc, + infiniopTensorDescriptor_t in_desc, + float negative_slope) { + + auto dt_raw = in_desc->dtype(); + infiniDtype_t dt_in = dt_raw; + + CHECK_DTYPE(dt_in, INFINI_DTYPE_F16, INFINI_DTYPE_BF16, INFINI_DTYPE_F32, INFINI_DTYPE_F64); + + CHECK_OR_RETURN(out_desc->ndim() == in_desc->ndim(), INFINI_STATUS_BAD_TENSOR_SHAPE); + for (size_t i = 0; i < out_desc->ndim(); ++i) { + CHECK_OR_RETURN(out_desc->dim(i) == in_desc->dim(i), INFINI_STATUS_BAD_TENSOR_SHAPE); + } + + size_t n = 1; + for (size_t i = 0; i < in_desc->ndim(); ++i) { + n *= static_cast(in_desc->dim(i)); + } + + return utils::Result(LeakyReLUInfo{ + dt_in, + out_desc->shape(), + in_desc->strides(), + out_desc->strides(), + n, + negative_slope}); + } +}; + +} // namespace op::leakyrelu + +#endif // __LEAKYRELU_INFO_H__ diff --git a/src/infiniop/ops/leakyrelu/leakyrelu.h b/src/infiniop/ops/leakyrelu/leakyrelu.h new file mode 100644 index 000000000..a6a01a85b --- /dev/null +++ b/src/infiniop/ops/leakyrelu/leakyrelu.h @@ -0,0 +1,49 @@ +#ifndef __LEAKYRELU_H__ +#define __LEAKYRELU_H__ + +#include "../../operator.h" +#include "info.h" + +#define DESCRIPTOR(NAMESPACE) \ + \ + namespace op::leakyrelu::NAMESPACE { \ + class Descriptor final : public InfiniopDescriptor { \ + struct Opaque; \ + Opaque *_opaque; \ + \ + LeakyReLUInfo _info; \ + size_t _min_workspace_size; \ + \ + Descriptor( \ + LeakyReLUInfo info, \ + size_t min_workspace_size, \ + Opaque *opaque, \ + infiniDevice_t device_type, \ + int device_id) \ + : InfiniopDescriptor{device_type, device_id}, \ + _opaque(opaque), \ + _info(info), \ + _min_workspace_size(min_workspace_size) {} \ + \ + public: \ + ~Descriptor(); \ + \ + static infiniStatus_t create( \ + infiniopHandle_t handle, \ + Descriptor **desc_ptr, \ + infiniopTensorDescriptor_t out_desc, \ + infiniopTensorDescriptor_t in_desc, \ + float negative_slope); \ + \ + size_t workspaceSize() const; \ + \ + infiniStatus_t calculate( \ + void *workspace, \ + size_t workspace_size, \ + void *output, \ + const void *input, \ + void *stream) const; \ + }; \ + } + +#endif // __LEAKYRELU_H__ diff --git a/src/infiniop/ops/leakyrelu/metax/leakyrelu_metax.h b/src/infiniop/ops/leakyrelu/metax/leakyrelu_metax.h new file mode 100644 index 000000000..15cdccc61 --- /dev/null +++ b/src/infiniop/ops/leakyrelu/metax/leakyrelu_metax.h @@ -0,0 +1,8 @@ +#ifndef __LEAKYRELU_METAX_API_H__ +#define __LEAKYRELU_METAX_API_H__ + +#include "../leakyrelu.h" + +DESCRIPTOR(metax) + +#endif // __LEAKYRELU_METAX_API_H__ diff --git a/src/infiniop/ops/leakyrelu/metax/leakyrelu_metax.maca b/src/infiniop/ops/leakyrelu/metax/leakyrelu_metax.maca new file mode 100644 index 000000000..871c3f663 --- /dev/null +++ b/src/infiniop/ops/leakyrelu/metax/leakyrelu_metax.maca @@ -0,0 +1,174 @@ +#include "../cuda/kernel.cuh" +#include "../../../devices/metax/metax_common.h" +#include "../../../devices/metax/metax_kernel_common.h" +#include "../leakyrelu.h" +#include "leakyrelu_metax.h" +#include "../info.h" + +namespace op::leakyrelu::metax { + +struct Descriptor::Opaque { + std::shared_ptr internal; +}; + +Descriptor::~Descriptor() { + delete _opaque; +} + +template struct MapHcType { using Type = T; }; +template <> struct MapHcType { using Type = half; }; +#if defined(__HC_BF16_TYPES_EXIST__) || defined(__HC_ARCH__) +template <> struct MapHcType { using Type = __nv_bfloat16; }; +#endif + +infiniStatus_t Descriptor::create( + infiniopHandle_t handle_, + Descriptor **desc_ptr, + infiniopTensorDescriptor_t out_desc, + infiniopTensorDescriptor_t in_desc, + float negative_slope) { + auto handle = reinterpret_cast(handle_); + + auto info_r = LeakyReLUInfo::create(out_desc, in_desc, negative_slope); + CHECK_RESULT(info_r); + auto info = info_r.take(); + + size_t workspace_size = 0; + + *desc_ptr = new Descriptor( + info, + workspace_size, + new Opaque{handle->internal()}, + handle->device, handle->device_id); + + return INFINI_STATUS_SUCCESS; +} + +size_t Descriptor::workspaceSize() const { + return _min_workspace_size; +} + +template +static inline infiniStatus_t metax_leakyrelu_impl_incremental( + void *output_, const void *input_, + const op::leakyrelu::LeakyReLUInfo &info, + void *stream_) { + + int bs = 256, grid = 0; + hcError_t propErr; + int device_id_local = 0; + using DevT = typename MapHcType::Type; + + auto out_dev = reinterpret_cast(output_); + auto in_dev = reinterpret_cast(input_); + auto stream = reinterpret_cast(stream_); + + int ndim = static_cast(info.shape.size()); + if (ndim == 0) { + return INFINI_STATUS_SUCCESS; + } + + std::vector h_shape(info.shape.begin(), info.shape.end()); + std::vector h_div(ndim); + h_div[ndim - 1] = 1; + for (int d = ndim - 2; d >= 0; --d) { + h_div[d] = h_div[d + 1] * h_shape[d + 1]; + } + + std::vector h_in_stride(ndim), h_out_stride(ndim); + for (int d = 0; d < ndim; ++d) { + h_in_stride[d] = static_cast(info.in_stride[d]); + h_out_stride[d] = static_cast(info.out_stride[d]); + } + + size_t *d_shape = nullptr; + size_t *d_div = nullptr; + long long *d_in_stride = nullptr; + long long *d_out_stride = nullptr; + + hcError_t err = hcSuccess; + + err = hcMalloc(reinterpret_cast(&d_shape), sizeof(size_t) * ndim); + if (err != hcSuccess) goto cleanup; + err = hcMalloc(reinterpret_cast(&d_div), sizeof(size_t) * ndim); + if (err != hcSuccess) goto cleanup; + err = hcMalloc(reinterpret_cast(&d_in_stride), sizeof(long long) * ndim); + if (err != hcSuccess) goto cleanup; + err = hcMalloc(reinterpret_cast(&d_out_stride), sizeof(long long) * ndim); + if (err != hcSuccess) goto cleanup; + err = hcMemcpyAsync(d_shape, h_shape.data(), sizeof(size_t) * ndim, hcMemcpyHostToDevice, stream); + if (err != hcSuccess) goto cleanup; + err = hcMemcpyAsync(d_div, h_div.data(), sizeof(size_t) * ndim, hcMemcpyHostToDevice, stream); + if (err != hcSuccess) goto cleanup; + err = hcMemcpyAsync(d_in_stride, h_in_stride.data(), sizeof(long long) * ndim, hcMemcpyHostToDevice, stream); + if (err != hcSuccess) goto cleanup; + err = hcMemcpyAsync(d_out_stride, h_out_stride.data(), sizeof(long long) * ndim, hcMemcpyHostToDevice, stream); + if (err != hcSuccess) goto cleanup; + + device_id_local = 0; + propErr = hcGetDevice(&device_id_local); + if (propErr == hcSuccess) { + hcDeviceProp_t prop; + if (hcGetDeviceProperties(&prop, device_id_local) == hcSuccess) { + bs = std::min(bs, static_cast(prop.maxThreadsPerBlock) / 2); + } else { + if (bs > 256) bs = 256; + } + } else { + if (bs > 256) bs = 256; + } + + if (bs <= 0) bs = 256; + grid = static_cast((info.n + bs - 1) / bs); + if (grid <= 0) grid = 1; + + leakyrelu_kernel<<>>( + out_dev, in_dev, info.n, info.negative_slope, d_shape, d_div, d_in_stride, d_out_stride, ndim); + + err = hcGetLastError(); + if (err != hcSuccess) goto cleanup; + + err = hcStreamSynchronize(stream); + if (err != hcSuccess) goto cleanup; + + hcFree(d_shape); + hcFree(d_div); + hcFree(d_in_stride); + hcFree(d_out_stride); + return INFINI_STATUS_SUCCESS; + +cleanup: + hcFree(d_shape); + hcFree(d_div); + hcFree(d_in_stride); + hcFree(d_out_stride); + return INFINI_STATUS_DEVICE_ARCHITECTURE_NOT_SUPPORTED; +} + +infiniStatus_t Descriptor::calculate( + void *workspace, + size_t workspace_size, + void *output, + const void *input, + void *stream) const { + + switch (_info.dt_in) { + case INFINI_DTYPE_F16: + metax_leakyrelu_impl_incremental(output, input, _info, stream); + break; + case INFINI_DTYPE_BF16: + metax_leakyrelu_impl_incremental(output, input, _info, stream); + break; + case INFINI_DTYPE_F32: + metax_leakyrelu_impl_incremental(output, input, _info, stream); + break; + case INFINI_DTYPE_F64: + metax_leakyrelu_impl_incremental(output, input, _info, stream); + break; + default: + return INFINI_STATUS_BAD_TENSOR_DTYPE; + } + return INFINI_STATUS_SUCCESS; +} + +}; // namespace op::leakyrelu::metax diff --git a/src/infiniop/ops/leakyrelu/nvidia/leakyrelu_nvidia.cu b/src/infiniop/ops/leakyrelu/nvidia/leakyrelu_nvidia.cu new file mode 100644 index 000000000..9b65bc421 --- /dev/null +++ b/src/infiniop/ops/leakyrelu/nvidia/leakyrelu_nvidia.cu @@ -0,0 +1,215 @@ +#include "../../../devices/nvidia/nvidia_handle.cuh" +#include "../../../devices/nvidia/nvidia_kernel_common.cuh" +#include "../cuda/kernel.cuh" +#include "../info.h" +#include "../leakyrelu.h" +#include "leakyrelu_nvidia.cuh" +#include +#include +#include +#include + +namespace op::leakyrelu::nvidia { + +struct Descriptor::Opaque { + std::shared_ptr internal; +}; + +Descriptor::~Descriptor() { + delete _opaque; +} + +template +struct MapCudaType { + using Type = T; +}; +template <> +struct MapCudaType { + using Type = half; +}; +#if defined(__CUDA_BF16_TYPES_EXIST__) || defined(__CUDA_ARCH__) +template <> +struct MapCudaType { + using Type = __nv_bfloat16; +}; +#endif + +infiniStatus_t Descriptor::create( + infiniopHandle_t handle_, + Descriptor **desc_ptr, + infiniopTensorDescriptor_t out_desc, + infiniopTensorDescriptor_t in_desc, + float negative_slope) { + auto handle = reinterpret_cast(handle_); + + auto info_r = LeakyReLUInfo::create(out_desc, in_desc, negative_slope); + CHECK_RESULT(info_r); + auto info = info_r.take(); + + size_t workspace_size = 0; + + *desc_ptr = new Descriptor( + info, + workspace_size, + new Opaque{handle->internal()}, + handle->device, handle->device_id); + + return INFINI_STATUS_SUCCESS; +} + +size_t Descriptor::workspaceSize() const { + return _min_workspace_size; +} + +template +static inline infiniStatus_t cuda_leakyrelu_impl_incremental( + void *output_, const void *input_, + const op::leakyrelu::LeakyReLUInfo &info, + void *stream_) { + + int bs = 256, grid = 0; + cudaError_t propErr; + int device_id_local = 0; + using DevT = typename MapCudaType::Type; + + auto out_dev = reinterpret_cast(output_); + auto in_dev = reinterpret_cast(input_); + auto stream = reinterpret_cast(stream_); + + int ndim = static_cast(info.shape.size()); + if (ndim == 0) { + return INFINI_STATUS_SUCCESS; + } + + std::vector h_shape(info.shape.begin(), info.shape.end()); + std::vector h_div(ndim); + h_div[ndim - 1] = 1; + for (int d = ndim - 2; d >= 0; --d) { + h_div[d] = h_div[d + 1] * h_shape[d + 1]; + } + + std::vector h_in_stride(ndim), h_out_stride(ndim); + for (int d = 0; d < ndim; ++d) { + h_in_stride[d] = static_cast(info.in_stride[d]); + h_out_stride[d] = static_cast(info.out_stride[d]); + } + + size_t *d_shape = nullptr; + size_t *d_div = nullptr; + long long *d_in_stride = nullptr; + long long *d_out_stride = nullptr; + + cudaError_t err = cudaSuccess; + + err = cudaMalloc(reinterpret_cast(&d_shape), sizeof(size_t) * ndim); + if (err != cudaSuccess) { + goto cleanup; + } + err = cudaMalloc(reinterpret_cast(&d_div), sizeof(size_t) * ndim); + if (err != cudaSuccess) { + goto cleanup; + } + err = cudaMalloc(reinterpret_cast(&d_in_stride), sizeof(long long) * ndim); + if (err != cudaSuccess) { + goto cleanup; + } + err = cudaMalloc(reinterpret_cast(&d_out_stride), sizeof(long long) * ndim); + if (err != cudaSuccess) { + goto cleanup; + } + err = cudaMemcpyAsync(d_shape, h_shape.data(), sizeof(size_t) * ndim, cudaMemcpyHostToDevice, stream); + if (err != cudaSuccess) { + goto cleanup; + } + err = cudaMemcpyAsync(d_div, h_div.data(), sizeof(size_t) * ndim, cudaMemcpyHostToDevice, stream); + if (err != cudaSuccess) { + goto cleanup; + } + err = cudaMemcpyAsync(d_in_stride, h_in_stride.data(), sizeof(long long) * ndim, cudaMemcpyHostToDevice, stream); + if (err != cudaSuccess) { + goto cleanup; + } + err = cudaMemcpyAsync(d_out_stride, h_out_stride.data(), sizeof(long long) * ndim, cudaMemcpyHostToDevice, stream); + if (err != cudaSuccess) { + goto cleanup; + } + + device_id_local = 0; + propErr = cudaGetDevice(&device_id_local); + if (propErr == cudaSuccess) { + cudaDeviceProp prop; + if (cudaGetDeviceProperties(&prop, device_id_local) == cudaSuccess) { + bs = std::min(bs, static_cast(prop.maxThreadsPerBlock) / 2); + } else { + if (bs > 256) { + bs = 256; + } + } + } else { + if (bs > 256) { + bs = 256; + } + } + + if (bs <= 0) { + bs = 256; + } + grid = static_cast((info.n + bs - 1) / bs); + if (grid <= 0) { + grid = 1; + } + + leakyrelu_kernel<<>>( + out_dev, in_dev, info.n, info.negative_slope, d_shape, d_div, d_in_stride, d_out_stride, ndim); + + err = cudaGetLastError(); + if (err != cudaSuccess) { + goto cleanup; + } + + err = cudaStreamSynchronize(stream); + if (err != cudaSuccess) { + goto cleanup; + } + + cudaFree(d_shape); + cudaFree(d_div); + cudaFree(d_in_stride); + cudaFree(d_out_stride); + return INFINI_STATUS_SUCCESS; + +cleanup: + cudaFree(d_shape); + cudaFree(d_div); + cudaFree(d_in_stride); + cudaFree(d_out_stride); + return INFINI_STATUS_DEVICE_ARCHITECTURE_NOT_SUPPORTED; +} + +infiniStatus_t Descriptor::calculate( + void *workspace, + size_t workspace_size, + void *output, + const void *input, + void *stream) const { + + switch (_info.dt_in) { + case INFINI_DTYPE_F16: + cuda_leakyrelu_impl_incremental(output, input, _info, stream); + break; + case INFINI_DTYPE_BF16: + cuda_leakyrelu_impl_incremental(output, input, _info, stream); + break; + case INFINI_DTYPE_F32: + cuda_leakyrelu_impl_incremental(output, input, _info, stream); + break; + case INFINI_DTYPE_F64: + cuda_leakyrelu_impl_incremental(output, input, _info, stream); + break; + default: + return INFINI_STATUS_BAD_TENSOR_DTYPE; + } + return INFINI_STATUS_SUCCESS; +} + +}; // namespace op::leakyrelu::nvidia diff --git a/src/infiniop/ops/leakyrelu/nvidia/leakyrelu_nvidia.cuh b/src/infiniop/ops/leakyrelu/nvidia/leakyrelu_nvidia.cuh new file mode 100644 index 000000000..fb891a6c9 --- /dev/null +++ b/src/infiniop/ops/leakyrelu/nvidia/leakyrelu_nvidia.cuh @@ -0,0 +1,8 @@ +#ifndef __LEAKYRELU_CUDA_API_H__ +#define __LEAKYRELU_CUDA_API_H__ + +#include "../leakyrelu.h" + +DESCRIPTOR(nvidia) + +#endif // __LEAKYRELU_CUDA_API_H__ diff --git a/src/infiniop/ops/leakyrelu/operator.cc b/src/infiniop/ops/leakyrelu/operator.cc new file mode 100644 index 000000000..3f78a4916 --- /dev/null +++ b/src/infiniop/ops/leakyrelu/operator.cc @@ -0,0 +1,164 @@ +#include "../../operator.h" +#include "../../handle.h" +#include "infiniop/ops/leakyrelu.h" + +#ifdef ENABLE_CPU_API +#include "cpu/leakyrelu_cpu.h" +#endif +#if defined(ENABLE_NVIDIA_API) || defined(ENABLE_ILUVATAR_API) +#include "nvidia/leakyrelu_nvidia.cuh" +#endif +#ifdef ENABLE_METAX_API +#include "metax/leakyrelu_metax.h" +#endif + +__C infiniStatus_t infiniopCreateLeakyreluDescriptor( + infiniopHandle_t handle, + infiniopLeakyreluDescriptor_t *desc_ptr, + infiniopTensorDescriptor_t y_desc, + infiniopTensorDescriptor_t x_desc, + float negative_slope) { + +#define CREATE_LEAKY(CASE, NAMESPACE) \ + case CASE: \ + return op::leakyrelu::NAMESPACE::Descriptor::create( \ + handle, \ + reinterpret_cast(desc_ptr), \ + y_desc, \ + x_desc, \ + negative_slope) + + switch (handle->device) { +#ifdef ENABLE_CPU_API + CREATE_LEAKY(INFINI_DEVICE_CPU, cpu); +#endif +#ifdef ENABLE_NVIDIA_API + CREATE_LEAKY(INFINI_DEVICE_NVIDIA, nvidia); +#endif +#ifdef ENABLE_ILUVATAR_API + CREATE_LEAKY(INFINI_DEVICE_ILUVATAR, nvidia); +#endif +#ifdef ENABLE_KUNLUN_API + CREATE_LEAKY(INFINI_DEVICE_KUNLUN, kunlun); +#endif +#ifdef ENABLE_ASCEND_API + CREATE_LEAKY(INFINI_DEVICE_ASCEND, ascend); +#endif +#ifdef ENABLE_METAX_API + CREATE_LEAKY(INFINI_DEVICE_METAX, metax); +#endif +#ifdef ENABLE_MOORE_API + CREATE_LEAKY(INFINI_DEVICE_MOORE, musa); +#endif + } + +#undef CREATE_LEAKY + return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED; +} + +__C infiniStatus_t infiniopGetLeakyreluWorkspaceSize(infiniopLeakyreluDescriptor_t desc, size_t *size) { + +#define GET(CASE, NAMESPACE) \ + case CASE: \ + *size = reinterpret_cast(desc)->workspaceSize(); \ + return INFINI_STATUS_SUCCESS + + switch (desc->device_type) { +#ifdef ENABLE_CPU_API + GET(INFINI_DEVICE_CPU, cpu); +#endif +#ifdef ENABLE_NVIDIA_API + GET(INFINI_DEVICE_NVIDIA, nvidia); +#endif +#ifdef ENABLE_ILUVATAR_API + GET(INFINI_DEVICE_ILUVATAR, nvidia); +#endif +#ifdef ENABLE_KUNLUN_API + GET(INFINI_DEVICE_KUNLUN, kunlun); +#endif +#ifdef ENABLE_ASCEND_API + GET(INFINI_DEVICE_ASCEND, ascend); +#endif +#ifdef ENABLE_METAX_API + GET(INFINI_DEVICE_METAX, metax); +#endif +#ifdef ENABLE_MOORE_API + GET(INFINI_DEVICE_MOORE, musa); +#endif + } + +#undef GET + + return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED; +} + +__C infiniStatus_t infiniopLeakyrelu(infiniopLeakyreluDescriptor_t desc, void *workspace, size_t workspace_size, + void *y, const void *x, void *stream) { + +#define CALC_LEAKY(CASE, NAMESPACE) \ + case CASE: \ + return reinterpret_cast(desc)->calculate( \ + workspace, workspace_size, y, x, stream) + + switch (desc->device_type) { +#ifdef ENABLE_CPU_API + CALC_LEAKY(INFINI_DEVICE_CPU, cpu); +#endif +#ifdef ENABLE_NVIDIA_API + CALC_LEAKY(INFINI_DEVICE_NVIDIA, nvidia); +#endif +#ifdef ENABLE_ILUVATAR_API + CALC_LEAKY(INFINI_DEVICE_ILUVATAR, nvidia); +#endif +#ifdef ENABLE_KUNLUN_API + CALC_LEAKY(INFINI_DEVICE_KUNLUN, kunlun); +#endif +#ifdef ENABLE_ASCEND_API + CALC_LEAKY(INFINI_DEVICE_ASCEND, ascend); +#endif +#ifdef ENABLE_METAX_API + CALC_LEAKY(INFINI_DEVICE_METAX, metax); +#endif +#ifdef ENABLE_MOORE_API + CALC_LEAKY(INFINI_DEVICE_MOORE, musa); +#endif + } + +#undef CALC_LEAKY + return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED; +} + +__C infiniStatus_t infiniopDestroyLeakyreluDescriptor(infiniopLeakyreluDescriptor_t desc) { + +#define DESTROY_LEAKY(CASE, NAMESPACE) \ + case CASE: \ + delete reinterpret_cast(desc); \ + return INFINI_STATUS_SUCCESS + + switch (desc->device_type) { +#ifdef ENABLE_CPU_API + DESTROY_LEAKY(INFINI_DEVICE_CPU, cpu); +#endif +#ifdef ENABLE_NVIDIA_API + DESTROY_LEAKY(INFINI_DEVICE_NVIDIA, nvidia); +#endif +#ifdef ENABLE_ILUVATAR_API + DESTROY_LEAKY(INFINI_DEVICE_ILUVATAR, nvidia); +#endif +#ifdef ENABLE_KUNLUN_API + DESTROY_LEAKY(INFINI_DEVICE_KUNLUN, kunlun); +#endif +#ifdef ENABLE_ASCEND_API + DESTROY_LEAKY(INFINI_DEVICE_ASCEND, ascend); +#endif +#ifdef ENABLE_METAX_API + DESTROY_LEAKY(INFINI_DEVICE_METAX, metax); +#endif +#ifdef ENABLE_MOORE_API + DESTROY_LEAKY(INFINI_DEVICE_MOORE, musa); +#endif + } + +#undef DESTROY_LEAKY + return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED; +} diff --git a/src/infiniop/ops/sigmoid_backward/cpu/sigmoid_backward_cpu.cc b/src/infiniop/ops/sigmoid_backward/cpu/sigmoid_backward_cpu.cc new file mode 100644 index 000000000..ea3d5e63c --- /dev/null +++ b/src/infiniop/ops/sigmoid_backward/cpu/sigmoid_backward_cpu.cc @@ -0,0 +1,54 @@ +#include "sigmoid_backward_cpu.h" + +namespace op::sigmoid_backward::cpu { + +Descriptor::~Descriptor() = default; + +infiniStatus_t Descriptor::create( + infiniopHandle_t handle_, + Descriptor **desc_ptr, + infiniopTensorDescriptor_t out_desc, + std::vector input_desc_vec) { + + auto handle = reinterpret_cast(handle_); + auto dtype = out_desc->dtype(); + + const auto &input_desc = input_desc_vec.at(0); + const auto &grad_output_desc = input_desc_vec.at(1); + const auto &grad_input_shape = out_desc->shape(); + const auto &input_shape = input_desc->shape(); + const auto &grad_output_shape = grad_output_desc->shape(); + + CHECK_DTYPE(dtype, INFINI_DTYPE_F16, INFINI_DTYPE_F32, INFINI_DTYPE_F64, INFINI_DTYPE_BF16); + + CHECK_SAME_SHAPE(grad_input_shape, input_shape, grad_output_shape); + + // create CPU elementwise descriptor + CREATE_ELEMENTWISE_CPU_DESCRIPTOR(handle, dtype, out_desc, input_desc_vec); + + return INFINI_STATUS_SUCCESS; +} + +infiniStatus_t Descriptor::calculate( + void *workspace, + size_t workspace_size, + void *output, + std::vector inputs, + void *stream) const { + + switch (_dtype) { + case INFINI_DTYPE_F16: + return _device_info->calculate(_info, output, inputs, stream); + case INFINI_DTYPE_F32: + return _device_info->calculate(_info, output, inputs, stream); + case INFINI_DTYPE_F64: + return _device_info->calculate(_info, output, inputs, stream); + case INFINI_DTYPE_BF16: + return _device_info->calculate(_info, output, inputs, stream); + default: + return INFINI_STATUS_BAD_TENSOR_DTYPE; + } + + return INFINI_STATUS_SUCCESS; +} +} // namespace op::sigmoid_backward::cpu diff --git a/src/infiniop/ops/sigmoid_backward/cpu/sigmoid_backward_cpu.h b/src/infiniop/ops/sigmoid_backward/cpu/sigmoid_backward_cpu.h new file mode 100644 index 000000000..32537ef17 --- /dev/null +++ b/src/infiniop/ops/sigmoid_backward/cpu/sigmoid_backward_cpu.h @@ -0,0 +1,30 @@ +#ifndef __SIGMOID_BACKWARD_CPU_H__ +#define __SIGMOID_BACKWARD_CPU_H__ + +#include "../../../elementwise/cpu/elementwise_cpu.h" + +ELEMENTWISE_DESCRIPTOR(sigmoid_backward, cpu) + +namespace op::sigmoid_backward::cpu { +typedef struct SigmoidBackwardOp { +public: + static constexpr size_t num_inputs = 2; + template + T operator()(const T &x, const T &grad_out) const { + using ComputeT = std::conditional_t || std::is_same_v, + float, T>; + ComputeT xv = utils::cast(x); + ComputeT gov = utils::cast(grad_out); + + // sigmoid(x) = 1 / (1 + exp(-x)) + ComputeT s = static_cast(1) / (static_cast(1) + std::exp(-xv)); + + // grad_input = grad_output * s * (1 - s) + ComputeT gin = gov * s * (static_cast(1) - s); + + return utils::cast(gin); + } +} SigmoidBackwardOp; +} // namespace op::sigmoid_backward::cpu + +#endif // __SIGMOID_BACKWARD_CPU_H__ diff --git a/src/infiniop/ops/sigmoid_backward/cuda/kernel.cuh b/src/infiniop/ops/sigmoid_backward/cuda/kernel.cuh new file mode 100644 index 000000000..42c850004 --- /dev/null +++ b/src/infiniop/ops/sigmoid_backward/cuda/kernel.cuh @@ -0,0 +1,62 @@ +#ifndef __SIGMOID_BACKWARD_CUDA_H__ +#define __SIGMOID_BACKWARD_CUDA_H__ + +#include +#include +#include +#include + +namespace op::sigmoid_backward::cuda { +typedef struct SigmoidBackwardOp { +public: + static constexpr size_t num_inputs = 2; + + template + __device__ __forceinline__ T operator()(const T &x, const T &grad_out) const { + if constexpr (std::is_same_v) { + float2 xf = __half22float2(x); + float2 gf = __half22float2(grad_out); + float2 sf; + sf.x = 1.0f / (1.0f + __expf(-xf.x)); + sf.y = 1.0f / (1.0f + __expf(-xf.y)); + float2 gr; + gr.x = gf.x * sf.x * (1.0f - sf.x); + gr.y = gf.y * sf.y * (1.0f - sf.y); + return __float22half2_rn(gr); + } else if constexpr (std::is_same_v) { + float xf = __half2float(x); + float gf = __half2float(grad_out); + float s = 1.0f / (1.0f + __expf(-xf)); + float gr = gf * s * (1.0f - s); + return __float2half_rn(gr); + } else if constexpr (std::is_same_v) { + float f0 = __bfloat162float(__low2bfloat16(x)); + float f1 = __bfloat162float(__high2bfloat16(x)); + float g0 = __bfloat162float(__low2bfloat16(grad_out)); + float g1 = __bfloat162float(__high2bfloat16(grad_out)); + float s0 = 1.0f / (1.0f + __expf(-f0)); + float s1 = 1.0f / (1.0f + __expf(-f1)); + float r0 = g0 * s0 * (1.0f - s0); + float r1 = g1 * s1 * (1.0f - s1); + return __floats2bfloat162_rn(r0, r1); + } else if constexpr (std::is_same_v) { + float xf = __bfloat162float(x); + float gf = __bfloat162float(grad_out); + float s = 1.0f / (1.0f + __expf(-xf)); + float gr = gf * s * (1.0f - s); + return __float2bfloat16_rn(gr); + } else if constexpr (std::is_same_v) { + float s = 1.0f / (1.0f + __expf(-x)); + return grad_out * s * (1.0f - s); + } else if constexpr (std::is_same_v) { + double s = 1.0 / (1.0 + std::exp(-x)); + return grad_out * s * (1.0 - s); + } else { + auto s = static_cast(1) / (static_cast(1) + std::exp(-static_cast(x))); + return static_cast(static_cast(grad_out) * s * (1.0f - s)); + } + } +} SigmoidBackwardOp; +} // namespace op::sigmoid_backward::cuda + +#endif // __SIGMOID_BACKWARD_CUDA_H__ diff --git a/src/infiniop/ops/sigmoid_backward/metax/sigmoid_backward_metax.h b/src/infiniop/ops/sigmoid_backward/metax/sigmoid_backward_metax.h new file mode 100644 index 000000000..fa1708559 --- /dev/null +++ b/src/infiniop/ops/sigmoid_backward/metax/sigmoid_backward_metax.h @@ -0,0 +1,8 @@ +#ifndef __SIGMOID_BACKWARD_METAX_API_H__ +#define __SIGMOID_BACKWARD_METAX_API_H__ + +#include "../../../elementwise/metax/elementwise_metax_api.h" + +ELEMENTWISE_DESCRIPTOR(sigmoid_backward, metax) + +#endif // __SIGMOID_BACKWARD_METAX_API_H__ diff --git a/src/infiniop/ops/sigmoid_backward/metax/sigmoid_backward_metax.maca b/src/infiniop/ops/sigmoid_backward/metax/sigmoid_backward_metax.maca new file mode 100644 index 000000000..ed99ac65d --- /dev/null +++ b/src/infiniop/ops/sigmoid_backward/metax/sigmoid_backward_metax.maca @@ -0,0 +1,62 @@ +#include "sigmoid_backward_metax.h" + +#include "../../../elementwise/metax/elementwise_metax.h" + +#include "../cuda/kernel.cuh" + +namespace op::sigmoid_backward::metax { + +Descriptor::~Descriptor() = default; + +infiniStatus_t Descriptor::create( + infiniopHandle_t handle_, + Descriptor **desc_ptr, + infiniopTensorDescriptor_t out_desc, + std::vector input_desc_vec) { + + auto handle = reinterpret_cast(handle_); + auto dtype = out_desc->dtype(); + + const auto &input_desc = input_desc_vec.at(0); + const auto &grad_output_desc = input_desc_vec.at(1); + const auto &grad_input_shape = out_desc->shape(); + const auto &input_shape = input_desc->shape(); + const auto &grad_output_shape = grad_output_desc->shape(); + + CHECK_DTYPE(dtype, INFINI_DTYPE_F16, INFINI_DTYPE_F32, INFINI_DTYPE_F64, INFINI_DTYPE_BF16); + + CHECK_SAME_SHAPE(grad_input_shape, input_shape, grad_output_shape); + + // create CUDA elementwise descriptor + CREATE_ELEMENTWISE_METAX_DESCRIPTOR(handle, dtype, out_desc, input_desc_vec) + + return INFINI_STATUS_SUCCESS; +} + +infiniStatus_t Descriptor::calculate( + void *workspace, + size_t workspace_size, + void *output, + std::vector inputs, + void *stream) const { + + if (workspace_size < _workspace_size) { + return INFINI_STATUS_INSUFFICIENT_WORKSPACE; + } + + switch (_dtype) { + case INFINI_DTYPE_F16: + return _device_info->calculate<256, cuda::SigmoidBackwardOp, half>(_info, workspace, output, inputs, stream); + case INFINI_DTYPE_BF16: + return _device_info->calculate<256, cuda::SigmoidBackwardOp, cuda_bfloat16>(_info, workspace, output, inputs, stream); + case INFINI_DTYPE_F32: + return _device_info->calculate<256, cuda::SigmoidBackwardOp, float>(_info, workspace, output, inputs, stream); + case INFINI_DTYPE_F64: + return _device_info->calculate<256, cuda::SigmoidBackwardOp, double>(_info, workspace, output, inputs, stream); + default: + return INFINI_STATUS_BAD_TENSOR_DTYPE; + } + + return INFINI_STATUS_SUCCESS; +} +} // namespace op::sigmoid_backward::metax diff --git a/src/infiniop/ops/sigmoid_backward/nvidia/sigmoid_backward_nvidia.cu b/src/infiniop/ops/sigmoid_backward/nvidia/sigmoid_backward_nvidia.cu new file mode 100644 index 000000000..e7e604af4 --- /dev/null +++ b/src/infiniop/ops/sigmoid_backward/nvidia/sigmoid_backward_nvidia.cu @@ -0,0 +1,61 @@ +#include "../../../elementwise/nvidia/elementwise_nvidia.cuh" + +#include "../cuda/kernel.cuh" +#include "sigmoid_backward_nvidia.cuh" + +namespace op::sigmoid_backward::nvidia { + +Descriptor::~Descriptor() = default; + +infiniStatus_t Descriptor::create( + infiniopHandle_t handle_, + Descriptor **desc_ptr, + infiniopTensorDescriptor_t out_desc, + std::vector input_desc_vec) { + + auto handle = reinterpret_cast(handle_); + auto dtype = out_desc->dtype(); + + const auto &input_desc = input_desc_vec.at(0); + const auto &grad_output_desc = input_desc_vec.at(1); + const auto &grad_input_shape = out_desc->shape(); + const auto &input_shape = input_desc->shape(); + const auto &grad_output_shape = grad_output_desc->shape(); + + CHECK_DTYPE(dtype, INFINI_DTYPE_F16, INFINI_DTYPE_F32, INFINI_DTYPE_F64, INFINI_DTYPE_BF16); + + CHECK_SAME_SHAPE(grad_input_shape, input_shape, grad_output_shape); + + // create CUDA elementwise descriptor + CREATE_ELEMENTWISE_CUDA_DESCRIPTOR(handle, dtype, out_desc, input_desc_vec) + + return INFINI_STATUS_SUCCESS; +} + +infiniStatus_t Descriptor::calculate( + void *workspace, + size_t workspace_size, + void *output, + std::vector inputs, + void *stream) const { + + if (workspace_size < _workspace_size) { + return INFINI_STATUS_INSUFFICIENT_WORKSPACE; + } + + switch (_dtype) { + case INFINI_DTYPE_F16: + return _device_info->calculate<256, cuda::SigmoidBackwardOp, half>(_info, workspace, output, inputs, stream); + case INFINI_DTYPE_BF16: + return _device_info->calculate<256, cuda::SigmoidBackwardOp, cuda_bfloat16>(_info, workspace, output, inputs, stream); + case INFINI_DTYPE_F32: + return _device_info->calculate<256, cuda::SigmoidBackwardOp, float>(_info, workspace, output, inputs, stream); + case INFINI_DTYPE_F64: + return _device_info->calculate<256, cuda::SigmoidBackwardOp, double>(_info, workspace, output, inputs, stream); + default: + return INFINI_STATUS_BAD_TENSOR_DTYPE; + } + + return INFINI_STATUS_SUCCESS; +} +} // namespace op::sigmoid_backward::nvidia diff --git a/src/infiniop/ops/sigmoid_backward/nvidia/sigmoid_backward_nvidia.cuh b/src/infiniop/ops/sigmoid_backward/nvidia/sigmoid_backward_nvidia.cuh new file mode 100644 index 000000000..822f870fe --- /dev/null +++ b/src/infiniop/ops/sigmoid_backward/nvidia/sigmoid_backward_nvidia.cuh @@ -0,0 +1,8 @@ +#ifndef __SIGMOID_BACKWARD_CUDA_API_H__ +#define __SIGMOID_BACKWARD_CUDA_API_H__ + +#include "../../../elementwise/nvidia/elementwise_nvidia_api.cuh" + +ELEMENTWISE_DESCRIPTOR(sigmoid_backward, nvidia) + +#endif // __SIGMOID_BACKWARD_CUDA_API_H__ diff --git a/src/infiniop/ops/sigmoid_backward/operator.cc b/src/infiniop/ops/sigmoid_backward/operator.cc new file mode 100644 index 000000000..40a279f4b --- /dev/null +++ b/src/infiniop/ops/sigmoid_backward/operator.cc @@ -0,0 +1,145 @@ +#include "../../operator.h" +#include "../../handle.h" +#include "infiniop/ops/sigmoid_backward.h" + +#ifdef ENABLE_CPU_API +#include "cpu/sigmoid_backward_cpu.h" +#endif +#if defined(ENABLE_NVIDIA_API) || defined(ENABLE_ILUVATAR_API) +#include "nvidia/sigmoid_backward_nvidia.cuh" +#endif +#ifdef ENABLE_METAX_API +#include "metax/sigmoid_backward_metax.h" +#endif + +__C infiniStatus_t infiniopCreateSigmoidBackwardDescriptor( + infiniopHandle_t handle, + infiniopSigmoidBackwardDescriptor_t *desc_ptr, + infiniopTensorDescriptor_t grad_input_desc, + infiniopTensorDescriptor_t input_desc, + infiniopTensorDescriptor_t grad_output_desc) { + +#define CREATE(CASE, NAMESPACE) \ + case CASE: \ + return op::sigmoid_backward::NAMESPACE::Descriptor::create( \ + handle, \ + reinterpret_cast(desc_ptr), \ + grad_input_desc, \ + {input_desc, \ + grad_output_desc}) + + switch (handle->device) { + +#ifdef ENABLE_CPU_API + CREATE(INFINI_DEVICE_CPU, cpu); +#endif +#ifdef ENABLE_NVIDIA_API + CREATE(INFINI_DEVICE_NVIDIA, nvidia); +#endif +#ifdef ENABLE_ILUVATAR_API + CREATE(INFINI_DEVICE_ILUVATAR, nvidia); +#endif +#ifdef ENABLE_METAX_API + CREATE(INFINI_DEVICE_METAX, metax); +#endif + + default: + return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED; + } + +#undef CREATE +} + +__C infiniStatus_t infiniopGetSigmoidBackwardWorkspaceSize(infiniopSigmoidBackwardDescriptor_t desc, size_t *size) { + +#define GET(CASE, NAMESPACE) \ + case CASE: \ + *size = reinterpret_cast(desc)->workspaceSize(); \ + return INFINI_STATUS_SUCCESS + + switch (desc->device_type) { +#ifdef ENABLE_CPU_API + GET(INFINI_DEVICE_CPU, cpu); +#endif +#ifdef ENABLE_NVIDIA_API + GET(INFINI_DEVICE_NVIDIA, nvidia); +#endif +#ifdef ENABLE_ILUVATAR_API + GET(INFINI_DEVICE_ILUVATAR, nvidia); +#endif +#ifdef ENABLE_METAX_API + GET(INFINI_DEVICE_METAX, metax); +#endif + default: + return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED; + } +#undef GET + + return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED; +} + +__C infiniStatus_t infiniopSigmoidBackward( + infiniopSigmoidBackwardDescriptor_t desc, + void *workspace, + size_t workspace_size, + void *grad_input, + const void *input, + const void *grad_output, + void *stream) { + +#define CALCULATE(CASE, NAMESPACE) \ + case CASE: \ + return reinterpret_cast(desc) \ + ->calculate(workspace, workspace_size, grad_input, {input, grad_output}, stream) + + switch (desc->device_type) { + +#ifdef ENABLE_CPU_API + CALCULATE(INFINI_DEVICE_CPU, cpu); +#endif +#ifdef ENABLE_NVIDIA_API + CALCULATE(INFINI_DEVICE_NVIDIA, nvidia); +#endif +#ifdef ENABLE_ILUVATAR_API + CALCULATE(INFINI_DEVICE_ILUVATAR, nvidia); +#endif +#ifdef ENABLE_METAX_API + CALCULATE(INFINI_DEVICE_METAX, metax); +#endif + + default: + return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED; + } + +#undef CALCULATE +} + +__C infiniStatus_t +infiniopDestroySigmoidBackwardDescriptor(infiniopSigmoidBackwardDescriptor_t desc) { + +#define DELETE(CASE, NAMESPACE) \ + case CASE: \ + delete reinterpret_cast(desc); \ + return INFINI_STATUS_SUCCESS + + switch (desc->device_type) { + +#ifdef ENABLE_CPU_API + DELETE(INFINI_DEVICE_CPU, cpu); +#endif +#ifdef ENABLE_NVIDIA_API + DELETE(INFINI_DEVICE_NVIDIA, nvidia); +#endif +#ifdef ENABLE_ILUVATAR_API + DELETE(INFINI_DEVICE_ILUVATAR, nvidia); +#endif +#ifdef ENABLE_METAX_API + DELETE(INFINI_DEVICE_METAX, metax); +#endif + + default: + return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED; + } + +#undef DELETE +} diff --git a/src/infiniop/ops/sin/cpu/sin_cpu.cc b/src/infiniop/ops/sin/cpu/sin_cpu.cc new file mode 100644 index 000000000..88ba6cdd6 --- /dev/null +++ b/src/infiniop/ops/sin/cpu/sin_cpu.cc @@ -0,0 +1,52 @@ +#include "sin_cpu.h" + +namespace op::sin::cpu { + +Descriptor::~Descriptor() = default; + +infiniStatus_t Descriptor::create( + infiniopHandle_t handle_, + Descriptor **desc_ptr, + infiniopTensorDescriptor_t out_desc, + std::vector input_desc_vec) { + + auto handle = reinterpret_cast(handle_); + auto dtype = out_desc->dtype(); + + const auto &input_desc = input_desc_vec.at(0); + const auto &output_shape = out_desc->shape(); + const auto &input_shape = input_desc->shape(); + + CHECK_DTYPE(dtype, INFINI_DTYPE_F16, INFINI_DTYPE_F32, INFINI_DTYPE_F64, INFINI_DTYPE_BF16); + + CHECK_SAME_SHAPE(output_shape, input_shape); + + // create CPU elementwise descriptor + CREATE_ELEMENTWISE_CPU_DESCRIPTOR(handle, dtype, out_desc, input_desc_vec); + + return INFINI_STATUS_SUCCESS; +} + +infiniStatus_t Descriptor::calculate( + void *workspace, + size_t workspace_size, + void *output, + std::vector inputs, + void *stream) const { + + switch (_dtype) { + case INFINI_DTYPE_F16: + return _device_info->calculate(_info, output, inputs, stream); + case INFINI_DTYPE_F32: + return _device_info->calculate(_info, output, inputs, stream); + case INFINI_DTYPE_F64: + return _device_info->calculate(_info, output, inputs, stream); + case INFINI_DTYPE_BF16: + return _device_info->calculate(_info, output, inputs, stream); + default: + return INFINI_STATUS_BAD_TENSOR_DTYPE; + } + + return INFINI_STATUS_SUCCESS; +} +} // namespace op::sin::cpu diff --git a/src/infiniop/ops/sin/cpu/sin_cpu.h b/src/infiniop/ops/sin/cpu/sin_cpu.h new file mode 100644 index 000000000..80e406f98 --- /dev/null +++ b/src/infiniop/ops/sin/cpu/sin_cpu.h @@ -0,0 +1,21 @@ +#ifndef __SIN_CPU_H__ +#define __SIN_CPU_H__ + +#include "../../../elementwise/cpu/elementwise_cpu.h" +#include + +ELEMENTWISE_DESCRIPTOR(sin, cpu) + +namespace op::sin::cpu { +typedef struct SinOp { +public: + static constexpr size_t num_inputs = 1; + + template + T operator()(const T &input) const { + return std::sin(input); + } +} SinOp; +} // namespace op::sin::cpu + +#endif // __SIN_CPU_H__ diff --git a/src/infiniop/ops/sin/cuda/kernel.cuh b/src/infiniop/ops/sin/cuda/kernel.cuh new file mode 100644 index 000000000..30641366c --- /dev/null +++ b/src/infiniop/ops/sin/cuda/kernel.cuh @@ -0,0 +1,39 @@ +#ifndef __SIN_CUDA_H__ +#define __SIN_CUDA_H__ + +#include +#include +#include + +namespace op::sin::cuda { +typedef struct SinOp { + static constexpr size_t num_inputs = 1; + + template + __device__ __forceinline__ T operator()(const T &input) const { + if constexpr (std::is_same_v) { + float2 vf = __half22float2(input); + float2 vr = make_float2(__sinf(vf.x), __sinf(vf.y)); + return __float22half2_rn(vr); + } else if constexpr (std::is_same_v) { + float inputf = __half2float(input); + return __float2half_rn(sinf(inputf)); + } else if constexpr (std::is_same_v) { + float f0 = __bfloat162float(__low2bfloat16(input)); + float f1 = __bfloat162float(__high2bfloat16(input)); + return __floats2bfloat162_rn(__sinf(f0), __sinf(f1)); + } else if constexpr (std::is_same_v) { + float inputf = __bfloat162float(input); + return __float2bfloat16_rn(__sinf(inputf)); + } else if constexpr (std::is_same_v) { + return sinf(input); + } else if constexpr (std::is_same_v) { + return std::sin(input); + } else { + return std::sin(input); + } + } +} SinOp; +} // namespace op::sin::cuda + +#endif // __SIN_CUDA_H__ diff --git a/src/infiniop/ops/sin/metax/sin_metax.h b/src/infiniop/ops/sin/metax/sin_metax.h new file mode 100644 index 000000000..5b272d4d9 --- /dev/null +++ b/src/infiniop/ops/sin/metax/sin_metax.h @@ -0,0 +1,8 @@ +#ifndef __SIN_METAX_API_H__ +#define __SIN_METAX_API_H__ + +#include "../../../elementwise/metax/elementwise_metax_api.h" + +ELEMENTWISE_DESCRIPTOR(sin, metax) + +#endif // __SIN_METAX_API_H__ diff --git a/src/infiniop/ops/sin/metax/sin_metax.maca b/src/infiniop/ops/sin/metax/sin_metax.maca new file mode 100644 index 000000000..5ea69e139 --- /dev/null +++ b/src/infiniop/ops/sin/metax/sin_metax.maca @@ -0,0 +1,60 @@ +#include "sin_metax.h" + +#include "../../../elementwise/metax/elementwise_metax.h" + +#include "../cuda/kernel.cuh" + +namespace op::sin::metax { + +Descriptor::~Descriptor() = default; + +infiniStatus_t Descriptor::create( + infiniopHandle_t handle_, + Descriptor **desc_ptr, + infiniopTensorDescriptor_t out_desc, + std::vector input_desc_vec) { + + auto handle = reinterpret_cast(handle_); + auto dtype = out_desc->dtype(); + + const auto &input_desc = input_desc_vec.at(0); + const auto &output_shape = out_desc->shape(); + const auto &input_shape = input_desc->shape(); + + CHECK_DTYPE(dtype, INFINI_DTYPE_F16, INFINI_DTYPE_F32, INFINI_DTYPE_F64, INFINI_DTYPE_BF16); + + CHECK_SAME_SHAPE(output_shape, input_shape); + + // create CUDA elementwise descriptor + CREATE_ELEMENTWISE_METAX_DESCRIPTOR(handle, dtype, out_desc, input_desc_vec) + + return INFINI_STATUS_SUCCESS; +} + +infiniStatus_t Descriptor::calculate( + void *workspace, + size_t workspace_size, + void *output, + std::vector inputs, + void *stream) const { + + if (workspace_size < _workspace_size) { + return INFINI_STATUS_INSUFFICIENT_WORKSPACE; + } + + switch (_dtype) { + case INFINI_DTYPE_F16: + return _device_info->calculate<256, cuda::SinOp, half>(_info, workspace, output, inputs, stream); + case INFINI_DTYPE_BF16: + return _device_info->calculate<256, cuda::SinOp, cuda_bfloat16>(_info, workspace, output, inputs, stream); + case INFINI_DTYPE_F32: + return _device_info->calculate<256, cuda::SinOp, float>(_info, workspace, output, inputs, stream); + case INFINI_DTYPE_F64: + return _device_info->calculate<256, cuda::SinOp, double>(_info, workspace, output, inputs, stream); + default: + return INFINI_STATUS_BAD_TENSOR_DTYPE; + } + + return INFINI_STATUS_SUCCESS; +} +} // namespace op::sin::metax diff --git a/src/infiniop/ops/sin/nvidia/sin_nvidia.cu b/src/infiniop/ops/sin/nvidia/sin_nvidia.cu new file mode 100644 index 000000000..6fbf952bc --- /dev/null +++ b/src/infiniop/ops/sin/nvidia/sin_nvidia.cu @@ -0,0 +1,59 @@ +#include "../../../elementwise/nvidia/elementwise_nvidia.cuh" + +#include "../cuda/kernel.cuh" +#include "sin_nvidia.cuh" + +namespace op::sin::nvidia { + +Descriptor::~Descriptor() = default; + +infiniStatus_t Descriptor::create( + infiniopHandle_t handle_, + Descriptor **desc_ptr, + infiniopTensorDescriptor_t out_desc, + std::vector input_desc_vec) { + + auto handle = reinterpret_cast(handle_); + auto dtype = out_desc->dtype(); + + const auto &input_desc = input_desc_vec.at(0); + const auto &output_shape = out_desc->shape(); + const auto &input_shape = input_desc->shape(); + + CHECK_DTYPE(dtype, INFINI_DTYPE_F16, INFINI_DTYPE_F32, INFINI_DTYPE_F64, INFINI_DTYPE_BF16); + + CHECK_SAME_SHAPE(output_shape, input_shape); + + // create CUDA elementwise descriptor + CREATE_ELEMENTWISE_CUDA_DESCRIPTOR(handle, dtype, out_desc, input_desc_vec) + + return INFINI_STATUS_SUCCESS; +} + +infiniStatus_t Descriptor::calculate( + void *workspace, + size_t workspace_size, + void *output, + std::vector inputs, + void *stream) const { + + if (workspace_size < _workspace_size) { + return INFINI_STATUS_INSUFFICIENT_WORKSPACE; + } + + switch (_dtype) { + case INFINI_DTYPE_F16: + return _device_info->calculate<256, cuda::SinOp, half>(_info, workspace, output, inputs, stream); + case INFINI_DTYPE_BF16: + return _device_info->calculate<256, cuda::SinOp, cuda_bfloat16>(_info, workspace, output, inputs, stream); + case INFINI_DTYPE_F32: + return _device_info->calculate<256, cuda::SinOp, float>(_info, workspace, output, inputs, stream); + case INFINI_DTYPE_F64: + return _device_info->calculate<256, cuda::SinOp, double>(_info, workspace, output, inputs, stream); + default: + return INFINI_STATUS_BAD_TENSOR_DTYPE; + } + + return INFINI_STATUS_SUCCESS; +} +} // namespace op::sin::nvidia diff --git a/src/infiniop/ops/sin/nvidia/sin_nvidia.cuh b/src/infiniop/ops/sin/nvidia/sin_nvidia.cuh new file mode 100644 index 000000000..31f5b48ef --- /dev/null +++ b/src/infiniop/ops/sin/nvidia/sin_nvidia.cuh @@ -0,0 +1,8 @@ +#ifndef __SIN_CUDA_API_H__ +#define __SIN_CUDA_API_H__ + +#include "../../../elementwise/nvidia/elementwise_nvidia_api.cuh" + +ELEMENTWISE_DESCRIPTOR(sin, nvidia) + +#endif // __SIN_CUDA_API_H__ diff --git a/src/infiniop/ops/sin/operator.cc b/src/infiniop/ops/sin/operator.cc new file mode 100644 index 000000000..978561a04 --- /dev/null +++ b/src/infiniop/ops/sin/operator.cc @@ -0,0 +1,142 @@ +#include "../../operator.h" +#include "../../handle.h" +#include "infiniop/ops/sin.h" + +#ifdef ENABLE_CPU_API +#include "cpu/sin_cpu.h" +#endif +#if defined(ENABLE_NVIDIA_API) || defined(ENABLE_ILUVATAR_API) +#include "nvidia/sin_nvidia.cuh" +#endif +#ifdef ENABLE_METAX_API +#include "metax/sin_metax.h" +#endif + +__C infiniStatus_t infiniopCreateSinDescriptor( + infiniopHandle_t handle, + infiniopSinDescriptor_t *desc_ptr, + infiniopTensorDescriptor_t output_desc, + infiniopTensorDescriptor_t input_desc) { + +#define CREATE(CASE, NAMESPACE) \ + case CASE: \ + return op::sin::NAMESPACE::Descriptor::create( \ + handle, \ + reinterpret_cast(desc_ptr), \ + output_desc, \ + {input_desc}) + + switch (handle->device) { + +#ifdef ENABLE_CPU_API + CREATE(INFINI_DEVICE_CPU, cpu); +#endif +#ifdef ENABLE_NVIDIA_API + CREATE(INFINI_DEVICE_NVIDIA, nvidia); +#endif +#ifdef ENABLE_ILUVATAR_API + CREATE(INFINI_DEVICE_ILUVATAR, nvidia); +#endif +#ifdef ENABLE_METAX_API + CREATE(INFINI_DEVICE_METAX, metax); +#endif + + default: + return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED; + } + +#undef CREATE +} + +__C infiniStatus_t infiniopGetSinWorkspaceSize(infiniopSinDescriptor_t desc, size_t *size) { + +#define GET(CASE, NAMESPACE) \ + case CASE: \ + *size = reinterpret_cast(desc)->workspaceSize(); \ + return INFINI_STATUS_SUCCESS; + + switch (desc->device_type) { +#ifdef ENABLE_CPU_API + GET(INFINI_DEVICE_CPU, cpu); +#endif +#ifdef ENABLE_NVIDIA_API + GET(INFINI_DEVICE_NVIDIA, nvidia); +#endif +#ifdef ENABLE_ILUVATAR_API + GET(INFINI_DEVICE_ILUVATAR, nvidia); +#endif +#ifdef ENABLE_METAX_API + GET(INFINI_DEVICE_METAX, metax); +#endif + default: + return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED; + } +#undef GET + + return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED; +} + +__C infiniStatus_t infiniopSin( + infiniopSinDescriptor_t desc, + void *workspace, + size_t workspace_size, + void *output, + const void *input, + void *stream) { + +#define CALCULATE(CASE, NAMESPACE) \ + case CASE: \ + return reinterpret_cast(desc) \ + ->calculate(workspace, workspace_size, output, {input}, stream) + + switch (desc->device_type) { + +#ifdef ENABLE_CPU_API + CALCULATE(INFINI_DEVICE_CPU, cpu); +#endif +#ifdef ENABLE_NVIDIA_API + CALCULATE(INFINI_DEVICE_NVIDIA, nvidia); +#endif +#ifdef ENABLE_ILUVATAR_API + CALCULATE(INFINI_DEVICE_ILUVATAR, nvidia); +#endif +#ifdef ENABLE_METAX_API + CALCULATE(INFINI_DEVICE_METAX, metax); +#endif + + default: + return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED; + } + +#undef CALCULATE +} + +__C infiniStatus_t +infiniopDestroySinDescriptor(infiniopSinDescriptor_t desc) { + +#define DELETE(CASE, NAMESPACE) \ + case CASE: \ + delete reinterpret_cast(desc); \ + return INFINI_STATUS_SUCCESS + + switch (desc->device_type) { + +#ifdef ENABLE_CPU_API + DELETE(INFINI_DEVICE_CPU, cpu); +#endif +#ifdef ENABLE_NVIDIA_API + DELETE(INFINI_DEVICE_NVIDIA, nvidia); +#endif +#ifdef ENABLE_ILUVATAR_API + DELETE(INFINI_DEVICE_ILUVATAR, nvidia); +#endif +#ifdef ENABLE_METAX_API + DELETE(INFINI_DEVICE_METAX, metax); +#endif + + default: + return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED; + } + +#undef DELETE +} diff --git a/src/infiniop/ops/tanh/cpu/tanh_cpu.cc b/src/infiniop/ops/tanh/cpu/tanh_cpu.cc new file mode 100644 index 000000000..23a92ed65 --- /dev/null +++ b/src/infiniop/ops/tanh/cpu/tanh_cpu.cc @@ -0,0 +1,52 @@ +#include "tanh_cpu.h" + +namespace op::tanh::cpu { + +Descriptor::~Descriptor() = default; + +infiniStatus_t Descriptor::create( + infiniopHandle_t handle_, + Descriptor **desc_ptr, + infiniopTensorDescriptor_t out_desc, + std::vector input_desc_vec) { + + auto handle = reinterpret_cast(handle_); + auto dtype = out_desc->dtype(); + + const auto &input_desc = input_desc_vec.at(0); + const auto &output_shape = out_desc->shape(); + const auto &input_shape = input_desc->shape(); + + CHECK_DTYPE(dtype, INFINI_DTYPE_F16, INFINI_DTYPE_F32, INFINI_DTYPE_F64, INFINI_DTYPE_BF16); + + CHECK_SAME_SHAPE(output_shape, input_shape); + + // create CPU elementwise descriptor + CREATE_ELEMENTWISE_CPU_DESCRIPTOR(handle, dtype, out_desc, input_desc_vec); + + return INFINI_STATUS_SUCCESS; +} + +infiniStatus_t Descriptor::calculate( + void *workspace, + size_t workspace_size, + void *output, + std::vector inputs, + void *stream) const { + + switch (_dtype) { + case INFINI_DTYPE_F16: + return _device_info->calculate(_info, output, inputs, stream); + case INFINI_DTYPE_F32: + return _device_info->calculate(_info, output, inputs, stream); + case INFINI_DTYPE_F64: + return _device_info->calculate(_info, output, inputs, stream); + case INFINI_DTYPE_BF16: + return _device_info->calculate(_info, output, inputs, stream); + default: + return INFINI_STATUS_BAD_TENSOR_DTYPE; + } + + return INFINI_STATUS_SUCCESS; +} +} // namespace op::tanh::cpu diff --git a/src/infiniop/ops/tanh/cpu/tanh_cpu.h b/src/infiniop/ops/tanh/cpu/tanh_cpu.h new file mode 100644 index 000000000..73fd7c1b6 --- /dev/null +++ b/src/infiniop/ops/tanh/cpu/tanh_cpu.h @@ -0,0 +1,21 @@ +#ifndef __TANH_CPU_H__ +#define __TANH_CPU_H__ + +#include "../../../elementwise/cpu/elementwise_cpu.h" +#include + +ELEMENTWISE_DESCRIPTOR(tanh, cpu) + +namespace op::tanh::cpu { +typedef struct TanhOp { +public: + static constexpr size_t num_inputs = 1; + + template + T operator()(const T &input) const { + return std::tanh(input); + } +} TanhOp; +} // namespace op::tanh::cpu + +#endif // __TANH_CPU_H__ diff --git a/src/infiniop/ops/tanh/cuda/kernel.cuh b/src/infiniop/ops/tanh/cuda/kernel.cuh new file mode 100644 index 000000000..62979a20e --- /dev/null +++ b/src/infiniop/ops/tanh/cuda/kernel.cuh @@ -0,0 +1,46 @@ +#ifndef __TANH_CUDA_H__ +#define __TANH_CUDA_H__ + +#include +#include +#include + +namespace op::tanh::cuda { +typedef struct TanhOp { + static constexpr size_t num_inputs = 1; + + __device__ __forceinline__ float tanh_f32_func(float x) const { + return tanhf(x); + } + template + __device__ __forceinline__ T operator()(const T &input) const { + if constexpr (std::is_same_v) { + float2 vf = __half22float2(input); + float2 vr = make_float2(tanh_f32_func(vf.x), tanh_f32_func(vf.y)); + return __float22half2_rn(vr); + } else if constexpr (std::is_same_v) { + float xf = __half2float(input); + float yf = tanh_f32_func(xf); + return __float2half_rn(yf); + } else if constexpr (std::is_same_v) { + float f0 = __bfloat162float(__low2bfloat16(input)); + float f1 = __bfloat162float(__high2bfloat16(input)); + float r0 = tanh_f32_func(f0); + float r1 = tanh_f32_func(f1); + return __floats2bfloat162_rn(r0, r1); + } else if constexpr (std::is_same_v) { + float xf = __bfloat162float(input); + float rf = tanh_f32_func(xf); + return __float2bfloat16_rn(rf); + } else if constexpr (std::is_same_v) { + return tanh_f32_func(input); + } else if constexpr (std::is_same_v) { + return std::tanh(input); + } else { + return std::tanh(input); + } + } +} TanhOp; +} // namespace op::tanh::cuda + +#endif // __TANH_CUDA_H__ diff --git a/src/infiniop/ops/tanh/metax/tanh_metax.h b/src/infiniop/ops/tanh/metax/tanh_metax.h new file mode 100644 index 000000000..8432a7f0d --- /dev/null +++ b/src/infiniop/ops/tanh/metax/tanh_metax.h @@ -0,0 +1,8 @@ +#ifndef __TANH_METAX_API_H__ +#define __TANH_METAX_API_H__ + +#include "../../../elementwise/metax/elementwise_metax_api.h" + +ELEMENTWISE_DESCRIPTOR(tanh, metax) + +#endif // __TANH_METAX_API_H__ diff --git a/src/infiniop/ops/tanh/metax/tanh_metax.maca b/src/infiniop/ops/tanh/metax/tanh_metax.maca new file mode 100644 index 000000000..0a01554c4 --- /dev/null +++ b/src/infiniop/ops/tanh/metax/tanh_metax.maca @@ -0,0 +1,60 @@ +#include "tanh_metax.h" + +#include "../../../elementwise/metax/elementwise_metax.h" + +#include "../cuda/kernel.cuh" + +namespace op::tanh::metax { + +Descriptor::~Descriptor() = default; + +infiniStatus_t Descriptor::create( + infiniopHandle_t handle_, + Descriptor **desc_ptr, + infiniopTensorDescriptor_t out_desc, + std::vector input_desc_vec) { + + auto handle = reinterpret_cast(handle_); + auto dtype = out_desc->dtype(); + + const auto &input_desc = input_desc_vec.at(0); + const auto &output_shape = out_desc->shape(); + const auto &input_shape = input_desc->shape(); + + CHECK_DTYPE(dtype, INFINI_DTYPE_F16, INFINI_DTYPE_F32, INFINI_DTYPE_F64, INFINI_DTYPE_BF16); + + CHECK_SAME_SHAPE(output_shape, input_shape); + + // create CUDA elementwise descriptor + CREATE_ELEMENTWISE_METAX_DESCRIPTOR(handle, dtype, out_desc, input_desc_vec) + + return INFINI_STATUS_SUCCESS; +} + +infiniStatus_t Descriptor::calculate( + void *workspace, + size_t workspace_size, + void *output, + std::vector inputs, + void *stream) const { + + if (workspace_size < _workspace_size) { + return INFINI_STATUS_INSUFFICIENT_WORKSPACE; + } + + switch (_dtype) { + case INFINI_DTYPE_F16: + return _device_info->calculate<256, cuda::TanhOp, half>(_info, workspace, output, inputs, stream); + case INFINI_DTYPE_BF16: + return _device_info->calculate<256, cuda::TanhOp, cuda_bfloat16>(_info, workspace, output, inputs, stream); + case INFINI_DTYPE_F32: + return _device_info->calculate<256, cuda::TanhOp, float>(_info, workspace, output, inputs, stream); + case INFINI_DTYPE_F64: + return _device_info->calculate<256, cuda::TanhOp, double>(_info, workspace, output, inputs, stream); + default: + return INFINI_STATUS_BAD_TENSOR_DTYPE; + } + + return INFINI_STATUS_SUCCESS; +} +} // namespace op::tanh::metax diff --git a/src/infiniop/ops/tanh/nvidia/tanh_nvidia.cu b/src/infiniop/ops/tanh/nvidia/tanh_nvidia.cu new file mode 100644 index 000000000..a2c36551c --- /dev/null +++ b/src/infiniop/ops/tanh/nvidia/tanh_nvidia.cu @@ -0,0 +1,59 @@ +#include "../../../elementwise/nvidia/elementwise_nvidia.cuh" + +#include "../cuda/kernel.cuh" +#include "tanh_nvidia.cuh" + +namespace op::tanh::nvidia { + +Descriptor::~Descriptor() = default; + +infiniStatus_t Descriptor::create( + infiniopHandle_t handle_, + Descriptor **desc_ptr, + infiniopTensorDescriptor_t out_desc, + std::vector input_desc_vec) { + + auto handle = reinterpret_cast(handle_); + auto dtype = out_desc->dtype(); + + const auto &input_desc = input_desc_vec.at(0); + const auto &output_shape = out_desc->shape(); + const auto &input_shape = input_desc->shape(); + + CHECK_DTYPE(dtype, INFINI_DTYPE_F16, INFINI_DTYPE_F32, INFINI_DTYPE_F64, INFINI_DTYPE_BF16); + + CHECK_SAME_SHAPE(output_shape, input_shape); + + // create CUDA elementwise descriptor + CREATE_ELEMENTWISE_CUDA_DESCRIPTOR(handle, dtype, out_desc, input_desc_vec) + + return INFINI_STATUS_SUCCESS; +} + +infiniStatus_t Descriptor::calculate( + void *workspace, + size_t workspace_size, + void *output, + std::vector inputs, + void *stream) const { + + if (workspace_size < _workspace_size) { + return INFINI_STATUS_INSUFFICIENT_WORKSPACE; + } + + switch (_dtype) { + case INFINI_DTYPE_F16: + return _device_info->calculate<256, cuda::TanhOp, half>(_info, workspace, output, inputs, stream); + case INFINI_DTYPE_BF16: + return _device_info->calculate<256, cuda::TanhOp, cuda_bfloat16>(_info, workspace, output, inputs, stream); + case INFINI_DTYPE_F32: + return _device_info->calculate<256, cuda::TanhOp, float>(_info, workspace, output, inputs, stream); + case INFINI_DTYPE_F64: + return _device_info->calculate<256, cuda::TanhOp, double>(_info, workspace, output, inputs, stream); + default: + return INFINI_STATUS_BAD_TENSOR_DTYPE; + } + + return INFINI_STATUS_SUCCESS; +} +} // namespace op::tanh::nvidia diff --git a/src/infiniop/ops/tanh/nvidia/tanh_nvidia.cuh b/src/infiniop/ops/tanh/nvidia/tanh_nvidia.cuh new file mode 100644 index 000000000..cb37b2528 --- /dev/null +++ b/src/infiniop/ops/tanh/nvidia/tanh_nvidia.cuh @@ -0,0 +1,8 @@ +#ifndef __TANH_CUDA_API_H__ +#define __TANH_CUDA_API_H__ + +#include "../../../elementwise/nvidia/elementwise_nvidia_api.cuh" + +ELEMENTWISE_DESCRIPTOR(tanh, nvidia) + +#endif // __TANH_CUDA_API_H__ diff --git a/src/infiniop/ops/tanh/operator.cc b/src/infiniop/ops/tanh/operator.cc new file mode 100644 index 000000000..d34d97df6 --- /dev/null +++ b/src/infiniop/ops/tanh/operator.cc @@ -0,0 +1,142 @@ +#include "../../operator.h" +#include "../../handle.h" +#include "infiniop/ops/tanh.h" + +#ifdef ENABLE_CPU_API +#include "cpu/tanh_cpu.h" +#endif +#if defined(ENABLE_NVIDIA_API) || defined(ENABLE_ILUVATAR_API) +#include "nvidia/tanh_nvidia.cuh" +#endif +#ifdef ENABLE_METAX_API +#include "metax/tanh_metax.h" +#endif + +__C infiniStatus_t infiniopCreateTanhDescriptor( + infiniopHandle_t handle, + infiniopTanhDescriptor_t *desc_ptr, + infiniopTensorDescriptor_t output_desc, + infiniopTensorDescriptor_t input_desc) { + +#define CREATE(CASE, NAMESPACE) \ + case CASE: \ + return op::tanh::NAMESPACE::Descriptor::create( \ + handle, \ + reinterpret_cast(desc_ptr), \ + output_desc, \ + {input_desc}) + + switch (handle->device) { + +#ifdef ENABLE_CPU_API + CREATE(INFINI_DEVICE_CPU, cpu); +#endif +#ifdef ENABLE_NVIDIA_API + CREATE(INFINI_DEVICE_NVIDIA, nvidia); +#endif +#ifdef ENABLE_ILUVATAR_API + CREATE(INFINI_DEVICE_ILUVATAR, nvidia); +#endif +#ifdef ENABLE_METAX_API + CREATE(INFINI_DEVICE_METAX, metax); +#endif + + default: + return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED; + } + +#undef CREATE +} + +__C infiniStatus_t infiniopGetTanhWorkspaceSize(infiniopTanhDescriptor_t desc, size_t *size) { + +#define GET(CASE, NAMESPACE) \ + case CASE: \ + *size = reinterpret_cast(desc)->workspaceSize(); \ + return INFINI_STATUS_SUCCESS; + + switch (desc->device_type) { +#ifdef ENABLE_CPU_API + GET(INFINI_DEVICE_CPU, cpu); +#endif +#ifdef ENABLE_NVIDIA_API + GET(INFINI_DEVICE_NVIDIA, nvidia); +#endif +#ifdef ENABLE_ILUVATAR_API + GET(INFINI_DEVICE_ILUVATAR, nvidia); +#endif +#ifdef ENABLE_METAX_API + GET(INFINI_DEVICE_METAX, metax); +#endif + default: + return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED; + } +#undef GET + + return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED; +} + +__C infiniStatus_t infiniopTanh( + infiniopTanhDescriptor_t desc, + void *workspace, + size_t workspace_size, + void *output, + const void *input, + void *stream) { + +#define CALCULATE(CASE, NAMESPACE) \ + case CASE: \ + return reinterpret_cast(desc) \ + ->calculate(workspace, workspace_size, output, {input}, stream) + + switch (desc->device_type) { + +#ifdef ENABLE_CPU_API + CALCULATE(INFINI_DEVICE_CPU, cpu); +#endif +#ifdef ENABLE_NVIDIA_API + CALCULATE(INFINI_DEVICE_NVIDIA, nvidia); +#endif +#ifdef ENABLE_ILUVATAR_API + CALCULATE(INFINI_DEVICE_ILUVATAR, nvidia); +#endif +#ifdef ENABLE_METAX_API + CALCULATE(INFINI_DEVICE_METAX, metax); +#endif + + default: + return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED; + } + +#undef CALCULATE +} + +__C infiniStatus_t +infiniopDestroyTanhDescriptor(infiniopTanhDescriptor_t desc) { + +#define DELETE(CASE, NAMESPACE) \ + case CASE: \ + delete reinterpret_cast(desc); \ + return INFINI_STATUS_SUCCESS + + switch (desc->device_type) { + +#ifdef ENABLE_CPU_API + DELETE(INFINI_DEVICE_CPU, cpu); +#endif +#ifdef ENABLE_NVIDIA_API + DELETE(INFINI_DEVICE_NVIDIA, nvidia); +#endif +#ifdef ENABLE_ILUVATAR_API + DELETE(INFINI_DEVICE_ILUVATAR, nvidia); +#endif +#ifdef ENABLE_METAX_API + DELETE(INFINI_DEVICE_METAX, metax); +#endif + + default: + return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED; + } + +#undef DELETE +} diff --git a/src/infiniop/ops/where/cpu/where_cpu.cc b/src/infiniop/ops/where/cpu/where_cpu.cc new file mode 100644 index 000000000..de7e86e3e --- /dev/null +++ b/src/infiniop/ops/where/cpu/where_cpu.cc @@ -0,0 +1,84 @@ +#include "where_cpu.h" + +namespace op::where::cpu { + +Descriptor::~Descriptor() = default; + +infiniStatus_t Descriptor::create( + infiniopHandle_t handle_, + Descriptor **desc_ptr, + infiniopTensorDescriptor_t out_desc, + std::vector input_desc_vec) { + + auto handle = reinterpret_cast(handle_); + auto dtype = out_desc->dtype(); + + const auto &a_desc = input_desc_vec.at(0); + const auto &b_desc = input_desc_vec.at(1); + const auto &cond_desc = input_desc_vec.at(2); + + const auto &c_shape = out_desc->shape(); + const auto &a_shape = a_desc->shape(); + const auto &b_shape = b_desc->shape(); + const auto &cond_shape = cond_desc->shape(); + + CHECK_DTYPE(cond_desc->dtype(), + INFINI_DTYPE_F16, INFINI_DTYPE_F32, INFINI_DTYPE_F64, INFINI_DTYPE_BF16, + INFINI_DTYPE_I8, INFINI_DTYPE_I16, INFINI_DTYPE_I32, INFINI_DTYPE_I64, + INFINI_DTYPE_U8, INFINI_DTYPE_U16, INFINI_DTYPE_U32, INFINI_DTYPE_U64, + INFINI_DTYPE_BOOL); + + CHECK_DTYPE(dtype, + INFINI_DTYPE_F16, INFINI_DTYPE_F32, INFINI_DTYPE_F64, INFINI_DTYPE_BF16, + INFINI_DTYPE_I8, INFINI_DTYPE_I16, INFINI_DTYPE_I32, INFINI_DTYPE_I64, + INFINI_DTYPE_U8, INFINI_DTYPE_U16, INFINI_DTYPE_U32, INFINI_DTYPE_U64, + INFINI_DTYPE_BOOL); + + CHECK_SAME_SHAPE(c_shape, a_shape, b_shape, cond_shape); + + CREATE_ELEMENTWISE_CPU_DESCRIPTOR(handle, dtype, out_desc, input_desc_vec); + + return INFINI_STATUS_SUCCESS; +} + +infiniStatus_t Descriptor::calculate( + void *workspace, + size_t workspace_size, + void *output, + std::vector inputs, + void *stream) const { + + switch (_dtype) { + case INFINI_DTYPE_F16: + return _device_info->calculate(_info, output, inputs, stream); + case INFINI_DTYPE_BF16: + return _device_info->calculate(_info, output, inputs, stream); + case INFINI_DTYPE_F32: + return _device_info->calculate(_info, output, inputs, stream); + case INFINI_DTYPE_F64: + return _device_info->calculate(_info, output, inputs, stream); + case INFINI_DTYPE_I8: + return _device_info->calculate(_info, output, inputs, stream); + case INFINI_DTYPE_I16: + return _device_info->calculate(_info, output, inputs, stream); + case INFINI_DTYPE_I32: + return _device_info->calculate(_info, output, inputs, stream); + case INFINI_DTYPE_I64: + return _device_info->calculate(_info, output, inputs, stream); + case INFINI_DTYPE_U8: + return _device_info->calculate(_info, output, inputs, stream); + case INFINI_DTYPE_U16: + return _device_info->calculate(_info, output, inputs, stream); + case INFINI_DTYPE_U32: + return _device_info->calculate(_info, output, inputs, stream); + case INFINI_DTYPE_U64: + return _device_info->calculate(_info, output, inputs, stream); + case INFINI_DTYPE_BOOL: + return _device_info->calculate(_info, output, inputs, stream); + default: + return INFINI_STATUS_BAD_TENSOR_DTYPE; + } + + return INFINI_STATUS_SUCCESS; +} +} // namespace op::where::cpu diff --git a/src/infiniop/ops/where/cpu/where_cpu.h b/src/infiniop/ops/where/cpu/where_cpu.h new file mode 100644 index 000000000..3d86cb4f7 --- /dev/null +++ b/src/infiniop/ops/where/cpu/where_cpu.h @@ -0,0 +1,19 @@ +#ifndef __WHERE_CPU_H__ +#define __WHERE_CPU_H__ + +#include "../../../elementwise/cpu/elementwise_cpu.h" + +ELEMENTWISE_DESCRIPTOR(where, cpu) + +namespace op::where::cpu { +typedef struct WhereOp { +public: + static constexpr size_t num_inputs = 3; + template + T operator()(const T &a, const T &b, const bool &cond) const { + return cond ? a : b; + } +} WhereOp; +} // namespace op::where::cpu + +#endif // __WHERE_CPU_H__ diff --git a/src/infiniop/ops/where/cuda/kernel.cuh b/src/infiniop/ops/where/cuda/kernel.cuh new file mode 100644 index 000000000..8eb5c762b --- /dev/null +++ b/src/infiniop/ops/where/cuda/kernel.cuh @@ -0,0 +1,15 @@ +#ifndef __WHERE_CUDA_H__ +#define __WHERE_CUDA_H__ + +namespace op::where::cuda { +typedef struct WhereOp { +public: + static constexpr size_t num_inputs = 3; + template + __device__ __forceinline__ T operator()(const T &a, const T &b, const bool &cond) const { + return cond ? a : b; + } +} WhereOp; +} // namespace op::where::cuda + +#endif // __WHERE_CUDA_H__ diff --git a/src/infiniop/ops/where/metax/where_metax.h b/src/infiniop/ops/where/metax/where_metax.h new file mode 100644 index 000000000..43bb1a945 --- /dev/null +++ b/src/infiniop/ops/where/metax/where_metax.h @@ -0,0 +1,8 @@ +#ifndef __WHERE_METAX_API_H__ +#define __WHERE_METAX_API_H__ + +#include "../../../elementwise/metax/elementwise_metax_api.h" + +ELEMENTWISE_DESCRIPTOR(where, metax) + +#endif // __WHERE_METAX_API_H__ diff --git a/src/infiniop/ops/where/metax/where_metax.maca b/src/infiniop/ops/where/metax/where_metax.maca new file mode 100644 index 000000000..b648cfbcc --- /dev/null +++ b/src/infiniop/ops/where/metax/where_metax.maca @@ -0,0 +1,92 @@ +#include "where_metax.h" + +#include "../../../elementwise/metax/elementwise_metax.h" + +#include "../cuda/kernel.cuh" + +namespace op::where::metax { + +Descriptor::~Descriptor() = default; + +infiniStatus_t Descriptor::create( + infiniopHandle_t handle_, + Descriptor **desc_ptr, + infiniopTensorDescriptor_t out_desc, + std::vector input_desc_vec) { + + auto handle = reinterpret_cast(handle_); + auto dtype = out_desc->dtype(); + + const auto &a_desc = input_desc_vec.at(0); + const auto &b_desc = input_desc_vec.at(1); + const auto &cond_desc = input_desc_vec.at(2); + + const auto &c_shape = out_desc->shape(); + const auto &a_shape = a_desc->shape(); + const auto &b_shape = b_desc->shape(); + const auto &cond_shape = cond_desc->shape(); + + CHECK_DTYPE(cond_desc->dtype(), + INFINI_DTYPE_F16, INFINI_DTYPE_F32, INFINI_DTYPE_F64, INFINI_DTYPE_BF16, + INFINI_DTYPE_I8, INFINI_DTYPE_I16, INFINI_DTYPE_I32, INFINI_DTYPE_I64, + INFINI_DTYPE_U8, INFINI_DTYPE_U16, INFINI_DTYPE_U32, INFINI_DTYPE_U64, + INFINI_DTYPE_BOOL); + + CHECK_DTYPE(dtype, + INFINI_DTYPE_F16, INFINI_DTYPE_F32, INFINI_DTYPE_F64, INFINI_DTYPE_BF16, + INFINI_DTYPE_I8, INFINI_DTYPE_I16, INFINI_DTYPE_I32, INFINI_DTYPE_I64, + INFINI_DTYPE_U8, INFINI_DTYPE_U16, INFINI_DTYPE_U32, INFINI_DTYPE_U64, + INFINI_DTYPE_BOOL); + + CHECK_SAME_SHAPE(c_shape, a_shape, b_shape, cond_shape); + + CREATE_ELEMENTWISE_METAX_DESCRIPTOR(handle, dtype, out_desc, input_desc_vec) + + return INFINI_STATUS_SUCCESS; +} + +infiniStatus_t Descriptor::calculate( + void *workspace, + size_t workspace_size, + void *output, + std::vector inputs, + void *stream) const { + + if (workspace_size < _workspace_size) { + return INFINI_STATUS_INSUFFICIENT_WORKSPACE; + } + + switch (_dtype) { + case INFINI_DTYPE_F16: + return _device_info->calculate<256, cuda::WhereOp, half>(_info, workspace, output, inputs, stream); + case INFINI_DTYPE_BF16: + return _device_info->calculate<256, cuda::WhereOp, cuda_bfloat16>(_info, workspace, output, inputs, stream); + case INFINI_DTYPE_F32: + return _device_info->calculate<256, cuda::WhereOp, float>(_info, workspace, output, inputs, stream); + case INFINI_DTYPE_F64: + return _device_info->calculate<256, cuda::WhereOp, double>(_info, workspace, output, inputs, stream); + case INFINI_DTYPE_I8: + return _device_info->calculate<256, cuda::WhereOp, int8_t>(_info, workspace, output, inputs, stream); + case INFINI_DTYPE_I16: + return _device_info->calculate<256, cuda::WhereOp, int16_t>(_info, workspace, output, inputs, stream); + case INFINI_DTYPE_I32: + return _device_info->calculate<256, cuda::WhereOp, int32_t>(_info, workspace, output, inputs, stream); + case INFINI_DTYPE_I64: + return _device_info->calculate<256, cuda::WhereOp, int64_t>(_info, workspace, output, inputs, stream); + case INFINI_DTYPE_U8: + return _device_info->calculate<256, cuda::WhereOp, uint8_t>(_info, workspace, output, inputs, stream); + case INFINI_DTYPE_U16: + return _device_info->calculate<256, cuda::WhereOp, uint16_t>(_info, workspace, output, inputs, stream); + case INFINI_DTYPE_U32: + return _device_info->calculate<256, cuda::WhereOp, uint32_t>(_info, workspace, output, inputs, stream); + case INFINI_DTYPE_U64: + return _device_info->calculate<256, cuda::WhereOp, uint64_t>(_info, workspace, output, inputs, stream); + case INFINI_DTYPE_BOOL: + return _device_info->calculate<256, cuda::WhereOp, bool>(_info, workspace, output, inputs, stream); + default: + return INFINI_STATUS_BAD_TENSOR_DTYPE; + } + + return INFINI_STATUS_SUCCESS; +} +} // namespace op::where::metax diff --git a/src/infiniop/ops/where/nvidia/where_nvidia.cu b/src/infiniop/ops/where/nvidia/where_nvidia.cu new file mode 100644 index 000000000..860089bd2 --- /dev/null +++ b/src/infiniop/ops/where/nvidia/where_nvidia.cu @@ -0,0 +1,91 @@ +#include "../../../elementwise/nvidia/elementwise_nvidia.cuh" + +#include "../cuda/kernel.cuh" +#include "where_nvidia.cuh" + +namespace op::where::nvidia { + +Descriptor::~Descriptor() = default; + +infiniStatus_t Descriptor::create( + infiniopHandle_t handle_, + Descriptor **desc_ptr, + infiniopTensorDescriptor_t out_desc, + std::vector input_desc_vec) { + + auto handle = reinterpret_cast(handle_); + auto dtype = out_desc->dtype(); + + const auto &a_desc = input_desc_vec.at(0); + const auto &b_desc = input_desc_vec.at(1); + const auto &cond_desc = input_desc_vec.at(2); + + const auto &c_shape = out_desc->shape(); + const auto &a_shape = a_desc->shape(); + const auto &b_shape = b_desc->shape(); + const auto &cond_shape = cond_desc->shape(); + + CHECK_DTYPE(cond_desc->dtype(), + INFINI_DTYPE_F16, INFINI_DTYPE_F32, INFINI_DTYPE_F64, INFINI_DTYPE_BF16, + INFINI_DTYPE_I8, INFINI_DTYPE_I16, INFINI_DTYPE_I32, INFINI_DTYPE_I64, + INFINI_DTYPE_U8, INFINI_DTYPE_U16, INFINI_DTYPE_U32, INFINI_DTYPE_U64, + INFINI_DTYPE_BOOL); + + CHECK_DTYPE(dtype, + INFINI_DTYPE_F16, INFINI_DTYPE_F32, INFINI_DTYPE_F64, INFINI_DTYPE_BF16, + INFINI_DTYPE_I8, INFINI_DTYPE_I16, INFINI_DTYPE_I32, INFINI_DTYPE_I64, + INFINI_DTYPE_U8, INFINI_DTYPE_U16, INFINI_DTYPE_U32, INFINI_DTYPE_U64, + INFINI_DTYPE_BOOL); + + CHECK_SAME_SHAPE(c_shape, a_shape, b_shape, cond_shape); + + CREATE_ELEMENTWISE_CUDA_DESCRIPTOR(handle, dtype, out_desc, input_desc_vec) + + return INFINI_STATUS_SUCCESS; +} + +infiniStatus_t Descriptor::calculate( + void *workspace, + size_t workspace_size, + void *output, + std::vector inputs, + void *stream) const { + + if (workspace_size < _workspace_size) { + return INFINI_STATUS_INSUFFICIENT_WORKSPACE; + } + + switch (_dtype) { + case INFINI_DTYPE_F16: + return _device_info->calculate<256, cuda::WhereOp, half>(_info, workspace, output, inputs, stream); + case INFINI_DTYPE_BF16: + return _device_info->calculate<256, cuda::WhereOp, cuda_bfloat16>(_info, workspace, output, inputs, stream); + case INFINI_DTYPE_F32: + return _device_info->calculate<256, cuda::WhereOp, float>(_info, workspace, output, inputs, stream); + case INFINI_DTYPE_F64: + return _device_info->calculate<256, cuda::WhereOp, double>(_info, workspace, output, inputs, stream); + case INFINI_DTYPE_I8: + return _device_info->calculate<256, cuda::WhereOp, int8_t>(_info, workspace, output, inputs, stream); + case INFINI_DTYPE_I16: + return _device_info->calculate<256, cuda::WhereOp, int16_t>(_info, workspace, output, inputs, stream); + case INFINI_DTYPE_I32: + return _device_info->calculate<256, cuda::WhereOp, int32_t>(_info, workspace, output, inputs, stream); + case INFINI_DTYPE_I64: + return _device_info->calculate<256, cuda::WhereOp, int64_t>(_info, workspace, output, inputs, stream); + case INFINI_DTYPE_U8: + return _device_info->calculate<256, cuda::WhereOp, uint8_t>(_info, workspace, output, inputs, stream); + case INFINI_DTYPE_U16: + return _device_info->calculate<256, cuda::WhereOp, uint16_t>(_info, workspace, output, inputs, stream); + case INFINI_DTYPE_U32: + return _device_info->calculate<256, cuda::WhereOp, uint32_t>(_info, workspace, output, inputs, stream); + case INFINI_DTYPE_U64: + return _device_info->calculate<256, cuda::WhereOp, uint64_t>(_info, workspace, output, inputs, stream); + case INFINI_DTYPE_BOOL: + return _device_info->calculate<256, cuda::WhereOp, bool>(_info, workspace, output, inputs, stream); + default: + return INFINI_STATUS_BAD_TENSOR_DTYPE; + } + + return INFINI_STATUS_SUCCESS; +} +} // namespace op::where::nvidia diff --git a/src/infiniop/ops/where/nvidia/where_nvidia.cuh b/src/infiniop/ops/where/nvidia/where_nvidia.cuh new file mode 100644 index 000000000..c168364a8 --- /dev/null +++ b/src/infiniop/ops/where/nvidia/where_nvidia.cuh @@ -0,0 +1,8 @@ +#ifndef __WHERE_CUDA_API_H__ +#define __WHERE_CUDA_API_H__ + +#include "../../../elementwise/nvidia/elementwise_nvidia_api.cuh" + +ELEMENTWISE_DESCRIPTOR(where, nvidia) + +#endif // __WHERE_CUDA_API_H__ diff --git a/src/infiniop/ops/where/operator.cc b/src/infiniop/ops/where/operator.cc new file mode 100644 index 000000000..d69b1d4e1 --- /dev/null +++ b/src/infiniop/ops/where/operator.cc @@ -0,0 +1,148 @@ +#include "../../operator.h" +#include "../../handle.h" +#include "infiniop/ops/where.h" + +#ifdef ENABLE_CPU_API +#include "cpu/where_cpu.h" +#endif +#if defined(ENABLE_NVIDIA_API) || defined(ENABLE_ILUVATAR_API) +#include "nvidia/where_nvidia.cuh" +#endif +#ifdef ENABLE_METAX_API +#include "metax/where_metax.h" +#endif + +__C infiniStatus_t infiniopCreateWhereDescriptor( + infiniopHandle_t handle, + infiniopWhereDescriptor_t *desc_ptr, + infiniopTensorDescriptor_t c_desc, + infiniopTensorDescriptor_t a_desc, + infiniopTensorDescriptor_t b_desc, + infiniopTensorDescriptor_t condition_desc) { + +#define CREATE(CASE, NAMESPACE) \ + case CASE: \ + return op::where::NAMESPACE::Descriptor::create( \ + handle, \ + reinterpret_cast(desc_ptr), \ + c_desc, \ + {a_desc, \ + b_desc, \ + condition_desc}) + + switch (handle->device) { + +#ifdef ENABLE_CPU_API + CREATE(INFINI_DEVICE_CPU, cpu); +#endif +#ifdef ENABLE_NVIDIA_API + CREATE(INFINI_DEVICE_NVIDIA, nvidia); +#endif +#ifdef ENABLE_ILUVATAR_API + CREATE(INFINI_DEVICE_ILUVATAR, nvidia); +#endif +#ifdef ENABLE_METAX_API + CREATE(INFINI_DEVICE_METAX, metax); +#endif + + default: + return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED; + } + +#undef CREATE +} + +__C infiniStatus_t infiniopGetWhereWorkspaceSize(infiniopWhereDescriptor_t desc, size_t *size) { + +#define GET(CASE, NAMESPACE) \ + case CASE: \ + *size = reinterpret_cast(desc)->workspaceSize(); \ + return INFINI_STATUS_SUCCESS + + switch (desc->device_type) { +#ifdef ENABLE_CPU_API + GET(INFINI_DEVICE_CPU, cpu); +#endif +#ifdef ENABLE_NVIDIA_API + GET(INFINI_DEVICE_NVIDIA, nvidia); +#endif +#ifdef ENABLE_ILUVATAR_API + GET(INFINI_DEVICE_ILUVATAR, nvidia); +#endif +#ifdef ENABLE_METAX_API + GET(INFINI_DEVICE_METAX, metax); +#endif + default: + return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED; + } +#undef GET + + return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED; +} + +__C infiniStatus_t infiniopWhere( + infiniopWhereDescriptor_t desc, + void *workspace, + size_t workspace_size, + void *c, + const void *a, + const void *b, + const void *condition, + void *stream) { + +#define CALCULATE(CASE, NAMESPACE) \ + case CASE: \ + return reinterpret_cast(desc) \ + ->calculate(workspace, workspace_size, c, {a, b, condition}, stream) + + switch (desc->device_type) { + +#ifdef ENABLE_CPU_API + CALCULATE(INFINI_DEVICE_CPU, cpu); +#endif +#ifdef ENABLE_NVIDIA_API + CALCULATE(INFINI_DEVICE_NVIDIA, nvidia); +#endif +#ifdef ENABLE_ILUVATAR_API + CALCULATE(INFINI_DEVICE_ILUVATAR, nvidia); +#endif +#ifdef ENABLE_METAX_API + CALCULATE(INFINI_DEVICE_METAX, metax); +#endif + + default: + return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED; + } + +#undef CALCULATE +} + +__C infiniStatus_t +infiniopDestroyWhereDescriptor(infiniopWhereDescriptor_t desc) { + +#define DELETE(CASE, NAMESPACE) \ + case CASE: \ + delete reinterpret_cast(desc); \ + return INFINI_STATUS_SUCCESS + + switch (desc->device_type) { + +#ifdef ENABLE_CPU_API + DELETE(INFINI_DEVICE_CPU, cpu); +#endif +#ifdef ENABLE_NVIDIA_API + DELETE(INFINI_DEVICE_NVIDIA, nvidia); +#endif +#ifdef ENABLE_ILUVATAR_API + DELETE(INFINI_DEVICE_ILUVATAR, nvidia); +#endif +#ifdef ENABLE_METAX_API + DELETE(INFINI_DEVICE_METAX, metax); +#endif + + default: + return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED; + } + +#undef DELETE +} diff --git a/test/infiniop/cast.py b/test/infiniop/cast.py new file mode 100644 index 000000000..87b572741 --- /dev/null +++ b/test/infiniop/cast.py @@ -0,0 +1,244 @@ +import torch +import ctypes +from ctypes import c_uint64 +from libinfiniop import ( + LIBINFINIOP, + TestTensor, + get_test_devices, + check_error, + test_operator, + get_args, + debug, + profile_operation, + TestWorkspace, + InfiniDtype, + InfiniDtypeNames, + InfiniDeviceNames, + infiniopOperatorDescriptor_t, + to_torch_dtype, + torch_device_map +) +import itertools + +# ============================================================================== +# Configuration (Internal Use Only) +# ============================================================================== +_TEST_CASES = [ + ((13, 4), None, None), + ((13, 4), (10, 1), (10, 1)), + ((13, 4), (0, 1), None), + ((13, 4, 4), None, None), + ((13, 4, 4), (20, 4, 1), (20, 4, 1)), + ((13, 4, 4), (4, 0, 1), None), + ((16, 5632), None, None), + ((16, 5632), (10240, 1), (10240, 1)), + ((4, 4, 5632), None, None), + ((4, 4, 5632), (45056, 5632, 1), (45056, 5632, 1)), +] + +_INTEGER_DTYPES = [ + InfiniDtype.I32, + InfiniDtype.I64, + InfiniDtype.U32, + InfiniDtype.U64, +] + +_FLOAT_DTYPES = [ + InfiniDtype.F16, + InfiniDtype.F32, + InfiniDtype.F64, +] + +def is_supported_dt(inf_dt): + try: + td = to_torch_dtype(inf_dt, compatability_mode=True) + _ = torch.empty((1,), dtype=td, device="cpu") + return True + except Exception: + return False + +_TOLERANCE_MAP = { + ("float", "float"): {"atol": 1e-3, "rtol": 1e-3}, + ("int", "float"): {"atol": 1.0, "rtol": 1e-3}, +} + +DEBUG = False +PROFILE = False +NUM_PRERUN = 10 +NUM_ITERATIONS = 1000 + + +def _is_integer_dtype(inf_dt): + return inf_dt in _INTEGER_DTYPES + + +def _is_float_dtype(inf_dt): + return inf_dt in _FLOAT_DTYPES + + +def _is_unsigned_dtype(inf_dt): + return inf_dt in (InfiniDtype.U32, InfiniDtype.U64) + + +def reference_cast_torch(output_tensor, input_tensor): + converted = input_tensor.to(dtype=output_tensor.dtype, device=output_tensor.device).clone() + output_tensor.copy_(converted) + + +def make_integer_torch_tensor(shape, inf_dt, device): + use_compatibility = _is_unsigned_dtype(inf_dt) + + if inf_dt == InfiniDtype.I32: + low, high, dtype = -2000, 2000, torch.int32 + elif inf_dt == InfiniDtype.I64: + low, high, dtype = -2048, 2048, torch.int64 + elif inf_dt == InfiniDtype.U32: + low, high, dtype = 0, 2000, torch.int32 + elif inf_dt == InfiniDtype.U64: + low, high, dtype = 0, 2048, torch.int64 + else: + low, high, dtype = 0, 1, torch.int64 + + dev = torch_device_map[device] + + t = torch.randint(low=low, high=high, size=shape, dtype=dtype, device=dev) + + target_torch_dt = to_torch_dtype(inf_dt, compatability_mode=use_compatibility) + if t.dtype != target_torch_dt: + t = t.to(dtype=target_torch_dt) + + return t + + +def test( + handle, + device, + shape, + in_stride, + out_stride, + dtype_pair, + sync=None, +): + in_dt, out_dt = dtype_pair + + if not is_supported_dt(in_dt) or not is_supported_dt(out_dt): + print(f"Skipping test for in={InfiniDtypeNames[in_dt]} out={InfiniDtypeNames[out_dt]} because dtype not supported on this platform") + return + + try: + if _is_integer_dtype(in_dt): + in_torch = make_integer_torch_tensor(shape, in_dt, device) + input = TestTensor.from_torch(in_torch, in_dt, device) + else: + input = TestTensor(shape, in_stride, in_dt, device, mode="random") + + output = TestTensor(shape, out_stride, out_dt, device, mode="zeros") + + if output.is_broadcast(): + return + + print(f"Testing Cast on {InfiniDeviceNames[device]} shape={shape} in={InfiniDtypeNames[in_dt]} out={InfiniDtypeNames[out_dt]} in_stride={in_stride} out_stride={out_stride}") + + reference_cast_torch(output.actual_tensor(), input.torch_tensor()) + + expected = output.actual_tensor().clone() + + descriptor = infiniopOperatorDescriptor_t() + check_error( + LIBINFINIOP.infiniopCreateCastDescriptor( + handle, + ctypes.byref(descriptor), + output.descriptor, + input.descriptor, + ) + ) + + input.destroy_desc() + output.destroy_desc() + + workspace_size = c_uint64(0) + check_error(LIBINFINIOP.infiniopGetCastWorkspaceSize(descriptor, ctypes.byref(workspace_size))) + workspace = TestWorkspace(workspace_size.value, device) + + def lib_cast(): + check_error( + LIBINFINIOP.infiniopCast( + descriptor, + workspace.data(), + workspace_size.value, + output.data(), + input.data(), + None, + ) + ) + + lib_cast() + + actual = output.actual_tensor() + + if _is_integer_dtype(in_dt) and _is_float_dtype(out_dt): + tol = _TOLERANCE_MAP[("int", "float")] + atol, rtol = tol["atol"], tol["rtol"] + elif _is_float_dtype(in_dt) and _is_float_dtype(out_dt): + tol = _TOLERANCE_MAP[("float", "float")] + atol, rtol = tol["atol"], tol["rtol"] + else: + atol, rtol = 0, 0 + + if DEBUG: + debug(actual, expected, atol=atol, rtol=rtol) + + assert torch.allclose(actual, expected, atol=atol, rtol=rtol), \ + f"Mismatch for in={InfiniDtypeNames[in_dt]} out={InfiniDtypeNames[out_dt]} shape={shape}" + + if PROFILE: + profile_operation("PyTorch", lambda: reference_cast_torch(output.torch_tensor(), input.torch_tensor()), device, NUM_PRERUN, NUM_ITERATIONS) + profile_operation(" lib", lambda: lib_cast(), device, NUM_PRERUN, NUM_ITERATIONS) + + check_error(LIBINFINIOP.infiniopDestroyCastDescriptor(descriptor)) + + except RuntimeError as e: + if "not implemented for 'UInt32'" in str(e) or "not implemented for 'UInt64'" in str(e): + #print(f"Skipping unsupported operation: {e}") + return False + else: + raise + + +def main(): + args = get_args() + global DEBUG, PROFILE, NUM_PRERUN, NUM_ITERATIONS + DEBUG = args.debug + PROFILE = args.profile + NUM_PRERUN = args.num_prerun + NUM_ITERATIONS = args.num_iterations + + integer_pairs = itertools.product(_INTEGER_DTYPES, _INTEGER_DTYPES) + float_pairs = itertools.product(_FLOAT_DTYPES, _FLOAT_DTYPES) + int_to_float_pairs = itertools.product(_INTEGER_DTYPES, _FLOAT_DTYPES) + + all_pairs = list(set(itertools.chain(integer_pairs, float_pairs, int_to_float_pairs))) + + supported_pairs = [] + skipped_pairs = [] + for pair in all_pairs: + in_dt, out_dt = pair + if is_supported_dt(in_dt) and is_supported_dt(out_dt): + supported_pairs.append(pair) + else: + skipped_pairs.append(pair) + + print(f"Supported dtype pairs: {[(InfiniDtypeNames[in_d], InfiniDtypeNames[out_d]) for in_d, out_d in supported_pairs]}") + if skipped_pairs: + print(f"Warning: skipping unsupported dtype pairs: {[(InfiniDtypeNames[in_d], InfiniDtypeNames[out_d]) for in_d, out_d in skipped_pairs]}") + + devices = get_test_devices(args) + + for device in devices: + test_operator(device, test, _TEST_CASES, supported_pairs) + + print("\033[92mAll cast tests passed!\033[0m") + + +if __name__ == "__main__": + main() diff --git a/test/infiniop/cos.py b/test/infiniop/cos.py new file mode 100644 index 000000000..d1d94db3a --- /dev/null +++ b/test/infiniop/cos.py @@ -0,0 +1,166 @@ +import torch +import ctypes +from ctypes import c_uint64 +from libinfiniop import ( + LIBINFINIOP, + TestTensor, + get_test_devices, + check_error, + test_operator, + get_args, + debug, + get_tolerance, + get_sync_func, + profile_operation, + TestWorkspace, + InfiniDtype, + InfiniDtypeNames, + InfiniDeviceNames, + infiniopOperatorDescriptor_t, +) +from enum import Enum, auto + +# ======================================================================== +# Configuration (Internal Use Only) +# ======================================================================== +_TEST_CASES_ = [ + # shape, input_stride, output_stride + ((13, 4), None, None), + ((13, 4), (10, 1), (10, 1)), + ((13, 4), (0, 1), None), + ((13, 4, 4), None, None), + ((13, 4, 4), (20, 4, 1), (20, 4, 1)), + ((13, 4, 4), (4, 0, 1), None), + ((16, 5632), None, None), + ((16, 5632), (10240, 1), (10240, 1)), + ((4, 4, 5632), None, None), + ((4, 4, 5632), (45056, 5632, 1), (45056, 5632, 1)), +] + +class Inplace(Enum): + OUT_OF_PLACE = auto() + INPLACE_INPUT = auto() + +_INPLACE = [ + Inplace.OUT_OF_PLACE, + Inplace.INPLACE_INPUT, +] + +_TEST_CASES = [ + test_case + (inplace,) + for test_case in _TEST_CASES_ + for inplace in _INPLACE +] + +_TENSOR_DTYPES = [InfiniDtype.F16, InfiniDtype.F32, InfiniDtype.BF16] + +_TOLERANCE_MAP = { + InfiniDtype.F16: {"atol": 1e-3, "rtol": 1e-3}, + InfiniDtype.F32: {"atol": 1e-6, "rtol": 1e-6}, + InfiniDtype.BF16: {"atol": 1e-2, "rtol": 1e-2}, +} + +DEBUG = False +PROFILE = False +NUM_PRERUN = 10 +NUM_ITERATIONS = 1000 + + +def cos(output, input): + output.copy_(torch.cos(input)) + +def test( + handle, + device, + shape, + input_stride=None, + output_stride=None, + inplace=Inplace.OUT_OF_PLACE, + dtype=torch.float16, + sync=None, +): + input = TestTensor(shape, input_stride, dtype, device) + if inplace == Inplace.INPLACE_INPUT: + if input_stride != output_stride: + return + output = input + else: + output = TestTensor(shape, output_stride, dtype, device, mode="ones") + + if output.is_broadcast(): + return + + print( + f"Testing Cos on {InfiniDeviceNames[device]} with shape:{shape} input_stride:{input_stride} output_stride:{output_stride} " + f"dtype:{InfiniDtypeNames[dtype]} inplace:{inplace}" + ) + + cos(output.torch_tensor(), input.torch_tensor()) + + if sync is not None: + sync() + + descriptor = infiniopOperatorDescriptor_t() + check_error( + LIBINFINIOP.infiniopCreateCosDescriptor( + handle, + ctypes.byref(descriptor), + output.descriptor, + input.descriptor, + ) + ) + + # Invalidate the shape and strides in the descriptor to prevent them from being directly used by the kernel + for tensor in [input, output]: + tensor.destroy_desc() + + workspace_size = c_uint64(0) + check_error( + LIBINFINIOP.infiniopGetCosWorkspaceSize( + descriptor, ctypes.byref(workspace_size) + ) + ) + workspace = TestWorkspace(workspace_size.value, output.device) + + def lib_cos(): + check_error( + LIBINFINIOP.infiniopCos( + descriptor, + workspace.data(), + workspace_size.value, + output.data(), + input.data(), + None, + ) + ) + + lib_cos() + + atol, rtol = get_tolerance(_TOLERANCE_MAP, dtype) + if DEBUG: + debug(output.actual_tensor(), output.torch_tensor(), atol=atol, rtol=rtol) + + assert torch.allclose(output.actual_tensor(), output.torch_tensor(), atol=atol, rtol=rtol) + + # Profiling workflow + if PROFILE: + # fmt: off + profile_operation("PyTorch", lambda: cos(output.torch_tensor(), input.torch_tensor()), device, NUM_PRERUN, NUM_ITERATIONS) + profile_operation(" lib", lambda: lib_cos(), device, NUM_PRERUN, NUM_ITERATIONS) + # fmt: on + check_error(LIBINFINIOP.infiniopDestroyCosDescriptor(descriptor)) + + +if __name__ == "__main__": + args = get_args() + + # Configure testing options + DEBUG = args.debug + PROFILE = args.profile + NUM_PRERUN = args.num_prerun + NUM_ITERATIONS = args.num_iterations + + for device in get_test_devices(args): + test_operator(device, test, _TEST_CASES, _TENSOR_DTYPES) + + print("\033[92mTest passed!\033[0m") diff --git a/test/infiniop/exp.py b/test/infiniop/exp.py new file mode 100644 index 000000000..eb139af12 --- /dev/null +++ b/test/infiniop/exp.py @@ -0,0 +1,165 @@ +import torch +import ctypes +from ctypes import c_uint64 +from libinfiniop import ( + LIBINFINIOP, + TestTensor, + get_test_devices, + check_error, + test_operator, + get_args, + debug, + get_tolerance, + get_sync_func, + profile_operation, + TestWorkspace, + InfiniDtype, + InfiniDtypeNames, + InfiniDeviceNames, + infiniopOperatorDescriptor_t, +) +from enum import Enum, auto + +# ======================================================================== +# Configuration (Internal Use Only) +# ======================================================================== +_TEST_CASES_ = [ + # shape, input_stride, output_stride + ((13, 4), None, None), + ((13, 4), (10, 1), (10, 1)), + ((13, 4), (0, 1), None), + ((13, 4, 4), None, None), + ((13, 4, 4), (20, 4, 1), (20, 4, 1)), + ((13, 4, 4), (4, 0, 1), None), + ((16, 5632), None, None), + ((16, 5632), (10240, 1), (10240, 1)), + ((4, 4, 5632), None, None), + ((4, 4, 5632), (45056, 5632, 1), (45056, 5632, 1)), +] + +class Inplace(Enum): + OUT_OF_PLACE = auto() + INPLACE_INPUT = auto() + +_INPLACE = [ + Inplace.OUT_OF_PLACE, + Inplace.INPLACE_INPUT, +] + +_TEST_CASES = [ + test_case + (inplace,) + for test_case in _TEST_CASES_ + for inplace in _INPLACE +] + +_TENSOR_DTYPES = [InfiniDtype.F16, InfiniDtype.F32, InfiniDtype.BF16] + +_TOLERANCE_MAP = { + InfiniDtype.F16: {"atol": 1e-3, "rtol": 1e-3}, + InfiniDtype.F32: {"atol": 1e-6, "rtol": 1e-6}, + InfiniDtype.BF16: {"atol": 1e-2, "rtol": 1e-2}, +} + +DEBUG = False +PROFILE = False +NUM_PRERUN = 10 +NUM_ITERATIONS = 1000 + + +def exp(output, input): + output.copy_(torch.exp(input)) + +def test( + handle, + device, + shape, + input_stride=None, + output_stride=None, + inplace=Inplace.OUT_OF_PLACE, + dtype=torch.float16, + sync=None, +): + input = TestTensor(shape, input_stride, dtype, device) + if inplace == Inplace.INPLACE_INPUT: + if input_stride != output_stride: + return + output = input + else: + output = TestTensor(shape, output_stride, dtype, device, mode="ones") + + if output.is_broadcast(): + return + + print( + f"Testing Exp on {InfiniDeviceNames[device]} with shape:{shape} input_stride:{input_stride} output_stride:{output_stride} " + f"dtype:{InfiniDtypeNames[dtype]} inplace:{inplace}" + ) + + exp(output.torch_tensor(), input.torch_tensor()) + + if sync is not None: + sync() + + descriptor = infiniopOperatorDescriptor_t() + check_error( + LIBINFINIOP.infiniopCreateExpDescriptor( + handle, + ctypes.byref(descriptor), + output.descriptor, + input.descriptor, + ) + ) + + # Invalidate the shape and strides in the descriptor to prevent them from being directly used by the kernel + for tensor in [input, output]: + tensor.destroy_desc() + + workspace_size = c_uint64(0) + check_error( + LIBINFINIOP.infiniopGetExpWorkspaceSize( + descriptor, ctypes.byref(workspace_size) + ) + ) + workspace = TestWorkspace(workspace_size.value, output.device) + + def lib_exp(): + check_error( + LIBINFINIOP.infiniopExp( + descriptor, + workspace.data(), + workspace_size.value, + output.data(), + input.data(), + None, + ) + ) + + lib_exp() + + atol, rtol = get_tolerance(_TOLERANCE_MAP, dtype) + if DEBUG: + debug(output.actual_tensor(), output.torch_tensor(), atol=atol, rtol=rtol) + assert torch.allclose(output.actual_tensor(), output.torch_tensor(), atol=atol, rtol=rtol) + + # Profiling workflow + if PROFILE: + # fmt: off + profile_operation("PyTorch", lambda: exp(output.torch_tensor(), input.torch_tensor()), device, NUM_PRERUN, NUM_ITERATIONS) + profile_operation(" lib", lambda: lib_exp(), device, NUM_PRERUN, NUM_ITERATIONS) + # fmt: on + check_error(LIBINFINIOP.infiniopDestroyExpDescriptor(descriptor)) + + +if __name__ == "__main__": + args = get_args() + + # Configure testing options + DEBUG = args.debug + PROFILE = args.profile + NUM_PRERUN = args.num_prerun + NUM_ITERATIONS = args.num_iterations + + for device in get_test_devices(args): + test_operator(device, test, _TEST_CASES, _TENSOR_DTYPES) + + print("\033[92mTest passed!\033[0m") diff --git a/test/infiniop/hardswish.py b/test/infiniop/hardswish.py new file mode 100644 index 000000000..424b30567 --- /dev/null +++ b/test/infiniop/hardswish.py @@ -0,0 +1,167 @@ +import torch +import ctypes +from ctypes import c_uint64 +from libinfiniop import ( + LIBINFINIOP, + TestTensor, + get_test_devices, + check_error, + test_operator, + get_args, + debug, + get_tolerance, + get_sync_func, + profile_operation, + TestWorkspace, + InfiniDtype, + InfiniDtypeNames, + InfiniDeviceNames, + infiniopOperatorDescriptor_t, +) +from enum import Enum, auto + +# ======================================================================== +# Configuration (Internal Use Only) +# ======================================================================== +_TEST_CASES_ = [ + # shape, input_stride, output_stride + ((13, 4), None, None), + ((13, 4), (10, 1), (10, 1)), + ((13, 4), (0, 1), None), + ((13, 4, 4), None, None), + ((13, 4, 4), (20, 4, 1), (20, 4, 1)), + ((13, 4, 4), (4, 0, 1), None), + ((16, 5632), None, None), + ((16, 5632), (10240, 1), (10240, 1)), + ((4, 4, 5632), None, None), + ((4, 4, 5632), (45056, 5632, 1), (45056, 5632, 1)), +] + +class Inplace(Enum): + OUT_OF_PLACE = auto() + INPLACE_INPUT = auto() + +_INPLACE = [ + Inplace.OUT_OF_PLACE, + Inplace.INPLACE_INPUT, +] + +_TEST_CASES = [ + test_case + (inplace,) + for test_case in _TEST_CASES_ + for inplace in _INPLACE +] + +_TENSOR_DTYPES = [InfiniDtype.F16, InfiniDtype.F32, InfiniDtype.BF16] + +_TOLERANCE_MAP = { + InfiniDtype.F16: {"atol": 1e-3, "rtol": 1e-3}, + InfiniDtype.F32: {"atol": 1e-6, "rtol": 1e-6}, + InfiniDtype.BF16: {"atol": 1e-2, "rtol": 1e-2}, +} + +DEBUG = False +PROFILE = False +NUM_PRERUN = 10 +NUM_ITERATIONS = 1000 + + +def hardswish(output, input): + output.copy_(input * torch.clamp(input + 3, min=0, max=6) / 6) + + +def test( + handle, + device, + shape, + input_stride=None, + output_stride=None, + inplace=Inplace.OUT_OF_PLACE, + dtype=torch.float16, + sync=None, +): + input = TestTensor(shape, input_stride, dtype, device) + if inplace == Inplace.INPLACE_INPUT: + if input_stride != output_stride: + return + output = input + else: + output = TestTensor(shape, output_stride, dtype, device, mode="ones") + + if output.is_broadcast(): + return + + print( + f"Testing Hardswish on {InfiniDeviceNames[device]} with shape:{shape} input_stride:{input_stride} output_stride:{output_stride} " + f"dtype:{InfiniDtypeNames[dtype]} inplace:{inplace}" + ) + + hardswish(output.torch_tensor(), input.torch_tensor()) + + if sync is not None: + sync() + + descriptor = infiniopOperatorDescriptor_t() + check_error( + LIBINFINIOP.infiniopCreateHardswishDescriptor( + handle, + ctypes.byref(descriptor), + output.descriptor, + input.descriptor, + ) + ) + + # Invalidate the shape and strides in the descriptor to prevent them from being directly used by the kernel + for tensor in [input, output]: + tensor.destroy_desc() + + workspace_size = c_uint64(0) + check_error( + LIBINFINIOP.infiniopGetHardswishWorkspaceSize( + descriptor, ctypes.byref(workspace_size) + ) + ) + workspace = TestWorkspace(workspace_size.value, output.device) + + def lib_hardswish(): + check_error( + LIBINFINIOP.infiniopHardswish( + descriptor, + workspace.data(), + workspace_size.value, + output.data(), + input.data(), + None, + ) + ) + + lib_hardswish() + + atol, rtol = get_tolerance(_TOLERANCE_MAP, dtype) + if DEBUG: + debug(output.actual_tensor(), output.torch_tensor(), atol=atol, rtol=rtol) + + assert torch.allclose(output.actual_tensor(), output.torch_tensor(), atol=atol, rtol=rtol) + + # Profiling workflow + if PROFILE: + # fmt: off + profile_operation("PyTorch", lambda: hardswish(output.torch_tensor(), input.torch_tensor()), device, NUM_PRERUN, NUM_ITERATIONS) + profile_operation(" lib", lambda: lib_hardswish(), device, NUM_PRERUN, NUM_ITERATIONS) + # fmt: on + check_error(LIBINFINIOP.infiniopDestroyHardswishDescriptor(descriptor)) + + +if __name__ == "__main__": + args = get_args() + + # Configure testing options + DEBUG = args.debug + PROFILE = args.profile + NUM_PRERUN = args.num_prerun + NUM_ITERATIONS = args.num_iterations + + for device in get_test_devices(args): + test_operator(device, test, _TEST_CASES, _TENSOR_DTYPES) + + print("\033[92mTest passed!\033[0m") diff --git a/test/infiniop/leakyrelu.py b/test/infiniop/leakyrelu.py new file mode 100644 index 000000000..76562ddf0 --- /dev/null +++ b/test/infiniop/leakyrelu.py @@ -0,0 +1,168 @@ +import torch +import ctypes +from ctypes import c_uint64 +from libinfiniop import ( + LIBINFINIOP, + TestTensor, + get_test_devices, + check_error, + test_operator, + get_args, + debug, + get_tolerance, + profile_operation, + TestWorkspace, + InfiniDtype, + InfiniDtypeNames, + InfiniDeviceNames, + infiniopOperatorDescriptor_t, +) +from enum import Enum, auto + +# ======================================================================== +# Configuration (Internal Use Only) +# ======================================================================== +_TEST_CASES_ = [ + # shape, input_stride, output_stride + ((13, 4), None, None), + ((13, 4), (10, 1), (10, 1)), + ((13, 4), (0, 1), None), + ((13, 4, 4), None, None), + ((13, 4, 4), (20, 4, 1), (20, 4, 1)), + ((13, 4, 4), (4, 0, 1), None), + ((16, 5632), None, None), + ((16, 5632), (10240, 1), (10240, 1)), + ((4, 4, 5632), None, None), + ((4, 4, 5632), (45056, 5632, 1), (45056, 5632, 1)), +] + +class Inplace(Enum): + OUT_OF_PLACE = auto() + INPLACE_INPUT = auto() + +_INPLACE = [ + Inplace.OUT_OF_PLACE, + Inplace.INPLACE_INPUT, +] + +_TEST_CASES = [ + test_case + (inplace,) + for test_case in _TEST_CASES_ + for inplace in _INPLACE +] + +_TENSOR_DTYPES = [InfiniDtype.F16, InfiniDtype.F32, InfiniDtype.BF16] + +_TOLERANCE_MAP = { + InfiniDtype.F16: {"atol": 1e-3, "rtol": 1e-3}, + InfiniDtype.F32: {"atol": 1e-6, "rtol": 1e-6}, + InfiniDtype.BF16: {"atol": 1e-2, "rtol": 1e-2}, +} + +DEBUG = False +PROFILE = False +NUM_PRERUN = 10 +NUM_ITERATIONS = 1000 + + +def leakyrelu(output, input, negative_slope): + output.copy_(torch.where(input >= 0, input, input * negative_slope)) + + +def test( + handle, + device, + shape, + input_stride=None, + output_stride=None, + inplace=Inplace.OUT_OF_PLACE, + dtype=torch.float16, + sync=None, +): + input = TestTensor(shape, input_stride, dtype, device) + if inplace == Inplace.INPLACE_INPUT: + if input_stride != output_stride: + return + output = input + else: + output = TestTensor(shape, output_stride, dtype, device, mode="ones") + + if output.is_broadcast(): + return + + negative_slope = 0.1 + print( + f"Testing Leakyrelu on {InfiniDeviceNames[device]} with shape:{shape} input_stride:{input_stride} output_stride:{output_stride} negative_slope:{negative_slope} " + f"dtype:{InfiniDtypeNames[dtype]} inplace:{inplace}" + ) + + leakyrelu(output.torch_tensor(), input.torch_tensor(), negative_slope) + + if sync is not None: + sync() + + descriptor = infiniopOperatorDescriptor_t() + check_error( + LIBINFINIOP.infiniopCreateLeakyreluDescriptor( + handle, + ctypes.byref(descriptor), + output.descriptor, + input.descriptor, + negative_slope + ) + ) + + # Invalidate the shape and strides in the descriptor to prevent them from being directly used by the kernel + for tensor in [input, output]: + tensor.destroy_desc() + + workspace_size = c_uint64(0) + check_error( + LIBINFINIOP.infiniopGetLeakyreluWorkspaceSize( + descriptor, ctypes.byref(workspace_size) + ) + ) + workspace = TestWorkspace(workspace_size.value, output.device) + + def lib_leakyrelu(): + check_error( + LIBINFINIOP.infiniopLeakyrelu( + descriptor, + workspace.data(), + workspace_size.value, + output.data(), + input.data(), + None + ) + ) + + lib_leakyrelu() + + atol, rtol = get_tolerance(_TOLERANCE_MAP, dtype) + if DEBUG: + debug(output.actual_tensor(), output.torch_tensor(), atol=atol, rtol=rtol) + + assert torch.allclose(output.actual_tensor(), output.torch_tensor(), atol=atol, rtol=rtol) + + # Profiling workflow + if PROFILE: + # fmt: off + profile_operation("PyTorch", lambda: leakyrelu(output.torch_tensor(), input.torch_tensor(), negative_slope), device, NUM_PRERUN, NUM_ITERATIONS) + profile_operation(" lib", lambda: lib_leakyrelu(), device, NUM_PRERUN, NUM_ITERATIONS) + # fmt: on + check_error(LIBINFINIOP.infiniopDestroyLeakyreluDescriptor(descriptor)) + + +if __name__ == "__main__": + args = get_args() + + # Configure testing options + DEBUG = args.debug + PROFILE = args.profile + NUM_PRERUN = args.num_prerun + NUM_ITERATIONS = args.num_iterations + + for device in get_test_devices(args): + test_operator(device, test, _TEST_CASES, _TENSOR_DTYPES) + + print("\033[92mTest passed!\033[0m") diff --git a/test/infiniop/libinfiniop/op_register.py b/test/infiniop/libinfiniop/op_register.py index e92e77105..86cee0424 100644 --- a/test/infiniop/libinfiniop/op_register.py +++ b/test/infiniop/libinfiniop/op_register.py @@ -489,3 +489,290 @@ def conv_(lib): lib.infiniopDestroyConvDescriptor.argtypes = [ infiniopOperatorDescriptor_t, ] + +@OpRegister.operator +def exp_(lib): + lib.infiniopCreateExpDescriptor.restype = c_int32 + lib.infiniopCreateExpDescriptor.argtypes = [ + infiniopHandle_t, + POINTER(infiniopOperatorDescriptor_t), + infiniopTensorDescriptor_t, + infiniopTensorDescriptor_t, + ] + + lib.infiniopGetExpWorkspaceSize.restype = c_int32 + lib.infiniopGetExpWorkspaceSize.argtypes = [ + infiniopOperatorDescriptor_t, + POINTER(c_size_t), + ] + + lib.infiniopExp.restype = c_int32 + lib.infiniopExp.argtypes = [ + infiniopOperatorDescriptor_t, + c_void_p, + c_size_t, + c_void_p, + c_void_p, + c_void_p, + ] + + lib.infiniopDestroyExpDescriptor.restype = c_int32 + lib.infiniopDestroyExpDescriptor.argtypes = [ + infiniopOperatorDescriptor_t, + ] + +@OpRegister.operator +def sin_(lib): + lib.infiniopCreateSinDescriptor.restype = c_int32 + lib.infiniopCreateSinDescriptor.argtypes = [ + infiniopHandle_t, + POINTER(infiniopOperatorDescriptor_t), + infiniopTensorDescriptor_t, + infiniopTensorDescriptor_t, + ] + + lib.infiniopGetSinWorkspaceSize.restype = c_int32 + lib.infiniopGetSinWorkspaceSize.argtypes = [ + infiniopOperatorDescriptor_t, + POINTER(c_size_t), + ] + + lib.infiniopSin.restype = c_int32 + lib.infiniopSin.argtypes = [ + infiniopOperatorDescriptor_t, + c_void_p, + c_size_t, + c_void_p, + c_void_p, + c_void_p, + ] + + lib.infiniopDestroySinDescriptor.restype = c_int32 + lib.infiniopDestroySinDescriptor.argtypes = [ + infiniopOperatorDescriptor_t, + ] + +@OpRegister.operator +def cos_(lib): + lib.infiniopCreateCosDescriptor.restype = c_int32 + lib.infiniopCreateCosDescriptor.argtypes = [ + infiniopHandle_t, + POINTER(infiniopOperatorDescriptor_t), + infiniopTensorDescriptor_t, + infiniopTensorDescriptor_t, + ] + + lib.infiniopGetCosWorkspaceSize.restype = c_int32 + lib.infiniopGetCosWorkspaceSize.argtypes = [ + infiniopOperatorDescriptor_t, + POINTER(c_size_t), + ] + + lib.infiniopCos.restype = c_int32 + lib.infiniopCos.argtypes = [ + infiniopOperatorDescriptor_t, + c_void_p, + c_size_t, + c_void_p, + c_void_p, + c_void_p, + ] + + lib.infiniopDestroyCosDescriptor.restype = c_int32 + lib.infiniopDestroyCosDescriptor.argtypes = [ + infiniopOperatorDescriptor_t, + ] + +@OpRegister.operator +def leakyrelu_(lib): + lib.infiniopCreateLeakyreluDescriptor.restype = c_int32 + lib.infiniopCreateLeakyreluDescriptor.argtypes = [ + infiniopHandle_t, + POINTER(infiniopOperatorDescriptor_t), + infiniopTensorDescriptor_t, + infiniopTensorDescriptor_t, + c_float, + ] + + lib.infiniopGetLeakyreluWorkspaceSize.restype = c_int32 + lib.infiniopGetLeakyreluWorkspaceSize.argtypes = [ + infiniopOperatorDescriptor_t, + POINTER(c_size_t), + ] + + lib.infiniopLeakyrelu.restype = c_int32 + lib.infiniopLeakyrelu.argtypes = [ + infiniopOperatorDescriptor_t, + c_void_p, + c_size_t, + c_void_p, + c_void_p, + c_void_p, + ] + + lib.infiniopDestroyLeakyreluDescriptor.restype = c_int32 + lib.infiniopDestroyLeakyreluDescriptor.argtypes = [ + infiniopOperatorDescriptor_t, + ] + +@OpRegister.operator +def tanh_(lib): + lib.infiniopCreateTanhDescriptor.restype = c_int32 + lib.infiniopCreateTanhDescriptor.argtypes = [ + infiniopHandle_t, + POINTER(infiniopOperatorDescriptor_t), + infiniopTensorDescriptor_t, + infiniopTensorDescriptor_t, + ] + + lib.infiniopGetTanhWorkspaceSize.restype = c_int32 + lib.infiniopGetTanhWorkspaceSize.argtypes = [ + infiniopOperatorDescriptor_t, + POINTER(c_size_t), + ] + + lib.infiniopTanh.restype = c_int32 + lib.infiniopTanh.argtypes = [ + infiniopOperatorDescriptor_t, + c_void_p, + c_size_t, + c_void_p, + c_void_p, + c_void_p, + ] + + lib.infiniopDestroyTanhDescriptor.restype = c_int32 + lib.infiniopDestroyTanhDescriptor.argtypes = [ + infiniopOperatorDescriptor_t, + ] + +@OpRegister.operator +def sigmoid_backward_(lib): + lib.infiniopCreateSigmoidBackwardDescriptor.restype = c_int32 + lib.infiniopCreateSigmoidBackwardDescriptor.argtypes = [ + infiniopHandle_t, + POINTER(infiniopOperatorDescriptor_t), + infiniopTensorDescriptor_t, + infiniopTensorDescriptor_t, + infiniopTensorDescriptor_t, + ] + + lib.infiniopGetSigmoidBackwardWorkspaceSize.restype = c_int32 + lib.infiniopGetSigmoidBackwardWorkspaceSize.argtypes = [ + infiniopOperatorDescriptor_t, + POINTER(c_size_t), + ] + + lib.infiniopSigmoidBackward.restype = c_int32 + lib.infiniopSigmoidBackward.argtypes = [ + infiniopOperatorDescriptor_t, + c_void_p, + c_size_t, + c_void_p, + c_void_p, + c_void_p, + c_void_p, + ] + + lib.infiniopDestroySigmoidBackwardDescriptor.restype = c_int32 + lib.infiniopDestroySigmoidBackwardDescriptor.argtypes = [ + infiniopOperatorDescriptor_t, + ] + +@OpRegister.operator +def hardswish_(lib): + lib.infiniopCreateHardswishDescriptor.restype = c_int32 + lib.infiniopCreateHardswishDescriptor.argtypes = [ + infiniopHandle_t, + POINTER(infiniopOperatorDescriptor_t), + infiniopTensorDescriptor_t, + infiniopTensorDescriptor_t, + ] + + lib.infiniopGetHardswishWorkspaceSize.restype = c_int32 + lib.infiniopGetHardswishWorkspaceSize.argtypes = [ + infiniopOperatorDescriptor_t, + POINTER(c_size_t), + ] + + lib.infiniopHardswish.restype = c_int32 + lib.infiniopHardswish.argtypes = [ + infiniopOperatorDescriptor_t, + c_void_p, + c_size_t, + c_void_p, + c_void_p, + c_void_p, + ] + + lib.infiniopDestroyHardswishDescriptor.restype = c_int32 + lib.infiniopDestroyHardswishDescriptor.argtypes = [ + infiniopOperatorDescriptor_t, + ] + +@OpRegister.operator +def cast_(lib): + lib.infiniopCreateCastDescriptor.restype = c_int32 + lib.infiniopCreateCastDescriptor.argtypes = [ + infiniopHandle_t, + POINTER(infiniopOperatorDescriptor_t), + infiniopTensorDescriptor_t, + infiniopTensorDescriptor_t, + ] + + lib.infiniopGetCastWorkspaceSize.restype = c_int32 + lib.infiniopGetCastWorkspaceSize.argtypes = [ + infiniopOperatorDescriptor_t, + POINTER(c_size_t), + ] + + lib.infiniopCast.restype = c_int32 + lib.infiniopCast.argtypes = [ + infiniopOperatorDescriptor_t, + c_void_p, + c_size_t, + c_void_p, + c_void_p, + c_void_p, + ] + + lib.infiniopDestroyCastDescriptor.restype = c_int32 + lib.infiniopDestroyCastDescriptor.argtypes = [ + infiniopOperatorDescriptor_t, + ] + + +@OpRegister.operator +def where_(lib): + lib.infiniopCreateWhereDescriptor.restype = c_int32 + lib.infiniopCreateWhereDescriptor.argtypes = [ + infiniopHandle_t, + POINTER(infiniopOperatorDescriptor_t), + infiniopTensorDescriptor_t, + infiniopTensorDescriptor_t, + infiniopTensorDescriptor_t, + infiniopTensorDescriptor_t, + ] + + lib.infiniopGetWhereWorkspaceSize.restype = c_int32 + lib.infiniopGetWhereWorkspaceSize.argtypes = [ + infiniopOperatorDescriptor_t, + POINTER(c_size_t), + ] + + lib.infiniopWhere.restype = c_int32 + lib.infiniopWhere.argtypes = [ + infiniopOperatorDescriptor_t, + c_void_p, + c_size_t, + c_void_p, + c_void_p, + c_void_p, + c_void_p, + c_void_p, + ] + + lib.infiniopDestroyWhereDescriptor.restype = c_int32 + lib.infiniopDestroyWhereDescriptor.argtypes = [ + infiniopOperatorDescriptor_t, + ] \ No newline at end of file diff --git a/test/infiniop/sigmoid_backward.py b/test/infiniop/sigmoid_backward.py new file mode 100644 index 000000000..813791aa8 --- /dev/null +++ b/test/infiniop/sigmoid_backward.py @@ -0,0 +1,184 @@ +import torch +import ctypes +from ctypes import c_uint64 +from libinfiniop import ( + LIBINFINIOP, + TestTensor, + get_test_devices, + check_error, + test_operator, + get_args, + debug, + get_tolerance, + profile_operation, + TestWorkspace, + InfiniDtype, + InfiniDtypeNames, + InfiniDeviceNames, + infiniopOperatorDescriptor_t, +) +from enum import Enum, auto + +# ============================================================================== +# Configuration (Internal Use Only) +# ============================================================================== +# These are not meant to be imported from other modules +_TEST_CASES_ = [ + # shape, input_stride, grad_output_stride, grad_input_stride + ((13, 4), None, None, None), + ((13, 4), (10, 1), (10, 1), (10, 1)), + ((13, 4), (0, 1), None, None), + ((13, 4, 4), None, None, None), + ((13, 4, 4), (20, 4, 1), (20, 4, 1), (20, 4, 1)), + ((13, 4, 4), (4, 0, 1), (0, 4, 1), None), + ((16, 5632), None, None, None), + ((16, 5632), (13312, 1), (13312, 1), (13312, 1)), + ((4, 4, 5632), None, None, None), + ((4, 4, 5632), (45056, 5632, 1), (45056, 5632, 1), (45056, 5632, 1)), +] + + +class Inplace(Enum): + OUT_OF_PLACE = auto() + INPLACE_INPUT = auto() + INPLACE_GRAD_OUTPUT = auto() + + +# Inplace options applied for each test case in _TEST_CASES_ +_INPLACE = [ + Inplace.OUT_OF_PLACE, + Inplace.INPLACE_INPUT, + Inplace.INPLACE_GRAD_OUTPUT, +] + +# Form the test cases by appending each element of _INPLACE to each tuple in _TEST_CASES_ +_TEST_CASES = [ + test_case + (inplace_item,) + for test_case in _TEST_CASES_ + for inplace_item in _INPLACE +] + +# Data types used for testing +_TENSOR_DTYPES = [InfiniDtype.F16, InfiniDtype.F32, InfiniDtype.BF16] + +# Tolerance map for different data types +_TOLERANCE_MAP = { + InfiniDtype.F16: {"atol": 1e-3, "rtol": 1e-3}, + InfiniDtype.F32: {"atol": 1e-6, "rtol": 1e-6}, + InfiniDtype.BF16: {"atol": 1e-2, "rtol": 1e-2}, +} + +DEBUG = False +PROFILE = False +NUM_PRERUN = 10 +NUM_ITERATIONS = 1000 + + +def sigmoid_backward(grad_input, input_tensor, grad_output): + sigmoid_input = torch.sigmoid(input_tensor) + grad_input.copy_(grad_output * sigmoid_input * (1 - sigmoid_input)) + + +def test( + handle, + device, + shape, + input_stride=None, + grad_output_stride=None, + grad_input_stride=None, + inplace=Inplace.OUT_OF_PLACE, + dtype=torch.float16, + sync=None, +): + input_tensor = TestTensor(shape, input_stride, dtype, device) + grad_output = TestTensor(shape, grad_output_stride, dtype, device) + + if inplace == Inplace.INPLACE_INPUT: + if input_stride != grad_input_stride: + return + grad_input = input_tensor + elif inplace == Inplace.INPLACE_GRAD_OUTPUT: + if grad_input_stride != grad_output_stride: + return + grad_input = grad_output + else: + grad_input = TestTensor(shape, grad_input_stride, dtype, device, mode="ones") + + if grad_input.is_broadcast(): + return + + print( + f"Testing SigmoidBackward on {InfiniDeviceNames[device]} with shape:{shape} " + f"input_stride:{input_stride} grad_output_stride:{grad_output_stride} grad_input_stride:{grad_input_stride} " + f"dtype:{InfiniDtypeNames[dtype]} inplace:{inplace}" + ) + + sigmoid_backward(grad_input.torch_tensor(), input_tensor.torch_tensor(), grad_output.torch_tensor()) + + if sync is not None: + sync() + + descriptor = infiniopOperatorDescriptor_t() + check_error( + LIBINFINIOP.infiniopCreateSigmoidBackwardDescriptor( + handle, + ctypes.byref(descriptor), + grad_input.descriptor, + input_tensor.descriptor, + grad_output.descriptor, + ) + ) + + for tensor in [input_tensor, grad_output, grad_input]: + tensor.destroy_desc() + + workspace_size = c_uint64(0) + check_error( + LIBINFINIOP.infiniopGetSigmoidBackwardWorkspaceSize( + descriptor, ctypes.byref(workspace_size) + ) + ) + workspace = TestWorkspace(workspace_size.value, grad_input.device) + + def lib_sigmoid_backward(): + check_error( + LIBINFINIOP.infiniopSigmoidBackward( + descriptor, + workspace.data(), + workspace.size(), + grad_input.data(), + input_tensor.data(), + grad_output.data(), + None, + ) + ) + + lib_sigmoid_backward() + + atol, rtol = get_tolerance(_TOLERANCE_MAP, dtype) + if DEBUG: + debug(grad_input.actual_tensor(), grad_input.torch_tensor(), atol=atol, rtol=rtol) + assert torch.allclose(grad_input.actual_tensor(), grad_input.torch_tensor(), atol=atol, rtol=rtol) + + if PROFILE: + # fmt: off + profile_operation("PyTorch", lambda: sigmoid_backward(grad_input.torch_tensor(), input_tensor.torch_tensor(), grad_output.torch_tensor()), + device, NUM_PRERUN, NUM_ITERATIONS) + profile_operation(" lib", lambda: lib_sigmoid_backward(), device, NUM_PRERUN, NUM_ITERATIONS) + # fmt: on + + check_error(LIBINFINIOP.infiniopDestroySigmoidBackwardDescriptor(descriptor)) + + +if __name__ == "__main__": + args = get_args() + + DEBUG = args.debug + PROFILE = args.profile + NUM_PRERUN = args.num_prerun + NUM_ITERATIONS = args.num_iterations + + for device in get_test_devices(args): + test_operator(device, test, _TEST_CASES, _TENSOR_DTYPES) + + print("\033[92mTest passed!\033[0m") diff --git a/test/infiniop/sin.py b/test/infiniop/sin.py new file mode 100644 index 000000000..613257e9c --- /dev/null +++ b/test/infiniop/sin.py @@ -0,0 +1,166 @@ +import torch +import ctypes +from ctypes import c_uint64 +from libinfiniop import ( + LIBINFINIOP, + TestTensor, + get_test_devices, + check_error, + test_operator, + get_args, + debug, + get_tolerance, + get_sync_func, + profile_operation, + TestWorkspace, + InfiniDtype, + InfiniDtypeNames, + InfiniDeviceNames, + infiniopOperatorDescriptor_t, +) +from enum import Enum, auto + +# ======================================================================== +# Configuration (Internal Use Only) +# ======================================================================== +_TEST_CASES_ = [ + # shape, input_stride, output_stride + ((13, 4), None, None), + ((13, 4), (10, 1), (10, 1)), + ((13, 4), (0, 1), None), + ((13, 4, 4), None, None), + ((13, 4, 4), (20, 4, 1), (20, 4, 1)), + ((13, 4, 4), (4, 0, 1), None), + ((16, 5632), None, None), + ((16, 5632), (10240, 1), (10240, 1)), + ((4, 4, 5632), None, None), + ((4, 4, 5632), (45056, 5632, 1), (45056, 5632, 1)), +] + +class Inplace(Enum): + OUT_OF_PLACE = auto() + INPLACE_INPUT = auto() + +_INPLACE = [ + Inplace.OUT_OF_PLACE, + Inplace.INPLACE_INPUT, +] + +_TEST_CASES = [ + test_case + (inplace,) + for test_case in _TEST_CASES_ + for inplace in _INPLACE +] + +_TENSOR_DTYPES = [InfiniDtype.F16, InfiniDtype.F32, InfiniDtype.BF16] + +_TOLERANCE_MAP = { + InfiniDtype.F16: {"atol": 1e-3, "rtol": 1e-3}, + InfiniDtype.F32: {"atol": 1e-6, "rtol": 1e-6}, + InfiniDtype.BF16: {"atol": 1e-2, "rtol": 1e-2}, +} + +DEBUG = False +PROFILE = False +NUM_PRERUN = 10 +NUM_ITERATIONS = 1000 + + +def sin(output, input): + output.copy_(torch.sin(input)) + +def test( + handle, + device, + shape, + input_stride=None, + output_stride=None, + inplace=Inplace.OUT_OF_PLACE, + dtype=torch.float16, + sync=None, +): + input = TestTensor(shape, input_stride, dtype, device) + if inplace == Inplace.INPLACE_INPUT: + if input_stride != output_stride: + return + output = input + else: + output = TestTensor(shape, output_stride, dtype, device, mode="ones") + + if output.is_broadcast(): + return + + print( + f"Testing Sin on {InfiniDeviceNames[device]} with shape:{shape} input_stride:{input_stride} output_stride:{output_stride} " + f"dtype:{InfiniDtypeNames[dtype]} inplace:{inplace}" + ) + + sin(output.torch_tensor(), input.torch_tensor()) + + if sync is not None: + sync() + + descriptor = infiniopOperatorDescriptor_t() + check_error( + LIBINFINIOP.infiniopCreateSinDescriptor( + handle, + ctypes.byref(descriptor), + output.descriptor, + input.descriptor, + ) + ) + + # Invalidate the shape and strides in the descriptor to prevent them from being directly used by the kernel + for tensor in [input, output]: + tensor.destroy_desc() + + workspace_size = c_uint64(0) + check_error( + LIBINFINIOP.infiniopGetSinWorkspaceSize( + descriptor, ctypes.byref(workspace_size) + ) + ) + workspace = TestWorkspace(workspace_size.value, output.device) + + def lib_sin(): + check_error( + LIBINFINIOP.infiniopSin( + descriptor, + workspace.data(), + workspace_size.value, + output.data(), + input.data(), + None, + ) + ) + + lib_sin() + + atol, rtol = get_tolerance(_TOLERANCE_MAP, dtype) + if DEBUG: + debug(output.actual_tensor(), output.torch_tensor(), atol=atol, rtol=rtol) + + assert torch.allclose(output.actual_tensor(), output.torch_tensor(), atol=atol, rtol=rtol) + + # Profiling workflow + if PROFILE: + # fmt: off + profile_operation("PyTorch", lambda: sin(output.torch_tensor(), input.torch_tensor()), device, NUM_PRERUN, NUM_ITERATIONS) + profile_operation(" lib", lambda: lib_sin(), device, NUM_PRERUN, NUM_ITERATIONS) + # fmt: on + check_error(LIBINFINIOP.infiniopDestroySinDescriptor(descriptor)) + + +if __name__ == "__main__": + args = get_args() + + # Configure testing options + DEBUG = args.debug + PROFILE = args.profile + NUM_PRERUN = args.num_prerun + NUM_ITERATIONS = args.num_iterations + + for device in get_test_devices(args): + test_operator(device, test, _TEST_CASES, _TENSOR_DTYPES) + + print("\033[92mTest passed!\033[0m") diff --git a/test/infiniop/tanh.py b/test/infiniop/tanh.py new file mode 100644 index 000000000..dc6ec46e8 --- /dev/null +++ b/test/infiniop/tanh.py @@ -0,0 +1,166 @@ +import torch +import ctypes +from ctypes import c_uint64 +from libinfiniop import ( + LIBINFINIOP, + TestTensor, + get_test_devices, + check_error, + test_operator, + get_args, + debug, + get_tolerance, + get_sync_func, + profile_operation, + TestWorkspace, + InfiniDtype, + InfiniDtypeNames, + InfiniDeviceNames, + infiniopOperatorDescriptor_t, +) +from enum import Enum, auto + +# ======================================================================== +# Configuration (Internal Use Only) +# ======================================================================== +_TEST_CASES_ = [ + # shape, input_stride, output_stride + ((13, 4), None, None), + ((13, 4), (10, 1), (10, 1)), + ((13, 4), (0, 1), None), + ((13, 4, 4), None, None), + ((13, 4, 4), (20, 4, 1), (20, 4, 1)), + ((13, 4, 4), (4, 0, 1), None), + ((16, 5632), None, None), + ((16, 5632), (10240, 1), (10240, 1)), + ((4, 4, 5632), None, None), + ((4, 4, 5632), (45056, 5632, 1), (45056, 5632, 1)), +] + +class Inplace(Enum): + OUT_OF_PLACE = auto() + INPLACE_INPUT = auto() + +_INPLACE = [ + Inplace.OUT_OF_PLACE, + Inplace.INPLACE_INPUT, +] + +_TEST_CASES = [ + test_case + (inplace,) + for test_case in _TEST_CASES_ + for inplace in _INPLACE +] + +_TENSOR_DTYPES = [InfiniDtype.F16, InfiniDtype.F32, InfiniDtype.BF16] + +_TOLERANCE_MAP = { + InfiniDtype.F16: {"atol": 1e-3, "rtol": 1e-3}, + InfiniDtype.F32: {"atol": 1e-6, "rtol": 1e-6}, + InfiniDtype.BF16: {"atol": 1e-2, "rtol": 1e-2}, +} + +DEBUG = False +PROFILE = False +NUM_PRERUN = 10 +NUM_ITERATIONS = 1000 + + +def tanh(output, input): + output.copy_(torch.tanh(input)) + +def test( + handle, + device, + shape, + input_stride=None, + output_stride=None, + inplace=Inplace.OUT_OF_PLACE, + dtype=torch.float16, + sync=None, +): + input = TestTensor(shape, input_stride, dtype, device) + if inplace == Inplace.INPLACE_INPUT: + if input_stride != output_stride: + return + output = input + else: + output = TestTensor(shape, output_stride, dtype, device, mode="ones") + + if output.is_broadcast(): + return + + print( + f"Testing Tanh on {InfiniDeviceNames[device]} with shape:{shape} input_stride:{input_stride} output_stride:{output_stride} " + f"dtype:{InfiniDtypeNames[dtype]} inplace:{inplace}" + ) + + tanh(output.torch_tensor(), input.torch_tensor()) + + if sync is not None: + sync() + + descriptor = infiniopOperatorDescriptor_t() + check_error( + LIBINFINIOP.infiniopCreateTanhDescriptor( + handle, + ctypes.byref(descriptor), + output.descriptor, + input.descriptor, + ) + ) + + # Invalidate the shape and strides in the descriptor to prevent them from being directly used by the kernel + for tensor in [input, output]: + tensor.destroy_desc() + + workspace_size = c_uint64(0) + check_error( + LIBINFINIOP.infiniopGetTanhWorkspaceSize( + descriptor, ctypes.byref(workspace_size) + ) + ) + workspace = TestWorkspace(workspace_size.value, output.device) + + def lib_tanh(): + check_error( + LIBINFINIOP.infiniopTanh( + descriptor, + workspace.data(), + workspace_size.value, + output.data(), + input.data(), + None, + ) + ) + + lib_tanh() + + atol, rtol = get_tolerance(_TOLERANCE_MAP, dtype) + if DEBUG: + debug(output.actual_tensor(), output.torch_tensor(), atol=atol, rtol=rtol) + + assert torch.allclose(output.actual_tensor(), output.torch_tensor(), atol=atol, rtol=rtol) + + # Profiling workflow + if PROFILE: + # fmt: off + profile_operation("PyTorch", lambda: tanh(output.torch_tensor(), input.torch_tensor()), device, NUM_PRERUN, NUM_ITERATIONS) + profile_operation(" lib", lambda: lib_tanh(), device, NUM_PRERUN, NUM_ITERATIONS) + # fmt: on + check_error(LIBINFINIOP.infiniopDestroyTanhDescriptor(descriptor)) + + +if __name__ == "__main__": + args = get_args() + + # Configure testing options + DEBUG = args.debug + PROFILE = args.profile + NUM_PRERUN = args.num_prerun + NUM_ITERATIONS = args.num_iterations + + for device in get_test_devices(args): + test_operator(device, test, _TEST_CASES, _TENSOR_DTYPES) + + print("\033[92mTest passed!\033[0m") diff --git a/test/infiniop/where.py b/test/infiniop/where.py new file mode 100644 index 000000000..c940d4f05 --- /dev/null +++ b/test/infiniop/where.py @@ -0,0 +1,288 @@ +import torch +import ctypes +from ctypes import c_uint64 +from libinfiniop import ( + LIBINFINIOP, + TestTensor, + get_test_devices, + check_error, + test_operator, + get_args, + debug, + get_tolerance, + profile_operation, + TestWorkspace, + InfiniDtype, + InfiniDtypeNames, + InfiniDeviceNames, + infiniopOperatorDescriptor_t, + to_torch_dtype, + torch_device_map, +) +from enum import Enum, auto + +# ====================================================================== +# Configuration (Internal Use Only) +# Now each test case tuple is: (shape, a_stride, b_stride, cond_stride, c_stride) +# ====================================================================== +_TEST_CASES_ = [ + ((13, 4), None, None, None, None), + ((13, 4), None, None, None, None), + ((13, 4), (10, 1), (10, 1), (10, 1), (10, 1)), + ((13, 4), (0, 1), None, None, None), + ((13, 4, 4), None, None, None, None), + ((13, 4, 4), (20, 4, 1), (20, 4, 1), (20, 4, 1), (20, 4, 1)), + ((13, 4, 4), (4, 0, 1), (0, 4, 1), None, None), + ((16, 5632), None, None, None, None), + ((16, 5632), (13312, 1), (13312, 1), (13312, 1), (13312, 1)), + ((4, 4, 5632), None, None, None, None), + ((4, 4, 5632), (45056, 5632, 1), (45056, 5632, 1), (45056, 5632, 1), (45056, 5632, 1)), +] + +class Inplace(Enum): + OUT_OF_PLACE = auto() + INPLACE_A = auto() + INPLACE_B = auto() + INPLACE_COND = auto() + +_INPLACE = [ + Inplace.OUT_OF_PLACE, + Inplace.INPLACE_A, + Inplace.INPLACE_B, + Inplace.INPLACE_COND, +] + +_TEST_CASES = [ + test_case + (inplace_item,) + for test_case in _TEST_CASES_ + for inplace_item in _INPLACE +] + +_INTEGER_DTYPES = [ + InfiniDtype.I32, + InfiniDtype.I64, + InfiniDtype.U32, + InfiniDtype.U64, +] + +_FLOAT_DTYPES = [ + InfiniDtype.F16, + InfiniDtype.F32, + InfiniDtype.F64, + InfiniDtype.BF16, +] + +_TENSOR_DTYPES = _INTEGER_DTYPES + _FLOAT_DTYPES + +_TOLERANCE_MAP = { + InfiniDtype.I32: {"atol": 1e-2, "rtol": 1e-2}, + InfiniDtype.I64: {"atol": 1e-2, "rtol": 1e-2}, + InfiniDtype.U32: {"atol": 1e-2, "rtol": 1e-2}, + InfiniDtype.U64: {"atol": 1e-2, "rtol": 1e-2}, + InfiniDtype.F16: {"atol": 1e-3, "rtol": 1e-3}, + InfiniDtype.F32: {"atol": 1e-6, "rtol": 1e-6}, + InfiniDtype.F64: {"atol": 1e-7, "rtol": 1e-7}, + InfiniDtype.BF16: {"atol": 1e-3, "rtol": 1e-3}, +} + +DEBUG = False +PROFILE = False +NUM_PRERUN = 10 +NUM_ITERATIONS = 1000 + +def is_supported_dt(inf_dt): + try: + td = to_torch_dtype(inf_dt, compatability_mode=True) + _ = torch.empty((1,), dtype=td, device="cpu") + return True + except Exception: + return False + +def _is_integer_dtype(inf_dt): + return inf_dt in _INTEGER_DTYPES + +def _is_unsigned_dtype(inf_dt): + return inf_dt in (InfiniDtype.U32, InfiniDtype.U64) + + +def make_integer_torch_tensor(shape, inf_dt, device): + use_compatibility = _is_unsigned_dtype(inf_dt) + + if inf_dt == InfiniDtype.I32: + low, high, dtype = -2000, 2000, torch.int32 + elif inf_dt == InfiniDtype.I64: + low, high, dtype = -2048, 2048, torch.int64 + elif inf_dt == InfiniDtype.U32: + low, high, dtype = 0, 2000, torch.int32 + elif inf_dt == InfiniDtype.U64: + low, high, dtype = 0, 2048, torch.int64 + else: + low, high, dtype = 0, 1, torch.int64 + + dev = torch_device_map[device] + + t = torch.randint(low=low, high=high, size=shape, dtype=dtype, device=dev) + + target_torch_dt = to_torch_dtype(inf_dt, compatability_mode=use_compatibility) + if t.dtype != target_torch_dt: + t = t.to(dtype=target_torch_dt) + + return t + +def where_ref(c, a, b, cond): + cond_bool = cond.torch_tensor().to(torch.bool) + c.torch_tensor().copy_(torch.where(cond_bool, a.torch_tensor(), b.torch_tensor())) + +def test( + handle, + device, + shape, + a_stride=None, + b_stride=None, + cond_stride=None, + c_stride=None, + inplace=Inplace.OUT_OF_PLACE, + dtype=InfiniDtype.F16, + sync=None, +): + inf_dt = dtype + + if not is_supported_dt(inf_dt): + # print(f"Skipping dtype {InfiniDtypeNames[inf_dt]} on this platform") + return + + try: + if _is_integer_dtype(inf_dt): + a_torch = make_integer_torch_tensor(shape, inf_dt, device) + b_torch = make_integer_torch_tensor(shape, inf_dt, device) + a = TestTensor.from_torch(a_torch, inf_dt, device) + b = TestTensor.from_torch(b_torch, inf_dt, device) + else: + a = TestTensor(shape, a_stride, inf_dt, device, mode="random") + b = TestTensor(shape, b_stride, inf_dt, device, mode="random") + except RuntimeError as e: + msg = str(e) + if "not implemented for 'UInt32'" in msg or "not implemented for 'UInt64'" in msg or "check_uniform_bounds" in msg: + # print(f"Skipping dtype {InfiniDtypeNames[inf_dt]} because platform torch can't build random tensor: {e}") + return + else: + raise + + dev = torch_device_map[device] + if _is_integer_dtype(inf_dt): + cond_torch = torch.randint(0, 2, size=shape, dtype=to_torch_dtype(inf_dt, compatability_mode=False), device=dev) + else: + cond_bool = (torch.rand(shape, device=dev) > 0.5) + cond_torch = cond_bool.to(dtype=to_torch_dtype(inf_dt, compatability_mode=False)) + + cond = TestTensor.from_torch(cond_torch, inf_dt, device) + + if inplace == Inplace.INPLACE_A: + if a_stride != c_stride: + return + c = a + elif inplace == Inplace.INPLACE_B: + if c_stride != b_stride: + return + c = b + elif inplace == Inplace.INPLACE_COND: + if c_stride != cond_stride: + return + c = cond + else: + if _is_integer_dtype(inf_dt): + dev = torch_device_map[device] + c_torch = torch.zeros(shape, dtype=to_torch_dtype(inf_dt, compatability_mode=False), device=dev) + c = TestTensor.from_torch(c_torch, inf_dt, device) + else: + c = TestTensor(shape, c_stride, inf_dt, device, mode="ones") + + if c.is_broadcast(): + return + + print( + f"Testing Where on {InfiniDeviceNames[device]} " + f"shape:{shape} a_stride:{a_stride} b_stride:{b_stride} cond_stride:{cond_stride} c_stride:{c_stride} " + f"dtype:{InfiniDtypeNames[inf_dt]} inplace:{inplace}" + ) + + where_ref(c, a, b, cond) + + if sync is not None: + sync() + + descriptor = infiniopOperatorDescriptor_t() + try: + check_error( + LIBINFINIOP.infiniopCreateWhereDescriptor( + handle, + ctypes.byref(descriptor), + c.descriptor, + a.descriptor, + b.descriptor, + cond.descriptor, + ) + ) + except Exception as e: + # print(f"Skipping dtype {InfiniDtypeNames[inf_dt]} on {InfiniDeviceNames[device]}: CreateWhereDescriptor failed: {e}") + return + + for tensor in [a, b, c, cond]: + tensor.destroy_desc() + + workspace_size = c_uint64(0) + check_error( + LIBINFINIOP.infiniopGetWhereWorkspaceSize( + descriptor, ctypes.byref(workspace_size) + ) + ) + workspace = TestWorkspace(workspace_size.value, c.device) + + def lib_where(): + check_error( + LIBINFINIOP.infiniopWhere( + descriptor, + workspace.data(), + workspace.size(), + c.data(), + a.data(), + b.data(), + cond.data(), + None, + ) + ) + + lib_where() + + atol, rtol = get_tolerance(_TOLERANCE_MAP, inf_dt) + if DEBUG: + debug(c.actual_tensor(), c.torch_tensor(), atol=atol, rtol=rtol) + + assert torch.allclose(c.actual_tensor(), c.torch_tensor(), atol=atol, rtol=rtol) + + if PROFILE: + profile_operation("PyTorch", lambda: where_ref(c, a, b, cond), device, NUM_PRERUN, NUM_ITERATIONS) + profile_operation(" lib", lambda: lib_where(), device, NUM_PRERUN, NUM_ITERATIONS) + + check_error(LIBINFINIOP.infiniopDestroyWhereDescriptor(descriptor)) + + +def main(): + args = get_args() + global DEBUG, PROFILE, NUM_PRERUN, NUM_ITERATIONS + DEBUG = args.debug + PROFILE = args.profile + NUM_PRERUN = args.num_prerun + NUM_ITERATIONS = args.num_iterations + + supported = [dt for dt in _TENSOR_DTYPES if is_supported_dt(dt)] + devices = get_test_devices(args) + + for device in devices: + test_operator(device, test, _TEST_CASES, supported) + + print("\033[92mTest passed!\033[0m") + + +if __name__ == "__main__": + main()