diff --git a/include/infiniop.h b/include/infiniop.h index d51b8d92e..f7ad9e02c 100644 --- a/include/infiniop.h +++ b/include/infiniop.h @@ -17,5 +17,7 @@ #include "infiniop/ops/sub.h" #include "infiniop/ops/swiglu.h" #include "infiniop/tensor_descriptor.h" +#include "infiniop/ops/reduce_mean.h" +#include "infiniop/ops/reduce_max.h" #endif // __INFINIOP_API_H__ diff --git a/include/infiniop/ops/reduce_max.h b/include/infiniop/ops/reduce_max.h new file mode 100644 index 000000000..42a3dd62d --- /dev/null +++ b/include/infiniop/ops/reduce_max.h @@ -0,0 +1,27 @@ +#ifndef __INFINIOP_REDUCE_MAX_API_H__ +#define __INFINIOP_REDUCE_MAX_API_H__ + +#include "../operator_descriptor.h" + +typedef struct InfiniopDescriptor *infiniopReduceMaxDescriptor_t; + +__C __export infiniStatus_t infiniopCreateReduceMaxDescriptor( + infiniopHandle_t handle, + infiniopReduceMaxDescriptor_t *desc_ptr, + infiniopTensorDescriptor_t output_desc, + infiniopTensorDescriptor_t input_desc, + size_t dim); + +__C __export infiniStatus_t infiniopGetReduceMaxWorkspaceSize(infiniopReduceMaxDescriptor_t desc, size_t *size); + +__C __export infiniStatus_t infiniopReduceMax( + infiniopReduceMaxDescriptor_t desc, + void *workspace, + size_t workspace_size, + void *output, + const void *input, + void *stream); + +__C __export infiniStatus_t infiniopDestroyReduceMaxDescriptor(infiniopReduceMaxDescriptor_t desc); + +#endif diff --git a/include/infiniop/ops/reduce_mean.h b/include/infiniop/ops/reduce_mean.h new file mode 100644 index 000000000..5efd8b227 --- /dev/null +++ b/include/infiniop/ops/reduce_mean.h @@ -0,0 +1,27 @@ +#ifndef __INFINIOP_REDUCE_MEAN_API_H__ +#define __INFINIOP_REDUCE_MEAN_API_H__ + +#include "../operator_descriptor.h" + +typedef struct InfiniopDescriptor *infiniopReduceMeanDescriptor_t; + +__C __export infiniStatus_t infiniopCreateReduceMeanDescriptor( + infiniopHandle_t handle, + infiniopReduceMeanDescriptor_t *desc_ptr, + infiniopTensorDescriptor_t y_desc, + infiniopTensorDescriptor_t x_desc, + size_t dim); + +__C __export infiniStatus_t infiniopGetReduceMeanWorkspaceSize(infiniopReduceMeanDescriptor_t desc, size_t *size); + +__C __export infiniStatus_t infiniopReduceMean( + infiniopReduceMeanDescriptor_t desc, + void *workspace, + size_t workspace_size, + void *y, + const void *x, + void *stream); + +__C __export infiniStatus_t infiniopDestroyReduceMeanDescriptor(infiniopReduceMeanDescriptor_t desc); + +#endif diff --git a/scripts/python_test.py b/scripts/python_test.py index eb2d4319e..89df98708 100644 --- a/scripts/python_test.py +++ b/scripts/python_test.py @@ -24,6 +24,8 @@ def run_tests(args): "rope.py", "sub.py", "swiglu.py", + "reduce_mean.py", + "reduce_max.py", ]: result = subprocess.run( f"python {test} {args} --debug", text=True, encoding="utf-8", shell=True diff --git a/src/infiniop-test/include/ops.hpp b/src/infiniop-test/include/ops.hpp index 3820f7cfd..aee602aac 100644 --- a/src/infiniop-test/include/ops.hpp +++ b/src/infiniop-test/include/ops.hpp @@ -16,6 +16,8 @@ DECLARE_INFINIOP_TEST(add) DECLARE_INFINIOP_TEST(causal_softmax) DECLARE_INFINIOP_TEST(rearrange) DECLARE_INFINIOP_TEST(sub) +DECLARE_INFINIOP_TEST(reduce_mean) +DECLARE_INFINIOP_TEST(reduce_max) #define REGISTER_INFINIOP_TEST(name) \ { \ @@ -43,6 +45,8 @@ DECLARE_INFINIOP_TEST(sub) REGISTER_INFINIOP_TEST(causal_softmax) \ REGISTER_INFINIOP_TEST(rearrange) \ REGISTER_INFINIOP_TEST(sub) \ + REGISTER_INFINIOP_TEST(reduce_mean) \ + REGISTER_INFINIOP_TEST(reduce_max) \ } namespace infiniop_test { @@ -64,4 +68,4 @@ bool check_names( } // namespace infiniop_test -#endif +#endif \ No newline at end of file diff --git a/src/infiniop-test/include/test.hpp b/src/infiniop-test/include/test.hpp index e2dd45f9f..277061029 100644 --- a/src/infiniop-test/include/test.hpp +++ b/src/infiniop-test/include/test.hpp @@ -47,7 +47,7 @@ std::vector> runAllTests( const GGUFFileReader &, infiniDevice_t device, int device_id, size_t warm_ups, size_t iterations, - double rtol, double atol); + double rtol, double atol, bool equal_nan = false); // Run a single test read from a GGUF file std::shared_ptr runTest( @@ -55,10 +55,11 @@ std::shared_ptr runTest( infiniDevice_t device, int device_id, size_t warm_ups, size_t iterations, double rtol, double atol, - size_t test_id); + size_t test_id, + bool equal_nan = false); // Check if two tensors are close within given tolerance -void allClose(std::shared_ptr actual, std::shared_ptr expected, double rtol = 1e-3, double atol = 1e-3); +void allClose(std::shared_ptr actual, std::shared_ptr expected, double rtol = 1e-3, double atol = 1e-3, bool equal_nan = false); // Check if two tensors are equal void allEqual(std::shared_ptr actual, std::shared_ptr expected); @@ -85,13 +86,14 @@ class Test { namespace infiniop_test::name { \ class Test : public infiniop_test::base::Test { \ double _rtol, _atol; \ + bool _equal_nan; \ \ public: \ static std::string op_name() { return #name; } \ static std::shared_ptr build( \ std::unordered_map> attributes, \ std::unordered_map> tensors, \ - double, double); \ + double, double, bool); \ \ static std::vector attribute_names(); \ static std::vector tensor_names(); \ @@ -109,7 +111,8 @@ class Test { struct Attributes; \ Attributes *_attributes; \ Test() = delete; \ - Test(double rtol, double atol) : _rtol(rtol), _atol(atol) {} \ + Test(double rtol, double atol, bool equal_nan = false) \ + : _rtol(rtol), _atol(atol), _equal_nan(equal_nan) {} \ }; \ } @@ -117,7 +120,7 @@ namespace infiniop_test { using BuilderFunc = std::function( std::unordered_map>, std::unordered_map>, - double, double)>; + double, double, bool)>; // Testcase Registry // Each testcase should provid a formatted builder, attribute names, and tensor names diff --git a/src/infiniop-test/src/gguf.cpp b/src/infiniop-test/src/gguf.cpp index a4b200033..aee5b39a8 100644 --- a/src/infiniop-test/src/gguf.cpp +++ b/src/infiniop-test/src/gguf.cpp @@ -53,7 +53,9 @@ GGUFFileReader::GGUFFileReader(const std::string &filepath) { try { _file = std::make_shared(filepath); } catch (const std::exception &e) { - throw e; + // throw e; + std::cerr << "Error: " << e.what() << std::endl; + // throw e; } _data = _file->ptr(); _cursor = reinterpret_cast(_data); diff --git a/src/infiniop-test/src/main.cpp b/src/infiniop-test/src/main.cpp index 4863c8172..6805bd7f8 100644 --- a/src/infiniop-test/src/main.cpp +++ b/src/infiniop-test/src/main.cpp @@ -1,8 +1,8 @@ #include "gguf.hpp" #include "test.hpp" +#include #include #include - struct ParsedArgs { std::string file_path; // Mandatory argument: test.gguf file path infiniDevice_t device_type = INFINI_DEVICE_CPU; // Default to CPU @@ -11,12 +11,13 @@ struct ParsedArgs { int iterations = 0; // Default to 0 if not given double atol = 0.001; // Default absolute tolerance double rtol = 0.001; // Default relative tolerance + bool equal_nan = false; // Default relative tolerance }; void printUsage() { std::cout << "Usage:" << std::endl << std::endl; - std::cout << "infiniop-test [--[:id]] [--warmup ] [--run ] [--atol ] [--rtol ]" << std::endl + std::cout << "infiniop-test [--[:id]] [--warmup ] [--run ] [--atol ] [--rtol ] [--equal-nan ]" << std::endl << std::endl; std::cout << " >" << std::endl; std::cout << " Path to the test gguf file" << std::endl @@ -36,6 +37,9 @@ void printUsage() { std::cout << " --rtol " << std::endl; std::cout << " (Optional) Relative tolerance for correctness check. Default to 0.001" << std::endl << std::endl; + std::cout << " --equal-nan " << std::endl; + std::cout << " (Optional) If True, then two NaNs will be considered equal. Default to False" << std::endl + << std::endl; exit(-1); } @@ -91,6 +95,11 @@ ParsedArgs parseArgs(int argc, char *argv[]) { else if (arg == "--rtol" && i + 1 < argc) { args.rtol = std::stod(argv[++i]); } + else if (arg == "--equal-nan" && i + 1 < argc) { + args.equal_nan = (strcmp(argv[++i], "True") == 0 || strcmp(argv[i], "true") == 0) + ? true + : false; + } else { printUsage(); } @@ -119,7 +128,7 @@ int main(int argc, char *argv[]) { reader, (infiniDevice_t)args.device_type, args.device_id, args.warmups, args.iterations, - args.rtol, args.atol); + args.rtol, args.atol, args.equal_nan); std::cout << "=====================================" << std::endl; for (auto result : results) { diff --git a/src/infiniop-test/src/ops/add.cpp b/src/infiniop-test/src/ops/add.cpp index 27f69d687..e90290d55 100644 --- a/src/infiniop-test/src/ops/add.cpp +++ b/src/infiniop-test/src/ops/add.cpp @@ -15,8 +15,8 @@ struct Test::Attributes { std::shared_ptr Test::build( std::unordered_map> attributes, std::unordered_map> tensors, - double rtol, double atol) { - auto test = std::shared_ptr(new Test(rtol, atol)); + double rtol, double atol, bool equal_nan) { + auto test = std::shared_ptr(new Test(rtol, atol, equal_nan)); test->_attributes = new Attributes(); if (tensors.find("a") == tensors.end() || tensors.find("b") == tensors.end() @@ -58,7 +58,7 @@ std::shared_ptr Test::run( return TEST_FAILED(OP_EXECUTION_FAILED, "Failed during execution.")); try { - allClose(c, _attributes->ans, _rtol, _atol); + allClose(c, _attributes->ans, _rtol, _atol, _equal_nan); } catch (const std::exception &e) { return TEST_FAILED(RESULT_INCORRECT, e.what()); } @@ -98,7 +98,7 @@ std::string Test::toString() const { oss << "- b: " << _attributes->b->info() << std::endl; oss << "- c: " << _attributes->c->info() << std::endl; oss << std::scientific << std::setprecision(2); - oss << "- rtol=" << _rtol << ", atol=" << _atol << std::endl; + oss << "- rtol=" << _rtol << ", atol=" << _atol << ", equal_nan=" << _equal_nan << std::endl; return oss.str(); } diff --git a/src/infiniop-test/src/ops/causal_softmax.cpp b/src/infiniop-test/src/ops/causal_softmax.cpp index 29612960a..97c65ef8c 100644 --- a/src/infiniop-test/src/ops/causal_softmax.cpp +++ b/src/infiniop-test/src/ops/causal_softmax.cpp @@ -14,8 +14,8 @@ struct Test::Attributes { std::shared_ptr Test::build( std::unordered_map> attributes, std::unordered_map> tensors, - double rtol, double atol) { - auto test = std::shared_ptr(new Test(rtol, atol)); + double rtol, double atol, bool equal_nan) { + auto test = std::shared_ptr(new Test(rtol, atol, equal_nan)); test->_attributes = new Attributes(); if (tensors.find("x") == tensors.end() || tensors.find("y") == tensors.end() @@ -53,7 +53,7 @@ std::shared_ptr Test::run( return TEST_FAILED(OP_EXECUTION_FAILED, "Failed during execution.")); try { - allClose(y, _attributes->ans, _rtol, _atol); + allClose(y, _attributes->ans, _rtol, _atol, _equal_nan); } catch (const std::exception &e) { return TEST_FAILED(RESULT_INCORRECT, e.what()); } @@ -92,7 +92,7 @@ std::string Test::toString() const { oss << "- y: " << _attributes->y->info() << std::endl; oss << "- ans: " << _attributes->ans->info() << std::endl; oss << std::scientific << std::setprecision(2); - oss << "- rtol=" << _rtol << ", atol=" << _atol << std::endl; + oss << "- rtol=" << _rtol << ", atol=" << _atol << ", equal_nan=" << _equal_nan << std::endl; return oss.str(); } diff --git a/src/infiniop-test/src/ops/clip.cpp b/src/infiniop-test/src/ops/clip.cpp index 82a0e9b10..a01c18a4d 100644 --- a/src/infiniop-test/src/ops/clip.cpp +++ b/src/infiniop-test/src/ops/clip.cpp @@ -16,8 +16,8 @@ struct Test::Attributes { std::shared_ptr Test::build( std::unordered_map> attributes, std::unordered_map> tensors, - double rtol, double atol) { - auto test = std::shared_ptr(new Test(rtol, atol)); + double rtol, double atol, bool equal_nan) { + auto test = std::shared_ptr(new Test(rtol, atol, equal_nan)); test->_attributes = new Attributes(); if (tensors.find("x") == tensors.end() || tensors.find("min_val") == tensors.end() @@ -64,7 +64,7 @@ std::shared_ptr Test::run( return TEST_FAILED(OP_EXECUTION_FAILED, "Failed during execution.")); try { - allClose(y, _attributes->ans, _rtol, _atol); + allClose(y, _attributes->ans, _rtol, _atol, _equal_nan); } catch (const std::exception &e) { return TEST_FAILED(RESULT_INCORRECT, e.what()); } @@ -109,7 +109,7 @@ std::string Test::toString() const { oss << "- max_val: " << _attributes->max_val->info() << std::endl; oss << "- y: " << _attributes->y->info() << std::endl; oss << std::scientific << std::setprecision(2); - oss << "- rtol=" << _rtol << ", atol=" << _atol << std::endl; + oss << "- rtol=" << _rtol << ", atol=" << _atol << ", equal_nan=" << _equal_nan << std::endl; return oss.str(); } diff --git a/src/infiniop-test/src/ops/gemm.cpp b/src/infiniop-test/src/ops/gemm.cpp index 37c8ed6fe..664288d73 100644 --- a/src/infiniop-test/src/ops/gemm.cpp +++ b/src/infiniop-test/src/ops/gemm.cpp @@ -18,8 +18,8 @@ struct Test::Attributes { std::shared_ptr Test::build( std::unordered_map> attributes, std::unordered_map> tensors, - double rtol, double atol) { - auto test = std::shared_ptr(new Test(rtol, atol)); + double rtol, double atol, bool equal_nan) { + auto test = std::shared_ptr(new Test(rtol, atol, equal_nan)); test->_attributes = new Attributes(); if (!check_names(attributes, Test::attribute_names()) || !check_names(tensors, Test::tensor_names())) { throw std::runtime_error("Invalid Test"); @@ -65,7 +65,7 @@ std::shared_ptr Test::run( return TEST_FAILED(OP_EXECUTION_FAILED, "Failed during execution.")); try { - allClose(c, _attributes->ans, _rtol, _atol); + allClose(c, _attributes->ans, _rtol, _atol, _equal_nan); } catch (const std::exception &e) { return TEST_FAILED(RESULT_INCORRECT, e.what()); } @@ -120,7 +120,7 @@ std::string Test::toString() const { oss << "- b: " << _attributes->b->info() << std::endl; oss << "- c: " << _attributes->c->info() << std::endl; oss << std::scientific << std::setprecision(2); - oss << "- rtol=" << _rtol << ", atol=" << _atol << std::endl; + oss << "- rtol=" << _rtol << ", atol=" << _atol << ", equal_nan=" << _equal_nan << std::endl; return oss.str(); } diff --git a/src/infiniop-test/src/ops/mul.cpp b/src/infiniop-test/src/ops/mul.cpp index 8ebfc426b..cb0b639bf 100644 --- a/src/infiniop-test/src/ops/mul.cpp +++ b/src/infiniop-test/src/ops/mul.cpp @@ -15,8 +15,8 @@ struct Test::Attributes { std::shared_ptr Test::build( std::unordered_map> attributes, std::unordered_map> tensors, - double rtol, double atol) { - auto test = std::shared_ptr(new Test(rtol, atol)); + double rtol, double atol, bool equal_nan) { + auto test = std::shared_ptr(new Test(rtol, atol, equal_nan)); test->_attributes = new Attributes(); if (tensors.find("a") == tensors.end() || tensors.find("b") == tensors.end() @@ -58,7 +58,7 @@ std::shared_ptr Test::run( return TEST_FAILED(OP_EXECUTION_FAILED, "Failed during execution.")); try { - allClose(c, _attributes->ans, _rtol, _atol); + allClose(c, _attributes->ans, _rtol, _atol, _equal_nan); } catch (const std::exception &e) { return TEST_FAILED(RESULT_INCORRECT, e.what()); } @@ -98,7 +98,7 @@ std::string Test::toString() const { oss << "- b: " << _attributes->b->info() << std::endl; oss << "- c: " << _attributes->c->info() << std::endl; oss << std::scientific << std::setprecision(2); - oss << "- rtol=" << _rtol << ", atol=" << _atol << std::endl; + oss << "- rtol=" << _rtol << ", atol=" << _atol << ", equal_nan=" << _equal_nan << std::endl; return oss.str(); } diff --git a/src/infiniop-test/src/ops/random_sample.cpp b/src/infiniop-test/src/ops/random_sample.cpp index a11e0f446..75ee07b44 100644 --- a/src/infiniop-test/src/ops/random_sample.cpp +++ b/src/infiniop-test/src/ops/random_sample.cpp @@ -20,8 +20,8 @@ struct Test::Attributes { std::shared_ptr Test::build( std::unordered_map> attributes, std::unordered_map> tensors, - double rtol, double atol) { - auto test = std::shared_ptr(new Test(rtol, atol)); + double rtol, double atol, bool equal_nan) { + auto test = std::shared_ptr(new Test(rtol, atol, equal_nan)); test->_attributes = new Attributes(); if (!check_names(attributes, Test::attribute_names()) || !check_names(tensors, Test::tensor_names())) { throw std::runtime_error("Invalid Test"); @@ -70,7 +70,7 @@ std::shared_ptr Test::run( return TEST_FAILED(OP_EXECUTION_FAILED, "Failed during execution.")); try { - allClose(result, _attributes->ans, _rtol, _atol); + allClose(result, _attributes->ans, _rtol, _atol, _equal_nan); } catch (const std::exception &e) { return TEST_FAILED(RESULT_INCORRECT, e.what()); } @@ -117,7 +117,7 @@ std::string Test::toString() const { oss << "- data: " << _attributes->data->info() << std::endl; oss << "- result: " << _attributes->result->info() << std::endl; oss << std::scientific << std::setprecision(2); - oss << "- rtol=" << _rtol << ", atol=" << _atol << std::endl; + oss << "- rtol=" << _rtol << ", atol=" << _atol << ", equal_nan=" << _equal_nan << std::endl; return oss.str(); } diff --git a/src/infiniop-test/src/ops/rearrange.cpp b/src/infiniop-test/src/ops/rearrange.cpp index 9fbf6f2cb..bdf162ce2 100644 --- a/src/infiniop-test/src/ops/rearrange.cpp +++ b/src/infiniop-test/src/ops/rearrange.cpp @@ -12,9 +12,9 @@ struct Test::Attributes { std::shared_ptr Test::build( std::unordered_map> attributes, std::unordered_map> tensors, - double rtol, double atol) { + double rtol, double atol, bool equal_nan) { - auto test = std::shared_ptr(new Test(rtol, atol)); + auto test = std::shared_ptr(new Test(rtol, atol, equal_nan)); test->_attributes = new Attributes(); if (!check_names(attributes, Test::attribute_names()) || !check_names(tensors, Test::tensor_names())) { throw std::runtime_error("Invalid Test"); diff --git a/src/infiniop-test/src/ops/reduce_max.cpp b/src/infiniop-test/src/ops/reduce_max.cpp new file mode 100644 index 000000000..37045ec78 --- /dev/null +++ b/src/infiniop-test/src/ops/reduce_max.cpp @@ -0,0 +1,119 @@ +#include "ops.hpp" +#include "utils.hpp" +#include +#include +#include + +namespace infiniop_test::reduce_max { +struct Test::Attributes { + std::shared_ptr input; + std::shared_ptr output; + std::shared_ptr ans; + size_t dim; +}; + +std::shared_ptr Test::build( + std::unordered_map> attributes, + std::unordered_map> tensors, + double rtol, double atol, bool equal_nan) { + auto test = std::shared_ptr(new Test(rtol, atol, equal_nan)); + test->_attributes = new Attributes(); + + if (attributes.find("dim") == attributes.end() + || tensors.find("input") == tensors.end() + || tensors.find("ans") == tensors.end() + || tensors.find("output") == tensors.end()) { + throw std::runtime_error("Invalid Test: Missing attributes or tensors"); + } + + test->_attributes->dim = size_t(*reinterpret_cast(attributes["dim"].data())); + test->_attributes->ans = tensors["ans"]; + test->_attributes->input = tensors["input"]; + test->_attributes->output = tensors["output"]; + + return test; +} + +std::shared_ptr Test::run( + infiniopHandle_t handle, infiniDevice_t device, int device_id, + size_t warm_ups, size_t iterations) { + + infiniopReduceMaxDescriptor_t op_desc; + CHECK_OR(infiniopCreateReduceMaxDescriptor(handle, &op_desc, + _attributes->output->desc(), + _attributes->input->desc(), + _attributes->dim), + return TEST_FAILED(OP_CREATION_FAILED, "Failed to create ReduceMax descriptor")); + + auto input = _attributes->input->to(device, device_id); + auto output = _attributes->output->to(device, device_id); + + size_t workspace_size; + CHECK_OR(infiniopGetReduceMaxWorkspaceSize(op_desc, &workspace_size), + return TEST_FAILED(OP_CREATION_FAILED, "Failed to get workspace size")); + void *workspace = nullptr; + if (workspace_size > 0) { + CHECK_OR(infinirtMalloc(&workspace, workspace_size), + return TEST_FAILED(OP_CREATION_FAILED, "Failed to allocate workspace")); + } + + CHECK_OR(infiniopReduceMax(op_desc, + workspace, workspace_size, + output->data(), + input->data(), + nullptr), + return TEST_FAILED(OP_EXECUTION_FAILED, "ReduceMax execution failed")); + + try { + allClose(output, _attributes->ans, _rtol, _atol, _equal_nan); + } catch (const std::exception &e) { + return TEST_FAILED(RESULT_INCORRECT, e.what()); + } + + double elapsed_time = 0.; + + elapsed_time = benchmark( + [=]() { + infiniopReduceMax(op_desc, + workspace, workspace_size, + output->data(), + input->data(), + nullptr); + }, + warm_ups, iterations); + + if (workspace != nullptr) { + infinirtFree(workspace); + } + + return TEST_PASSED(elapsed_time); +} + +std::vector Test::attribute_names() { + return {"dim"}; +} + +std::vector Test::tensor_names() { + return {"input", "ans", "output"}; +} + +std::vector Test::output_names() { + return {"output"}; +} + +std::string Test::toString() const { + std::ostringstream oss; + oss << op_name() << std::endl; + oss << "- input: " << _attributes->input->info() << std::endl; + oss << "- output: " << _attributes->output->info() << std::endl; + oss << "- dim=" << _attributes->dim << std::endl; + oss << std::scientific << std::setprecision(2); + oss << "- rtol=" << _rtol << ", atol=" << _atol << ", equal_nan=" << _equal_nan << std::endl; + return oss.str(); +} + +Test::~Test() { + delete _attributes; +} + +} // namespace infiniop_test::reduce_max diff --git a/src/infiniop-test/src/ops/reduce_mean.cpp b/src/infiniop-test/src/ops/reduce_mean.cpp new file mode 100644 index 000000000..9de22d49d --- /dev/null +++ b/src/infiniop-test/src/ops/reduce_mean.cpp @@ -0,0 +1,119 @@ +#include "ops.hpp" +#include "utils.hpp" +#include +#include +#include + +namespace infiniop_test::reduce_mean { +struct Test::Attributes { + std::shared_ptr input; + std::shared_ptr output; + std::shared_ptr ans; + size_t dim; +}; + +std::shared_ptr Test::build( + std::unordered_map> attributes, + std::unordered_map> tensors, + double rtol, double atol, bool equal_nan) { + auto test = std::shared_ptr(new Test(rtol, atol, equal_nan)); + test->_attributes = new Attributes(); + + if (attributes.find("dim") == attributes.end() + || tensors.find("input") == tensors.end() + || tensors.find("ans") == tensors.end() + || tensors.find("output") == tensors.end()) { + throw std::runtime_error("Invalid Test: Missing attributes or tensors"); + } + + test->_attributes->dim = size_t(*reinterpret_cast(attributes["dim"].data())); + test->_attributes->ans = tensors["ans"]; + test->_attributes->input = tensors["input"]; + test->_attributes->output = tensors["output"]; + + return test; +} + +std::shared_ptr Test::run( + infiniopHandle_t handle, infiniDevice_t device, int device_id, + size_t warm_ups, size_t iterations) { + + infiniopReduceMeanDescriptor_t op_desc; + CHECK_OR(infiniopCreateReduceMeanDescriptor(handle, &op_desc, + _attributes->output->desc(), + _attributes->input->desc(), + _attributes->dim), + return TEST_FAILED(OP_CREATION_FAILED, "Failed to create ReduceMean descriptor")); + + auto input = _attributes->input->to(device, device_id); + auto output = _attributes->output->to(device, device_id); + + size_t workspace_size; + CHECK_OR(infiniopGetReduceMeanWorkspaceSize(op_desc, &workspace_size), + return TEST_FAILED(OP_CREATION_FAILED, "Failed to get workspace size")); + void *workspace = nullptr; + if (workspace_size > 0) { + CHECK_OR(infinirtMalloc(&workspace, workspace_size), + return TEST_FAILED(OP_CREATION_FAILED, "Failed to allocate workspace")); + } + + CHECK_OR(infiniopReduceMean(op_desc, + workspace, workspace_size, + output->data(), + input->data(), + nullptr), + return TEST_FAILED(OP_EXECUTION_FAILED, "ReduceMean execution failed")); + + try { + allClose(output, _attributes->ans, _rtol, _atol, _equal_nan); + } catch (const std::exception &e) { + return TEST_FAILED(RESULT_INCORRECT, e.what()); + } + + double elapsed_time = 0.; + + elapsed_time = benchmark( + [=]() { + infiniopReduceMean(op_desc, + workspace, workspace_size, + output->data(), + input->data(), + nullptr); + }, + warm_ups, iterations); + + if (workspace != nullptr) { + infinirtFree(workspace); + } + + return TEST_PASSED(elapsed_time); +} + +std::vector Test::attribute_names() { + return {"dim"}; +} + +std::vector Test::tensor_names() { + return {"input", "ans", "output"}; +} + +std::vector Test::output_names() { + return {"output"}; +} + +std::string Test::toString() const { + std::ostringstream oss; + oss << op_name() << std::endl; + oss << "- input: " << _attributes->input->info() << std::endl; + oss << "- output: " << _attributes->output->info() << std::endl; + oss << "- dim=" << _attributes->dim << std::endl; + oss << std::scientific << std::setprecision(2); + oss << "- rtol=" << _rtol << ", atol=" << _atol << ", equal_nan=" << _equal_nan << std::endl; + return oss.str(); +} + +Test::~Test() { + delete _attributes; +} + +} // namespace infiniop_test::reduce_mean diff --git a/src/infiniop-test/src/ops/rms_norm.cpp b/src/infiniop-test/src/ops/rms_norm.cpp index 8359a4536..786ce8470 100644 --- a/src/infiniop-test/src/ops/rms_norm.cpp +++ b/src/infiniop-test/src/ops/rms_norm.cpp @@ -16,8 +16,8 @@ struct Test::Attributes { std::shared_ptr Test::build( std::unordered_map> attributes, std::unordered_map> tensors, - double rtol, double atol) { - auto test = std::shared_ptr(new Test(rtol, atol)); + double rtol, double atol, bool equal_nan) { + auto test = std::shared_ptr(new Test(rtol, atol, equal_nan)); test->_attributes = new Attributes(); if (attributes.find("epsilon") == attributes.end() @@ -72,7 +72,7 @@ std::shared_ptr Test::run( return TEST_FAILED(OP_EXECUTION_FAILED, "RMSNorm execution failed")); try { - allClose(y, _attributes->ans, _rtol, _atol); + allClose(y, _attributes->ans, _rtol, _atol, _equal_nan); } catch (const std::exception &e) { return TEST_FAILED(RESULT_INCORRECT, e.what()); } @@ -117,7 +117,7 @@ std::string Test::toString() const { oss << "- w: " << _attributes->w->info() << std::endl; oss << "- y: " << _attributes->y->info() << std::endl; oss << std::scientific << std::setprecision(2); - oss << "- rtol=" << _rtol << ", atol=" << _atol << std::endl; + oss << "- rtol=" << _rtol << ", atol=" << _atol << ", equal_nan=" << _equal_nan << std::endl; return oss.str(); } diff --git a/src/infiniop-test/src/ops/rope.cpp b/src/infiniop-test/src/ops/rope.cpp index 636f565af..94010a122 100644 --- a/src/infiniop-test/src/ops/rope.cpp +++ b/src/infiniop-test/src/ops/rope.cpp @@ -17,8 +17,8 @@ struct Test::Attributes { std::shared_ptr Test::build( std::unordered_map> attributes, std::unordered_map> tensors, - double rtol, double atol) { - auto test = std::shared_ptr(new Test(rtol, atol)); + double rtol, double atol, bool equal_nan) { + auto test = std::shared_ptr(new Test(rtol, atol, equal_nan)); test->_attributes = new Attributes(); if (tensors.find("y") == tensors.end() @@ -77,7 +77,7 @@ std::shared_ptr Test::run( return TEST_FAILED(OP_EXECUTION_FAILED, "Failed during execution.")); try { - allClose(y, _attributes->ans, _rtol, _atol); + allClose(y, _attributes->ans, _rtol, _atol, _equal_nan); } catch (const std::exception &e) { return TEST_FAILED(RESULT_INCORRECT, e.what()); } @@ -121,7 +121,7 @@ std::string Test::toString() const { oss << "- sin_table: " << _attributes->sin_table->info() << std::endl; oss << "- cos_table: " << _attributes->cos_table->info() << std::endl; oss << std::scientific << std::setprecision(2); - oss << "- rtol=" << _rtol << ", atol=" << _atol << std::endl; + oss << "- rtol=" << _rtol << ", atol=" << _atol << ", equal_nan=" << _equal_nan << std::endl; return oss.str(); } diff --git a/src/infiniop-test/src/ops/sub.cpp b/src/infiniop-test/src/ops/sub.cpp index 6bb1fd1eb..bb3adc350 100644 --- a/src/infiniop-test/src/ops/sub.cpp +++ b/src/infiniop-test/src/ops/sub.cpp @@ -15,8 +15,8 @@ struct Test::Attributes { std::shared_ptr Test::build( std::unordered_map> attributes, std::unordered_map> tensors, - double rtol, double atol) { - auto test = std::shared_ptr(new Test(rtol, atol)); + double rtol, double atol, bool equal_nan) { + auto test = std::shared_ptr(new Test(rtol, atol, equal_nan)); test->_attributes = new Attributes(); if (tensors.find("a") == tensors.end() || tensors.find("b") == tensors.end() @@ -58,7 +58,7 @@ std::shared_ptr Test::run( return TEST_FAILED(OP_EXECUTION_FAILED, "Failed during execution.")); try { - allClose(c, _attributes->ans, _rtol, _atol); + allClose(c, _attributes->ans, _rtol, _atol, _equal_nan); } catch (const std::exception &e) { return TEST_FAILED(RESULT_INCORRECT, e.what()); } @@ -98,7 +98,7 @@ std::string Test::toString() const { oss << "- b: " << _attributes->b->info() << std::endl; oss << "- c: " << _attributes->c->info() << std::endl; oss << std::scientific << std::setprecision(2); - oss << "- rtol=" << _rtol << ", atol=" << _atol << std::endl; + oss << "- rtol=" << _rtol << ", atol=" << _atol << ", equal_nan=" << _equal_nan << std::endl; return oss.str(); } diff --git a/src/infiniop-test/src/ops/swiglu.cpp b/src/infiniop-test/src/ops/swiglu.cpp index 96b75efc5..f86dfadc6 100644 --- a/src/infiniop-test/src/ops/swiglu.cpp +++ b/src/infiniop-test/src/ops/swiglu.cpp @@ -15,8 +15,8 @@ struct Test::Attributes { std::shared_ptr Test::build( std::unordered_map> attributes, std::unordered_map> tensors, - double rtol, double atol) { - auto test = std::shared_ptr(new Test(rtol, atol)); + double rtol, double atol, bool equal_nan) { + auto test = std::shared_ptr(new Test(rtol, atol, equal_nan)); test->_attributes = new Attributes(); if (tensors.find("a") == tensors.end() @@ -54,7 +54,7 @@ std::shared_ptr Test::run( CHECK_OR(infiniopSwiGLU(op_desc, workspace, workspace_size, c->data(), a->data(), b->data(), nullptr), return TEST_FAILED(OP_CREATION_FAILED, "Failed during execution.")); try { - allClose(c, _attributes->ans, _rtol, _atol); + allClose(c, _attributes->ans, _rtol, _atol, _equal_nan); } catch (const std::exception &e) { return TEST_FAILED(RESULT_INCORRECT, e.what()); } @@ -93,7 +93,7 @@ std::string Test::toString() const { oss << "- b: " << _attributes->b->info() << std::endl; oss << "- c: " << _attributes->c->info() << std::endl; oss << std::scientific << std::setprecision(2); - oss << "- rtol=" << _rtol << ", atol=" << _atol << std::endl; + oss << "- rtol=" << _rtol << ", atol=" << _atol << ", equal_nan=" << _equal_nan << std::endl; return oss.str(); } diff --git a/src/infiniop-test/src/test.cpp b/src/infiniop-test/src/test.cpp index e312ac5f5..0cbfe067a 100644 --- a/src/infiniop-test/src/test.cpp +++ b/src/infiniop-test/src/test.cpp @@ -49,7 +49,7 @@ std::string Result::toString() const { std::vector> runAllTests(const GGUFFileReader &gguf_reader, infiniDevice_t device, int device_id, size_t warm_ups, size_t iterations, - double rtol, double atol) { + double rtol, double atol, bool equal_nan) { auto meta = gguf_reader.getAttributeMap(); auto count_meta = meta.find("test_count"); if (count_meta == meta.end()) { @@ -60,7 +60,7 @@ std::vector> runAllTests(const GGUFFileReader &gguf_read auto results = std::vector>(count); try { for (size_t i = 0; i < count; i++) { - results[i] = runTest(gguf_reader, device, device_id, warm_ups, iterations, rtol, atol, i); + results[i] = runTest(gguf_reader, device, device_id, warm_ups, iterations, rtol, atol, i, equal_nan); } } catch (const std::exception &e) { std::cerr << "Error: " << e.what() << std::endl; @@ -72,7 +72,7 @@ std::vector> runAllTests(const GGUFFileReader &gguf_read std::shared_ptr runTest(const GGUFFileReader &gguf_reader, infiniDevice_t device, int device_id, size_t warm_ups, size_t iterations, - double rtol, double atol, size_t test_id) { + double rtol, double atol, size_t test_id, bool equal_nan) { auto meta = gguf_reader.getAttributeMap(); auto tensor_info = gguf_reader.getTensorInfoMap(); auto name_meta = meta.find("test." + std::to_string(test_id) + ".op_name"); @@ -107,7 +107,7 @@ std::shared_ptr runTest(const GGUFFileReader &gguf_reader, } std::shared_ptr test; try { - test = builder.build(attrs, tensors, rtol, atol); + test = builder.build(attrs, tensors, rtol, atol, equal_nan); } catch (const std::exception &e) { return TEST_INIT_FAILED(op_name + "/n" + e.what()); } @@ -141,7 +141,7 @@ void incrementOffset(ptrdiff_t &offset_1, const std::vector &strides_ } } -void allClose(std::shared_ptr actual_, std::shared_ptr expected_, double rtol, double atol) { +void allClose(std::shared_ptr actual_, std::shared_ptr expected_, double rtol, double atol, bool equal_nan) { auto actual = actual_->to(INFINI_DEVICE_CPU); auto expected = expected_->to(INFINI_DEVICE_CPU); auto shape = actual->shape(); @@ -158,12 +158,22 @@ void allClose(std::shared_ptr actual_, std::shared_ptr expected_ for (size_t i = 0; i < total; i++) { double a_ = getVal((char *)actual->data() + actual_offset, actual->ggml_type()); double e_ = getVal((char *)expected->data() + expected_offset, expected->ggml_type()); - if (std::fabs(a_ - e_) > atol && std::fabs(a_ - e_) > rtol * std::fmax(std::fabs(a_), std::fabs(e_))) { - if (num_failed == 0) { - first_failed_msg = "First failed at index " + std::to_string(i) + " with value " + std::to_string(a_) + " but should be " + std::to_string(e_) + "."; + if (std::isnan(a_) || std::isnan(e_)) { + if ((equal_nan && (std::isnan(a_) != std::isnan(e_))) || !equal_nan) { + num_failed++; + if (num_failed == 0) { + first_failed_msg = "First failed at index " + std::to_string(i) + " with value " + std::to_string(a_) + " but should be " + std::to_string(e_) + "."; + } + } + } else { + if (std::fabs(a_ - e_) > atol && std::fabs(a_ - e_) > rtol * std::fmax(std::fabs(a_), std::fabs(e_))) { + if (num_failed == 0) { + first_failed_msg = "First failed at index " + std::to_string(i) + " with value " + std::to_string(a_) + " but should be " + std::to_string(e_) + "."; + } + num_failed++; } - num_failed++; } + incrementOffset(actual_offset, actual->strides(), ggmlTypeSize(actual->ggml_type()), expected_offset, expected->strides(), ggmlTypeSize(expected->ggml_type()), counter, shape); diff --git a/src/infiniop/ops/reduce_max/cpu/reduce_max_cpu.cc b/src/infiniop/ops/reduce_max/cpu/reduce_max_cpu.cc new file mode 100644 index 000000000..55a340226 --- /dev/null +++ b/src/infiniop/ops/reduce_max/cpu/reduce_max_cpu.cc @@ -0,0 +1,102 @@ +#include "reduce_max_cpu.h" +#include "../../../devices/cpu/common_cpu.h" +#include "../../../reduce/cpu/reduce.h" + +namespace op::reduce_max::cpu { + +Descriptor::~Descriptor() {} + +infiniStatus_t Descriptor::create( + infiniopHandle_t handle, + Descriptor **desc_ptr, + infiniopTensorDescriptor_t y_desc, + infiniopTensorDescriptor_t x_desc, + size_t dim) { + auto result = ReduceMaxInfo::create(y_desc, x_desc, dim); + CHECK_RESULT(result); + *desc_ptr = new Descriptor(nullptr, result.take(), 0, handle->device, handle->device_id); + return INFINI_STATUS_SUCCESS; +} + +template +infiniStatus_t reduce_max(const ReduceMaxInfo *info, T *y, const T *x) { + const size_t cols = info->shape[3]; // 规约维度 + const ptrdiff_t y_batch_stride = info->y_strides[0]; + const ptrdiff_t y_channel_stride = info->y_strides[1]; + const ptrdiff_t y_row_stride = info->y_strides[2]; + const ptrdiff_t x_batch_stride = info->x_strides[0]; + const ptrdiff_t x_channel_stride = info->x_strides[1]; + const ptrdiff_t x_row_stride = info->x_strides[2]; + const ptrdiff_t x_col_stride = info->x_strides[3]; +#ifdef _WIN32 + const ptrdiff_t batch_size = static_cast(info->shape[0]); + const ptrdiff_t channels = static_cast(info->shape[1]); + const ptrdiff_t rows = static_cast(info->shape[2]); +#pragma omp parallel for + for (ptrdiff_t batch = 0; batch < batch_size; ++batch) { + for (ptrdiff_t channel = 0; channel < channels; ++channel) { + for (ptrdiff_t row = 0; row < rows; ++row) { + const T *input_start = x + batch * x_batch_stride + + channel * x_channel_stride + + row * x_row_stride; + T *output_ptr = y + batch * y_batch_stride + + channel * y_channel_stride + + row * y_row_stride; + + float max = op::common_cpu::reduce_op::max(input_start, cols, x_col_stride); + + if constexpr (std::is_same::value || std::is_same::value) { + *output_ptr = utils::cast(max); + } else { + *output_ptr = max; + } + } + } + } +#else + const size_t batch_size = info->shape[0]; + const size_t channels = info->shape[1]; + const size_t rows = info->shape[2]; +#pragma omp parallel for collapse(3) + for (size_t batch = 0; batch < batch_size; ++batch) { + for (size_t channel = 0; channel < channels; ++channel) { + for (size_t row = 0; row < rows; ++row) { + const T *input_start = x + batch * x_batch_stride + + channel * x_channel_stride + + row * x_row_stride; + T *output_ptr = y + batch * y_batch_stride + + channel * y_channel_stride + + row * y_row_stride; + + float max = op::common_cpu::reduce_op::max(input_start, cols, x_col_stride); + + if constexpr (std::is_same::value || std::is_same::value) { + *output_ptr = utils::cast(max); + } else { + *output_ptr = max; + } + } + } + } +#endif //_WIN32 + return INFINI_STATUS_SUCCESS; +} +infiniStatus_t Descriptor::calculate( + void *workspace, size_t workspace_size, + void *y, + const void *x, + void *stream) const { + + if (_info.dtype == INFINI_DTYPE_F16) { + CHECK_STATUS(reduce_max(&_info, (fp16_t *)y, (const fp16_t *)x)); + } else if (_info.dtype == INFINI_DTYPE_BF16) { + CHECK_STATUS(reduce_max(&_info, (bf16_t *)y, (const bf16_t *)x)); + } else if (_info.dtype == INFINI_DTYPE_F32) { + CHECK_STATUS(reduce_max(&_info, (float *)y, (const float *)x)); + } else { + return INFINI_STATUS_BAD_TENSOR_DTYPE; + } + + return INFINI_STATUS_SUCCESS; +} +} // namespace op::reduce_max::cpu diff --git a/src/infiniop/ops/reduce_max/cpu/reduce_max_cpu.h b/src/infiniop/ops/reduce_max/cpu/reduce_max_cpu.h new file mode 100644 index 000000000..bf6be9b74 --- /dev/null +++ b/src/infiniop/ops/reduce_max/cpu/reduce_max_cpu.h @@ -0,0 +1,7 @@ +#ifndef __REDUCE_MAX_CPU_H__ +#define __REDUCE_MAX_CPU_H__ +#include "../reduce_max.h" + +DESCRIPTOR(cpu) + +#endif diff --git a/src/infiniop/ops/reduce_max/info.h b/src/infiniop/ops/reduce_max/info.h new file mode 100644 index 000000000..99bb5be19 --- /dev/null +++ b/src/infiniop/ops/reduce_max/info.h @@ -0,0 +1,62 @@ +#ifndef __REDUCE_MAX_INFO_H__ +#define __REDUCE_MAX_INFO_H__ + +#include "../../../utils.h" +#include "../../tensor.h" +#include + +namespace op::reduce_max { + +class ReduceMaxInfo { + ReduceMaxInfo() = default; + +public: + infiniDtype_t dtype; + + std::vector shape; + std::vector y_strides; + std::vector x_strides; + + static utils::Result create(infiniopTensorDescriptor_t y_desc, infiniopTensorDescriptor_t x_desc, size_t dim) { + auto dtype = y_desc->dtype(); + if (dtype != x_desc->dtype()) { + return INFINI_STATUS_BAD_TENSOR_DTYPE; + } + CHECK_DTYPE(dtype, INFINI_DTYPE_F16, INFINI_DTYPE_BF16, INFINI_DTYPE_F32); + + size_t ndim = y_desc->ndim(); + if (x_desc->ndim() != ndim) { + CHECK_STATUS(INFINI_STATUS_BAD_TENSOR_SHAPE); + } + CHECK_REDUCE_SHAPE(x_desc->shape(), dim, y_desc->shape()); + if (ndim > 4) { + return INFINI_STATUS_BAD_TENSOR_SHAPE; + } else if (ndim == 0) { + std::vector shape = {1, 1, 1, 1}; + std::vector y_strides = {0, 0, 0, 0}; + std::vector x_strides = {0, 0, 0, 0}; + return utils::Result(ReduceMaxInfo{ + dtype, shape, y_strides, x_strides}); + } else { + std::vector shape = x_desc->shape(); + std::vector y_strides = y_desc->strides(); + std::vector x_strides = x_desc->strides(); + if (dim != (shape.size() - 1)) { + std::swap(shape[dim], shape[shape.size() - 1]); + std::swap(y_strides[dim], y_strides[shape.size() - 1]); + std::swap(x_strides[dim], x_strides[shape.size() - 1]); + } + while (shape.size() < 4) { + shape.insert(shape.begin(), 1); + y_strides.insert(y_strides.begin(), 0); + x_strides.insert(x_strides.begin(), 0); + } + return utils::Result(ReduceMaxInfo{ + dtype, shape, y_strides, x_strides}); + } + } +}; + +} // namespace op::reduce_max + +#endif // __REDUCE_MAX_INFO_H__ diff --git a/src/infiniop/ops/reduce_max/metax/reduce_max_metax.h b/src/infiniop/ops/reduce_max/metax/reduce_max_metax.h new file mode 100644 index 000000000..735bc8da4 --- /dev/null +++ b/src/infiniop/ops/reduce_max/metax/reduce_max_metax.h @@ -0,0 +1,8 @@ +#ifndef __REDUCE_MAX_METAX_H__ +#define __REDUCE_MAX_METAX_H__ + +#include "../reduce_max.h" + +DESCRIPTOR(metax) + +#endif diff --git a/src/infiniop/ops/reduce_max/metax/reduce_max_metax.maca b/src/infiniop/ops/reduce_max/metax/reduce_max_metax.maca new file mode 100644 index 000000000..ab120fea5 --- /dev/null +++ b/src/infiniop/ops/reduce_max/metax/reduce_max_metax.maca @@ -0,0 +1,96 @@ +#include "../../../devices/metax/metax_common.h" +#include "reduce_max_metax.h" + +#include +#include "../../../devices/metax/metax_kernel_common.h" + +#include "../../../reduce/cuda/reduce.cuh" + +#include "../nvidia/kernel.cuh" + +template +INFINIOP_METAX_KERNEL ReduceMax( + Tdata *y_, const Tdata *x_, + size_t batch, size_t channels, size_t height, size_t width, + ptrdiff_t y_stride_b, ptrdiff_t y_stride_c, ptrdiff_t y_stride_h, + ptrdiff_t x_stride_b, ptrdiff_t x_stride_c, ptrdiff_t x_stride_h, ptrdiff_t x_stride_w) { + ReduceMaxKernel(y_, x_, batch, channels, height, width, y_stride_b, y_stride_c, y_stride_h, x_stride_b, x_stride_c, x_stride_h, x_stride_w); +} + +namespace op::reduce_max::metax { + +struct Descriptor::Opaque { + std::shared_ptr internal; +}; + +Descriptor::~Descriptor() { + delete _opaque; +} + +infiniStatus_t Descriptor::create( + infiniopHandle_t handle, + Descriptor **desc_ptr, + infiniopTensorDescriptor_t y_desc, + infiniopTensorDescriptor_t x_desc, + size_t dim) { + auto info = ReduceMaxInfo::create(y_desc, x_desc, dim); + CHECK_RESULT(info); + *desc_ptr = new Descriptor( + new Opaque{reinterpret_cast(handle)->internal()}, + info.take(), 0, handle->device, handle->device_id); + return INFINI_STATUS_SUCCESS; +} + +template +infiniStatus_t launchKernel(void *y, const void *x, infiniDtype_t dtype, + size_t batch_size, size_t channels, size_t height, size_t width, + ptrdiff_t y_stride_b, ptrdiff_t y_stride_c, ptrdiff_t y_stride_h, + ptrdiff_t x_stride_b, ptrdiff_t x_stride_c, ptrdiff_t x_stride_h, ptrdiff_t x_stride_w, + hcStream_t stream) { + dim3 grid=dim3(uint32_t(batch_size), uint32_t(channels), uint32_t(height)); + if (dtype == INFINI_DTYPE_F16) { + ReduceMax + <<>>((half *)y, (const half *)x, + batch_size, channels, height, width, + y_stride_b, y_stride_c, y_stride_h, + x_stride_b, x_stride_c, x_stride_h, x_stride_w); + } else if (dtype == INFINI_DTYPE_BF16) { + ReduceMax + <<>>((__hpcc_bfloat16 *)y, (const __hpcc_bfloat16 *)x, + batch_size, channels, height, width, + y_stride_b, y_stride_c, y_stride_h, + x_stride_b, x_stride_c, x_stride_h, x_stride_w); + } else if (dtype == INFINI_DTYPE_F32) { + ReduceMax + <<>>((float *)y, (const float *)x, + batch_size, channels, height, width, + y_stride_b, y_stride_c, y_stride_h, + x_stride_b, x_stride_c, x_stride_h, x_stride_w); + } else { + return INFINI_STATUS_BAD_TENSOR_DTYPE; + } + return INFINI_STATUS_SUCCESS; +} + +infiniStatus_t Descriptor::calculate(void *workspace, size_t workspace_size, + void *y, + const void *x, + void *stream_) const { + hcStream_t stream = (hcStream_t)stream_; + if (_opaque->internal->maxThreadsPerBlock() == METAX_BLOCK_SIZE_1024) { + CHECK_STATUS(launchKernel( + y, x, _info.dtype, _info.shape[0], _info.shape[1], _info.shape[2], _info.shape[3], + _info.y_strides[0], _info.y_strides[1], _info.y_strides[2], + _info.x_strides[0], _info.x_strides[1], _info.x_strides[2], _info.x_strides[3], stream)); + } else if (_opaque->internal->maxThreadsPerBlock() == METAX_BLOCK_SIZE_512) { + CHECK_STATUS(launchKernel( + y, x, _info.dtype, _info.shape[0], _info.shape[1], _info.shape[2], _info.shape[3], + _info.y_strides[0], _info.y_strides[1], _info.y_strides[2], + _info.x_strides[0], _info.x_strides[1], _info.x_strides[2], _info.x_strides[3], stream)); + } else { + return INFINI_STATUS_DEVICE_ARCHITECTURE_NOT_SUPPORTED; + } + return INFINI_STATUS_SUCCESS; +} + +} // namespace op::reduce_max::metax diff --git a/src/infiniop/ops/reduce_max/nvidia/kernel.cuh b/src/infiniop/ops/reduce_max/nvidia/kernel.cuh new file mode 100644 index 000000000..fec2f9341 --- /dev/null +++ b/src/infiniop/ops/reduce_max/nvidia/kernel.cuh @@ -0,0 +1,21 @@ +#ifndef __REDUCE_MAX_KERNEL_CUH__ +#define __REDUCE_MAX_KERNEL_CUH__ + +template +__device__ void ReduceMaxKernel( + Tdata *y_, const Tdata *x_, + size_t batch, size_t channels, size_t height, size_t width, + ptrdiff_t y_stride_b, ptrdiff_t y_stride_c, ptrdiff_t y_stride_h, + ptrdiff_t x_stride_b, ptrdiff_t x_stride_c, ptrdiff_t x_stride_h, ptrdiff_t x_stride_w) { + + Tdata *y = y_ + blockIdx.x * y_stride_b + blockIdx.y * y_stride_c + blockIdx.z * y_stride_h; + const Tdata *x = x_ + blockIdx.x * x_stride_b + blockIdx.y * x_stride_c + blockIdx.z * x_stride_h; + + // [Reduce] Find the max of each updated row and store in shared memory + Tcompute max_0 = op::common_cuda::reduce_op::max(x, width, x_stride_w); + if (threadIdx.x == 0) { + *y = max_0; + } +} + +#endif // __REDUCE_MAX_KERNEL_CUH__ diff --git a/src/infiniop/ops/reduce_max/nvidia/reduce_max_nvidia.cu b/src/infiniop/ops/reduce_max/nvidia/reduce_max_nvidia.cu new file mode 100644 index 000000000..f64f596f4 --- /dev/null +++ b/src/infiniop/ops/reduce_max/nvidia/reduce_max_nvidia.cu @@ -0,0 +1,101 @@ +#include "../../../devices/nvidia/nvidia_common.cuh" +#include "reduce_max_nvidia.cuh" + +#include "../../../devices/nvidia/nvidia_kernel_common.cuh" +#include + +#include "../../../reduce/cuda/reduce.cuh" + +#include "kernel.cuh" + +template +INFINIOP_CUDA_KERNEL ReduceMax( + Tdata *y_, const Tdata *x_, + size_t batch, size_t channels, size_t height, size_t width, + ptrdiff_t y_stride_b, ptrdiff_t y_stride_c, ptrdiff_t y_stride_h, + ptrdiff_t x_stride_b, ptrdiff_t x_stride_c, ptrdiff_t x_stride_h, ptrdiff_t x_stride_w) { + ReduceMaxKernel(y_, x_, batch, channels, height, width, y_stride_b, y_stride_c, y_stride_h, x_stride_b, x_stride_c, x_stride_h, x_stride_w); +} + +namespace op::reduce_max::nvidia { + +struct Descriptor::Opaque { + std::shared_ptr internal; +}; + +Descriptor::~Descriptor() { + delete _opaque; +} + +infiniStatus_t Descriptor::create( + infiniopHandle_t handle, + Descriptor **desc_ptr, + infiniopTensorDescriptor_t y_desc, + infiniopTensorDescriptor_t x_desc, + size_t dim) { + auto info = ReduceMaxInfo::create(y_desc, x_desc, dim); + CHECK_RESULT(info); + *desc_ptr = new Descriptor( + new Opaque{reinterpret_cast(handle)->internal()}, + info.take(), 0, handle->device, handle->device_id); + return INFINI_STATUS_SUCCESS; +} + +template +infiniStatus_t launchKernel(void *y, const void *x, infiniDtype_t dtype, + size_t batch_size, size_t channels, size_t height, size_t width, + ptrdiff_t y_stride_b, ptrdiff_t y_stride_c, ptrdiff_t y_stride_h, + ptrdiff_t x_stride_b, ptrdiff_t x_stride_c, ptrdiff_t x_stride_h, ptrdiff_t x_stride_w, + cudaStream_t stream) { + dim3 grid = dim3(uint32_t(batch_size), uint32_t(channels), uint32_t(height)); + if (dtype == INFINI_DTYPE_F16) { + ReduceMax + <<>>((half *)y, (const half *)x, + batch_size, channels, height, width, + y_stride_b, y_stride_c, y_stride_h, + x_stride_b, x_stride_c, x_stride_h, x_stride_w); + } else if (dtype == INFINI_DTYPE_BF16) { + ReduceMax + <<>>((__nv_bfloat16 *)y, (const __nv_bfloat16 *)x, + batch_size, channels, height, width, + y_stride_b, y_stride_c, y_stride_h, + x_stride_b, x_stride_c, x_stride_h, x_stride_w); + } else if (dtype == INFINI_DTYPE_F32) { + ReduceMax + <<>>((float *)y, (const float *)x, + batch_size, channels, height, width, + y_stride_b, y_stride_c, y_stride_h, + x_stride_b, x_stride_c, x_stride_h, x_stride_w); + } else { + return INFINI_STATUS_BAD_TENSOR_DTYPE; + } + return INFINI_STATUS_SUCCESS; +} + +infiniStatus_t Descriptor::calculate(void *workspace, size_t workspace_size, + void *y, + const void *x, + void *stream_) const { + cudaStream_t stream = (cudaStream_t)stream_; + if (_opaque->internal->maxThreadsPerBlock() == CUDA_BLOCK_SIZE_1024) { + CHECK_STATUS(launchKernel( + y, x, _info.dtype, _info.shape[0], _info.shape[1], _info.shape[2], _info.shape[3], + _info.y_strides[0], _info.y_strides[1], _info.y_strides[2], + _info.x_strides[0], _info.x_strides[1], _info.x_strides[2], _info.x_strides[3], stream)); + } else if (_opaque->internal->maxThreadsPerBlock() == CUDA_BLOCK_SIZE_512) { + CHECK_STATUS(launchKernel( + y, x, _info.dtype, _info.shape[0], _info.shape[1], _info.shape[2], _info.shape[3], + _info.y_strides[0], _info.y_strides[1], _info.y_strides[2], + _info.x_strides[0], _info.x_strides[1], _info.x_strides[2], _info.x_strides[3], stream)); + } else if (_opaque->internal->maxThreadsPerBlock() == CUDA_BLOCK_SIZE_4096) { + CHECK_STATUS(launchKernel( + y, x, _info.dtype, _info.shape[0], _info.shape[1], _info.shape[2], _info.shape[3], + _info.y_strides[0], _info.y_strides[1], _info.y_strides[2], + _info.x_strides[0], _info.x_strides[1], _info.x_strides[2], _info.x_strides[3], stream)); + } else { + return INFINI_STATUS_DEVICE_ARCHITECTURE_NOT_SUPPORTED; + } + return INFINI_STATUS_SUCCESS; +} + +} // namespace op::reduce_max::nvidia diff --git a/src/infiniop/ops/reduce_max/nvidia/reduce_max_nvidia.cuh b/src/infiniop/ops/reduce_max/nvidia/reduce_max_nvidia.cuh new file mode 100644 index 000000000..388738c27 --- /dev/null +++ b/src/infiniop/ops/reduce_max/nvidia/reduce_max_nvidia.cuh @@ -0,0 +1,8 @@ +#ifndef __REDUCE_MAX_NVIDIA_H__ +#define __REDUCE_MAX_NVIDIA_H__ + +#include "../reduce_max.h" + +DESCRIPTOR(nvidia) + +#endif diff --git a/src/infiniop/ops/reduce_max/operator.cc b/src/infiniop/ops/reduce_max/operator.cc new file mode 100644 index 000000000..fc8a976b6 --- /dev/null +++ b/src/infiniop/ops/reduce_max/operator.cc @@ -0,0 +1,181 @@ +#include "../../operator.h" +#include "../../handle.h" +#include "infiniop/ops/reduce_max.h" + +#ifdef ENABLE_CPU_API +#include "cpu/reduce_max_cpu.h" +#endif +#if defined(ENABLE_NVIDIA_API) || defined(ENABLE_ILUVATAR_API) +#include "nvidia/reduce_max_nvidia.cuh" +#endif +#ifdef ENABLE_METAX_API +#include "metax/reduce_max_metax.h" +#endif +#ifdef ENABLE_ASCEND_API +#include "ascend/reduce_max_ascend.h" +#endif + +__C infiniStatus_t infiniopCreateReduceMaxDescriptor( + infiniopHandle_t handle, + infiniopReduceMaxDescriptor_t *desc_ptr, + infiniopTensorDescriptor_t output_desc, + infiniopTensorDescriptor_t input_desc, + size_t dim) { + +#define CREATE(CASE, NAMESPACE) \ + case CASE: \ + return op::reduce_max::NAMESPACE::Descriptor::create( \ + handle, \ + reinterpret_cast(desc_ptr), \ + output_desc, \ + input_desc, \ + dim); + + switch (handle->device) { +#ifdef ENABLE_CPU_API + CREATE(INFINI_DEVICE_CPU, cpu) +#endif +#ifdef ENABLE_NVIDIA_API + CREATE(INFINI_DEVICE_NVIDIA, nvidia) +#endif +#ifdef ENABLE_ILUVATAR_API + CREATE(INFINI_DEVICE_ILUVATAR, nvidia); +#endif +#ifdef ENABLE_METAX_API + CREATE(INFINI_DEVICE_METAX, metax) +#endif + // #ifdef ENABLE_ASCEND_API + // CREATE(INFINI_DEVICE_ASCEND, ascend) + // #endif + // #ifdef ENABLE_CAMBRICON_MLU + // case DevCambriconMlu: { + // return bangCreateReduceMaxDescriptor((BangHandle_t)handle, (ReduceMaxBangDescriptor_t *)desc_ptr, output_desc, input_desc, dim); + // // return cnnlCreateReduceMaxDescriptor((BangHandle_t) handle, (ReduceMaxCnnlDescriptor_t *) desc_ptr, output_desc, input_desc, dim); + // } + // #endif + // #ifdef ENABLE_MTHREADS_GPU + // case DevMthreadsGpu: { + // return musaCreateReduceMaxDescriptor((MusaHandle_t)handle, (ReduceMaxMusaDescriptor_t *)desc_ptr, output_desc, input_desc, dim); + // } + // #endif + } + return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED; +} + +__C infiniStatus_t infiniopGetReduceMaxWorkspaceSize(infiniopReduceMaxDescriptor_t desc, size_t *size) { + +#define GET(CASE, NAMESPACE) \ + case CASE: \ + *size = reinterpret_cast(desc)->workspaceSize(); \ + return INFINI_STATUS_SUCCESS; + + switch (desc->device_type) { +#ifdef ENABLE_CPU_API + GET(INFINI_DEVICE_CPU, cpu) +#endif +#ifdef ENABLE_NVIDIA_API + GET(INFINI_DEVICE_NVIDIA, nvidia) +#endif +#ifdef ENABLE_ILUVATAR_API + GET(INFINI_DEVICE_ILUVATAR, nvidia); +#endif +#ifdef ENABLE_METAX_API + GET(INFINI_DEVICE_METAX, metax) +#endif + // #ifdef ENABLE_ASCEND_API + // GET(INFINI_DEVICE_ASCEND, ascend) + // #endif + // #ifdef ENABLE_CAMBRICON_MLU + // case DevCambriconMlu: { + // return bangGetReduceMaxWorkspaceSize((ReduceMaxBangDescriptor_t)desc, size); + // // return cnnlGetReduceMaxWorkspaceSize((ReduceMaxCnnlDescriptor_t) desc, size); + // } + // #endif + // #ifdef ENABLE_MTHREADS_GPU + // case DevMthreadsGpu: { + // return musaGetReduceMaxWorkspaceSize((ReduceMaxMusaDescriptor_t)desc, size); + // } + // #endif + } + return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED; +} + +__C infiniStatus_t infiniopReduceMax( + infiniopReduceMaxDescriptor_t desc, + void *workspace, size_t workspace_size, + void *output, + const void *input, + void *stream) { + +#define CALCULATE(CASE, NAMESPACE) \ + case CASE: \ + return reinterpret_cast(desc)->calculate( \ + workspace, workspace_size, output, input, stream); + + switch (desc->device_type) { +#ifdef ENABLE_CPU_API + CALCULATE(INFINI_DEVICE_CPU, cpu) +#endif +#ifdef ENABLE_NVIDIA_API + CALCULATE(INFINI_DEVICE_NVIDIA, nvidia) +#endif +#ifdef ENABLE_ILUVATAR_API + CALCULATE(INFINI_DEVICE_ILUVATAR, nvidia); +#endif +#ifdef ENABLE_METAX_API + CALCULATE(INFINI_DEVICE_METAX, metax) +#endif + // #ifdef ENABLE_ASCEND_API + // CALCULATE(INFINI_DEVICE_ASCEND, ascend) + // #endif + // #ifdef ENABLE_CAMBRICON_MLU + // case DevCambriconMlu: { + // return bangReduceMax((ReduceMaxBangDescriptor_t)desc, workspace, workspace_size, output, input, stream); + // // return cnnlReduceMax((ReduceMaxCnnlDescriptor_t) desc, workspace, workspace_size, output, input, stream); + // } + // #endif + // #ifdef ENABLE_MTHREADS_GPU + // case DevMthreadsGpu: { + // return musaReduceMax((ReduceMaxMusaDescriptor_t)desc, workspace, workspace_size, output, input, stream); + // } + // #endif + } + return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED; +} + +__C infiniStatus_t infiniopDestroyReduceMaxDescriptor(infiniopReduceMaxDescriptor_t desc) { + +#define DESTROY(CASE, NAMESPACE) \ + case CASE: \ + delete reinterpret_cast(desc); \ + return INFINI_STATUS_SUCCESS; + + switch (desc->device_type) { +#ifdef ENABLE_CPU_API + DESTROY(INFINI_DEVICE_CPU, cpu) +#endif +#ifdef ENABLE_NVIDIA_API + DESTROY(INFINI_DEVICE_NVIDIA, nvidia) +#endif +#ifdef ENABLE_ILUVATAR_API + DESTROY(INFINI_DEVICE_ILUVATAR, nvidia); +#endif +#ifdef ENABLE_METAX_API + DESTROY(INFINI_DEVICE_METAX, metax) +#endif + // #ifdef ENABLE_ASCEND_API + // DESTROY(INFINI_DEVICE_ASCEND, ascend) + // #endif + // #ifdef ENABLE_CAMBRICON_MLU + // case DevCambriconMlu: { + // return bangDestroyReduceMaxDescriptor((ReduceMaxBangDescriptor_t)desc); + // // return cnnlDestroyReduceMaxDescriptor((ReduceMaxCnnlDescriptor_t) desc); + // } + // #endif + // #ifdef ENABLE_MTHREADS_GPU + // case DevMthreadsGpu: + // return musaDestroyReduceMaxDescriptor((ReduceMaxMusaDescriptor_t)desc); + // #endif + } + return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED; +} \ No newline at end of file diff --git a/src/infiniop/ops/reduce_max/reduce_max.h b/src/infiniop/ops/reduce_max/reduce_max.h new file mode 100644 index 000000000..6ef8630f2 --- /dev/null +++ b/src/infiniop/ops/reduce_max/reduce_max.h @@ -0,0 +1,47 @@ +#ifndef REDUCE_MAX_H +#define REDUCE_MAX_H + +#include "../../operator.h" +#include "info.h" + +#define DESCRIPTOR(NAMESPACE) \ + \ + namespace op::reduce_max::NAMESPACE { \ + class Descriptor final : public InfiniopDescriptor { \ + struct Opaque; \ + Opaque *_opaque; \ + ReduceMaxInfo _info; \ + size_t _workspace_size; \ + \ + Descriptor( \ + Opaque *opaque, \ + ReduceMaxInfo info, \ + size_t workspace_size, \ + infiniDevice_t device_type, \ + int device_id) \ + : InfiniopDescriptor{device_type, device_id}, \ + _opaque(opaque), \ + _info(info), \ + _workspace_size(workspace_size) {} \ + \ + public: \ + ~Descriptor(); \ + \ + size_t workspaceSize() const { return _workspace_size; } \ + \ + static infiniStatus_t create( \ + infiniopHandle_t handle, \ + Descriptor **desc_ptr, \ + infiniopTensorDescriptor_t output_desc, \ + infiniopTensorDescriptor_t input_desc, \ + size_t dim); \ + \ + infiniStatus_t calculate( \ + void *workspace, size_t workspace_size, \ + void *output, \ + const void *input, \ + void *stream) const; \ + }; \ + } + +#endif // REDUCE_MAX_H \ No newline at end of file diff --git a/src/infiniop/ops/reduce_mean/cpu/reduce_mean_cpu.cc b/src/infiniop/ops/reduce_mean/cpu/reduce_mean_cpu.cc new file mode 100644 index 000000000..7853afdbb --- /dev/null +++ b/src/infiniop/ops/reduce_mean/cpu/reduce_mean_cpu.cc @@ -0,0 +1,102 @@ +#include "reduce_mean_cpu.h" +#include "../../../devices/cpu/common_cpu.h" +#include "../../../reduce/cpu/reduce.h" + +namespace op::reduce_mean::cpu { + +Descriptor::~Descriptor() {} + +infiniStatus_t Descriptor::create( + infiniopHandle_t handle, + Descriptor **desc_ptr, + infiniopTensorDescriptor_t y_desc, + infiniopTensorDescriptor_t x_desc, + size_t dim) { + auto result = ReduceMeanInfo::create(y_desc, x_desc, dim); + CHECK_RESULT(result); + *desc_ptr = new Descriptor(nullptr, result.take(), 0, handle->device, handle->device_id); + return INFINI_STATUS_SUCCESS; +} + +template +infiniStatus_t reduce_mean(const ReduceMeanInfo *info, T *y, const T *x) { + const size_t cols = info->shape[3]; // 规约维度 + const ptrdiff_t y_batch_stride = info->y_strides[0]; + const ptrdiff_t y_channel_stride = info->y_strides[1]; + const ptrdiff_t y_row_stride = info->y_strides[2]; + const ptrdiff_t x_batch_stride = info->x_strides[0]; + const ptrdiff_t x_channel_stride = info->x_strides[1]; + const ptrdiff_t x_row_stride = info->x_strides[2]; + const ptrdiff_t x_col_stride = info->x_strides[3]; +#ifdef _WIN32 + const ptrdiff_t batch_size = static_cast(info->shape[0]); + const ptrdiff_t channels = static_cast(info->shape[1]); + const ptrdiff_t rows = static_cast(info->shape[2]); +#pragma omp parallel for + for (ptrdiff_t batch = 0; batch < batch_size; ++batch) { + for (ptrdiff_t channel = 0; channel < channels; ++channel) { + for (ptrdiff_t row = 0; row < rows; ++row) { + const T *input_start = x + batch * x_batch_stride + + channel * x_channel_stride + + row * x_row_stride; + T *output_ptr = y + batch * y_batch_stride + + channel * y_channel_stride + + row * y_row_stride; + + float mean = op::common_cpu::reduce_op::sum(input_start, cols, x_col_stride) / cols; + + if constexpr (std::is_same::value || std::is_same::value) { + *output_ptr = utils::cast(mean); + } else { + *output_ptr = mean; + } + } + } + } +#else + const size_t batch_size = info->shape[0]; + const size_t channels = info->shape[1]; + const size_t rows = info->shape[2]; +#pragma omp parallel for collapse(3) + for (size_t batch = 0; batch < batch_size; ++batch) { + for (size_t channel = 0; channel < channels; ++channel) { + for (size_t row = 0; row < rows; ++row) { + const T *input_start = x + batch * x_batch_stride + + channel * x_channel_stride + + row * x_row_stride; + T *output_ptr = y + batch * y_batch_stride + + channel * y_channel_stride + + row * y_row_stride; + + float mean = op::common_cpu::reduce_op::sum(input_start, cols, x_col_stride) / cols; + + if constexpr (std::is_same::value || std::is_same::value) { + *output_ptr = utils::cast(mean); + } else { + *output_ptr = mean; + } + } + } + } +#endif //_WIN32 + return INFINI_STATUS_SUCCESS; +} +infiniStatus_t Descriptor::calculate( + void *workspace, size_t workspace_size, + void *y, + const void *x, + void *stream) const { + + if (_info.dtype == INFINI_DTYPE_F16) { + CHECK_STATUS(reduce_mean(&_info, (fp16_t *)y, (const fp16_t *)x)); + } else if (_info.dtype == INFINI_DTYPE_BF16) { + CHECK_STATUS(reduce_mean(&_info, (bf16_t *)y, (const bf16_t *)x)); + } else if (_info.dtype == INFINI_DTYPE_F32) { + CHECK_STATUS(reduce_mean(&_info, (float *)y, (const float *)x)); + } else { + return INFINI_STATUS_BAD_TENSOR_DTYPE; + } + + return INFINI_STATUS_SUCCESS; +} +} // namespace op::reduce_mean::cpu diff --git a/src/infiniop/ops/reduce_mean/cpu/reduce_mean_cpu.h b/src/infiniop/ops/reduce_mean/cpu/reduce_mean_cpu.h new file mode 100644 index 000000000..f67601c62 --- /dev/null +++ b/src/infiniop/ops/reduce_mean/cpu/reduce_mean_cpu.h @@ -0,0 +1,7 @@ +#ifndef __REDUCE_MEAN_CPU_H__ +#define __REDUCE_MEAN_CPU_H__ +#include "../reduce_mean.h" + +DESCRIPTOR(cpu) + +#endif diff --git a/src/infiniop/ops/reduce_mean/info.h b/src/infiniop/ops/reduce_mean/info.h new file mode 100644 index 000000000..6c11e07d3 --- /dev/null +++ b/src/infiniop/ops/reduce_mean/info.h @@ -0,0 +1,62 @@ +#ifndef __REDUCE_MEAN_INFO_H__ +#define __REDUCE_MEAN_INFO_H__ + +#include "../../../utils.h" +#include "../../tensor.h" +#include + +namespace op::reduce_mean { + +class ReduceMeanInfo { + ReduceMeanInfo() = default; + +public: + infiniDtype_t dtype; + + std::vector shape; + std::vector y_strides; + std::vector x_strides; + + static utils::Result create(infiniopTensorDescriptor_t y_desc, infiniopTensorDescriptor_t x_desc, size_t dim) { + auto dtype = y_desc->dtype(); + if (dtype != x_desc->dtype()) { + return INFINI_STATUS_BAD_TENSOR_DTYPE; + } + CHECK_DTYPE(dtype, INFINI_DTYPE_F16, INFINI_DTYPE_BF16, INFINI_DTYPE_F32); + + size_t ndim = y_desc->ndim(); + if (x_desc->ndim() != ndim) { + CHECK_STATUS(INFINI_STATUS_BAD_TENSOR_SHAPE); + } + CHECK_REDUCE_SHAPE(x_desc->shape(), dim, y_desc->shape()); + if (ndim > 4) { + return INFINI_STATUS_BAD_TENSOR_SHAPE; + } else if (ndim == 0) { + std::vector shape = {1, 1, 1, 1}; + std::vector y_strides = {0, 0, 0, 0}; + std::vector x_strides = {0, 0, 0, 0}; + return utils::Result(ReduceMeanInfo{ + dtype, shape, y_strides, x_strides}); + } else { + std::vector shape = x_desc->shape(); + std::vector y_strides = y_desc->strides(); + std::vector x_strides = x_desc->strides(); + if (dim != (shape.size() - 1)) { + std::swap(shape[dim], shape[shape.size() - 1]); + std::swap(y_strides[dim], y_strides[shape.size() - 1]); + std::swap(x_strides[dim], x_strides[shape.size() - 1]); + } + while (shape.size() < 4) { + shape.insert(shape.begin(), 1); + y_strides.insert(y_strides.begin(), 0); + x_strides.insert(x_strides.begin(), 0); + } + return utils::Result(ReduceMeanInfo{ + dtype, shape, y_strides, x_strides}); + } + } +}; + +} // namespace op::reduce_mean + +#endif // __REDUCE_MEAN_INFO_H__ diff --git a/src/infiniop/ops/reduce_mean/metax/reduce_mean_metax.h b/src/infiniop/ops/reduce_mean/metax/reduce_mean_metax.h new file mode 100644 index 000000000..a105724d6 --- /dev/null +++ b/src/infiniop/ops/reduce_mean/metax/reduce_mean_metax.h @@ -0,0 +1,8 @@ +#ifndef __REDUCE_MEAN_METAX_H__ +#define __REDUCE_MEAN_METAX_H__ + +#include "../reduce_mean.h" + +DESCRIPTOR(metax) + +#endif diff --git a/src/infiniop/ops/reduce_mean/metax/reduce_mean_metax.maca b/src/infiniop/ops/reduce_mean/metax/reduce_mean_metax.maca new file mode 100644 index 000000000..0eaf1c9cb --- /dev/null +++ b/src/infiniop/ops/reduce_mean/metax/reduce_mean_metax.maca @@ -0,0 +1,96 @@ +#include "../../../devices/metax/metax_common.h" +#include "reduce_mean_metax.h" + +#include +#include "../../../devices/metax/metax_kernel_common.h" + +#include "../../../reduce/cuda/reduce.cuh" + +#include "../nvidia/kernel.cuh" + +template +INFINIOP_METAX_KERNEL ReduceMean( + Tdata *y_, const Tdata *x_, + size_t batch, size_t channels, size_t height, size_t width, + ptrdiff_t y_stride_b, ptrdiff_t y_stride_c, ptrdiff_t y_stride_h, + ptrdiff_t x_stride_b, ptrdiff_t x_stride_c, ptrdiff_t x_stride_h, ptrdiff_t x_stride_w) { + ReduceMeanKernel(y_, x_, batch, channels, height, width, y_stride_b, y_stride_c, y_stride_h, x_stride_b, x_stride_c, x_stride_h, x_stride_w); +} + +namespace op::reduce_mean::metax { + +struct Descriptor::Opaque { + std::shared_ptr internal; +}; + +Descriptor::~Descriptor() { + delete _opaque; +} + +infiniStatus_t Descriptor::create( + infiniopHandle_t handle, + Descriptor **desc_ptr, + infiniopTensorDescriptor_t y_desc, + infiniopTensorDescriptor_t x_desc, + size_t dim) { + auto info = ReduceMeanInfo::create(y_desc, x_desc, dim); + CHECK_RESULT(info); + *desc_ptr = new Descriptor( + new Opaque{reinterpret_cast(handle)->internal()}, + info.take(), 0, handle->device, handle->device_id); + return INFINI_STATUS_SUCCESS; +} + +template +infiniStatus_t launchKernel(void *y, const void *x, infiniDtype_t dtype, + size_t batch_size, size_t channels, size_t height, size_t width, + ptrdiff_t y_stride_b, ptrdiff_t y_stride_c, ptrdiff_t y_stride_h, + ptrdiff_t x_stride_b, ptrdiff_t x_stride_c, ptrdiff_t x_stride_h, ptrdiff_t x_stride_w, + hcStream_t stream) { + dim3 grid=dim3(uint32_t(batch_size), uint32_t(channels), uint32_t(height)); + if (dtype == INFINI_DTYPE_F16) { + ReduceMean + <<>>((half *)y, (const half *)x, + batch_size, channels, height, width, + y_stride_b, y_stride_c, y_stride_h, + x_stride_b, x_stride_c, x_stride_h, x_stride_w); + } else if (dtype == INFINI_DTYPE_BF16) { + ReduceMean + <<>>((__hpcc_bfloat16 *)y, (const __hpcc_bfloat16 *)x, + batch_size, channels, height, width, + y_stride_b, y_stride_c, y_stride_h, + x_stride_b, x_stride_c, x_stride_h, x_stride_w); + } else if (dtype == INFINI_DTYPE_F32) { + ReduceMean + <<>>((float *)y, (const float *)x, + batch_size, channels, height, width, + y_stride_b, y_stride_c, y_stride_h, + x_stride_b, x_stride_c, x_stride_h, x_stride_w); + } else { + return INFINI_STATUS_BAD_TENSOR_DTYPE; + } + return INFINI_STATUS_SUCCESS; +} + +infiniStatus_t Descriptor::calculate(void *workspace, size_t workspace_size, + void *y, + const void *x, + void *stream_) const { + hcStream_t stream = (hcStream_t)stream_; + if (_opaque->internal->maxThreadsPerBlock() == METAX_BLOCK_SIZE_1024) { + CHECK_STATUS(launchKernel( + y, x, _info.dtype, _info.shape[0], _info.shape[1], _info.shape[2], _info.shape[3], + _info.y_strides[0], _info.y_strides[1], _info.y_strides[2], + _info.x_strides[0], _info.x_strides[1], _info.x_strides[2], _info.x_strides[3], stream)); + } else if (_opaque->internal->maxThreadsPerBlock() == METAX_BLOCK_SIZE_512) { + CHECK_STATUS(launchKernel( + y, x, _info.dtype, _info.shape[0], _info.shape[1], _info.shape[2], _info.shape[3], + _info.y_strides[0], _info.y_strides[1], _info.y_strides[2], + _info.x_strides[0], _info.x_strides[1], _info.x_strides[2], _info.x_strides[3], stream)); + } else { + return INFINI_STATUS_DEVICE_ARCHITECTURE_NOT_SUPPORTED; + } + return INFINI_STATUS_SUCCESS; +} + +} // namespace op::reduce_mean::metax diff --git a/src/infiniop/ops/reduce_mean/nvidia/kernel.cuh b/src/infiniop/ops/reduce_mean/nvidia/kernel.cuh new file mode 100644 index 000000000..e70748605 --- /dev/null +++ b/src/infiniop/ops/reduce_mean/nvidia/kernel.cuh @@ -0,0 +1,26 @@ +#ifndef __REDUCE_MEAN_KERNEL_CUH__ +#define __REDUCE_MEAN_KERNEL_CUH__ + +template +__device__ void ReduceMeanKernel( + Tdata *y_, const Tdata *x_, + size_t batch, size_t channels, size_t height, size_t width, + ptrdiff_t y_stride_b, ptrdiff_t y_stride_c, ptrdiff_t y_stride_h, + ptrdiff_t x_stride_b, ptrdiff_t x_stride_c, ptrdiff_t x_stride_h, ptrdiff_t x_stride_w) { + + Tdata *y = y_ + blockIdx.x * y_stride_b + blockIdx.y * y_stride_c + blockIdx.z * y_stride_h; + const Tdata *x = x_ + blockIdx.x * x_stride_b + blockIdx.y * x_stride_c + blockIdx.z * x_stride_h; + + // [Reduce] Find the sum of each updated row and store in shared memory + Tcompute sum_0 = op::common_cuda::reduce_op::sum(x, width, x_stride_w); + if (threadIdx.x == 0) { + // mean_ = sum_0/width; + *y = sum_0 / width; + } + // __syncthreads(); + + // [Elementwise] Divide each element by the sum and store in shared memory + // *y = mean_; +} + +#endif // __REDUCE_MEAN_KERNEL_CUH__ diff --git a/src/infiniop/ops/reduce_mean/nvidia/reduce_mean_nvidia.cu b/src/infiniop/ops/reduce_mean/nvidia/reduce_mean_nvidia.cu new file mode 100644 index 000000000..bfc26e23d --- /dev/null +++ b/src/infiniop/ops/reduce_mean/nvidia/reduce_mean_nvidia.cu @@ -0,0 +1,101 @@ +#include "../../../devices/nvidia/nvidia_common.cuh" +#include "reduce_mean_nvidia.cuh" + +#include "../../../devices/nvidia/nvidia_kernel_common.cuh" +#include + +#include "../../../reduce/cuda/reduce.cuh" + +#include "kernel.cuh" + +template +INFINIOP_CUDA_KERNEL ReduceMean( + Tdata *y_, const Tdata *x_, + size_t batch, size_t channels, size_t height, size_t width, + ptrdiff_t y_stride_b, ptrdiff_t y_stride_c, ptrdiff_t y_stride_h, + ptrdiff_t x_stride_b, ptrdiff_t x_stride_c, ptrdiff_t x_stride_h, ptrdiff_t x_stride_w) { + ReduceMeanKernel(y_, x_, batch, channels, height, width, y_stride_b, y_stride_c, y_stride_h, x_stride_b, x_stride_c, x_stride_h, x_stride_w); +} + +namespace op::reduce_mean::nvidia { + +struct Descriptor::Opaque { + std::shared_ptr internal; +}; + +Descriptor::~Descriptor() { + delete _opaque; +} + +infiniStatus_t Descriptor::create( + infiniopHandle_t handle, + Descriptor **desc_ptr, + infiniopTensorDescriptor_t y_desc, + infiniopTensorDescriptor_t x_desc, + size_t dim) { + auto info = ReduceMeanInfo::create(y_desc, x_desc, dim); + CHECK_RESULT(info); + *desc_ptr = new Descriptor( + new Opaque{reinterpret_cast(handle)->internal()}, + info.take(), 0, handle->device, handle->device_id); + return INFINI_STATUS_SUCCESS; +} + +template +infiniStatus_t launchKernel(void *y, const void *x, infiniDtype_t dtype, + size_t batch_size, size_t channels, size_t height, size_t width, + ptrdiff_t y_stride_b, ptrdiff_t y_stride_c, ptrdiff_t y_stride_h, + ptrdiff_t x_stride_b, ptrdiff_t x_stride_c, ptrdiff_t x_stride_h, ptrdiff_t x_stride_w, + cudaStream_t stream) { + dim3 grid = dim3(uint32_t(batch_size), uint32_t(channels), uint32_t(height)); + if (dtype == INFINI_DTYPE_F16) { + ReduceMean + <<>>((half *)y, (const half *)x, + batch_size, channels, height, width, + y_stride_b, y_stride_c, y_stride_h, + x_stride_b, x_stride_c, x_stride_h, x_stride_w); + } else if (dtype == INFINI_DTYPE_BF16) { + ReduceMean + <<>>((__nv_bfloat16 *)y, (const __nv_bfloat16 *)x, + batch_size, channels, height, width, + y_stride_b, y_stride_c, y_stride_h, + x_stride_b, x_stride_c, x_stride_h, x_stride_w); + } else if (dtype == INFINI_DTYPE_F32) { + ReduceMean + <<>>((float *)y, (const float *)x, + batch_size, channels, height, width, + y_stride_b, y_stride_c, y_stride_h, + x_stride_b, x_stride_c, x_stride_h, x_stride_w); + } else { + return INFINI_STATUS_BAD_TENSOR_DTYPE; + } + return INFINI_STATUS_SUCCESS; +} + +infiniStatus_t Descriptor::calculate(void *workspace, size_t workspace_size, + void *y, + const void *x, + void *stream_) const { + cudaStream_t stream = (cudaStream_t)stream_; + if (_opaque->internal->maxThreadsPerBlock() == CUDA_BLOCK_SIZE_1024) { + CHECK_STATUS(launchKernel( + y, x, _info.dtype, _info.shape[0], _info.shape[1], _info.shape[2], _info.shape[3], + _info.y_strides[0], _info.y_strides[1], _info.y_strides[2], + _info.x_strides[0], _info.x_strides[1], _info.x_strides[2], _info.x_strides[3], stream)); + } else if (_opaque->internal->maxThreadsPerBlock() == CUDA_BLOCK_SIZE_512) { + CHECK_STATUS(launchKernel( + y, x, _info.dtype, _info.shape[0], _info.shape[1], _info.shape[2], _info.shape[3], + _info.y_strides[0], _info.y_strides[1], _info.y_strides[2], + _info.x_strides[0], _info.x_strides[1], _info.x_strides[2], _info.x_strides[3], stream)); + } else if (_opaque->internal->maxThreadsPerBlock() == CUDA_BLOCK_SIZE_4096) { + CHECK_STATUS(launchKernel( + y, x, _info.dtype, _info.shape[0], _info.shape[1], _info.shape[2], _info.shape[3], + _info.y_strides[0], _info.y_strides[1], _info.y_strides[2], + _info.x_strides[0], _info.x_strides[1], _info.x_strides[2], _info.x_strides[3], stream)); + } else { + return INFINI_STATUS_DEVICE_ARCHITECTURE_NOT_SUPPORTED; + } + return INFINI_STATUS_SUCCESS; +} + +} // namespace op::reduce_mean::nvidia diff --git a/src/infiniop/ops/reduce_mean/nvidia/reduce_mean_nvidia.cuh b/src/infiniop/ops/reduce_mean/nvidia/reduce_mean_nvidia.cuh new file mode 100644 index 000000000..be16b4491 --- /dev/null +++ b/src/infiniop/ops/reduce_mean/nvidia/reduce_mean_nvidia.cuh @@ -0,0 +1,8 @@ +#ifndef __REDUCE_MEAN_NVIDIA_H__ +#define __REDUCE_MEAN_NVIDIA_H__ + +#include "../reduce_mean.h" + +DESCRIPTOR(nvidia) + +#endif diff --git a/src/infiniop/ops/reduce_mean/operator.cc b/src/infiniop/ops/reduce_mean/operator.cc new file mode 100644 index 000000000..3696f95d2 --- /dev/null +++ b/src/infiniop/ops/reduce_mean/operator.cc @@ -0,0 +1,182 @@ +#include "../../operator.h" +#include "../../handle.h" +#include "infiniop/ops/reduce_mean.h" + +#ifdef ENABLE_CPU_API +#include "cpu/reduce_mean_cpu.h" +#endif +#if defined(ENABLE_NVIDIA_API) || defined(ENABLE_ILUVATAR_API) +#include "nvidia/reduce_mean_nvidia.cuh" +#endif +#ifdef ENABLE_METAX_API +#include "metax/reduce_mean_metax.h" +#endif +#ifdef ENABLE_ASCEND_API +#include "ascend/reduce_mean_ascend.h" +#endif + +__C infiniStatus_t infiniopCreateReduceMeanDescriptor( + infiniopHandle_t handle, + infiniopReduceMeanDescriptor_t *desc_ptr, + infiniopTensorDescriptor_t y_desc, + infiniopTensorDescriptor_t x_desc, + size_t dim) { + +#define CREATE(CASE, NAMESPACE) \ + case CASE: \ + return op::reduce_mean::NAMESPACE::Descriptor::create( \ + handle, \ + reinterpret_cast(desc_ptr), \ + y_desc, \ + x_desc, \ + dim); + + switch (handle->device) { +#ifdef ENABLE_CPU_API + CREATE(INFINI_DEVICE_CPU, cpu) +#endif +#ifdef ENABLE_NVIDIA_API + CREATE(INFINI_DEVICE_NVIDIA, nvidia) +#endif +#ifdef ENABLE_ILUVATAR_API + CREATE(INFINI_DEVICE_ILUVATAR, nvidia); +#endif +#ifdef ENABLE_METAX_API + CREATE(INFINI_DEVICE_METAX, metax) +#endif + // #ifdef ENABLE_ASCEND_API + // CREATE(INFINI_DEVICE_ASCEND, ascend) + // #endif + // #ifdef ENABLE_CAMBRICON_MLU + // case DevCambriconMlu: { + // return bangCreateCausalSoftmaxDescriptor((BangHandle_t)handle, (CausalSoftmaxBangDescriptor_t *)desc_ptr, y_desc); + // // return cnnlCreateCausalSoftmaxDescriptor((BangHandle_t) handle, (CausalSoftmaxCnnlDescriptor_t *) desc_ptr, y_desc); + // } + // #endif + // #ifdef ENABLE_MTHREADS_GPU + // case DevMthreadsGpu: { + // return musaCreateCausalSoftmaxDescriptor((MusaHandle_t)handle, (CausalSoftmaxMusaDescriptor_t *)desc_ptr, y_desc); + // } + // #endif + } + return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED; +} + +__C infiniStatus_t infiniopGetReduceMeanWorkspaceSize(infiniopReduceMeanDescriptor_t desc, size_t *size) { + +#define GET(CASE, NAMESPACE) \ + case CASE: \ + *size = reinterpret_cast(desc)->workspaceSize(); \ + return INFINI_STATUS_SUCCESS; + + switch (desc->device_type) { +#ifdef ENABLE_CPU_API + GET(INFINI_DEVICE_CPU, cpu) +#endif +#ifdef ENABLE_NVIDIA_API + GET(INFINI_DEVICE_NVIDIA, nvidia) +#endif +#ifdef ENABLE_ILUVATAR_API + GET(INFINI_DEVICE_ILUVATAR, nvidia); +#endif +#ifdef ENABLE_METAX_API + GET(INFINI_DEVICE_METAX, metax) +#endif + // #ifdef ENABLE_ASCEND_API + // GET(INFINI_DEVICE_ASCEND, ascend) + // #endif + // #ifdef ENABLE_CAMBRICON_MLU + // case DevCambriconMlu: { + // return bangGetCausalSoftmaxWorkspaceSize((CausalSoftmaxBangDescriptor_t)desc, size); + // // return cnnlGetCausalSoftmaxWorkspaceSize((CausalSoftmaxCnnlDescriptor_t) desc, size); + // } + + // #endif + // #ifdef ENABLE_MTHREADS_GPU + // case DevMthreadsGpu: { + // return musaGetCausalSoftmaxWorkspaceSize((CausalSoftmaxMusaDescriptor_t)desc, size); + // } + // #endif + } + return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED; +} + +__C infiniStatus_t infiniopReduceMean( + infiniopReduceMeanDescriptor_t desc, + void *workspace, size_t workspace_size, + void *y, + const void *x, + void *stream) { + +#define CALCULATE(CASE, NAMESPACE) \ + case CASE: \ + return reinterpret_cast(desc)->calculate( \ + workspace, workspace_size, y, x, stream); + + switch (desc->device_type) { +#ifdef ENABLE_CPU_API + CALCULATE(INFINI_DEVICE_CPU, cpu) +#endif +#ifdef ENABLE_NVIDIA_API + CALCULATE(INFINI_DEVICE_NVIDIA, nvidia) +#endif +#ifdef ENABLE_ILUVATAR_API + CALCULATE(INFINI_DEVICE_ILUVATAR, nvidia); +#endif +#ifdef ENABLE_METAX_API + CALCULATE(INFINI_DEVICE_METAX, metax) +#endif + // #ifdef ENABLE_ASCEND_API + // CALCULATE(INFINI_DEVICE_ASCEND, ascend) + // #endif + // #ifdef ENABLE_CAMBRICON_MLU + // case DevCambriconMlu: { + // return bangCausalSoftmax((CausalSoftmaxBangDescriptor_t)desc, workspace, workspace_size, data, stream); + // // return cnnlCausalSoftmax((CausalSoftmaxCnnlDescriptor_t) desc, workspace, workspace_size, data, stream); + // } + // #endif + // #ifdef ENABLE_MTHREADS_GPU + // case DevMthreadsGpu: { + // return musaCausalSoftmax((CausalSoftmaxMusaDescriptor_t)desc, workspace, workspace_size, data, stream); + // } + // #endif + } + return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED; +} + +__C infiniStatus_t infiniopDestroyReduceMeanDescriptor(infiniopReduceMeanDescriptor_t desc) { + +#define DESTROY(CASE, NAMESPACE) \ + case CASE: \ + delete reinterpret_cast(desc); \ + return INFINI_STATUS_SUCCESS; + + switch (desc->device_type) { +#ifdef ENABLE_CPU_API + DESTROY(INFINI_DEVICE_CPU, cpu) +#endif +#ifdef ENABLE_NVIDIA_API + DESTROY(INFINI_DEVICE_NVIDIA, nvidia) +#endif +#ifdef ENABLE_ILUVATAR_API + DESTROY(INFINI_DEVICE_ILUVATAR, nvidia); +#endif +#ifdef ENABLE_METAX_API + DESTROY(INFINI_DEVICE_METAX, metax) +#endif + // #ifdef ENABLE_ASCEND_API + // DESTROY(INFINI_DEVICE_ASCEND, ascend) + // #endif + // #ifdef ENABLE_CAMBRICON_MLU + // case DevCambriconMlu: { + // return bangDestroyCausalSoftmaxDescriptor((CausalSoftmaxBangDescriptor_t)desc); + // // return cnnlDestroyCausalSoftmaxDescriptor((CausalSoftmaxCnnlDescriptor_t) desc); + // } + // #endif + // #ifdef ENABLE_MTHREADS_GPU + // case DevMthreadsGpu: + // return musaDestroyCausalSoftmaxDescriptor((CausalSoftmaxMusaDescriptor_t)desc); + // #endif + } + return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED; +} diff --git a/src/infiniop/ops/reduce_mean/reduce_mean.h b/src/infiniop/ops/reduce_mean/reduce_mean.h new file mode 100644 index 000000000..bf2e2dda0 --- /dev/null +++ b/src/infiniop/ops/reduce_mean/reduce_mean.h @@ -0,0 +1,47 @@ +#ifndef REDUCE_MEAN_H +#define REDUCE_MEAN_H + +#include "../../operator.h" +#include "info.h" + +#define DESCRIPTOR(NAMESPACE) \ + \ + namespace op::reduce_mean::NAMESPACE { \ + class Descriptor final : public InfiniopDescriptor { \ + struct Opaque; \ + Opaque *_opaque; \ + ReduceMeanInfo _info; \ + size_t _workspace_size; \ + \ + Descriptor( \ + Opaque *opaque, \ + ReduceMeanInfo info, \ + size_t workspace_size, \ + infiniDevice_t device_type, \ + int device_id) \ + : InfiniopDescriptor{device_type, device_id}, \ + _opaque(opaque), \ + _info(info), \ + _workspace_size(workspace_size) {} \ + \ + public: \ + ~Descriptor(); \ + \ + size_t workspaceSize() const { return _workspace_size; } \ + \ + static infiniStatus_t create( \ + infiniopHandle_t handle, \ + Descriptor **desc_ptr, \ + infiniopTensorDescriptor_t y_desc, \ + infiniopTensorDescriptor_t x_desc, \ + size_t dim); \ + \ + infiniStatus_t calculate( \ + void *workspace, size_t workspace_size, \ + void *y, \ + const void *x, \ + void *stream) const; \ + }; \ + } + +#endif // REDUCE_MEAN_H diff --git a/src/infiniop/reduce/cuda/reduce.cuh b/src/infiniop/reduce/cuda/reduce.cuh index a1d2c2501..cd98d0936 100644 --- a/src/infiniop/reduce/cuda/reduce.cuh +++ b/src/infiniop/reduce/cuda/reduce.cuh @@ -61,6 +61,39 @@ __device__ __forceinline__ Tdata max(const Tdata *data_ptr, size_t count) { return BlockReduce(temp_storage).Reduce(max_, cub::Max(), BLOCK_SIZE); } +// Sum(x) on non-contiguous data of length count +template +__device__ __forceinline__ Tcompute sum(const Tdata *data_ptr, + size_t count, + ptrdiff_t stride) { + Tcompute s = 0; + + for (size_t i = threadIdx.x; i < count; i += BLOCK_SIZE) { + s += Tcompute(data_ptr[i * stride]); + } + + using BlockReduce = cub::BlockReduce; + __shared__ typename BlockReduce::TempStorage temp_storage; + + return BlockReduce(temp_storage).Sum(s); +} + +// Max(x) on non-contiguous data of length count +template +__device__ __forceinline__ Tdata max(const Tdata *data_ptr, + size_t count, + ptrdiff_t stride) { + Tdata max_ = data_ptr[0]; + + for (size_t i = threadIdx.x; i < count; i += BLOCK_SIZE) { + max_ = cub::Max()(max_, data_ptr[i * stride]); + } + + using BlockReduce = cub::BlockReduce; + __shared__ typename BlockReduce::TempStorage temp_storage; + + return BlockReduce(temp_storage).Reduce(max_, cub::Max(), BLOCK_SIZE); +} } // namespace op::common_cuda::reduce_op #endif diff --git a/src/infinirt/infinirt_impl.h b/src/infinirt/infinirt_impl.h index 0d6f8cf05..4c41a1198 100644 --- a/src/infinirt/infinirt_impl.h +++ b/src/infinirt/infinirt_impl.h @@ -30,7 +30,6 @@ infiniStatus_t freeAsync(void *ptr, infinirtStream_t stream) IMPL; #define INFINIRT_DEVICE_API_IMPL INFINIRT_DEVICE_API(, ) -#define INFINIRT_DEVICE_API_NOOP INFINIRT_DEVICE_API({ return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED; }, \ - {*count = 0; return INFINI_STATUS_SUCCESS; }) +#define INFINIRT_DEVICE_API_NOOP INFINIRT_DEVICE_API({ return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED; }, {*count = 0; return INFINI_STATUS_SUCCESS; }) #endif // __INFINIRT_IMPL_H__ diff --git a/src/utils.h b/src/utils.h index f4e63be25..e721f05a6 100644 --- a/src/utils.h +++ b/src/utils.h @@ -108,4 +108,4 @@ inline size_t align(size_t size, size_t alignment) { } // namespace utils -#endif +#endif \ No newline at end of file diff --git a/src/utils/check.h b/src/utils/check.h index 7f4a2bdd9..76823aa40 100644 --- a/src/utils/check.h +++ b/src/utils/check.h @@ -59,4 +59,22 @@ #define CHECK_SAME_STRIDES(FIRST, ...) CHECK_SAME_VEC(INFINI_STATUS_BAD_TENSOR_STRIDES, FIRST, __VA_ARGS__) +#define CHECK_REDUCE_SHAPE(INPUT_SHAPE, DIM, EXPECTED_SHAPE) \ + do { \ + if (INPUT_SHAPE.empty()) { \ + if (!EXPECTED_SHAPE.empty()) { \ + return INFINI_STATUS_BAD_TENSOR_SHAPE; \ + } \ + break; \ + } \ + if (DIM >= INPUT_SHAPE.size()) { \ + return INFINI_STATUS_BAD_PARAM; \ + } \ + std::vector reduced_shape = INPUT_SHAPE; \ + reduced_shape[DIM] = 1; \ + if (reduced_shape != EXPECTED_SHAPE) { \ + return INFINI_STATUS_BAD_TENSOR_SHAPE; \ + } \ + } while (0) + #endif // INFINIUTILS_CHECK_H diff --git a/test/infiniop-test/test_generate/__init__.py b/test/infiniop-test/test_generate/__init__.py index a61f63f7c..8db1e6755 100644 --- a/test/infiniop-test/test_generate/__init__.py +++ b/test/infiniop-test/test_generate/__init__.py @@ -1 +1,8 @@ -from .infiniop_test import InfiniopTestCase, InfiniopTestWriter, np_dtype_to_ggml, gguf_strides, contiguous_gguf_strides, process_zero_stride_tensor +from .infiniop_test import ( + InfiniopTestCase, + InfiniopTestWriter, + np_dtype_to_ggml, + gguf_strides, + contiguous_gguf_strides, + process_zero_stride_tensor, +) diff --git a/test/infiniop-test/test_generate/testcases/add.py b/test/infiniop-test/test_generate/testcases/add.py index b04ba2042..052ef18a7 100644 --- a/test/infiniop-test/test_generate/testcases/add.py +++ b/test/infiniop-test/test_generate/testcases/add.py @@ -4,7 +4,14 @@ from typing import List from numpy.lib.stride_tricks import as_strided -from .. import InfiniopTestWriter, InfiniopTestCase, np_dtype_to_ggml, gguf_strides, contiguous_gguf_strides, process_zero_stride_tensor +from .. import ( + InfiniopTestWriter, + InfiniopTestCase, + np_dtype_to_ggml, + gguf_strides, + contiguous_gguf_strides, + process_zero_stride_tensor, +) def add( @@ -26,7 +33,6 @@ def __init__( c: np.ndarray, shape_c: List[int] | None, stride_c: List[int] | None, - ): super().__init__("add") self.a = a @@ -39,7 +45,6 @@ def __init__( self.shape_c = shape_c self.stride_c = stride_c - def write_test(self, test_writer: "InfiniopTestWriter"): super().write_test(test_writer) if self.shape_a is not None: @@ -49,12 +54,22 @@ def write_test(self, test_writer: "InfiniopTestWriter"): if self.shape_c is not None: test_writer.add_array(test_writer.gguf_key("c.shape"), self.shape_c) if self.stride_a is not None: - test_writer.add_array(test_writer.gguf_key("a.strides"), gguf_strides(*self.stride_a)) + test_writer.add_array( + test_writer.gguf_key("a.strides"), gguf_strides(*self.stride_a) + ) if self.stride_b is not None: - test_writer.add_array(test_writer.gguf_key("b.strides"), gguf_strides(*self.stride_b)) + test_writer.add_array( + test_writer.gguf_key("b.strides"), gguf_strides(*self.stride_b) + ) test_writer.add_array( test_writer.gguf_key("c.strides"), - gguf_strides(*self.stride_c if self.stride_c is not None else contiguous_gguf_strides(self.shape_c)) + gguf_strides( + *( + self.stride_c + if self.stride_c is not None + else contiguous_gguf_strides(self.shape_c) + ) + ), ) test_writer.add_tensor( test_writer.gguf_key("a"), self.a, raw_dtype=np_dtype_to_ggml(self.a.dtype) @@ -114,7 +129,6 @@ def write_test(self, test_writer: "InfiniopTestWriter"): stride_c=stride_c, ) test_cases.append(test_case) - + test_writer.add_tests(test_cases) test_writer.save() - \ No newline at end of file diff --git a/test/infiniop-test/test_generate/testcases/causal_softmax.py b/test/infiniop-test/test_generate/testcases/causal_softmax.py index 74c3efcf0..037701865 100644 --- a/test/infiniop-test/test_generate/testcases/causal_softmax.py +++ b/test/infiniop-test/test_generate/testcases/causal_softmax.py @@ -4,7 +4,13 @@ from typing import List from enum import Enum, auto -from .. import InfiniopTestWriter, InfiniopTestCase, np_dtype_to_ggml, gguf_strides, contiguous_gguf_strides +from .. import ( + InfiniopTestWriter, + InfiniopTestCase, + np_dtype_to_ggml, + gguf_strides, + contiguous_gguf_strides, +) def causal_softmax(x): @@ -37,8 +43,8 @@ def __init__( super().__init__("causal_softmax") self.x = x self.y = y - self.shape_x=shape_x - self.shape_y=shape_y + self.shape_x = shape_x + self.shape_y = shape_y self.stride_x = stride_x self.stride_y = stride_y @@ -49,10 +55,18 @@ def write_test(self, test_writer: "InfiniopTestWriter"): if self.shape_y is not None: test_writer.add_array(test_writer.gguf_key("y.shape"), self.shape_y) if self.stride_x is not None: - test_writer.add_array(test_writer.gguf_key("x.strides"), gguf_strides(*self.stride_x)) + test_writer.add_array( + test_writer.gguf_key("x.strides"), gguf_strides(*self.stride_x) + ) test_writer.add_array( test_writer.gguf_key("y.strides"), - gguf_strides(*self.stride_y if self.stride_y is not None else contiguous_gguf_strides(self.shape_y)) + gguf_strides( + *( + self.stride_y + if self.stride_y is not None + else contiguous_gguf_strides(self.shape_y) + ) + ), ) test_writer.add_tensor( test_writer.gguf_key("x"), @@ -102,6 +116,6 @@ def write_test(self, test_writer: "InfiniopTestWriter"): stride_y, ) test_cases.append(test_case) - + test_writer.add_tests(test_cases) test_writer.save() diff --git a/test/infiniop-test/test_generate/testcases/clip.py b/test/infiniop-test/test_generate/testcases/clip.py index f08a59929..786153197 100644 --- a/test/infiniop-test/test_generate/testcases/clip.py +++ b/test/infiniop-test/test_generate/testcases/clip.py @@ -2,7 +2,13 @@ import gguf from typing import List, Optional, Tuple -from .. import InfiniopTestWriter, InfiniopTestCase, np_dtype_to_ggml, gguf_strides, contiguous_gguf_strides +from .. import ( + InfiniopTestWriter, + InfiniopTestCase, + np_dtype_to_ggml, + gguf_strides, + contiguous_gguf_strides, +) def clip( @@ -35,7 +41,7 @@ def random_tensor(shape, dtype): Returns: Random tensor with the specified shape and dtype """ - return (np.random.rand(*shape).astype(dtype) * 4.0 - 2.0) + return np.random.rand(*shape).astype(dtype) * 4.0 - 2.0 class ClipTestCase(InfiniopTestCase): @@ -52,7 +58,7 @@ def __init__( max_val: np.ndarray, max_stride: Optional[List[int]], y: np.ndarray, - y_shape: Optional[List[int]], + y_shape: Optional[List[int]], y_stride: Optional[List[int]], ): super().__init__("clip") @@ -63,7 +69,7 @@ def __init__( self.max_val = max_val self.max_stride = max_stride self.y = y - self.y_shape=y_shape + self.y_shape = y_shape self.y_stride = y_stride def write_test(self, test_writer: "InfiniopTestWriter"): @@ -71,57 +77,64 @@ def write_test(self, test_writer: "InfiniopTestWriter"): # Add strides as arrays if they exist if self.x_stride is not None: - test_writer.add_array(test_writer.gguf_key("x.strides"), gguf_strides(*self.x_stride)) + test_writer.add_array( + test_writer.gguf_key("x.strides"), gguf_strides(*self.x_stride) + ) if self.min_stride is not None: - test_writer.add_array(test_writer.gguf_key("min_val.strides"), gguf_strides(*self.min_stride)) + test_writer.add_array( + test_writer.gguf_key("min_val.strides"), gguf_strides(*self.min_stride) + ) if self.max_stride is not None: - test_writer.add_array(test_writer.gguf_key("max_val.strides"), gguf_strides(*self.max_stride)) + test_writer.add_array( + test_writer.gguf_key("max_val.strides"), gguf_strides(*self.max_stride) + ) if self.y_shape is not None: test_writer.add_array(test_writer.gguf_key("y.shape"), self.y_shape) test_writer.add_array( test_writer.gguf_key("y.strides"), - gguf_strides(*self.y_stride if self.y_stride is not None else contiguous_gguf_strides(self.y_shape)) + gguf_strides( + *( + self.y_stride + if self.y_stride is not None + else contiguous_gguf_strides(self.y_shape) + ) + ), ) # Add tensors to the test test_writer.add_tensor( - test_writer.gguf_key("x"), - self.x, - raw_dtype=np_dtype_to_ggml(self.x.dtype) + test_writer.gguf_key("x"), self.x, raw_dtype=np_dtype_to_ggml(self.x.dtype) ) test_writer.add_tensor( test_writer.gguf_key("min_val"), self.min_val, - raw_dtype=np_dtype_to_ggml(self.min_val.dtype) + raw_dtype=np_dtype_to_ggml(self.min_val.dtype), ) test_writer.add_tensor( test_writer.gguf_key("max_val"), self.max_val, - raw_dtype=np_dtype_to_ggml(self.max_val.dtype) + raw_dtype=np_dtype_to_ggml(self.max_val.dtype), ) test_writer.add_tensor( - test_writer.gguf_key("y"), - self.y, - raw_dtype=np_dtype_to_ggml(self.y.dtype) + test_writer.gguf_key("y"), self.y, raw_dtype=np_dtype_to_ggml(self.y.dtype) ) # Calculate the expected result ans = clip( self.x.astype(np.float64), self.min_val.astype(np.float64), - self.max_val.astype(np.float64) + self.max_val.astype(np.float64), ) # Add the expected result to the test test_writer.add_tensor( - test_writer.gguf_key("ans"), - ans, - raw_dtype=gguf.GGMLQuantizationType.F64 + test_writer.gguf_key("ans"), ans, raw_dtype=gguf.GGMLQuantizationType.F64 ) + if __name__ == "__main__": test_writer = InfiniopTestWriter("clip.gguf") @@ -130,23 +143,23 @@ def write_test(self, test_writer: "InfiniopTestWriter"): # Test case shapes shapes = [ - (10,), # 1D tensor - (5, 10), # 2D tensor - (2, 3, 4), # 3D tensor - (7, 13), # Prime dimensions - (1, 1), # Minimum shape - (100, 100), # Large shape - (16, 16, 16), # Large 3D + (10,), # 1D tensor + (5, 10), # 2D tensor + (2, 3, 4), # 3D tensor + (7, 13), # Prime dimensions + (1, 1), # Minimum shape + (100, 100), # Large shape + (16, 16, 16), # Large 3D ] # Test case min/max values min_max_values = [ - (-1.0, 1.0), # Standard range - (0.0, 2.0), # Positive range - (-2.0, 0.0), # Negative range - (-1000.0, 1000.0), # Large range - (-0.001, 0.001), # Small range - (0.0, 0.0), # min=max + (-1.0, 1.0), # Standard range + (0.0, 2.0), # Positive range + (-2.0, 0.0), # Negative range + (-1000.0, 1000.0), # Large range + (-0.001, 0.001), # Small range + (0.0, 0.0), # min=max ] # Data types to test @@ -171,7 +184,7 @@ def write_test(self, test_writer: "InfiniopTestWriter"): max_stride=None, y=y, y_shape=shape, - y_stride=None + y_stride=None, ) ) @@ -199,7 +212,7 @@ def write_test(self, test_writer: "InfiniopTestWriter"): max_stride=row_stride, y=y, y_shape=shape, - y_stride=row_stride + y_stride=row_stride, ) ) @@ -219,7 +232,7 @@ def write_test(self, test_writer: "InfiniopTestWriter"): max_stride=col_stride, y=y, y_shape=shape, - y_stride=col_stride + y_stride=col_stride, ) ) @@ -239,7 +252,7 @@ def write_test(self, test_writer: "InfiniopTestWriter"): max_stride=row_stride, y=y, y_shape=shape, - y_stride=col_stride + y_stride=col_stride, ) ) diff --git a/test/infiniop-test/test_generate/testcases/mul.py b/test/infiniop-test/test_generate/testcases/mul.py index 00c427bcb..ad4f6b806 100644 --- a/test/infiniop-test/test_generate/testcases/mul.py +++ b/test/infiniop-test/test_generate/testcases/mul.py @@ -2,30 +2,36 @@ import gguf from typing import List -from .. import InfiniopTestWriter, InfiniopTestCase, np_dtype_to_ggml, gguf_strides, contiguous_gguf_strides +from .. import ( + InfiniopTestWriter, + InfiniopTestCase, + np_dtype_to_ggml, + gguf_strides, + contiguous_gguf_strides, +) -def mul( - a: np.ndarray, - b: np.ndarray -): + +def mul(a: np.ndarray, b: np.ndarray): return np.multiply(a, b) + def random_tensor(shape, dtype): rate = 1e-3 var = 0.5 * rate # 数值范围在[-5e-4, 5e-4] return rate * np.random.rand(*shape).astype(dtype) - var + class MulTestCase(InfiniopTestCase): def __init__( self, a: np.ndarray, - shape_a: List[int] | None, + shape_a: List[int] | None, stride_a: List[int] | None, b: np.ndarray, - shape_b: List[int] | None, + shape_b: List[int] | None, stride_b: List[int] | None, c: np.ndarray, - shape_c: List[int] | None, + shape_c: List[int] | None, stride_c: List[int] | None, ): super().__init__("mul") @@ -39,7 +45,6 @@ def __init__( self.shape_c = shape_c self.stride_c = stride_c - def write_test(self, test_writer: "InfiniopTestWriter"): super().write_test(test_writer) if self.shape_a is not None: @@ -49,12 +54,22 @@ def write_test(self, test_writer: "InfiniopTestWriter"): if self.shape_c is not None: test_writer.add_array(test_writer.gguf_key("c.shape"), self.shape_c) if self.stride_a is not None: - test_writer.add_array(test_writer.gguf_key("a.strides"), gguf_strides(*self.stride_a)) + test_writer.add_array( + test_writer.gguf_key("a.strides"), gguf_strides(*self.stride_a) + ) if self.stride_b is not None: - test_writer.add_array(test_writer.gguf_key("b.strides"), gguf_strides(*self.stride_b)) + test_writer.add_array( + test_writer.gguf_key("b.strides"), gguf_strides(*self.stride_b) + ) test_writer.add_array( test_writer.gguf_key("c.strides"), - gguf_strides(*self.stride_c if self.stride_c is not None else contiguous_gguf_strides(self.shape_c)) + gguf_strides( + *( + self.stride_c + if self.stride_c is not None + else contiguous_gguf_strides(self.shape_c) + ) + ), ) test_writer.add_tensor( @@ -68,7 +83,7 @@ def write_test(self, test_writer: "InfiniopTestWriter"): ) a_fp64 = self.a.astype(np.float64) b_fp64 = self.b.astype(np.float64) - + ans_fp64 = np.multiply(a_fp64, b_fp64) ans = mul(self.a, self.b) test_writer.add_tensor( @@ -80,7 +95,8 @@ def write_test(self, test_writer: "InfiniopTestWriter"): raw_dtype=np_dtype_to_ggml(ans_fp64.dtype), ) -if __name__ == '__main__': + +if __name__ == "__main__": test_writer = InfiniopTestWriter("mul.gguf") test_cases = [] @@ -96,16 +112,15 @@ def write_test(self, test_writer: "InfiniopTestWriter"): ((2048, 2560), (2560, 1), (1, 2048), (2560, 1)), ((4, 48, 64), (64 * 48, 64, 1), (1, 4, 192), None), ((4, 48, 64), None, (1, 4, 192), (48 * 64, 64, 1)), - ] + ] _TENSOR_DTYPES_ = [np.float32, np.float16] - + for dtype in _TENSOR_DTYPES_: for shape, stride_a, stride_b, stride_c in _TEST_CASES_: a = random_tensor(shape, dtype) b = random_tensor(shape, dtype) c = np.empty(tuple(0 for _ in shape), dtype=dtype) - test_cases.append( MulTestCase( a=a, @@ -118,7 +133,7 @@ def write_test(self, test_writer: "InfiniopTestWriter"): shape_c=shape, stride_c=stride_c, ) - ) - + ) + test_writer.add_tests(test_cases) test_writer.save() diff --git a/test/infiniop-test/test_generate/testcases/rearrange.py b/test/infiniop-test/test_generate/testcases/rearrange.py index 9617a1fc0..3d3a0e73b 100644 --- a/test/infiniop-test/test_generate/testcases/rearrange.py +++ b/test/infiniop-test/test_generate/testcases/rearrange.py @@ -1,14 +1,21 @@ import torch from typing import List -from .. import InfiniopTestWriter, InfiniopTestCase, np_dtype_to_ggml, gguf_strides, contiguous_gguf_strides +from .. import ( + InfiniopTestWriter, + InfiniopTestCase, + np_dtype_to_ggml, + gguf_strides, + contiguous_gguf_strides, +) + def row_major_strides(shape): """生成张量的行优先stride - + Args: shape: 张量形状 - + Returns: 行优先strides列表 """ @@ -19,12 +26,13 @@ def row_major_strides(shape): strides.insert(0, stride) return strides + def column_major_strides(shape): """生成张量的列优先stride - + Args: shape: 张量形状 - + Returns: 列优先strides列表 """ @@ -35,6 +43,7 @@ def column_major_strides(shape): strides.append(stride) return strides + def rearrange_using_torch(src: torch.Tensor, dst_strides: List[int]) -> torch.Tensor: """ 使用torch的rearrange函数计算结果 @@ -66,27 +75,35 @@ def __init__( self.shape = shape self.src_strides = src_strides self.dst_strides = dst_strides - + def write_test(self, test_writer: "InfiniopTestWriter"): super().write_test(test_writer) - + # 写入形状信息 if self.shape is not None: test_writer.add_array(test_writer.gguf_key("src.shape"), self.shape) test_writer.add_array(test_writer.gguf_key("dst.shape"), self.shape) - + # 写入strides信息 if self.src_strides is not None: - test_writer.add_array(test_writer.gguf_key("src.strides"), gguf_strides(*self.src_strides)) + test_writer.add_array( + test_writer.gguf_key("src.strides"), gguf_strides(*self.src_strides) + ) test_writer.add_array( test_writer.gguf_key("dst.strides"), - gguf_strides(*self.dst_strides if self.dst_strides is not None else contiguous_gguf_strides(self.shape)) + gguf_strides( + *( + self.dst_strides + if self.dst_strides is not None + else contiguous_gguf_strides(self.shape) + ) + ), ) - + # 转换torch tensor为numpy用于写入文件 src_numpy = self.src.detach().cpu().numpy() dst_numpy = self.dst.detach().cpu().numpy() - + # 写入张量数据 test_writer.add_tensor( test_writer.gguf_key("src"), @@ -98,9 +115,13 @@ def write_test(self, test_writer: "InfiniopTestWriter"): dst_numpy, raw_dtype=np_dtype_to_ggml(dst_numpy.dtype), ) - + # 计算并写入答案 - dst_strides_for_ans = self.dst_strides if self.dst_strides is not None else list(contiguous_gguf_strides(self.shape)) + dst_strides_for_ans = ( + self.dst_strides + if self.dst_strides is not None + else list(contiguous_gguf_strides(self.shape)) + ) ans_torch = rearrange_using_torch(self.src, dst_strides_for_ans) ans_numpy = ans_torch.detach().cpu().numpy() test_writer.add_tensor( @@ -109,6 +130,7 @@ def write_test(self, test_writer: "InfiniopTestWriter"): raw_dtype=np_dtype_to_ggml(src_numpy.dtype), ) + if __name__ == "__main__": test_writer = InfiniopTestWriter("rearrange.gguf") test_cases = [] @@ -117,12 +139,20 @@ def write_test(self, test_writer: "InfiniopTestWriter"): # (shape, src_stride, dst_stride) ((100, 100), (1, 100), (100, 1)), ((4, 4), (1, 4), (4, 1)), - ((4, 6, 64), (64, 4*64, 1), (6*64, 64, 1)), + ((4, 6, 64), (64, 4 * 64, 1), (6 * 64, 64, 1)), ((2000, 2000), (1, 2000), (2000, 1)), ((2001, 2001), (1, 2001), (2001, 1)), ((2, 2, 2, 4), (16, 8, 4, 1), (16, 8, 1, 2)), - ((3, 4, 7, 53, 9), row_major_strides((3, 4, 7, 53, 9)), column_major_strides((3, 4, 7, 53, 9))), - ((3, 4, 50, 50, 5, 7), row_major_strides((3, 4, 50, 50, 5, 7)), column_major_strides((3, 4, 50, 50, 5, 7))), + ( + (3, 4, 7, 53, 9), + row_major_strides((3, 4, 7, 53, 9)), + column_major_strides((3, 4, 7, 53, 9)), + ), + ( + (3, 4, 50, 50, 5, 7), + row_major_strides((3, 4, 50, 50, 5, 7)), + column_major_strides((3, 4, 50, 50, 5, 7)), + ), ] _TENSOR_DTYPES_ = [torch.float32, torch.float16] @@ -132,7 +162,7 @@ def write_test(self, test_writer: "InfiniopTestWriter"): src = torch.rand(*shape, dtype=dtype) # 生成目标张量,使用正确的形状 dst = torch.empty(shape, dtype=dtype) - + test_case = RearrangeTestCase( src=src, dst=dst, @@ -140,7 +170,7 @@ def write_test(self, test_writer: "InfiniopTestWriter"): src_strides=src_strides, dst_strides=dst_strides, ) - test_cases.append(test_case) + test_cases.append(test_case) test_writer.add_tests(test_cases) - test_writer.save() + test_writer.save() diff --git a/test/infiniop-test/test_generate/testcases/reduce_max.py b/test/infiniop-test/test_generate/testcases/reduce_max.py new file mode 100644 index 000000000..868b8b3c4 --- /dev/null +++ b/test/infiniop-test/test_generate/testcases/reduce_max.py @@ -0,0 +1,124 @@ +from ast import List +import numpy as np +import gguf +from typing import List +from enum import Enum, auto + +from .. import ( + InfiniopTestWriter, + InfiniopTestCase, + np_dtype_to_ggml, + gguf_strides, + contiguous_gguf_strides, +) + + +def reduce_max(x, dim): + if isinstance(x, np.float64): + return x + return x.max(axis=dim, keepdims=True) + + +def random_tensor(shape: tuple, dtype: np.dtype) -> np.ndarray: + return np.random.uniform(-1.0, 1.0, shape).astype(dtype) * 0.001 + + +class ReduceMaxTestCase(InfiniopTestCase): + def __init__( + self, + x: np.ndarray, + y: np.ndarray, + shape_x: List[int] | None, + shape_y: List[int] | None, + stride_x: List[int] | None, + stride_y: List[int] | None, + dim: int = 0, + ): + super().__init__("reduce_max") + self.x = x + self.y = y + self.shape_x = shape_x + self.shape_y = shape_y + self.stride_x = stride_x + self.stride_y = stride_y + self.dim = dim + + def write_test(self, test_writer: "InfiniopTestWriter"): + super().write_test(test_writer) + print(self.shape_y, self.shape_x, self.stride_y, self.stride_x, self.dim) + if self.shape_x is not None: + test_writer.add_array(test_writer.gguf_key("input.shape"), self.shape_x) + if self.shape_y is not None: + test_writer.add_array(test_writer.gguf_key("output.shape"), self.shape_y) + if self.stride_x is not None: + test_writer.add_array( + test_writer.gguf_key("input.strides"), gguf_strides(*self.stride_x) + ) + test_writer.add_array( + test_writer.gguf_key("output.strides"), + gguf_strides( + *( + self.stride_y + if self.stride_y is not None + else contiguous_gguf_strides(self.shape_y) + ) + ), + ) + test_writer.add_uint64(test_writer.gguf_key("dim"), self.dim) + test_writer.add_tensor( + test_writer.gguf_key("input"), + self.x, + raw_dtype=np_dtype_to_ggml(self.x.dtype), + ) + test_writer.add_tensor( + test_writer.gguf_key("output"), + self.y, + raw_dtype=np_dtype_to_ggml(self.y.dtype), + ) + ans = reduce_max(self.x.astype(np.float64), self.dim) + test_writer.add_tensor( + test_writer.gguf_key("ans"), ans, raw_dtype=gguf.GGMLQuantizationType.F64 + ) + + +if __name__ == "__main__": + test_writer = InfiniopTestWriter("reduce_max.gguf") + test_cases = [] + # ============================================================================== + # Configuration + # ============================================================================== + # These are not maxt to be imported from other modules + _TEST_CASES_ = [ + # y_shape, x_shape, y_stride, x_stride, dim + # ((0,), (0,), (0,), (0,), 0), + ((1,), (32,), None, None, 0), + ((1, 4), (1, 4), None, None, 0), + ((1, 1), (1, 4), None, None, 1), + ((16, 1), (16, 2048), None, None, 1), + ((1, 16), (2048, 16), None, None, 0), + ((16, 1), (16, 2048), (4096, 1), (4096, 1), 1), + ((1, 2048), (16, 2048), (4096, 1), (4096, 1), 0), + ((4, 4, 1), (4, 4, 2048), None, None, 2), + ((1, 4, 4), (2048, 4, 4), None, None, 0), + ((4, 1, 4), (4, 2048, 4), (45056, 5632, 1), (32768, 8, 1), 1), + ((1, 8, 4, 8), (16, 8, 4, 8), (256, 32, 8, 1), (256, 32, 8, 1), 0), + ] + _TENSOR_DTYPES_ = [np.float16, np.float32] + + for dtype in _TENSOR_DTYPES_: + for shape_y, shape_x, stride_y, stride_x, dim in _TEST_CASES_: + x = random_tensor(shape_x, dtype) + y = np.empty(tuple(0 for _ in shape_y), dtype=dtype) + test_case = ReduceMaxTestCase( + x, + y, + shape_x, + shape_y, + stride_x, + stride_y, + dim, + ) + test_cases.append(test_case) + + test_writer.add_tests(test_cases) + test_writer.save() diff --git a/test/infiniop-test/test_generate/testcases/reduce_mean.py b/test/infiniop-test/test_generate/testcases/reduce_mean.py new file mode 100644 index 000000000..a452a8a89 --- /dev/null +++ b/test/infiniop-test/test_generate/testcases/reduce_mean.py @@ -0,0 +1,124 @@ +from ast import List +import numpy as np +import gguf +from typing import List +from enum import Enum, auto + +from .. import ( + InfiniopTestWriter, + InfiniopTestCase, + np_dtype_to_ggml, + gguf_strides, + contiguous_gguf_strides, +) + + +def reduce_mean(x, dim): + if isinstance(x, np.float64): + return x + return x.mean(axis=dim, keepdims=True) + + +def random_tensor(shape: tuple, dtype: np.dtype) -> np.ndarray: + return np.random.uniform(-1.0, 1.0, shape).astype(dtype) * 0.001 + + +class ReduceMeanTestCase(InfiniopTestCase): + def __init__( + self, + x: np.ndarray, + y: np.ndarray, + shape_x: List[int] | None, + shape_y: List[int] | None, + stride_x: List[int] | None, + stride_y: List[int] | None, + dim: int = 0, + ): + super().__init__("reduce_mean") + self.x = x + self.y = y + self.shape_x = shape_x + self.shape_y = shape_y + self.stride_x = stride_x + self.stride_y = stride_y + self.dim = dim + + def write_test(self, test_writer: "InfiniopTestWriter"): + super().write_test(test_writer) + print(self.shape_y, self.shape_x, self.stride_y, self.stride_x, self.dim) + if self.shape_x is not None: + test_writer.add_array(test_writer.gguf_key("input.shape"), self.shape_x) + if self.shape_y is not None: + test_writer.add_array(test_writer.gguf_key("output.shape"), self.shape_y) + if self.stride_x is not None: + test_writer.add_array( + test_writer.gguf_key("input.strides"), gguf_strides(*self.stride_x) + ) + test_writer.add_array( + test_writer.gguf_key("output.strides"), + gguf_strides( + *( + self.stride_y + if self.stride_y is not None + else contiguous_gguf_strides(self.shape_y) + ) + ), + ) + test_writer.add_uint64(test_writer.gguf_key("dim"), self.dim) + test_writer.add_tensor( + test_writer.gguf_key("input"), + self.x, + raw_dtype=np_dtype_to_ggml(self.x.dtype), + ) + test_writer.add_tensor( + test_writer.gguf_key("output"), + self.y, + raw_dtype=np_dtype_to_ggml(self.y.dtype), + ) + ans = reduce_mean(self.x.astype(np.float64), self.dim) + test_writer.add_tensor( + test_writer.gguf_key("ans"), ans, raw_dtype=gguf.GGMLQuantizationType.F64 + ) + + +if __name__ == "__main__": + test_writer = InfiniopTestWriter("reduce_mean.gguf") + test_cases = [] + # ============================================================================== + # Configuration + # ============================================================================== + # These are not meant to be imported from other modules + _TEST_CASES_ = [ + # y_shape, x_shape, y_stride, x_stride, dim + # ((0,), (0,), (0,), (0,), 0), + ((1,), (32,), None, None, 0), + ((1, 4), (1, 4), None, None, 0), + ((1, 1), (1, 4), None, None, 1), + ((16, 1), (16, 2048), None, None, 1), + ((1, 16), (2048, 16), None, None, 0), + ((16, 1), (16, 2048), (4096, 1), (4096, 1), 1), + ((1, 2048), (16, 2048), (4096, 1), (4096, 1), 0), + ((4, 4, 1), (4, 4, 2048), None, None, 2), + ((1, 4, 4), (2048, 4, 4), None, None, 0), + ((4, 1, 4), (4, 2048, 4), (45056, 5632, 1), (32768, 8, 1), 1), + ((1, 8, 4, 8), (16, 8, 4, 8), (256, 32, 8, 1), (256, 32, 8, 1), 0), + ] + _TENSOR_DTYPES_ = [np.float16, np.float32] + + for dtype in _TENSOR_DTYPES_: + for shape_y, shape_x, stride_y, stride_x, dim in _TEST_CASES_: + x = random_tensor(shape_x, dtype) + y = np.empty(tuple(0 for _ in shape_y), dtype=dtype) + test_case = ReduceMeanTestCase( + x, + y, + shape_x, + shape_y, + stride_x, + stride_y, + dim, + ) + test_cases.append(test_case) + + test_writer.add_tests(test_cases) + test_writer.save() diff --git a/test/infiniop-test/test_generate/testcases/rms_norm.py b/test/infiniop-test/test_generate/testcases/rms_norm.py index 681ebafc4..9332c090a 100644 --- a/test/infiniop-test/test_generate/testcases/rms_norm.py +++ b/test/infiniop-test/test_generate/testcases/rms_norm.py @@ -1,11 +1,19 @@ import numpy as np from typing import List -from .. import InfiniopTestWriter, InfiniopTestCase, np_dtype_to_ggml, gguf_strides, contiguous_gguf_strides +from .. import ( + InfiniopTestWriter, + InfiniopTestCase, + np_dtype_to_ggml, + gguf_strides, + contiguous_gguf_strides, +) + def random_tensor(shape: tuple, dtype: np.dtype) -> np.ndarray: return np.random.uniform(-1.0, 1.0, shape).astype(dtype) * 0.001 + def rms_norm(x: np.ndarray, w: np.ndarray, epsilon: float) -> np.ndarray: """ 使用numpy计算rms_norm结果 @@ -16,13 +24,14 @@ def rms_norm(x: np.ndarray, w: np.ndarray, epsilon: float) -> np.ndarray: Returns: 输出张量, 形状与 input 相同 """ - squared = x ** 2 + squared = x**2 mean = np.mean(squared, axis=-1, keepdims=True) rms = np.sqrt(mean + epsilon) - + normalized = x / rms return normalized * w + class RMSNormTestCase(InfiniopTestCase): def __init__( self, @@ -40,9 +49,9 @@ def __init__( self.y = y self.shape = shape self.epsilon = epsilon - self.x_strides=x_strides - self.y_strides=y_strides - + self.x_strides = x_strides + self.y_strides = y_strides + def write_test(self, test_writer: "InfiniopTestWriter"): super().write_test(test_writer) test_writer.add_float32(test_writer.gguf_key("epsilon"), self.epsilon) @@ -50,10 +59,18 @@ def write_test(self, test_writer: "InfiniopTestWriter"): test_writer.add_array(test_writer.gguf_key("x.shape"), self.shape) test_writer.add_array(test_writer.gguf_key("y.shape"), self.shape) if self.x_strides is not None: - test_writer.add_array(test_writer.gguf_key("x.strides"), gguf_strides(*self.x_strides)) + test_writer.add_array( + test_writer.gguf_key("x.strides"), gguf_strides(*self.x_strides) + ) test_writer.add_array( test_writer.gguf_key("y.strides"), - gguf_strides(*self.y_strides if self.y_strides is not None else contiguous_gguf_strides(self.shape)) + gguf_strides( + *( + self.y_strides + if self.y_strides is not None + else contiguous_gguf_strides(self.shape) + ) + ), ) test_writer.add_tensor( test_writer.gguf_key("x"), @@ -70,13 +87,16 @@ def write_test(self, test_writer: "InfiniopTestWriter"): self.y, raw_dtype=np_dtype_to_ggml(self.y.dtype), ) - ans = rms_norm(self.x.astype(np.float64), self.w.astype(np.float64), self.epsilon) + ans = rms_norm( + self.x.astype(np.float64), self.w.astype(np.float64), self.epsilon + ) test_writer.add_tensor( test_writer.gguf_key("ans"), ans, raw_dtype=np_dtype_to_ggml(np.float64), ) + if __name__ == "__main__": test_writer = InfiniopTestWriter("rms_norm.gguf") test_cases = [] @@ -112,9 +132,9 @@ def write_test(self, test_writer: "InfiniopTestWriter"): shape=shape, x_strides=x_strides, y_strides=y_strides, - epsilon=epsilon + epsilon=epsilon, ) - test_cases.append(test_case) + test_cases.append(test_case) test_writer.add_tests(test_cases) test_writer.save() diff --git a/test/infiniop-test/test_generate/testcases/rope.py b/test/infiniop-test/test_generate/testcases/rope.py index 85d9685dd..27f5a06db 100644 --- a/test/infiniop-test/test_generate/testcases/rope.py +++ b/test/infiniop-test/test_generate/testcases/rope.py @@ -4,11 +4,17 @@ from typing import List -from .. import InfiniopTestWriter, InfiniopTestCase, np_dtype_to_ggml, gguf_strides, contiguous_gguf_strides +from .. import ( + InfiniopTestWriter, + InfiniopTestCase, + np_dtype_to_ggml, + gguf_strides, + contiguous_gguf_strides, +) def rotary_embedding(t, sin, cos): - dh = t.shape[2] + dh = t.shape[2] assert dh % 2 == 0, "Embedding dimension must be even." t_even = t[..., 0::2] # [seq_len, n_head, dh // 2] @@ -30,7 +36,9 @@ def rotary_embedding(t, sin, cos): def sin_cos_table(pos, dim, theta, dtype): assert dim % 2 == 0, "Embedding dimension must be even." - freqs = 1.0 / (theta ** (np.arange(0, dim, 2)[: (dim // 2)].astype(np.float32) / dim)) + freqs = 1.0 / ( + theta ** (np.arange(0, dim, 2)[: (dim // 2)].astype(np.float32) / dim) + ) angles = np.outer(pos, freqs) @@ -79,19 +87,33 @@ def write_test(self, test_writer: "InfiniopTestWriter"): test_writer.add_array(test_writer.gguf_key("x.shape"), self.shape_x) test_writer.add_array( test_writer.gguf_key("y.strides"), - gguf_strides(*self.stride_y if self.stride_y is not None else contiguous_gguf_strides(self.shape_y)) + gguf_strides( + *( + self.stride_y + if self.stride_y is not None + else contiguous_gguf_strides(self.shape_y) + ) + ), ) if self.stride_x is not None: - test_writer.add_array(test_writer.gguf_key("x.strides"), gguf_strides(*self.stride_x)) + test_writer.add_array( + test_writer.gguf_key("x.strides"), gguf_strides(*self.stride_x) + ) test_writer.add_tensor( - test_writer.gguf_key("pos_ids"), self.pos_ids, raw_dtype=np_dtype_to_ggml(self.pos_ids.dtype) + test_writer.gguf_key("pos_ids"), + self.pos_ids, + raw_dtype=np_dtype_to_ggml(self.pos_ids.dtype), ) test_writer.add_tensor( - test_writer.gguf_key("sin_table"), self.sin_table, raw_dtype=np_dtype_to_ggml(self.sin_table.dtype) + test_writer.gguf_key("sin_table"), + self.sin_table, + raw_dtype=np_dtype_to_ggml(self.sin_table.dtype), ) test_writer.add_tensor( - test_writer.gguf_key("cos_table"), self.cos_table, raw_dtype=np_dtype_to_ggml(self.cos_table.dtype) + test_writer.gguf_key("cos_table"), + self.cos_table, + raw_dtype=np_dtype_to_ggml(self.cos_table.dtype), ) ans = rotary_embedding( self.x.astype(np.float64), @@ -103,8 +125,6 @@ def write_test(self, test_writer: "InfiniopTestWriter"): ) - - if __name__ == "__main__": # ============================================================================== # Configuration (Internal Use Only) @@ -130,7 +150,9 @@ def write_test(self, test_writer: "InfiniopTestWriter"): x = np.random.rand(*shape).astype(dtype) y = np.empty(tuple(0 for _ in shape), dtype=dtype) pos_ids = np.arange(0, x.shape[0], dtype=np.int32) - sin_table, cos_table = sin_cos_table(pos_ids, x.shape[2], theta=1e5, dtype=dtype) + sin_table, cos_table = sin_cos_table( + pos_ids, x.shape[2], theta=1e5, dtype=dtype + ) test_case = RoPETestCase( y=y, x=x, diff --git a/test/infiniop-test/test_generate/testcases/swiglu.py b/test/infiniop-test/test_generate/testcases/swiglu.py index cb692b613..aa3450fed 100644 --- a/test/infiniop-test/test_generate/testcases/swiglu.py +++ b/test/infiniop-test/test_generate/testcases/swiglu.py @@ -2,7 +2,14 @@ import gguf from typing import List -from .. import InfiniopTestWriter, InfiniopTestCase, np_dtype_to_ggml, gguf_strides, contiguous_gguf_strides, process_zero_stride_tensor +from .. import ( + InfiniopTestWriter, + InfiniopTestCase, + np_dtype_to_ggml, + gguf_strides, + contiguous_gguf_strides, + process_zero_stride_tensor, +) def swiglu( @@ -26,7 +33,6 @@ def __init__( c: np.ndarray, shape_c: List[int] | None, stride_c: List[int] | None, - ): super().__init__("swiglu") self.a = a @@ -39,7 +45,6 @@ def __init__( self.shape_c = shape_c self.stride_c = stride_c - def write_test(self, test_writer: "InfiniopTestWriter"): super().write_test(test_writer) if self.shape_a is not None: @@ -47,14 +52,24 @@ def write_test(self, test_writer: "InfiniopTestWriter"): if self.shape_b is not None: test_writer.add_array(test_writer.gguf_key("b.shape"), self.shape_b) if self.shape_c is not None: - test_writer.add_array(test_writer.gguf_key("c.shape"), self.shape_c) + test_writer.add_array(test_writer.gguf_key("c.shape"), self.shape_c) if self.stride_a is not None: - test_writer.add_array(test_writer.gguf_key("a.strides"), gguf_strides(*self.stride_a)) + test_writer.add_array( + test_writer.gguf_key("a.strides"), gguf_strides(*self.stride_a) + ) if self.stride_b is not None: - test_writer.add_array(test_writer.gguf_key("b.strides"), gguf_strides(*self.stride_b)) + test_writer.add_array( + test_writer.gguf_key("b.strides"), gguf_strides(*self.stride_b) + ) test_writer.add_array( test_writer.gguf_key("c.strides"), - gguf_strides(*self.stride_c if self.stride_c is not None else contiguous_gguf_strides(self.shape_c)) + gguf_strides( + *( + self.stride_c + if self.stride_c is not None + else contiguous_gguf_strides(self.shape_c) + ) + ), ) test_writer.add_tensor( test_writer.gguf_key("a"), self.a, raw_dtype=np_dtype_to_ggml(self.a.dtype) diff --git a/test/infiniop/libinfiniop/op_register.py b/test/infiniop/libinfiniop/op_register.py index e92e77105..72bf31f48 100644 --- a/test/infiniop/libinfiniop/op_register.py +++ b/test/infiniop/libinfiniop/op_register.py @@ -454,6 +454,7 @@ def swiglu_(lib): infiniopOperatorDescriptor_t, ] + @OpRegister.operator def conv_(lib): lib.infiniopCreateConvDescriptor.restype = c_int32 @@ -489,3 +490,69 @@ def conv_(lib): lib.infiniopDestroyConvDescriptor.argtypes = [ infiniopOperatorDescriptor_t, ] + + +@OpRegister.operator +def reduce_mean_(lib): + lib.infiniopCreateReduceMeanDescriptor.restype = c_int32 + lib.infiniopCreateReduceMeanDescriptor.argtypes = [ + infiniopHandle_t, + POINTER(infiniopOperatorDescriptor_t), + infiniopTensorDescriptor_t, + infiniopTensorDescriptor_t, + c_size_t, + ] + + lib.infiniopGetReduceMeanWorkspaceSize.restype = c_int32 + lib.infiniopGetReduceMeanWorkspaceSize.argtypes = [ + infiniopOperatorDescriptor_t, + POINTER(c_size_t), + ] + + lib.infiniopReduceMean.restype = c_int32 + lib.infiniopReduceMean.argtypes = [ + infiniopOperatorDescriptor_t, + c_void_p, + c_size_t, + c_void_p, + c_void_p, + c_void_p, + ] + + lib.infiniopDestroyReduceMeanDescriptor.restype = c_int32 + lib.infiniopDestroyReduceMeanDescriptor.argtypes = [ + infiniopOperatorDescriptor_t, + ] + + +@OpRegister.operator +def reduce_max_(lib): + lib.infiniopCreateReduceMaxDescriptor.restype = c_int32 + lib.infiniopCreateReduceMaxDescriptor.argtypes = [ + infiniopHandle_t, + POINTER(infiniopOperatorDescriptor_t), + infiniopTensorDescriptor_t, + infiniopTensorDescriptor_t, + c_size_t, + ] + + lib.infiniopGetReduceMaxWorkspaceSize.restype = c_int32 + lib.infiniopGetReduceMaxWorkspaceSize.argtypes = [ + infiniopOperatorDescriptor_t, + POINTER(c_size_t), + ] + + lib.infiniopReduceMax.restype = c_int32 + lib.infiniopReduceMax.argtypes = [ + infiniopOperatorDescriptor_t, + c_void_p, + c_size_t, + c_void_p, + c_void_p, + c_void_p, + ] + + lib.infiniopDestroyReduceMaxDescriptor.restype = c_int32 + lib.infiniopDestroyReduceMaxDescriptor.argtypes = [ + infiniopOperatorDescriptor_t, + ] diff --git a/test/infiniop/reduce_max.py b/test/infiniop/reduce_max.py new file mode 100644 index 000000000..00c86f76a --- /dev/null +++ b/test/infiniop/reduce_max.py @@ -0,0 +1,154 @@ +import torch +import ctypes +from ctypes import c_uint64 +from libinfiniop import ( + LIBINFINIOP, + TestTensor, + get_test_devices, + check_error, + test_operator, + get_args, + debug, + get_tolerance, + profile_operation, + TestWorkspace, + InfiniDtype, + InfiniDtypeNames, + InfiniDeviceNames, + infiniopOperatorDescriptor_t, +) + +# ============================================================================== +# Configuration (Internal Use Only) +# ============================================================================== +# These are not maxt to be imported from other modules +_TEST_CASES_ = [ + # y_shape, x_shape, y_stride, x_stride, dim + ((), (), None, None, 0), + ((1,), (32,), None, None, 0), + ((1, 4), (1, 4), None, None, 0), + ((1, 1), (1, 4), None, None, 1), + ((16, 1), (16, 2048), None, None, 1), + ((1, 16), (2048, 16), None, None, 0), + ((16, 1), (16, 2048), (4096, 1), (4096, 1), 1), + ((1, 2048), (16, 2048), (4096, 1), (4096, 1), 0), + ((4, 4, 1), (4, 4, 2048), None, None, 2), + ((1, 4, 4), (2048, 4, 4), None, None, 0), + ((4, 1, 4), (4, 2048, 4), (45056, 5632, 1), (32768, 8, 1), 1), +] + +# x types used for testing +_TENSOR_DTYPES = [InfiniDtype.F16, InfiniDtype.BF16, InfiniDtype.F32] + +_TEST_CASES = _TEST_CASES_ + +# Tolerance map for different data types +_TOLERANCE_MAP = { + InfiniDtype.F16: {"atol": 1e-3, "rtol": 1e-3}, + InfiniDtype.BF16: {"atol": 1e-2, "rtol": 1e-2}, + InfiniDtype.F32: {"atol": 1e-4, "rtol": 1e-4}, +} + +DEBUG = False +PROFILE = False +NUM_PRERUN = 10 +NUM_ITERATIONS = 1000 + + +def reduce_max(x, dim): + return x.max(dim=dim, keepdim=True)[0] + + +def test( + handle, + device, + y_shape, + x_shape, + y_stride, + x_stride, + dim, + dtype=InfiniDtype.F16, + sync=None, +): + print( + f"Testing Reduce_Max on {InfiniDeviceNames[device]} with y_shape:{y_shape} x_shape:{x_shape}" + f" y_stride:{y_stride} x_stride:{x_stride} dim:{dim} dtype:{InfiniDtypeNames[dtype]}" + ) + + x = TestTensor(x_shape, x_stride, dtype, device) + ans = reduce_max(x.torch_tensor(), dim) + + y = TestTensor(y_shape, y_stride, dtype, device) + + if sync is not None: + sync() + + descriptor = infiniopOperatorDescriptor_t() + check_error( + LIBINFINIOP.infiniopCreateReduceMaxDescriptor( + handle, + ctypes.byref(descriptor), + y.descriptor, + x.descriptor, + ctypes.c_size_t(dim), + ) + ) + + # Invalidate the shape and strides in the descriptor to prevent them from being directly used by the kernel + x.destroy_desc() + y.destroy_desc() + + workspace_size = c_uint64(0) + check_error( + LIBINFINIOP.infiniopGetReduceMaxWorkspaceSize( + descriptor, ctypes.byref(workspace_size) + ) + ) + workspace = TestWorkspace(workspace_size.value, x.device) + + def lib_reduce_max(): + check_error( + LIBINFINIOP.infiniopReduceMax( + descriptor, + workspace.data(), + workspace_size.value, + y.data(), + x.data(), + None, + ) + ) + + lib_reduce_max() + + if sync is not None: + sync() + + atol, rtol = get_tolerance(_TOLERANCE_MAP, dtype) + if DEBUG: + debug(y.actual_tensor(), ans, atol=atol, rtol=rtol) + assert torch.allclose(y.actual_tensor(), ans, atol=atol, rtol=rtol) + + # Profiling workflow + if PROFILE: + # fmt: off + profile_operation("PyTorch", lambda: causal_softmax(x.torch_tensor()), device, NUM_PRERUN, NUM_ITERATIONS) + profile_operation(" lib", lambda: lib_causal_softmax(), device, NUM_PRERUN, NUM_ITERATIONS) + # fmt: on + + check_error(LIBINFINIOP.infiniopDestroyReduceMaxDescriptor(descriptor)) + + +if __name__ == "__main__": + args = get_args() + + # Configure testing options + DEBUG = args.debug + PROFILE = args.profile + NUM_PRERUN = args.num_prerun + NUM_ITERATIONS = args.num_iterations + + # Execute tests + for device in get_test_devices(args): + test_operator(device, test, _TEST_CASES, _TENSOR_DTYPES) + + print("\033[92mTest passed!\033[0m") diff --git a/test/infiniop/reduce_mean.py b/test/infiniop/reduce_mean.py new file mode 100644 index 000000000..c097cd296 --- /dev/null +++ b/test/infiniop/reduce_mean.py @@ -0,0 +1,154 @@ +import torch +import ctypes +from ctypes import c_uint64 +from libinfiniop import ( + LIBINFINIOP, + TestTensor, + get_test_devices, + check_error, + test_operator, + get_args, + debug, + get_tolerance, + profile_operation, + TestWorkspace, + InfiniDtype, + InfiniDtypeNames, + InfiniDeviceNames, + infiniopOperatorDescriptor_t, +) + +# ============================================================================== +# Configuration (Internal Use Only) +# ============================================================================== +# These are not meant to be imported from other modules +_TEST_CASES_ = [ + # y_shape, x_shape, y_stride, x_stride, dim + ((), (), None, None, 0), + ((1,), (32,), None, None, 0), + ((1, 4), (1, 4), None, None, 0), + ((1, 1), (1, 4), None, None, 1), + ((16, 1), (16, 2048), None, None, 1), + ((1, 16), (2048, 16), None, None, 0), + ((16, 1), (16, 2048), (4096, 1), (4096, 1), 1), + ((1, 2048), (16, 2048), (4096, 1), (4096, 1), 0), + ((4, 4, 1), (4, 4, 2048), None, None, 2), + ((1, 4, 4), (2048, 4, 4), None, None, 0), + ((4, 1, 4), (4, 2048, 4), (45056, 5632, 1), (32768, 8, 1), 1), +] + +# x types used for testing +_TENSOR_DTYPES = [InfiniDtype.F16, InfiniDtype.BF16, InfiniDtype.F32] + +_TEST_CASES = _TEST_CASES_ + +# Tolerance map for different data types +_TOLERANCE_MAP = { + InfiniDtype.F16: {"atol": 1e-3, "rtol": 1e-3}, + InfiniDtype.BF16: {"atol": 1e-2, "rtol": 1e-2}, + InfiniDtype.F32: {"atol": 1e-4, "rtol": 1e-4}, +} + +DEBUG = False +PROFILE = False +NUM_PRERUN = 10 +NUM_ITERATIONS = 1000 + + +def reduce_mean(x, dim): + return x.mean(dim=dim, keepdim=True) + + +def test( + handle, + device, + y_shape, + x_shape, + y_stride, + x_stride, + dim, + dtype=InfiniDtype.F16, + sync=None, +): + print( + f"Testing Reduce_Mean on {InfiniDeviceNames[device]} with y_shape:{y_shape} x_shape:{x_shape}" + f" y_stride:{y_stride} x_stride:{x_stride} dim:{dim} dtype:{InfiniDtypeNames[dtype]}" + ) + + x = TestTensor(x_shape, x_stride, dtype, device) + ans = reduce_mean(x.torch_tensor(), dim) + + y = TestTensor(y_shape, y_stride, dtype, device) + + if sync is not None: + sync() + + descriptor = infiniopOperatorDescriptor_t() + check_error( + LIBINFINIOP.infiniopCreateReduceMeanDescriptor( + handle, + ctypes.byref(descriptor), + y.descriptor, + x.descriptor, + ctypes.c_size_t(dim), + ) + ) + + # Invalidate the shape and strides in the descriptor to prevent them from being directly used by the kernel + x.destroy_desc() + y.destroy_desc() + + workspace_size = c_uint64(0) + check_error( + LIBINFINIOP.infiniopGetReduceMeanWorkspaceSize( + descriptor, ctypes.byref(workspace_size) + ) + ) + workspace = TestWorkspace(workspace_size.value, x.device) + + def lib_reduce_mean(): + check_error( + LIBINFINIOP.infiniopReduceMean( + descriptor, + workspace.data(), + workspace_size.value, + y.data(), + x.data(), + None, + ) + ) + + lib_reduce_mean() + + if sync is not None: + sync() + + atol, rtol = get_tolerance(_TOLERANCE_MAP, dtype) + if DEBUG: + debug(y.actual_tensor(), ans, atol=atol, rtol=rtol) + assert torch.allclose(y.actual_tensor(), ans, atol=atol, rtol=rtol) + + # Profiling workflow + if PROFILE: + # fmt: off + profile_operation("PyTorch", lambda: causal_softmax(x.torch_tensor()), device, NUM_PRERUN, NUM_ITERATIONS) + profile_operation(" lib", lambda: lib_causal_softmax(), device, NUM_PRERUN, NUM_ITERATIONS) + # fmt: on + + check_error(LIBINFINIOP.infiniopDestroyReduceMeanDescriptor(descriptor)) + + +if __name__ == "__main__": + args = get_args() + + # Configure testing options + DEBUG = args.debug + PROFILE = args.profile + NUM_PRERUN = args.num_prerun + NUM_ITERATIONS = args.num_iterations + + # Execute tests + for device in get_test_devices(args): + test_operator(device, test, _TEST_CASES, _TENSOR_DTYPES) + + print("\033[92mTest passed!\033[0m")