diff --git a/include/infiniop.h b/include/infiniop.h
index d51b8d92e..f7ad9e02c 100644
--- a/include/infiniop.h
+++ b/include/infiniop.h
@@ -17,5 +17,7 @@
 #include "infiniop/ops/sub.h"
 #include "infiniop/ops/swiglu.h"
 #include "infiniop/tensor_descriptor.h"
+#include "infiniop/ops/reduce_mean.h"
+#include "infiniop/ops/reduce_max.h"
 
 #endif // __INFINIOP_API_H__
diff --git a/include/infiniop/ops/reduce_max.h b/include/infiniop/ops/reduce_max.h
new file mode 100644
index 000000000..42a3dd62d
--- /dev/null
+++ b/include/infiniop/ops/reduce_max.h
@@ -0,0 +1,27 @@
+#ifndef __INFINIOP_REDUCE_MAX_API_H__
+#define __INFINIOP_REDUCE_MAX_API_H__
+
+#include "../operator_descriptor.h"
+
+typedef struct InfiniopDescriptor *infiniopReduceMaxDescriptor_t;
+
+__C __export infiniStatus_t infiniopCreateReduceMaxDescriptor(
+    infiniopHandle_t handle,
+    infiniopReduceMaxDescriptor_t *desc_ptr,
+    infiniopTensorDescriptor_t output_desc,
+    infiniopTensorDescriptor_t input_desc,
+    size_t dim);
+
+__C __export infiniStatus_t infiniopGetReduceMaxWorkspaceSize(infiniopReduceMaxDescriptor_t desc, size_t *size);
+
+__C __export infiniStatus_t infiniopReduceMax(
+    infiniopReduceMaxDescriptor_t desc,
+    void *workspace,
+    size_t workspace_size,
+    void *output,
+    const void *input,
+    void *stream);
+
+__C __export infiniStatus_t infiniopDestroyReduceMaxDescriptor(infiniopReduceMaxDescriptor_t desc);
+
+#endif
diff --git a/include/infiniop/ops/reduce_mean.h b/include/infiniop/ops/reduce_mean.h
new file mode 100644
index 000000000..5efd8b227
--- /dev/null
+++ b/include/infiniop/ops/reduce_mean.h
@@ -0,0 +1,27 @@
+#ifndef __INFINIOP_REDUCE_MEAN_API_H__
+#define __INFINIOP_REDUCE_MEAN_API_H__
+
+#include "../operator_descriptor.h"
+
+typedef struct InfiniopDescriptor *infiniopReduceMeanDescriptor_t;
+
+__C __export infiniStatus_t infiniopCreateReduceMeanDescriptor(
+    infiniopHandle_t handle,
+    infiniopReduceMeanDescriptor_t *desc_ptr,
+    infiniopTensorDescriptor_t y_desc,
+    infiniopTensorDescriptor_t x_desc,
+    size_t dim);
+
+__C __export infiniStatus_t infiniopGetReduceMeanWorkspaceSize(infiniopReduceMeanDescriptor_t desc, size_t *size);
+
+__C __export infiniStatus_t infiniopReduceMean(
+    infiniopReduceMeanDescriptor_t desc,
+    void *workspace,
+    size_t workspace_size,
+    void *y,
+    const void *x,
+    void *stream);
+
+__C __export infiniStatus_t infiniopDestroyReduceMeanDescriptor(infiniopReduceMeanDescriptor_t desc);
+
+#endif
diff --git a/scripts/python_test.py b/scripts/python_test.py
index eb2d4319e..89df98708 100644
--- a/scripts/python_test.py
+++ b/scripts/python_test.py
@@ -24,6 +24,8 @@ def run_tests(args):
         "rope.py",
         "sub.py",
         "swiglu.py",
+        "reduce_mean.py",
+        "reduce_max.py",
     ]:
         result = subprocess.run(
             f"python {test} {args} --debug", text=True, encoding="utf-8", shell=True
diff --git a/src/infiniop-test/include/ops.hpp b/src/infiniop-test/include/ops.hpp
index 3820f7cfd..aee602aac 100644
--- a/src/infiniop-test/include/ops.hpp
+++ b/src/infiniop-test/include/ops.hpp
@@ -16,6 +16,8 @@ DECLARE_INFINIOP_TEST(add)
 DECLARE_INFINIOP_TEST(causal_softmax)
 DECLARE_INFINIOP_TEST(rearrange)
 DECLARE_INFINIOP_TEST(sub)
+DECLARE_INFINIOP_TEST(reduce_mean)
+DECLARE_INFINIOP_TEST(reduce_max)
 
 #define REGISTER_INFINIOP_TEST(name)                      \
     {                                                     \
@@ -43,6 +45,8 @@ DECLARE_INFINIOP_TEST(sub)
         REGISTER_INFINIOP_TEST(causal_softmax) \
         REGISTER_INFINIOP_TEST(rearrange)      \
         REGISTER_INFINIOP_TEST(sub)            \
+        REGISTER_INFINIOP_TEST(reduce_mean)    \
+        REGISTER_INFINIOP_TEST(reduce_max)     \
     }
 
 namespace infiniop_test {
@@ -64,4 +68,4 @@ bool check_names(
 
 } // namespace infiniop_test
 
-#endif
+#endif
\ No newline at end of file
diff --git a/src/infiniop-test/include/test.hpp b/src/infiniop-test/include/test.hpp
index e2dd45f9f..277061029 100644
--- a/src/infiniop-test/include/test.hpp
+++ b/src/infiniop-test/include/test.hpp
@@ -47,7 +47,7 @@ std::vector<std::shared_ptr<Result>> runAllTests(
     const GGUFFileReader &,
     infiniDevice_t device, int device_id,
     size_t warm_ups, size_t iterations,
-    double rtol, double atol);
+    double rtol, double atol, bool equal_nan = false);
 
 // Run a single test read from a GGUF file
 std::shared_ptr<Result> runTest(
@@ -55,10 +55,11 @@ std::shared_ptr<Result> runTest(
     infiniDevice_t device, int device_id,
     size_t warm_ups, size_t iterations,
     double rtol, double atol,
-    size_t test_id);
+    size_t test_id,
+    bool equal_nan = false);
 
 // Check if two tensors are close within given tolerance
-void allClose(std::shared_ptr<Tensor> actual, std::shared_ptr<Tensor> expected, double rtol = 1e-3, double atol = 1e-3);
+void allClose(std::shared_ptr<Tensor> actual, std::shared_ptr<Tensor> expected, double rtol = 1e-3, double atol = 1e-3, bool equal_nan = false);
 
 // Check if two tensors are equal
 void allEqual(std::shared_ptr<Tensor> actual, std::shared_ptr<Tensor> expected);
@@ -85,13 +86,14 @@ class Test {
     namespace infiniop_test::name {                                           \
     class Test : public infiniop_test::base::Test {                           \
         double _rtol, _atol;                                                  \
+        bool _equal_nan;                                                      \
                                                                               \
     public:                                                                   \
         static std::string op_name() { return #name; }                        \
         static std::shared_ptr<Test> build(                                   \
             std::unordered_map<std::string, std::vector<uint8_t>> attributes, \
             std::unordered_map<std::string, std::shared_ptr<Tensor>> tensors, \
-            double, double);                                                  \
+            double, double, bool);                                            \
                                                                               \
         static std::vector<std::string> attribute_names();                    \
         static std::vector<std::string> tensor_names();                       \
@@ -109,7 +111,8 @@ class Test {
         struct Attributes;                                                    \
         Attributes *_attributes;                                              \
         Test() = delete;                                                      \
-        Test(double rtol, double atol) : _rtol(rtol), _atol(atol) {}          \
+        Test(double rtol, double atol, bool equal_nan = false)                \
+            : _rtol(rtol), _atol(atol), _equal_nan(equal_nan) {}              \
     };                                                                        \
     }
 
@@ -117,7 +120,7 @@ namespace infiniop_test {
 using BuilderFunc = std::function<std::shared_ptr<infiniop_test::base::Test>(
     std::unordered_map<std::string, std::vector<uint8_t>>,
     std::unordered_map<std::string, std::shared_ptr<Tensor>>,
-    double, double)>;
+    double, double, bool)>;
 
 // Testcase Registry
 // Each testcase should provid a formatted builder, attribute names, and tensor names
diff --git a/src/infiniop-test/src/gguf.cpp b/src/infiniop-test/src/gguf.cpp
index a4b200033..aee5b39a8 100644
--- a/src/infiniop-test/src/gguf.cpp
+++ b/src/infiniop-test/src/gguf.cpp
@@ -53,7 +53,9 @@ GGUFFileReader::GGUFFileReader(const std::string &filepath) {
     try {
         _file = std::make_shared<FileMapping>(filepath);
     } catch (const std::exception &e) {
-        throw e;
+        // throw e;
+        std::cerr << "Error: " << e.what() << std::endl;
+        // throw e;
     }
     _data = _file->ptr();
     _cursor = reinterpret_cast<uint8_t *>(_data);
diff --git a/src/infiniop-test/src/main.cpp b/src/infiniop-test/src/main.cpp
index 4863c8172..6805bd7f8 100644
--- a/src/infiniop-test/src/main.cpp
+++ b/src/infiniop-test/src/main.cpp
@@ -1,8 +1,8 @@
 #include "gguf.hpp"
 #include "test.hpp"
+#include <cstring>
 #include <infinirt.h>
 #include <iostream>
-
 struct ParsedArgs {
     std::string file_path;                          // Mandatory argument: test.gguf file path
     infiniDevice_t device_type = INFINI_DEVICE_CPU; // Default to CPU
@@ -11,12 +11,13 @@ struct ParsedArgs {
     int iterations = 0;                             // Default to 0 if not given
     double atol = 0.001;                            // Default absolute tolerance
     double rtol = 0.001;                            // Default relative tolerance
+    bool equal_nan = false;                         // Default relative tolerance
 };
 
 void printUsage() {
     std::cout << "Usage:" << std::endl
               << std::endl;
-    std::cout << "infiniop-test <test.gguf> [--<device>[:id]] [--warmup <warmups>] [--run <iterations>] [--atol <atol>] [--rtol <rtol>]" << std::endl
+    std::cout << "infiniop-test <test.gguf> [--<device>[:id]] [--warmup <warmups>] [--run <iterations>] [--atol <atol>] [--rtol <rtol>] [--equal-nan <equal nan>]" << std::endl
               << std::endl;
     std::cout << "  <test.gguf>>" << std::endl;
     std::cout << "    Path to the test gguf file" << std::endl
@@ -36,6 +37,9 @@ void printUsage() {
     std::cout << "  --rtol <relative_tolerance>" << std::endl;
     std::cout << "    (Optional) Relative tolerance for correctness check. Default to 0.001" << std::endl
               << std::endl;
+    std::cout << "  --equal-nan <compare NaNs as equal>" << std::endl;
+    std::cout << "    (Optional) If True, then two NaNs will be considered equal. Default to False" << std::endl
+              << std::endl;
     exit(-1);
 }
 
@@ -91,6 +95,11 @@ ParsedArgs parseArgs(int argc, char *argv[]) {
             else if (arg == "--rtol" && i + 1 < argc) {
                 args.rtol = std::stod(argv[++i]);
             }
+            else if (arg == "--equal-nan" && i + 1 < argc) {
+                args.equal_nan = (strcmp(argv[++i], "True") == 0 || strcmp(argv[i], "true") == 0)
+                                   ? true
+                                   : false;
+            }
             else {
                 printUsage();
             }
@@ -119,7 +128,7 @@ int main(int argc, char *argv[]) {
             reader,
             (infiniDevice_t)args.device_type, args.device_id,
             args.warmups, args.iterations,
-            args.rtol, args.atol);
+            args.rtol, args.atol, args.equal_nan);
 
         std::cout << "=====================================" << std::endl;
         for (auto result : results) {
diff --git a/src/infiniop-test/src/ops/add.cpp b/src/infiniop-test/src/ops/add.cpp
index 27f69d687..e90290d55 100644
--- a/src/infiniop-test/src/ops/add.cpp
+++ b/src/infiniop-test/src/ops/add.cpp
@@ -15,8 +15,8 @@ struct Test::Attributes {
 std::shared_ptr<Test> Test::build(
     std::unordered_map<std::string, std::vector<uint8_t>> attributes,
     std::unordered_map<std::string, std::shared_ptr<Tensor>> tensors,
-    double rtol, double atol) {
-    auto test = std::shared_ptr<Test>(new Test(rtol, atol));
+    double rtol, double atol, bool equal_nan) {
+    auto test = std::shared_ptr<Test>(new Test(rtol, atol, equal_nan));
     test->_attributes = new Attributes();
     if (tensors.find("a") == tensors.end()
         || tensors.find("b") == tensors.end()
@@ -58,7 +58,7 @@ std::shared_ptr<infiniop_test::Result> Test::run(
              return TEST_FAILED(OP_EXECUTION_FAILED, "Failed during execution."));
 
     try {
-        allClose(c, _attributes->ans, _rtol, _atol);
+        allClose(c, _attributes->ans, _rtol, _atol, _equal_nan);
     } catch (const std::exception &e) {
         return TEST_FAILED(RESULT_INCORRECT, e.what());
     }
@@ -98,7 +98,7 @@ std::string Test::toString() const {
     oss << "- b: " << _attributes->b->info() << std::endl;
     oss << "- c: " << _attributes->c->info() << std::endl;
     oss << std::scientific << std::setprecision(2);
-    oss << "- rtol=" << _rtol << ", atol=" << _atol << std::endl;
+    oss << "- rtol=" << _rtol << ", atol=" << _atol << ", equal_nan=" << _equal_nan << std::endl;
     return oss.str();
 }
 
diff --git a/src/infiniop-test/src/ops/causal_softmax.cpp b/src/infiniop-test/src/ops/causal_softmax.cpp
index 29612960a..97c65ef8c 100644
--- a/src/infiniop-test/src/ops/causal_softmax.cpp
+++ b/src/infiniop-test/src/ops/causal_softmax.cpp
@@ -14,8 +14,8 @@ struct Test::Attributes {
 std::shared_ptr<Test> Test::build(
     std::unordered_map<std::string, std::vector<uint8_t>> attributes,
     std::unordered_map<std::string, std::shared_ptr<Tensor>> tensors,
-    double rtol, double atol) {
-    auto test = std::shared_ptr<Test>(new Test(rtol, atol));
+    double rtol, double atol, bool equal_nan) {
+    auto test = std::shared_ptr<Test>(new Test(rtol, atol, equal_nan));
     test->_attributes = new Attributes();
     if (tensors.find("x") == tensors.end()
         || tensors.find("y") == tensors.end()
@@ -53,7 +53,7 @@ std::shared_ptr<infiniop_test::Result> Test::run(
              return TEST_FAILED(OP_EXECUTION_FAILED, "Failed during execution."));
 
     try {
-        allClose(y, _attributes->ans, _rtol, _atol);
+        allClose(y, _attributes->ans, _rtol, _atol, _equal_nan);
     } catch (const std::exception &e) {
         return TEST_FAILED(RESULT_INCORRECT, e.what());
     }
@@ -92,7 +92,7 @@ std::string Test::toString() const {
     oss << "- y: " << _attributes->y->info() << std::endl;
     oss << "- ans: " << _attributes->ans->info() << std::endl;
     oss << std::scientific << std::setprecision(2);
-    oss << "- rtol=" << _rtol << ", atol=" << _atol << std::endl;
+    oss << "- rtol=" << _rtol << ", atol=" << _atol << ", equal_nan=" << _equal_nan << std::endl;
     return oss.str();
 }
 
diff --git a/src/infiniop-test/src/ops/clip.cpp b/src/infiniop-test/src/ops/clip.cpp
index 82a0e9b10..a01c18a4d 100644
--- a/src/infiniop-test/src/ops/clip.cpp
+++ b/src/infiniop-test/src/ops/clip.cpp
@@ -16,8 +16,8 @@ struct Test::Attributes {
 std::shared_ptr<Test> Test::build(
     std::unordered_map<std::string, std::vector<uint8_t>> attributes,
     std::unordered_map<std::string, std::shared_ptr<Tensor>> tensors,
-    double rtol, double atol) {
-    auto test = std::shared_ptr<Test>(new Test(rtol, atol));
+    double rtol, double atol, bool equal_nan) {
+    auto test = std::shared_ptr<Test>(new Test(rtol, atol, equal_nan));
     test->_attributes = new Attributes();
     if (tensors.find("x") == tensors.end()
         || tensors.find("min_val") == tensors.end()
@@ -64,7 +64,7 @@ std::shared_ptr<infiniop_test::Result> Test::run(
              return TEST_FAILED(OP_EXECUTION_FAILED, "Failed during execution."));
 
     try {
-        allClose(y, _attributes->ans, _rtol, _atol);
+        allClose(y, _attributes->ans, _rtol, _atol, _equal_nan);
     } catch (const std::exception &e) {
         return TEST_FAILED(RESULT_INCORRECT, e.what());
     }
@@ -109,7 +109,7 @@ std::string Test::toString() const {
     oss << "- max_val: " << _attributes->max_val->info() << std::endl;
     oss << "- y: " << _attributes->y->info() << std::endl;
     oss << std::scientific << std::setprecision(2);
-    oss << "- rtol=" << _rtol << ", atol=" << _atol << std::endl;
+    oss << "- rtol=" << _rtol << ", atol=" << _atol << ", equal_nan=" << _equal_nan << std::endl;
     return oss.str();
 }
 
diff --git a/src/infiniop-test/src/ops/gemm.cpp b/src/infiniop-test/src/ops/gemm.cpp
index 37c8ed6fe..664288d73 100644
--- a/src/infiniop-test/src/ops/gemm.cpp
+++ b/src/infiniop-test/src/ops/gemm.cpp
@@ -18,8 +18,8 @@ struct Test::Attributes {
 std::shared_ptr<Test> Test::build(
     std::unordered_map<std::string, std::vector<uint8_t>> attributes,
     std::unordered_map<std::string, std::shared_ptr<Tensor>> tensors,
-    double rtol, double atol) {
-    auto test = std::shared_ptr<Test>(new Test(rtol, atol));
+    double rtol, double atol, bool equal_nan) {
+    auto test = std::shared_ptr<Test>(new Test(rtol, atol, equal_nan));
     test->_attributes = new Attributes();
     if (!check_names(attributes, Test::attribute_names()) || !check_names(tensors, Test::tensor_names())) {
         throw std::runtime_error("Invalid Test");
@@ -65,7 +65,7 @@ std::shared_ptr<infiniop_test::Result> Test::run(
              return TEST_FAILED(OP_EXECUTION_FAILED, "Failed during execution."));
 
     try {
-        allClose(c, _attributes->ans, _rtol, _atol);
+        allClose(c, _attributes->ans, _rtol, _atol, _equal_nan);
     } catch (const std::exception &e) {
         return TEST_FAILED(RESULT_INCORRECT, e.what());
     }
@@ -120,7 +120,7 @@ std::string Test::toString() const {
     oss << "- b: " << _attributes->b->info() << std::endl;
     oss << "- c: " << _attributes->c->info() << std::endl;
     oss << std::scientific << std::setprecision(2);
-    oss << "- rtol=" << _rtol << ", atol=" << _atol << std::endl;
+    oss << "- rtol=" << _rtol << ", atol=" << _atol << ", equal_nan=" << _equal_nan << std::endl;
     return oss.str();
 }
 
diff --git a/src/infiniop-test/src/ops/mul.cpp b/src/infiniop-test/src/ops/mul.cpp
index 8ebfc426b..cb0b639bf 100644
--- a/src/infiniop-test/src/ops/mul.cpp
+++ b/src/infiniop-test/src/ops/mul.cpp
@@ -15,8 +15,8 @@ struct Test::Attributes {
 std::shared_ptr<Test> Test::build(
     std::unordered_map<std::string, std::vector<uint8_t>> attributes,
     std::unordered_map<std::string, std::shared_ptr<Tensor>> tensors,
-    double rtol, double atol) {
-    auto test = std::shared_ptr<Test>(new Test(rtol, atol));
+    double rtol, double atol, bool equal_nan) {
+    auto test = std::shared_ptr<Test>(new Test(rtol, atol, equal_nan));
     test->_attributes = new Attributes();
     if (tensors.find("a") == tensors.end()
         || tensors.find("b") == tensors.end()
@@ -58,7 +58,7 @@ std::shared_ptr<infiniop_test::Result> Test::run(
              return TEST_FAILED(OP_EXECUTION_FAILED, "Failed during execution."));
 
     try {
-        allClose(c, _attributes->ans, _rtol, _atol);
+        allClose(c, _attributes->ans, _rtol, _atol, _equal_nan);
     } catch (const std::exception &e) {
         return TEST_FAILED(RESULT_INCORRECT, e.what());
     }
@@ -98,7 +98,7 @@ std::string Test::toString() const {
     oss << "- b: " << _attributes->b->info() << std::endl;
     oss << "- c: " << _attributes->c->info() << std::endl;
     oss << std::scientific << std::setprecision(2);
-    oss << "- rtol=" << _rtol << ", atol=" << _atol << std::endl;
+    oss << "- rtol=" << _rtol << ", atol=" << _atol << ", equal_nan=" << _equal_nan << std::endl;
     return oss.str();
 }
 
diff --git a/src/infiniop-test/src/ops/random_sample.cpp b/src/infiniop-test/src/ops/random_sample.cpp
index a11e0f446..75ee07b44 100644
--- a/src/infiniop-test/src/ops/random_sample.cpp
+++ b/src/infiniop-test/src/ops/random_sample.cpp
@@ -20,8 +20,8 @@ struct Test::Attributes {
 std::shared_ptr<Test> Test::build(
     std::unordered_map<std::string, std::vector<uint8_t>> attributes,
     std::unordered_map<std::string, std::shared_ptr<Tensor>> tensors,
-    double rtol, double atol) {
-    auto test = std::shared_ptr<Test>(new Test(rtol, atol));
+    double rtol, double atol, bool equal_nan) {
+    auto test = std::shared_ptr<Test>(new Test(rtol, atol, equal_nan));
     test->_attributes = new Attributes();
     if (!check_names(attributes, Test::attribute_names()) || !check_names(tensors, Test::tensor_names())) {
         throw std::runtime_error("Invalid Test");
@@ -70,7 +70,7 @@ std::shared_ptr<infiniop_test::Result> Test::run(
              return TEST_FAILED(OP_EXECUTION_FAILED, "Failed during execution."));
 
     try {
-        allClose(result, _attributes->ans, _rtol, _atol);
+        allClose(result, _attributes->ans, _rtol, _atol, _equal_nan);
     } catch (const std::exception &e) {
         return TEST_FAILED(RESULT_INCORRECT, e.what());
     }
@@ -117,7 +117,7 @@ std::string Test::toString() const {
     oss << "- data: " << _attributes->data->info() << std::endl;
     oss << "- result: " << _attributes->result->info() << std::endl;
     oss << std::scientific << std::setprecision(2);
-    oss << "- rtol=" << _rtol << ", atol=" << _atol << std::endl;
+    oss << "- rtol=" << _rtol << ", atol=" << _atol << ", equal_nan=" << _equal_nan << std::endl;
     return oss.str();
 }
 
diff --git a/src/infiniop-test/src/ops/rearrange.cpp b/src/infiniop-test/src/ops/rearrange.cpp
index 9fbf6f2cb..bdf162ce2 100644
--- a/src/infiniop-test/src/ops/rearrange.cpp
+++ b/src/infiniop-test/src/ops/rearrange.cpp
@@ -12,9 +12,9 @@ struct Test::Attributes {
 std::shared_ptr<Test> Test::build(
     std::unordered_map<std::string, std::vector<uint8_t>> attributes,
     std::unordered_map<std::string, std::shared_ptr<Tensor>> tensors,
-    double rtol, double atol) {
+    double rtol, double atol, bool equal_nan) {
 
-    auto test = std::shared_ptr<Test>(new Test(rtol, atol));
+    auto test = std::shared_ptr<Test>(new Test(rtol, atol, equal_nan));
     test->_attributes = new Attributes();
     if (!check_names(attributes, Test::attribute_names()) || !check_names(tensors, Test::tensor_names())) {
         throw std::runtime_error("Invalid Test");
diff --git a/src/infiniop-test/src/ops/reduce_max.cpp b/src/infiniop-test/src/ops/reduce_max.cpp
new file mode 100644
index 000000000..37045ec78
--- /dev/null
+++ b/src/infiniop-test/src/ops/reduce_max.cpp
@@ -0,0 +1,119 @@
+#include "ops.hpp"
+#include "utils.hpp"
+#include <infinirt.h>
+#include <iomanip>
+#include <iostream>
+
+namespace infiniop_test::reduce_max {
+struct Test::Attributes {
+    std::shared_ptr<Tensor> input;
+    std::shared_ptr<Tensor> output;
+    std::shared_ptr<Tensor> ans;
+    size_t dim;
+};
+
+std::shared_ptr<Test> Test::build(
+    std::unordered_map<std::string, std::vector<uint8_t>> attributes,
+    std::unordered_map<std::string, std::shared_ptr<Tensor>> tensors,
+    double rtol, double atol, bool equal_nan) {
+    auto test = std::shared_ptr<Test>(new Test(rtol, atol, equal_nan));
+    test->_attributes = new Attributes();
+
+    if (attributes.find("dim") == attributes.end()
+        || tensors.find("input") == tensors.end()
+        || tensors.find("ans") == tensors.end()
+        || tensors.find("output") == tensors.end()) {
+        throw std::runtime_error("Invalid Test: Missing attributes or tensors");
+    }
+
+    test->_attributes->dim = size_t(*reinterpret_cast<uint64_t *>(attributes["dim"].data()));
+    test->_attributes->ans = tensors["ans"];
+    test->_attributes->input = tensors["input"];
+    test->_attributes->output = tensors["output"];
+
+    return test;
+}
+
+std::shared_ptr<infiniop_test::Result> Test::run(
+    infiniopHandle_t handle, infiniDevice_t device, int device_id,
+    size_t warm_ups, size_t iterations) {
+
+    infiniopReduceMaxDescriptor_t op_desc;
+    CHECK_OR(infiniopCreateReduceMaxDescriptor(handle, &op_desc,
+                                               _attributes->output->desc(),
+                                               _attributes->input->desc(),
+                                               _attributes->dim),
+             return TEST_FAILED(OP_CREATION_FAILED, "Failed to create ReduceMax descriptor"));
+
+    auto input = _attributes->input->to(device, device_id);
+    auto output = _attributes->output->to(device, device_id);
+
+    size_t workspace_size;
+    CHECK_OR(infiniopGetReduceMaxWorkspaceSize(op_desc, &workspace_size),
+             return TEST_FAILED(OP_CREATION_FAILED, "Failed to get workspace size"));
+    void *workspace = nullptr;
+    if (workspace_size > 0) {
+        CHECK_OR(infinirtMalloc(&workspace, workspace_size),
+                 return TEST_FAILED(OP_CREATION_FAILED, "Failed to allocate workspace"));
+    }
+
+    CHECK_OR(infiniopReduceMax(op_desc,
+                               workspace, workspace_size,
+                               output->data(),
+                               input->data(),
+                               nullptr),
+             return TEST_FAILED(OP_EXECUTION_FAILED, "ReduceMax execution failed"));
+
+    try {
+        allClose(output, _attributes->ans, _rtol, _atol, _equal_nan);
+    } catch (const std::exception &e) {
+        return TEST_FAILED(RESULT_INCORRECT, e.what());
+    }
+
+    double elapsed_time = 0.;
+
+    elapsed_time = benchmark(
+        [=]() {
+            infiniopReduceMax(op_desc,
+                              workspace, workspace_size,
+                              output->data(),
+                              input->data(),
+                              nullptr);
+        },
+        warm_ups, iterations);
+
+    if (workspace != nullptr) {
+        infinirtFree(workspace);
+    }
+
+    return TEST_PASSED(elapsed_time);
+}
+
+std::vector<std::string> Test::attribute_names() {
+    return {"dim"};
+}
+
+std::vector<std::string> Test::tensor_names() {
+    return {"input", "ans", "output"};
+}
+
+std::vector<std::string> Test::output_names() {
+    return {"output"};
+}
+
+std::string Test::toString() const {
+    std::ostringstream oss;
+    oss << op_name() << std::endl;
+    oss << "- input: " << _attributes->input->info() << std::endl;
+    oss << "- output: " << _attributes->output->info() << std::endl;
+    oss << "- dim=" << _attributes->dim << std::endl;
+    oss << std::scientific << std::setprecision(2);
+    oss << "- rtol=" << _rtol << ", atol=" << _atol << ", equal_nan=" << _equal_nan << std::endl;
+    return oss.str();
+}
+
+Test::~Test() {
+    delete _attributes;
+}
+
+} // namespace infiniop_test::reduce_max
diff --git a/src/infiniop-test/src/ops/reduce_mean.cpp b/src/infiniop-test/src/ops/reduce_mean.cpp
new file mode 100644
index 000000000..9de22d49d
--- /dev/null
+++ b/src/infiniop-test/src/ops/reduce_mean.cpp
@@ -0,0 +1,119 @@
+#include "ops.hpp"
+#include "utils.hpp"
+#include <infinirt.h>
+#include <iomanip>
+#include <iostream>
+
+namespace infiniop_test::reduce_mean {
+struct Test::Attributes {
+    std::shared_ptr<Tensor> input;
+    std::shared_ptr<Tensor> output;
+    std::shared_ptr<Tensor> ans;
+    size_t dim;
+};
+
+std::shared_ptr<Test> Test::build(
+    std::unordered_map<std::string, std::vector<uint8_t>> attributes,
+    std::unordered_map<std::string, std::shared_ptr<Tensor>> tensors,
+    double rtol, double atol, bool equal_nan) {
+    auto test = std::shared_ptr<Test>(new Test(rtol, atol, equal_nan));
+    test->_attributes = new Attributes();
+
+    if (attributes.find("dim") == attributes.end()
+        || tensors.find("input") == tensors.end()
+        || tensors.find("ans") == tensors.end()
+        || tensors.find("output") == tensors.end()) {
+        throw std::runtime_error("Invalid Test: Missing attributes or tensors");
+    }
+
+    test->_attributes->dim = size_t(*reinterpret_cast<uint64_t *>(attributes["dim"].data()));
+    test->_attributes->ans = tensors["ans"];
+    test->_attributes->input = tensors["input"];
+    test->_attributes->output = tensors["output"];
+
+    return test;
+}
+
+std::shared_ptr<infiniop_test::Result> Test::run(
+    infiniopHandle_t handle, infiniDevice_t device, int device_id,
+    size_t warm_ups, size_t iterations) {
+
+    infiniopReduceMeanDescriptor_t op_desc;
+    CHECK_OR(infiniopCreateReduceMeanDescriptor(handle, &op_desc,
+                                                _attributes->output->desc(),
+                                                _attributes->input->desc(),
+                                                _attributes->dim),
+             return TEST_FAILED(OP_CREATION_FAILED, "Failed to create ReduceMean descriptor"));
+
+    auto input = _attributes->input->to(device, device_id);
+    auto output = _attributes->output->to(device, device_id);
+
+    size_t workspace_size;
+    CHECK_OR(infiniopGetReduceMeanWorkspaceSize(op_desc, &workspace_size),
+             return TEST_FAILED(OP_CREATION_FAILED, "Failed to get workspace size"));
+    void *workspace = nullptr;
+    if (workspace_size > 0) {
+        CHECK_OR(infinirtMalloc(&workspace, workspace_size),
+                 return TEST_FAILED(OP_CREATION_FAILED, "Failed to allocate workspace"));
+    }
+
+    CHECK_OR(infiniopReduceMean(op_desc,
+                                workspace, workspace_size,
+                                output->data(),
+                                input->data(),
+                                nullptr),
+             return TEST_FAILED(OP_EXECUTION_FAILED, "ReduceMean execution failed"));
+
+    try {
+        allClose(output, _attributes->ans, _rtol, _atol, _equal_nan);
+    } catch (const std::exception &e) {
+        return TEST_FAILED(RESULT_INCORRECT, e.what());
+    }
+
+    double elapsed_time = 0.;
+
+    elapsed_time = benchmark(
+        [=]() {
+            infiniopReduceMean(op_desc,
+                               workspace, workspace_size,
+                               output->data(),
+                               input->data(),
+                               nullptr);
+        },
+        warm_ups, iterations);
+
+    if (workspace != nullptr) {
+        infinirtFree(workspace);
+    }
+
+    return TEST_PASSED(elapsed_time);
+}
+
+std::vector<std::string> Test::attribute_names() {
+    return {"dim"};
+}
+
+std::vector<std::string> Test::tensor_names() {
+    return {"input", "ans", "output"};
+}
+
+std::vector<std::string> Test::output_names() {
+    return {"output"};
+}
+
+std::string Test::toString() const {
+    std::ostringstream oss;
+    oss << op_name() << std::endl;
+    oss << "- input: " << _attributes->input->info() << std::endl;
+    oss << "- output: " << _attributes->output->info() << std::endl;
+    oss << "- dim=" << _attributes->dim << std::endl;
+    oss << std::scientific << std::setprecision(2);
+    oss << "- rtol=" << _rtol << ", atol=" << _atol << ", equal_nan=" << _equal_nan << std::endl;
+    return oss.str();
+}
+
+Test::~Test() {
+    delete _attributes;
+}
+
+} // namespace infiniop_test::reduce_mean
diff --git a/src/infiniop-test/src/ops/rms_norm.cpp b/src/infiniop-test/src/ops/rms_norm.cpp
index 8359a4536..786ce8470 100644
--- a/src/infiniop-test/src/ops/rms_norm.cpp
+++ b/src/infiniop-test/src/ops/rms_norm.cpp
@@ -16,8 +16,8 @@ struct Test::Attributes {
 std::shared_ptr<Test> Test::build(
     std::unordered_map<std::string, std::vector<uint8_t>> attributes,
     std::unordered_map<std::string, std::shared_ptr<Tensor>> tensors,
-    double rtol, double atol) {
-    auto test = std::shared_ptr<Test>(new Test(rtol, atol));
+    double rtol, double atol, bool equal_nan) {
+    auto test = std::shared_ptr<Test>(new Test(rtol, atol, equal_nan));
     test->_attributes = new Attributes();
 
     if (attributes.find("epsilon") == attributes.end()
@@ -72,7 +72,7 @@ std::shared_ptr<infiniop_test::Result> Test::run(
              return TEST_FAILED(OP_EXECUTION_FAILED, "RMSNorm execution failed"));
 
     try {
-        allClose(y, _attributes->ans, _rtol, _atol);
+        allClose(y, _attributes->ans, _rtol, _atol, _equal_nan);
     } catch (const std::exception &e) {
         return TEST_FAILED(RESULT_INCORRECT, e.what());
     }
@@ -117,7 +117,7 @@ std::string Test::toString() const {
     oss << "- w: " << _attributes->w->info() << std::endl;
     oss << "- y: " << _attributes->y->info() << std::endl;
     oss << std::scientific << std::setprecision(2);
-    oss << "- rtol=" << _rtol << ", atol=" << _atol << std::endl;
+    oss << "- rtol=" << _rtol << ", atol=" << _atol << ", equal_nan=" << _equal_nan << std::endl;
     return oss.str();
 }
 
diff --git a/src/infiniop-test/src/ops/rope.cpp b/src/infiniop-test/src/ops/rope.cpp
index 636f565af..94010a122 100644
--- a/src/infiniop-test/src/ops/rope.cpp
+++ b/src/infiniop-test/src/ops/rope.cpp
@@ -17,8 +17,8 @@ struct Test::Attributes {
 std::shared_ptr<Test> Test::build(
     std::unordered_map<std::string, std::vector<uint8_t>> attributes,
     std::unordered_map<std::string, std::shared_ptr<Tensor>> tensors,
-    double rtol, double atol) {
-    auto test = std::shared_ptr<Test>(new Test(rtol, atol));
+    double rtol, double atol, bool equal_nan) {
+    auto test = std::shared_ptr<Test>(new Test(rtol, atol, equal_nan));
     test->_attributes = new Attributes();
 
     if (tensors.find("y") == tensors.end()
@@ -77,7 +77,7 @@ std::shared_ptr<infiniop_test::Result> Test::run(
              return TEST_FAILED(OP_EXECUTION_FAILED, "Failed during execution."));
 
     try {
-        allClose(y, _attributes->ans, _rtol, _atol);
+        allClose(y, _attributes->ans, _rtol, _atol, _equal_nan);
     } catch (const std::exception &e) {
         return TEST_FAILED(RESULT_INCORRECT, e.what());
     }
@@ -121,7 +121,7 @@ std::string Test::toString() const {
     oss << "- sin_table: " << _attributes->sin_table->info() << std::endl;
     oss << "- cos_table: " << _attributes->cos_table->info() << std::endl;
     oss << std::scientific << std::setprecision(2);
-    oss << "- rtol=" << _rtol << ", atol=" << _atol << std::endl;
+    oss << "- rtol=" << _rtol << ", atol=" << _atol << ", equal_nan=" << _equal_nan << std::endl;
     return oss.str();
 }
 
diff --git a/src/infiniop-test/src/ops/sub.cpp b/src/infiniop-test/src/ops/sub.cpp
index 6bb1fd1eb..bb3adc350 100644
--- a/src/infiniop-test/src/ops/sub.cpp
+++ b/src/infiniop-test/src/ops/sub.cpp
@@ -15,8 +15,8 @@ struct Test::Attributes {
 std::shared_ptr<Test> Test::build(
     std::unordered_map<std::string, std::vector<uint8_t>> attributes,
     std::unordered_map<std::string, std::shared_ptr<Tensor>> tensors,
-    double rtol, double atol) {
-    auto test = std::shared_ptr<Test>(new Test(rtol, atol));
+    double rtol, double atol, bool equal_nan) {
+    auto test = std::shared_ptr<Test>(new Test(rtol, atol, equal_nan));
     test->_attributes = new Attributes();
     if (tensors.find("a") == tensors.end()
         || tensors.find("b") == tensors.end()
@@ -58,7 +58,7 @@ std::shared_ptr<infiniop_test::Result> Test::run(
              return TEST_FAILED(OP_EXECUTION_FAILED, "Failed during execution."));
 
     try {
-        allClose(c, _attributes->ans, _rtol, _atol);
+        allClose(c, _attributes->ans, _rtol, _atol, _equal_nan);
     } catch (const std::exception &e) {
         return TEST_FAILED(RESULT_INCORRECT, e.what());
     }
@@ -98,7 +98,7 @@ std::string Test::toString() const {
     oss << "- b: " << _attributes->b->info() << std::endl;
     oss << "- c: " << _attributes->c->info() << std::endl;
     oss << std::scientific << std::setprecision(2);
-    oss << "- rtol=" << _rtol << ", atol=" << _atol << std::endl;
+    oss << "- rtol=" << _rtol << ", atol=" << _atol << ", equal_nan=" << _equal_nan << std::endl;
     return oss.str();
 }
 
diff --git a/src/infiniop-test/src/ops/swiglu.cpp b/src/infiniop-test/src/ops/swiglu.cpp
index 96b75efc5..f86dfadc6 100644
--- a/src/infiniop-test/src/ops/swiglu.cpp
+++ b/src/infiniop-test/src/ops/swiglu.cpp
@@ -15,8 +15,8 @@ struct Test::Attributes {
 std::shared_ptr<Test> Test::build(
     std::unordered_map<std::string, std::vector<uint8_t>> attributes,
     std::unordered_map<std::string, std::shared_ptr<Tensor>> tensors,
-    double rtol, double atol) {
-    auto test = std::shared_ptr<Test>(new Test(rtol, atol));
+    double rtol, double atol, bool equal_nan) {
+    auto test = std::shared_ptr<Test>(new Test(rtol, atol, equal_nan));
     test->_attributes = new Attributes();
 
     if (tensors.find("a") == tensors.end()
@@ -54,7 +54,7 @@ std::shared_ptr<infiniop_test::Result> Test::run(
     CHECK_OR(infiniopSwiGLU(op_desc, workspace, workspace_size, c->data(), a->data(), b->data(), nullptr),
              return TEST_FAILED(OP_CREATION_FAILED, "Failed during execution."));
     try {
-        allClose(c, _attributes->ans, _rtol, _atol);
+        allClose(c, _attributes->ans, _rtol, _atol, _equal_nan);
     } catch (const std::exception &e) {
         return TEST_FAILED(RESULT_INCORRECT, e.what());
     }
@@ -93,7 +93,7 @@ std::string Test::toString() const {
     oss << "- b: " << _attributes->b->info() << std::endl;
     oss << "- c: " << _attributes->c->info() << std::endl;
     oss << std::scientific << std::setprecision(2);
-    oss << "- rtol=" << _rtol << ", atol=" << _atol << std::endl;
+    oss << "- rtol=" << _rtol << ", atol=" << _atol << ", equal_nan=" << _equal_nan << std::endl;
     return oss.str();
 }
 
diff --git a/src/infiniop-test/src/test.cpp b/src/infiniop-test/src/test.cpp
index e312ac5f5..0cbfe067a 100644
--- a/src/infiniop-test/src/test.cpp
+++ b/src/infiniop-test/src/test.cpp
@@ -49,7 +49,7 @@ std::string Result::toString() const {
 std::vector<std::shared_ptr<Result>> runAllTests(const GGUFFileReader &gguf_reader,
                                                  infiniDevice_t device, int device_id,
                                                  size_t warm_ups, size_t iterations,
-                                                 double rtol, double atol) {
+                                                 double rtol, double atol, bool equal_nan) {
     auto meta = gguf_reader.getAttributeMap();
     auto count_meta = meta.find("test_count");
     if (count_meta == meta.end()) {
@@ -60,7 +60,7 @@ std::vector<std::shared_ptr<Result>> runAllTests(const GGUFFileReader &gguf_read
     auto results = std::vector<std::shared_ptr<Result>>(count);
     try {
         for (size_t i = 0; i < count; i++) {
-            results[i] = runTest(gguf_reader, device, device_id, warm_ups, iterations, rtol, atol, i);
+            results[i] = runTest(gguf_reader, device, device_id, warm_ups, iterations, rtol, atol, i, equal_nan);
         }
     } catch (const std::exception &e) {
         std::cerr << "Error: " << e.what() << std::endl;
@@ -72,7 +72,7 @@ std::vector<std::shared_ptr<Result>> runAllTests(const GGUFFileReader &gguf_read
 std::shared_ptr<Result> runTest(const GGUFFileReader &gguf_reader,
                                 infiniDevice_t device, int device_id,
                                 size_t warm_ups, size_t iterations,
-                                double rtol, double atol, size_t test_id) {
+                                double rtol, double atol, size_t test_id, bool equal_nan) {
     auto meta = gguf_reader.getAttributeMap();
     auto tensor_info = gguf_reader.getTensorInfoMap();
     auto name_meta = meta.find("test." + std::to_string(test_id) + ".op_name");
@@ -107,7 +107,7 @@ std::shared_ptr<Result> runTest(const GGUFFileReader &gguf_reader,
         }
         std::shared_ptr<infiniop_test::base::Test> test;
         try {
-            test = builder.build(attrs, tensors, rtol, atol);
+            test = builder.build(attrs, tensors, rtol, atol, equal_nan);
         } catch (const std::exception &e) {
             return TEST_INIT_FAILED(op_name + "/n" + e.what());
         }
@@ -141,7 +141,7 @@ void incrementOffset(ptrdiff_t &offset_1, const std::vector<ptrdiff_t> &strides_
     }
 }
 
-void allClose(std::shared_ptr<Tensor> actual_, std::shared_ptr<Tensor> expected_, double rtol, double atol) {
+void allClose(std::shared_ptr<Tensor> actual_, std::shared_ptr<Tensor> expected_, double rtol, double atol, bool equal_nan) {
     auto actual = actual_->to(INFINI_DEVICE_CPU);
     auto expected = expected_->to(INFINI_DEVICE_CPU);
     auto shape = actual->shape();
@@ -158,12 +158,22 @@ void allClose(std::shared_ptr<Tensor> actual_, std::shared_ptr<Tensor> expected_
     for (size_t i = 0; i < total; i++) {
         double a_ = getVal((char *)actual->data() + actual_offset, actual->ggml_type());
         double e_ = getVal((char *)expected->data() + expected_offset, expected->ggml_type());
-        if (std::fabs(a_ - e_) > atol && std::fabs(a_ - e_) > rtol * std::fmax(std::fabs(a_), std::fabs(e_))) {
-            if (num_failed == 0) {
-                first_failed_msg = "First failed at index " + std::to_string(i) + " with value " + std::to_string(a_) + " but should be " + std::to_string(e_) + ".";
+        if (std::isnan(a_) || std::isnan(e_)) {
+            if ((equal_nan && (std::isnan(a_) != std::isnan(e_))) || !equal_nan) {
+                num_failed++;
+                if (num_failed == 0) {
+                    first_failed_msg = "First failed at index " + std::to_string(i) + " with value " + std::to_string(a_) + " but should be " + std::to_string(e_) + ".";
+                }
+            }
+        } else {
+            if (std::fabs(a_ - e_) > atol && std::fabs(a_ - e_) > rtol * std::fmax(std::fabs(a_), std::fabs(e_))) {
+                if (num_failed == 0) {
+                    first_failed_msg = "First failed at index " + std::to_string(i) + " with value " + std::to_string(a_) + " but should be " + std::to_string(e_) + ".";
+                }
+                num_failed++;
             }
-            num_failed++;
         }
+
         incrementOffset(actual_offset, actual->strides(), ggmlTypeSize(actual->ggml_type()),
                         expected_offset, expected->strides(), ggmlTypeSize(expected->ggml_type()),
                         counter, shape);
diff --git a/src/infiniop/ops/reduce_max/cpu/reduce_max_cpu.cc b/src/infiniop/ops/reduce_max/cpu/reduce_max_cpu.cc
new file mode 100644
index 000000000..55a340226
--- /dev/null
+++ b/src/infiniop/ops/reduce_max/cpu/reduce_max_cpu.cc
@@ -0,0 +1,102 @@
+#include "reduce_max_cpu.h"
+#include "../../../devices/cpu/common_cpu.h"
+#include "../../../reduce/cpu/reduce.h"
+
+namespace op::reduce_max::cpu {
+
+Descriptor::~Descriptor() {}
+
+infiniStatus_t Descriptor::create(
+    infiniopHandle_t handle,
+    Descriptor **desc_ptr,
+    infiniopTensorDescriptor_t y_desc,
+    infiniopTensorDescriptor_t x_desc,
+    size_t dim) {
+    auto result = ReduceMaxInfo::create(y_desc, x_desc, dim);
+    CHECK_RESULT(result);
+    *desc_ptr = new Descriptor(nullptr, result.take(), 0, handle->device, handle->device_id);
+    return INFINI_STATUS_SUCCESS;
+}
+
+template <typename T>
+infiniStatus_t reduce_max(const ReduceMaxInfo *info, T *y, const T *x) {
+    const size_t cols = info->shape[3]; // 规约维度
+    const ptrdiff_t y_batch_stride = info->y_strides[0];
+    const ptrdiff_t y_channel_stride = info->y_strides[1];
+    const ptrdiff_t y_row_stride = info->y_strides[2];
+    const ptrdiff_t x_batch_stride = info->x_strides[0];
+    const ptrdiff_t x_channel_stride = info->x_strides[1];
+    const ptrdiff_t x_row_stride = info->x_strides[2];
+    const ptrdiff_t x_col_stride = info->x_strides[3];
+#ifdef _WIN32
+    const ptrdiff_t batch_size = static_cast<ptrdiff_t>(info->shape[0]);
+    const ptrdiff_t channels = static_cast<ptrdiff_t>(info->shape[1]);
+    const ptrdiff_t rows = static_cast<ptrdiff_t>(info->shape[2]);
+#pragma omp parallel for
+    for (ptrdiff_t batch = 0; batch < batch_size; ++batch) {
+        for (ptrdiff_t channel = 0; channel < channels; ++channel) {
+            for (ptrdiff_t row = 0; row < rows; ++row) {
+                const T *input_start = x + batch * x_batch_stride
+                                     + channel * x_channel_stride
+                                     + row * x_row_stride;
+                T *output_ptr = y + batch * y_batch_stride
+                              + channel * y_channel_stride
+                              + row * y_row_stride;
+
+                float max = op::common_cpu::reduce_op::max(input_start, cols, x_col_stride);
+
+                if constexpr (std::is_same<T, fp16_t>::value || std::is_same<T, bf16_t>::value) {
+                    *output_ptr = utils::cast<T>(max);
+                } else {
+                    *output_ptr = max;
+                }
+            }
+        }
+    }
+#else
+    const size_t batch_size = info->shape[0];
+    const size_t channels = info->shape[1];
+    const size_t rows = info->shape[2];
+#pragma omp parallel for collapse(3)
+    for (size_t batch = 0; batch < batch_size; ++batch) {
+        for (size_t channel = 0; channel < channels; ++channel) {
+            for (size_t row = 0; row < rows; ++row) {
+                const T *input_start = x + batch * x_batch_stride
+                                     + channel * x_channel_stride
+                                     + row * x_row_stride;
+                T *output_ptr = y + batch * y_batch_stride
+                              + channel * y_channel_stride
+                              + row * y_row_stride;
+
+                float max = op::common_cpu::reduce_op::max(input_start, cols, x_col_stride);
+
+                if constexpr (std::is_same<T, fp16_t>::value || std::is_same<T, bf16_t>::value) {
+                    *output_ptr = utils::cast<T>(max);
+                } else {
+                    *output_ptr = max;
+                }
+            }
+        }
+    }
+#endif //_WIN32
+    return INFINI_STATUS_SUCCESS;
+}
+infiniStatus_t Descriptor::calculate(
+    void *workspace, size_t workspace_size,
+    void *y,
+    const void *x,
+    void *stream) const {
+
+    if (_info.dtype == INFINI_DTYPE_F16) {
+        CHECK_STATUS(reduce_max<fp16_t>(&_info, (fp16_t *)y, (const fp16_t *)x));
+    } else if (_info.dtype == INFINI_DTYPE_BF16) {
+        CHECK_STATUS(reduce_max<bf16_t>(&_info, (bf16_t *)y, (const bf16_t *)x));
+    } else if (_info.dtype == INFINI_DTYPE_F32) {
+        CHECK_STATUS(reduce_max<float>(&_info, (float *)y, (const float *)x));
+    } else {
+        return INFINI_STATUS_BAD_TENSOR_DTYPE;
+    }
+
+    return INFINI_STATUS_SUCCESS;
+}
+} // namespace op::reduce_max::cpu
diff --git a/src/infiniop/ops/reduce_max/cpu/reduce_max_cpu.h b/src/infiniop/ops/reduce_max/cpu/reduce_max_cpu.h
new file mode 100644
index 000000000..bf6be9b74
--- /dev/null
+++ b/src/infiniop/ops/reduce_max/cpu/reduce_max_cpu.h
@@ -0,0 +1,7 @@
+#ifndef __REDUCE_MAX_CPU_H__
+#define __REDUCE_MAX_CPU_H__
+#include "../reduce_max.h"
+
+DESCRIPTOR(cpu)
+
+#endif
diff --git a/src/infiniop/ops/reduce_max/info.h b/src/infiniop/ops/reduce_max/info.h
new file mode 100644
index 000000000..99bb5be19
--- /dev/null
+++ b/src/infiniop/ops/reduce_max/info.h
@@ -0,0 +1,62 @@
+﻿#ifndef __REDUCE_MAX_INFO_H__
+#define __REDUCE_MAX_INFO_H__
+
+#include "../../../utils.h"
+#include "../../tensor.h"
+#include <vector>
+
+namespace op::reduce_max {
+
+class ReduceMaxInfo {
+    ReduceMaxInfo() = default;
+
+public:
+    infiniDtype_t dtype;
+
+    std::vector<size_t> shape;
+    std::vector<ptrdiff_t> y_strides;
+    std::vector<ptrdiff_t> x_strides;
+
+    static utils::Result<ReduceMaxInfo> create(infiniopTensorDescriptor_t y_desc, infiniopTensorDescriptor_t x_desc, size_t dim) {
+        auto dtype = y_desc->dtype();
+        if (dtype != x_desc->dtype()) {
+            return INFINI_STATUS_BAD_TENSOR_DTYPE;
+        }
+        CHECK_DTYPE(dtype, INFINI_DTYPE_F16, INFINI_DTYPE_BF16, INFINI_DTYPE_F32);
+
+        size_t ndim = y_desc->ndim();
+        if (x_desc->ndim() != ndim) {
+            CHECK_STATUS(INFINI_STATUS_BAD_TENSOR_SHAPE);
+        }
+        CHECK_REDUCE_SHAPE(x_desc->shape(), dim, y_desc->shape());
+        if (ndim > 4) {
+            return INFINI_STATUS_BAD_TENSOR_SHAPE;
+        } else if (ndim == 0) {
+            std::vector<size_t> shape = {1, 1, 1, 1};
+            std::vector<ptrdiff_t> y_strides = {0, 0, 0, 0};
+            std::vector<ptrdiff_t> x_strides = {0, 0, 0, 0};
+            return utils::Result<ReduceMaxInfo>(ReduceMaxInfo{
+                dtype, shape, y_strides, x_strides});
+        } else {
+            std::vector<size_t> shape = x_desc->shape();
+            std::vector<ptrdiff_t> y_strides = y_desc->strides();
+            std::vector<ptrdiff_t> x_strides = x_desc->strides();
+            if (dim != (shape.size() - 1)) {
+                std::swap(shape[dim], shape[shape.size() - 1]);
+                std::swap(y_strides[dim], y_strides[shape.size() - 1]);
+                std::swap(x_strides[dim], x_strides[shape.size() - 1]);
+            }
+            while (shape.size() < 4) {
+                shape.insert(shape.begin(), 1);
+                y_strides.insert(y_strides.begin(), 0);
+                x_strides.insert(x_strides.begin(), 0);
+            }
+            return utils::Result<ReduceMaxInfo>(ReduceMaxInfo{
+                dtype, shape, y_strides, x_strides});
+        }
+    }
+};
+
+} // namespace op::reduce_max
+
+#endif // __REDUCE_MAX_INFO_H__
diff --git a/src/infiniop/ops/reduce_max/metax/reduce_max_metax.h b/src/infiniop/ops/reduce_max/metax/reduce_max_metax.h
new file mode 100644
index 000000000..735bc8da4
--- /dev/null
+++ b/src/infiniop/ops/reduce_max/metax/reduce_max_metax.h
@@ -0,0 +1,8 @@
+#ifndef __REDUCE_MAX_METAX_H__
+#define __REDUCE_MAX_METAX_H__
+
+#include "../reduce_max.h"
+
+DESCRIPTOR(metax)
+
+#endif
diff --git a/src/infiniop/ops/reduce_max/metax/reduce_max_metax.maca b/src/infiniop/ops/reduce_max/metax/reduce_max_metax.maca
new file mode 100644
index 000000000..ab120fea5
--- /dev/null
+++ b/src/infiniop/ops/reduce_max/metax/reduce_max_metax.maca
@@ -0,0 +1,96 @@
+#include "../../../devices/metax/metax_common.h"
+#include "reduce_max_metax.h"
+
+#include <hccub/block/block_reduce.cuh>
+#include "../../../devices/metax/metax_kernel_common.h"
+
+#include "../../../reduce/cuda/reduce.cuh"
+
+#include "../nvidia/kernel.cuh"
+
+template <unsigned int BLOCK_SIZE, typename Tdata, typename Tcompute>
+INFINIOP_METAX_KERNEL ReduceMax(
+    Tdata *y_, const Tdata *x_,
+    size_t batch, size_t channels, size_t height, size_t width,
+    ptrdiff_t y_stride_b, ptrdiff_t y_stride_c, ptrdiff_t y_stride_h,
+    ptrdiff_t x_stride_b, ptrdiff_t x_stride_c, ptrdiff_t x_stride_h, ptrdiff_t x_stride_w) {
+    ReduceMaxKernel<BLOCK_SIZE, Tdata, Tcompute>(y_, x_, batch, channels, height, width, y_stride_b, y_stride_c, y_stride_h, x_stride_b, x_stride_c, x_stride_h, x_stride_w);
+}
+
+namespace op::reduce_max::metax {
+
+struct Descriptor::Opaque {
+    std::shared_ptr<device::metax::Handle::Internal> internal;
+};
+
+Descriptor::~Descriptor() {
+    delete _opaque;
+}
+
+infiniStatus_t Descriptor::create(
+    infiniopHandle_t handle,
+    Descriptor **desc_ptr,
+    infiniopTensorDescriptor_t y_desc,
+    infiniopTensorDescriptor_t x_desc,
+    size_t dim) {
+    auto info = ReduceMaxInfo::create(y_desc, x_desc, dim);
+    CHECK_RESULT(info);
+    *desc_ptr = new Descriptor(
+        new Opaque{reinterpret_cast<device::metax::Handle *>(handle)->internal()},
+        info.take(), 0, handle->device, handle->device_id);
+    return INFINI_STATUS_SUCCESS;
+}
+
+template <unsigned int BLOCK_SIZE>
+infiniStatus_t launchKernel(void *y, const void *x, infiniDtype_t dtype,
+                            size_t batch_size, size_t channels, size_t height, size_t width,
+                            ptrdiff_t y_stride_b, ptrdiff_t y_stride_c, ptrdiff_t y_stride_h,
+                            ptrdiff_t x_stride_b, ptrdiff_t x_stride_c, ptrdiff_t x_stride_h, ptrdiff_t x_stride_w,
+                            hcStream_t stream) {
+    dim3 grid=dim3(uint32_t(batch_size), uint32_t(channels), uint32_t(height));
+    if (dtype == INFINI_DTYPE_F16) {
+        ReduceMax<BLOCK_SIZE, half, float>
+            <<<grid, BLOCK_SIZE, 0, stream>>>((half *)y, (const half *)x,
+                                              batch_size, channels, height, width,
+                                              y_stride_b, y_stride_c, y_stride_h,
+                                              x_stride_b, x_stride_c, x_stride_h, x_stride_w);
+    } else if (dtype == INFINI_DTYPE_BF16) {
+        ReduceMax<BLOCK_SIZE, __hpcc_bfloat16, float>
+            <<<grid, BLOCK_SIZE, 0, stream>>>((__hpcc_bfloat16 *)y, (const __hpcc_bfloat16 *)x,
+                                              batch_size, channels, height, width,
+                                              y_stride_b, y_stride_c, y_stride_h,
+                                              x_stride_b, x_stride_c, x_stride_h, x_stride_w);
+    } else if (dtype == INFINI_DTYPE_F32) {
+        ReduceMax<BLOCK_SIZE, float, float>
+            <<<grid, BLOCK_SIZE, 0, stream>>>((float *)y, (const float *)x,
+                                              batch_size, channels, height, width,
+                                              y_stride_b, y_stride_c, y_stride_h,
+                                              x_stride_b, x_stride_c, x_stride_h, x_stride_w);
+    } else {
+        return INFINI_STATUS_BAD_TENSOR_DTYPE;
+    }
+    return INFINI_STATUS_SUCCESS;
+}
+
+infiniStatus_t Descriptor::calculate(void *workspace, size_t workspace_size,
+                                     void *y,
+                                     const void *x,
+                                     void *stream_) const {
+    hcStream_t stream = (hcStream_t)stream_;
+    if (_opaque->internal->maxThreadsPerBlock() == METAX_BLOCK_SIZE_1024) {
+        CHECK_STATUS(launchKernel<METAX_BLOCK_SIZE_1024>(
+            y, x, _info.dtype, _info.shape[0], _info.shape[1], _info.shape[2], _info.shape[3],
+            _info.y_strides[0], _info.y_strides[1], _info.y_strides[2], 
+            _info.x_strides[0], _info.x_strides[1], _info.x_strides[2], _info.x_strides[3], stream));
+    } else if (_opaque->internal->maxThreadsPerBlock() == METAX_BLOCK_SIZE_512) {
+        CHECK_STATUS(launchKernel<METAX_BLOCK_SIZE_512>(
+            y, x, _info.dtype, _info.shape[0], _info.shape[1], _info.shape[2], _info.shape[3],
+            _info.y_strides[0], _info.y_strides[1], _info.y_strides[2], 
+            _info.x_strides[0], _info.x_strides[1], _info.x_strides[2], _info.x_strides[3], stream));
+    } else {
+        return INFINI_STATUS_DEVICE_ARCHITECTURE_NOT_SUPPORTED;
+    }
+    return INFINI_STATUS_SUCCESS;
+}
+
+} // namespace op::reduce_max::metax
diff --git a/src/infiniop/ops/reduce_max/nvidia/kernel.cuh b/src/infiniop/ops/reduce_max/nvidia/kernel.cuh
new file mode 100644
index 000000000..fec2f9341
--- /dev/null
+++ b/src/infiniop/ops/reduce_max/nvidia/kernel.cuh
@@ -0,0 +1,21 @@
+﻿#ifndef __REDUCE_MAX_KERNEL_CUH__
+#define __REDUCE_MAX_KERNEL_CUH__
+
+template <unsigned int BLOCK_SIZE, typename Tdata, typename Tcompute>
+__device__ void ReduceMaxKernel(
+    Tdata *y_, const Tdata *x_,
+    size_t batch, size_t channels, size_t height, size_t width,
+    ptrdiff_t y_stride_b, ptrdiff_t y_stride_c, ptrdiff_t y_stride_h,
+    ptrdiff_t x_stride_b, ptrdiff_t x_stride_c, ptrdiff_t x_stride_h, ptrdiff_t x_stride_w) {
+
+    Tdata *y = y_ + blockIdx.x * y_stride_b + blockIdx.y * y_stride_c + blockIdx.z * y_stride_h;
+    const Tdata *x = x_ + blockIdx.x * x_stride_b + blockIdx.y * x_stride_c + blockIdx.z * x_stride_h;
+
+    // [Reduce] Find the max of each updated row and store in shared memory
+    Tcompute max_0 = op::common_cuda::reduce_op::max<BLOCK_SIZE, Tdata>(x, width, x_stride_w);
+    if (threadIdx.x == 0) {
+        *y = max_0;
+    }
+}
+
+#endif // __REDUCE_MAX_KERNEL_CUH__
diff --git a/src/infiniop/ops/reduce_max/nvidia/reduce_max_nvidia.cu b/src/infiniop/ops/reduce_max/nvidia/reduce_max_nvidia.cu
new file mode 100644
index 000000000..f64f596f4
--- /dev/null
+++ b/src/infiniop/ops/reduce_max/nvidia/reduce_max_nvidia.cu
@@ -0,0 +1,101 @@
+#include "../../../devices/nvidia/nvidia_common.cuh"
+#include "reduce_max_nvidia.cuh"
+
+#include "../../../devices/nvidia/nvidia_kernel_common.cuh"
+#include <cub/block/block_reduce.cuh>
+
+#include "../../../reduce/cuda/reduce.cuh"
+
+#include "kernel.cuh"
+
+template <unsigned int BLOCK_SIZE, typename Tdata, typename Tcompute>
+INFINIOP_CUDA_KERNEL ReduceMax(
+    Tdata *y_, const Tdata *x_,
+    size_t batch, size_t channels, size_t height, size_t width,
+    ptrdiff_t y_stride_b, ptrdiff_t y_stride_c, ptrdiff_t y_stride_h,
+    ptrdiff_t x_stride_b, ptrdiff_t x_stride_c, ptrdiff_t x_stride_h, ptrdiff_t x_stride_w) {
+    ReduceMaxKernel<BLOCK_SIZE, Tdata, Tcompute>(y_, x_, batch, channels, height, width, y_stride_b, y_stride_c, y_stride_h, x_stride_b, x_stride_c, x_stride_h, x_stride_w);
+}
+
+namespace op::reduce_max::nvidia {
+
+struct Descriptor::Opaque {
+    std::shared_ptr<device::nvidia::Handle::Internal> internal;
+};
+
+Descriptor::~Descriptor() {
+    delete _opaque;
+}
+
+infiniStatus_t Descriptor::create(
+    infiniopHandle_t handle,
+    Descriptor **desc_ptr,
+    infiniopTensorDescriptor_t y_desc,
+    infiniopTensorDescriptor_t x_desc,
+    size_t dim) {
+    auto info = ReduceMaxInfo::create(y_desc, x_desc, dim);
+    CHECK_RESULT(info);
+    *desc_ptr = new Descriptor(
+        new Opaque{reinterpret_cast<device::nvidia::Handle *>(handle)->internal()},
+        info.take(), 0, handle->device, handle->device_id);
+    return INFINI_STATUS_SUCCESS;
+}
+
+template <unsigned int BLOCK_SIZE>
+infiniStatus_t launchKernel(void *y, const void *x, infiniDtype_t dtype,
+                            size_t batch_size, size_t channels, size_t height, size_t width,
+                            ptrdiff_t y_stride_b, ptrdiff_t y_stride_c, ptrdiff_t y_stride_h,
+                            ptrdiff_t x_stride_b, ptrdiff_t x_stride_c, ptrdiff_t x_stride_h, ptrdiff_t x_stride_w,
+                            cudaStream_t stream) {
+    dim3 grid = dim3(uint32_t(batch_size), uint32_t(channels), uint32_t(height));
+    if (dtype == INFINI_DTYPE_F16) {
+        ReduceMax<BLOCK_SIZE, half, float>
+            <<<grid, BLOCK_SIZE, 0, stream>>>((half *)y, (const half *)x,
+                                              batch_size, channels, height, width,
+                                              y_stride_b, y_stride_c, y_stride_h,
+                                              x_stride_b, x_stride_c, x_stride_h, x_stride_w);
+    } else if (dtype == INFINI_DTYPE_BF16) {
+        ReduceMax<BLOCK_SIZE, __nv_bfloat16, float>
+            <<<grid, BLOCK_SIZE, 0, stream>>>((__nv_bfloat16 *)y, (const __nv_bfloat16 *)x,
+                                              batch_size, channels, height, width,
+                                              y_stride_b, y_stride_c, y_stride_h,
+                                              x_stride_b, x_stride_c, x_stride_h, x_stride_w);
+    } else if (dtype == INFINI_DTYPE_F32) {
+        ReduceMax<BLOCK_SIZE, float, float>
+            <<<grid, BLOCK_SIZE, 0, stream>>>((float *)y, (const float *)x,
+                                              batch_size, channels, height, width,
+                                              y_stride_b, y_stride_c, y_stride_h,
+                                              x_stride_b, x_stride_c, x_stride_h, x_stride_w);
+    } else {
+        return INFINI_STATUS_BAD_TENSOR_DTYPE;
+    }
+    return INFINI_STATUS_SUCCESS;
+}
+
+infiniStatus_t Descriptor::calculate(void *workspace, size_t workspace_size,
+                                     void *y,
+                                     const void *x,
+                                     void *stream_) const {
+    cudaStream_t stream = (cudaStream_t)stream_;
+    if (_opaque->internal->maxThreadsPerBlock() == CUDA_BLOCK_SIZE_1024) {
+        CHECK_STATUS(launchKernel<CUDA_BLOCK_SIZE_1024>(
+            y, x, _info.dtype, _info.shape[0], _info.shape[1], _info.shape[2], _info.shape[3],
+            _info.y_strides[0], _info.y_strides[1], _info.y_strides[2],
+            _info.x_strides[0], _info.x_strides[1], _info.x_strides[2], _info.x_strides[3], stream));
+    } else if (_opaque->internal->maxThreadsPerBlock() == CUDA_BLOCK_SIZE_512) {
+        CHECK_STATUS(launchKernel<CUDA_BLOCK_SIZE_512>(
+            y, x, _info.dtype, _info.shape[0], _info.shape[1], _info.shape[2], _info.shape[3],
+            _info.y_strides[0], _info.y_strides[1], _info.y_strides[2],
+            _info.x_strides[0], _info.x_strides[1], _info.x_strides[2], _info.x_strides[3], stream));
+    } else if (_opaque->internal->maxThreadsPerBlock() == CUDA_BLOCK_SIZE_4096) {
+        CHECK_STATUS(launchKernel<CUDA_BLOCK_SIZE_4096>(
+            y, x, _info.dtype, _info.shape[0], _info.shape[1], _info.shape[2], _info.shape[3],
+            _info.y_strides[0], _info.y_strides[1], _info.y_strides[2],
+            _info.x_strides[0], _info.x_strides[1], _info.x_strides[2], _info.x_strides[3], stream));
+    } else {
+        return INFINI_STATUS_DEVICE_ARCHITECTURE_NOT_SUPPORTED;
+    }
+    return INFINI_STATUS_SUCCESS;
+}
+
+} // namespace op::reduce_max::nvidia
diff --git a/src/infiniop/ops/reduce_max/nvidia/reduce_max_nvidia.cuh b/src/infiniop/ops/reduce_max/nvidia/reduce_max_nvidia.cuh
new file mode 100644
index 000000000..388738c27
--- /dev/null
+++ b/src/infiniop/ops/reduce_max/nvidia/reduce_max_nvidia.cuh
@@ -0,0 +1,8 @@
+#ifndef __REDUCE_MAX_NVIDIA_H__
+#define __REDUCE_MAX_NVIDIA_H__
+
+#include "../reduce_max.h"
+
+DESCRIPTOR(nvidia)
+
+#endif
diff --git a/src/infiniop/ops/reduce_max/operator.cc b/src/infiniop/ops/reduce_max/operator.cc
new file mode 100644
index 000000000..fc8a976b6
--- /dev/null
+++ b/src/infiniop/ops/reduce_max/operator.cc
@@ -0,0 +1,181 @@
+#include "../../operator.h"
+#include "../../handle.h"
+#include "infiniop/ops/reduce_max.h"
+
+#ifdef ENABLE_CPU_API
+#include "cpu/reduce_max_cpu.h"
+#endif
+#if defined(ENABLE_NVIDIA_API) || defined(ENABLE_ILUVATAR_API)
+#include "nvidia/reduce_max_nvidia.cuh"
+#endif
+#ifdef ENABLE_METAX_API
+#include "metax/reduce_max_metax.h"
+#endif
+#ifdef ENABLE_ASCEND_API
+#include "ascend/reduce_max_ascend.h"
+#endif
+
+__C infiniStatus_t infiniopCreateReduceMaxDescriptor(
+    infiniopHandle_t handle,
+    infiniopReduceMaxDescriptor_t *desc_ptr,
+    infiniopTensorDescriptor_t output_desc,
+    infiniopTensorDescriptor_t input_desc,
+    size_t dim) {
+
+#define CREATE(CASE, NAMESPACE)                                                   \
+    case CASE:                                                                    \
+        return op::reduce_max::NAMESPACE::Descriptor::create(                     \
+            handle,                                                               \
+            reinterpret_cast<op::reduce_max::NAMESPACE::Descriptor **>(desc_ptr), \
+            output_desc,                                                          \
+            input_desc,                                                           \
+            dim);
+
+    switch (handle->device) {
+#ifdef ENABLE_CPU_API
+        CREATE(INFINI_DEVICE_CPU, cpu)
+#endif
+#ifdef ENABLE_NVIDIA_API
+        CREATE(INFINI_DEVICE_NVIDIA, nvidia)
+#endif
+#ifdef ENABLE_ILUVATAR_API
+        CREATE(INFINI_DEVICE_ILUVATAR, nvidia);
+#endif
+#ifdef ENABLE_METAX_API
+        CREATE(INFINI_DEVICE_METAX, metax)
+#endif
+        // #ifdef ENABLE_ASCEND_API
+        //         CREATE(INFINI_DEVICE_ASCEND, ascend)
+        // #endif
+        // #ifdef ENABLE_CAMBRICON_MLU
+        //     case DevCambriconMlu: {
+        //         return bangCreateReduceMaxDescriptor((BangHandle_t)handle, (ReduceMaxBangDescriptor_t *)desc_ptr, output_desc, input_desc, dim);
+        //         // return cnnlCreateReduceMaxDescriptor((BangHandle_t) handle, (ReduceMaxCnnlDescriptor_t *) desc_ptr, output_desc, input_desc, dim);
+        //     }
+        // #endif
+        // #ifdef ENABLE_MTHREADS_GPU
+        //     case DevMthreadsGpu: {
+        //         return musaCreateReduceMaxDescriptor((MusaHandle_t)handle, (ReduceMaxMusaDescriptor_t *)desc_ptr, output_desc, input_desc, dim);
+        //     }
+        // #endif
+    }
+    return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
+}
+
+__C infiniStatus_t infiniopGetReduceMaxWorkspaceSize(infiniopReduceMaxDescriptor_t desc, size_t *size) {
+
+#define GET(CASE, NAMESPACE)                                                                      \
+    case CASE:                                                                                    \
+        *size = reinterpret_cast<op::reduce_max::NAMESPACE::Descriptor *>(desc)->workspaceSize(); \
+        return INFINI_STATUS_SUCCESS;
+
+    switch (desc->device_type) {
+#ifdef ENABLE_CPU_API
+        GET(INFINI_DEVICE_CPU, cpu)
+#endif
+#ifdef ENABLE_NVIDIA_API
+        GET(INFINI_DEVICE_NVIDIA, nvidia)
+#endif
+#ifdef ENABLE_ILUVATAR_API
+        GET(INFINI_DEVICE_ILUVATAR, nvidia);
+#endif
+#ifdef ENABLE_METAX_API
+        GET(INFINI_DEVICE_METAX, metax)
+#endif
+        // #ifdef ENABLE_ASCEND_API
+        //         GET(INFINI_DEVICE_ASCEND, ascend)
+        // #endif
+        // #ifdef ENABLE_CAMBRICON_MLU
+        //     case DevCambriconMlu: {
+        //         return bangGetReduceMaxWorkspaceSize((ReduceMaxBangDescriptor_t)desc, size);
+        //         // return cnnlGetReduceMaxWorkspaceSize((ReduceMaxCnnlDescriptor_t) desc, size);
+        //     }
+        // #endif
+        // #ifdef ENABLE_MTHREADS_GPU
+        //     case DevMthreadsGpu: {
+        //         return musaGetReduceMaxWorkspaceSize((ReduceMaxMusaDescriptor_t)desc, size);
+        //     }
+        // #endif
+    }
+    return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
+}
+
+__C infiniStatus_t infiniopReduceMax(
+    infiniopReduceMaxDescriptor_t desc,
+    void *workspace, size_t workspace_size,
+    void *output,
+    const void *input,
+    void *stream) {
+
+#define CALCULATE(CASE, NAMESPACE)                                                         \
+    case CASE:                                                                             \
+        return reinterpret_cast<op::reduce_max::NAMESPACE::Descriptor *>(desc)->calculate( \
+            workspace, workspace_size, output, input, stream);
+
+    switch (desc->device_type) {
+#ifdef ENABLE_CPU_API
+        CALCULATE(INFINI_DEVICE_CPU, cpu)
+#endif
+#ifdef ENABLE_NVIDIA_API
+        CALCULATE(INFINI_DEVICE_NVIDIA, nvidia)
+#endif
+#ifdef ENABLE_ILUVATAR_API
+        CALCULATE(INFINI_DEVICE_ILUVATAR, nvidia);
+#endif
+#ifdef ENABLE_METAX_API
+        CALCULATE(INFINI_DEVICE_METAX, metax)
+#endif
+        // #ifdef ENABLE_ASCEND_API
+        //         CALCULATE(INFINI_DEVICE_ASCEND, ascend)
+        // #endif
+        // #ifdef ENABLE_CAMBRICON_MLU
+        //     case DevCambriconMlu: {
+        //         return bangReduceMax((ReduceMaxBangDescriptor_t)desc, workspace, workspace_size, output, input, stream);
+        //         // return cnnlReduceMax((ReduceMaxCnnlDescriptor_t) desc, workspace, workspace_size, output, input, stream);
+        //     }
+        // #endif
+        // #ifdef ENABLE_MTHREADS_GPU
+        //     case DevMthreadsGpu: {
+        //         return musaReduceMax((ReduceMaxMusaDescriptor_t)desc, workspace, workspace_size, output, input, stream);
+        //     }
+        // #endif
+    }
+    return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
+}
+
+__C infiniStatus_t infiniopDestroyReduceMaxDescriptor(infiniopReduceMaxDescriptor_t desc) {
+
+#define DESTROY(CASE, NAMESPACE)                                                \
+    case CASE:                                                                  \
+        delete reinterpret_cast<op::reduce_max::NAMESPACE::Descriptor *>(desc); \
+        return INFINI_STATUS_SUCCESS;
+
+    switch (desc->device_type) {
+#ifdef ENABLE_CPU_API
+        DESTROY(INFINI_DEVICE_CPU, cpu)
+#endif
+#ifdef ENABLE_NVIDIA_API
+        DESTROY(INFINI_DEVICE_NVIDIA, nvidia)
+#endif
+#ifdef ENABLE_ILUVATAR_API
+        DESTROY(INFINI_DEVICE_ILUVATAR, nvidia);
+#endif
+#ifdef ENABLE_METAX_API
+        DESTROY(INFINI_DEVICE_METAX, metax)
+#endif
+        // #ifdef ENABLE_ASCEND_API
+        //         DESTROY(INFINI_DEVICE_ASCEND, ascend)
+        // #endif
+        // #ifdef ENABLE_CAMBRICON_MLU
+        //     case DevCambriconMlu: {
+        //         return bangDestroyReduceMaxDescriptor((ReduceMaxBangDescriptor_t)desc);
+        //         // return cnnlDestroyReduceMaxDescriptor((ReduceMaxCnnlDescriptor_t) desc);
+        //     }
+        // #endif
+        // #ifdef ENABLE_MTHREADS_GPU
+        //     case DevMthreadsGpu:
+        //         return musaDestroyReduceMaxDescriptor((ReduceMaxMusaDescriptor_t)desc);
+        // #endif
+    }
+    return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
+}
\ No newline at end of file
diff --git a/src/infiniop/ops/reduce_max/reduce_max.h b/src/infiniop/ops/reduce_max/reduce_max.h
new file mode 100644
index 000000000..6ef8630f2
--- /dev/null
+++ b/src/infiniop/ops/reduce_max/reduce_max.h
@@ -0,0 +1,47 @@
+#ifndef REDUCE_MAX_H
+#define REDUCE_MAX_H
+
+#include "../../operator.h"
+#include "info.h"
+
+#define DESCRIPTOR(NAMESPACE)                                    \
+                                                                 \
+    namespace op::reduce_max::NAMESPACE {                        \
+    class Descriptor final : public InfiniopDescriptor {         \
+        struct Opaque;                                           \
+        Opaque *_opaque;                                         \
+        ReduceMaxInfo _info;                                     \
+        size_t _workspace_size;                                  \
+                                                                 \
+        Descriptor(                                              \
+            Opaque *opaque,                                      \
+            ReduceMaxInfo info,                                  \
+            size_t workspace_size,                               \
+            infiniDevice_t device_type,                          \
+            int device_id)                                       \
+            : InfiniopDescriptor{device_type, device_id},        \
+              _opaque(opaque),                                   \
+              _info(info),                                       \
+              _workspace_size(workspace_size) {}                 \
+                                                                 \
+    public:                                                      \
+        ~Descriptor();                                           \
+                                                                 \
+        size_t workspaceSize() const { return _workspace_size; } \
+                                                                 \
+        static infiniStatus_t create(                            \
+            infiniopHandle_t handle,                             \
+            Descriptor **desc_ptr,                               \
+            infiniopTensorDescriptor_t output_desc,              \
+            infiniopTensorDescriptor_t input_desc,               \
+            size_t dim);                                         \
+                                                                 \
+        infiniStatus_t calculate(                                \
+            void *workspace, size_t workspace_size,              \
+            void *output,                                        \
+            const void *input,                                   \
+            void *stream) const;                                 \
+    };                                                           \
+    }
+
+#endif // REDUCE_MAX_H
\ No newline at end of file
diff --git a/src/infiniop/ops/reduce_mean/cpu/reduce_mean_cpu.cc b/src/infiniop/ops/reduce_mean/cpu/reduce_mean_cpu.cc
new file mode 100644
index 000000000..7853afdbb
--- /dev/null
+++ b/src/infiniop/ops/reduce_mean/cpu/reduce_mean_cpu.cc
@@ -0,0 +1,102 @@
+#include "reduce_mean_cpu.h"
+#include "../../../devices/cpu/common_cpu.h"
+#include "../../../reduce/cpu/reduce.h"
+
+namespace op::reduce_mean::cpu {
+
+Descriptor::~Descriptor() {}
+
+infiniStatus_t Descriptor::create(
+    infiniopHandle_t handle,
+    Descriptor **desc_ptr,
+    infiniopTensorDescriptor_t y_desc,
+    infiniopTensorDescriptor_t x_desc,
+    size_t dim) {
+    auto result = ReduceMeanInfo::create(y_desc, x_desc, dim);
+    CHECK_RESULT(result);
+    *desc_ptr = new Descriptor(nullptr, result.take(), 0, handle->device, handle->device_id);
+    return INFINI_STATUS_SUCCESS;
+}
+
+template <typename T>
+infiniStatus_t reduce_mean(const ReduceMeanInfo *info, T *y, const T *x) {
+    const size_t cols = info->shape[3]; // 规约维度
+    const ptrdiff_t y_batch_stride = info->y_strides[0];
+    const ptrdiff_t y_channel_stride = info->y_strides[1];
+    const ptrdiff_t y_row_stride = info->y_strides[2];
+    const ptrdiff_t x_batch_stride = info->x_strides[0];
+    const ptrdiff_t x_channel_stride = info->x_strides[1];
+    const ptrdiff_t x_row_stride = info->x_strides[2];
+    const ptrdiff_t x_col_stride = info->x_strides[3];
+#ifdef _WIN32
+    const ptrdiff_t batch_size = static_cast<ptrdiff_t>(info->shape[0]);
+    const ptrdiff_t channels = static_cast<ptrdiff_t>(info->shape[1]);
+    const ptrdiff_t rows = static_cast<ptrdiff_t>(info->shape[2]);
+#pragma omp parallel for
+    for (ptrdiff_t batch = 0; batch < batch_size; ++batch) {
+        for (ptrdiff_t channel = 0; channel < channels; ++channel) {
+            for (ptrdiff_t row = 0; row < rows; ++row) {
+                const T *input_start = x + batch * x_batch_stride
+                                     + channel * x_channel_stride
+                                     + row * x_row_stride;
+                T *output_ptr = y + batch * y_batch_stride
+                              + channel * y_channel_stride
+                              + row * y_row_stride;
+
+                float mean = op::common_cpu::reduce_op::sum(input_start, cols, x_col_stride) / cols;
+
+                if constexpr (std::is_same<T, fp16_t>::value || std::is_same<T, bf16_t>::value) {
+                    *output_ptr = utils::cast<T>(mean);
+                } else {
+                    *output_ptr = mean;
+                }
+            }
+        }
+    }
+#else
+    const size_t batch_size = info->shape[0];
+    const size_t channels = info->shape[1];
+    const size_t rows = info->shape[2];
+#pragma omp parallel for collapse(3)
+    for (size_t batch = 0; batch < batch_size; ++batch) {
+        for (size_t channel = 0; channel < channels; ++channel) {
+            for (size_t row = 0; row < rows; ++row) {
+                const T *input_start = x + batch * x_batch_stride
+                                     + channel * x_channel_stride
+                                     + row * x_row_stride;
+                T *output_ptr = y + batch * y_batch_stride
+                              + channel * y_channel_stride
+                              + row * y_row_stride;
+
+                float mean = op::common_cpu::reduce_op::sum(input_start, cols, x_col_stride) / cols;
+
+                if constexpr (std::is_same<T, fp16_t>::value || std::is_same<T, bf16_t>::value) {
+                    *output_ptr = utils::cast<T>(mean);
+                } else {
+                    *output_ptr = mean;
+                }
+            }
+        }
+    }
+#endif //_WIN32
+    return INFINI_STATUS_SUCCESS;
+}
+infiniStatus_t Descriptor::calculate(
+    void *workspace, size_t workspace_size,
+    void *y,
+    const void *x,
+    void *stream) const {
+
+    if (_info.dtype == INFINI_DTYPE_F16) {
+        CHECK_STATUS(reduce_mean<fp16_t>(&_info, (fp16_t *)y, (const fp16_t *)x));
+    } else if (_info.dtype == INFINI_DTYPE_BF16) {
+        CHECK_STATUS(reduce_mean<bf16_t>(&_info, (bf16_t *)y, (const bf16_t *)x));
+    } else if (_info.dtype == INFINI_DTYPE_F32) {
+        CHECK_STATUS(reduce_mean<float>(&_info, (float *)y, (const float *)x));
+    } else {
+        return INFINI_STATUS_BAD_TENSOR_DTYPE;
+    }
+
+    return INFINI_STATUS_SUCCESS;
+}
+} // namespace op::reduce_mean::cpu
diff --git a/src/infiniop/ops/reduce_mean/cpu/reduce_mean_cpu.h b/src/infiniop/ops/reduce_mean/cpu/reduce_mean_cpu.h
new file mode 100644
index 000000000..f67601c62
--- /dev/null
+++ b/src/infiniop/ops/reduce_mean/cpu/reduce_mean_cpu.h
@@ -0,0 +1,7 @@
+#ifndef __REDUCE_MEAN_CPU_H__
+#define __REDUCE_MEAN_CPU_H__
+#include "../reduce_mean.h"
+
+DESCRIPTOR(cpu)
+
+#endif
diff --git a/src/infiniop/ops/reduce_mean/info.h b/src/infiniop/ops/reduce_mean/info.h
new file mode 100644
index 000000000..6c11e07d3
--- /dev/null
+++ b/src/infiniop/ops/reduce_mean/info.h
@@ -0,0 +1,62 @@
+﻿#ifndef __REDUCE_MEAN_INFO_H__
+#define __REDUCE_MEAN_INFO_H__
+
+#include "../../../utils.h"
+#include "../../tensor.h"
+#include <vector>
+
+namespace op::reduce_mean {
+
+class ReduceMeanInfo {
+    ReduceMeanInfo() = default;
+
+public:
+    infiniDtype_t dtype;
+
+    std::vector<size_t> shape;
+    std::vector<ptrdiff_t> y_strides;
+    std::vector<ptrdiff_t> x_strides;
+
+    static utils::Result<ReduceMeanInfo> create(infiniopTensorDescriptor_t y_desc, infiniopTensorDescriptor_t x_desc, size_t dim) {
+        auto dtype = y_desc->dtype();
+        if (dtype != x_desc->dtype()) {
+            return INFINI_STATUS_BAD_TENSOR_DTYPE;
+        }
+        CHECK_DTYPE(dtype, INFINI_DTYPE_F16, INFINI_DTYPE_BF16, INFINI_DTYPE_F32);
+
+        size_t ndim = y_desc->ndim();
+        if (x_desc->ndim() != ndim) {
+            CHECK_STATUS(INFINI_STATUS_BAD_TENSOR_SHAPE);
+        }
+        CHECK_REDUCE_SHAPE(x_desc->shape(), dim, y_desc->shape());
+        if (ndim > 4) {
+            return INFINI_STATUS_BAD_TENSOR_SHAPE;
+        } else if (ndim == 0) {
+            std::vector<size_t> shape = {1, 1, 1, 1};
+            std::vector<ptrdiff_t> y_strides = {0, 0, 0, 0};
+            std::vector<ptrdiff_t> x_strides = {0, 0, 0, 0};
+            return utils::Result<ReduceMeanInfo>(ReduceMeanInfo{
+                dtype, shape, y_strides, x_strides});
+        } else {
+            std::vector<size_t> shape = x_desc->shape();
+            std::vector<ptrdiff_t> y_strides = y_desc->strides();
+            std::vector<ptrdiff_t> x_strides = x_desc->strides();
+            if (dim != (shape.size() - 1)) {
+                std::swap(shape[dim], shape[shape.size() - 1]);
+                std::swap(y_strides[dim], y_strides[shape.size() - 1]);
+                std::swap(x_strides[dim], x_strides[shape.size() - 1]);
+            }
+            while (shape.size() < 4) {
+                shape.insert(shape.begin(), 1);
+                y_strides.insert(y_strides.begin(), 0);
+                x_strides.insert(x_strides.begin(), 0);
+            }
+            return utils::Result<ReduceMeanInfo>(ReduceMeanInfo{
+                dtype, shape, y_strides, x_strides});
+        }
+    }
+};
+
+} // namespace op::reduce_mean
+
+#endif // __REDUCE_MEAN_INFO_H__
diff --git a/src/infiniop/ops/reduce_mean/metax/reduce_mean_metax.h b/src/infiniop/ops/reduce_mean/metax/reduce_mean_metax.h
new file mode 100644
index 000000000..a105724d6
--- /dev/null
+++ b/src/infiniop/ops/reduce_mean/metax/reduce_mean_metax.h
@@ -0,0 +1,8 @@
+#ifndef __REDUCE_MEAN_METAX_H__
+#define __REDUCE_MEAN_METAX_H__
+
+#include "../reduce_mean.h"
+
+DESCRIPTOR(metax)
+
+#endif
diff --git a/src/infiniop/ops/reduce_mean/metax/reduce_mean_metax.maca b/src/infiniop/ops/reduce_mean/metax/reduce_mean_metax.maca
new file mode 100644
index 000000000..0eaf1c9cb
--- /dev/null
+++ b/src/infiniop/ops/reduce_mean/metax/reduce_mean_metax.maca
@@ -0,0 +1,96 @@
+#include "../../../devices/metax/metax_common.h"
+#include "reduce_mean_metax.h"
+
+#include <hccub/block/block_reduce.cuh>
+#include "../../../devices/metax/metax_kernel_common.h"
+
+#include "../../../reduce/cuda/reduce.cuh"
+
+#include "../nvidia/kernel.cuh"
+
+template <unsigned int BLOCK_SIZE, typename Tdata, typename Tcompute>
+INFINIOP_METAX_KERNEL ReduceMean(
+    Tdata *y_, const Tdata *x_,
+    size_t batch, size_t channels, size_t height, size_t width,
+    ptrdiff_t y_stride_b, ptrdiff_t y_stride_c, ptrdiff_t y_stride_h,
+    ptrdiff_t x_stride_b, ptrdiff_t x_stride_c, ptrdiff_t x_stride_h, ptrdiff_t x_stride_w) {
+    ReduceMeanKernel<BLOCK_SIZE, Tdata, Tcompute>(y_, x_, batch, channels, height, width, y_stride_b, y_stride_c, y_stride_h, x_stride_b, x_stride_c, x_stride_h, x_stride_w);
+}
+
+namespace op::reduce_mean::metax {
+
+struct Descriptor::Opaque {
+    std::shared_ptr<device::metax::Handle::Internal> internal;
+};
+
+Descriptor::~Descriptor() {
+    delete _opaque;
+}
+
+infiniStatus_t Descriptor::create(
+    infiniopHandle_t handle,
+    Descriptor **desc_ptr,
+    infiniopTensorDescriptor_t y_desc,
+    infiniopTensorDescriptor_t x_desc,
+    size_t dim) {
+    auto info = ReduceMeanInfo::create(y_desc, x_desc, dim);
+    CHECK_RESULT(info);
+    *desc_ptr = new Descriptor(
+        new Opaque{reinterpret_cast<device::metax::Handle *>(handle)->internal()},
+        info.take(), 0, handle->device, handle->device_id);
+    return INFINI_STATUS_SUCCESS;
+}
+
+template <unsigned int BLOCK_SIZE>
+infiniStatus_t launchKernel(void *y, const void *x, infiniDtype_t dtype,
+                            size_t batch_size, size_t channels, size_t height, size_t width,
+                            ptrdiff_t y_stride_b, ptrdiff_t y_stride_c, ptrdiff_t y_stride_h,
+                            ptrdiff_t x_stride_b, ptrdiff_t x_stride_c, ptrdiff_t x_stride_h, ptrdiff_t x_stride_w,
+                            hcStream_t stream) {
+    dim3 grid=dim3(uint32_t(batch_size), uint32_t(channels), uint32_t(height));
+    if (dtype == INFINI_DTYPE_F16) {
+        ReduceMean<BLOCK_SIZE, half, float>
+            <<<grid, BLOCK_SIZE, 0, stream>>>((half *)y, (const half *)x,
+                                              batch_size, channels, height, width,
+                                              y_stride_b, y_stride_c, y_stride_h,
+                                              x_stride_b, x_stride_c, x_stride_h, x_stride_w);
+    } else if (dtype == INFINI_DTYPE_BF16) {
+        ReduceMean<BLOCK_SIZE, __hpcc_bfloat16, float>
+            <<<grid, BLOCK_SIZE, 0, stream>>>((__hpcc_bfloat16 *)y, (const __hpcc_bfloat16 *)x,
+                                              batch_size, channels, height, width,
+                                              y_stride_b, y_stride_c, y_stride_h,
+                                              x_stride_b, x_stride_c, x_stride_h, x_stride_w);
+    } else if (dtype == INFINI_DTYPE_F32) {
+        ReduceMean<BLOCK_SIZE, float, float>
+            <<<grid, BLOCK_SIZE, 0, stream>>>((float *)y, (const float *)x,
+                                              batch_size, channels, height, width,
+                                              y_stride_b, y_stride_c, y_stride_h,
+                                              x_stride_b, x_stride_c, x_stride_h, x_stride_w);
+    } else {
+        return INFINI_STATUS_BAD_TENSOR_DTYPE;
+    }
+    return INFINI_STATUS_SUCCESS;
+}
+
+infiniStatus_t Descriptor::calculate(void *workspace, size_t workspace_size,
+                                     void *y,
+                                     const void *x,
+                                     void *stream_) const {
+    hcStream_t stream = (hcStream_t)stream_;
+    if (_opaque->internal->maxThreadsPerBlock() == METAX_BLOCK_SIZE_1024) {
+        CHECK_STATUS(launchKernel<METAX_BLOCK_SIZE_1024>(
+            y, x, _info.dtype, _info.shape[0], _info.shape[1], _info.shape[2], _info.shape[3],
+            _info.y_strides[0], _info.y_strides[1], _info.y_strides[2], 
+            _info.x_strides[0], _info.x_strides[1], _info.x_strides[2], _info.x_strides[3], stream));
+    } else if (_opaque->internal->maxThreadsPerBlock() == METAX_BLOCK_SIZE_512) {
+        CHECK_STATUS(launchKernel<METAX_BLOCK_SIZE_512>(
+            y, x, _info.dtype, _info.shape[0], _info.shape[1], _info.shape[2], _info.shape[3],
+            _info.y_strides[0], _info.y_strides[1], _info.y_strides[2], 
+            _info.x_strides[0], _info.x_strides[1], _info.x_strides[2], _info.x_strides[3], stream));
+    } else {
+        return INFINI_STATUS_DEVICE_ARCHITECTURE_NOT_SUPPORTED;
+    }
+    return INFINI_STATUS_SUCCESS;
+}
+
+} // namespace op::reduce_mean::metax
diff --git a/src/infiniop/ops/reduce_mean/nvidia/kernel.cuh b/src/infiniop/ops/reduce_mean/nvidia/kernel.cuh
new file mode 100644
index 000000000..e70748605
--- /dev/null
+++ b/src/infiniop/ops/reduce_mean/nvidia/kernel.cuh
@@ -0,0 +1,26 @@
+﻿#ifndef __REDUCE_MEAN_KERNEL_CUH__
+#define __REDUCE_MEAN_KERNEL_CUH__
+
+template <unsigned int BLOCK_SIZE, typename Tdata, typename Tcompute>
+__device__ void ReduceMeanKernel(
+    Tdata *y_, const Tdata *x_,
+    size_t batch, size_t channels, size_t height, size_t width,
+    ptrdiff_t y_stride_b, ptrdiff_t y_stride_c, ptrdiff_t y_stride_h,
+    ptrdiff_t x_stride_b, ptrdiff_t x_stride_c, ptrdiff_t x_stride_h, ptrdiff_t x_stride_w) {
+
+    Tdata *y = y_ + blockIdx.x * y_stride_b + blockIdx.y * y_stride_c + blockIdx.z * y_stride_h;
+    const Tdata *x = x_ + blockIdx.x * x_stride_b + blockIdx.y * x_stride_c + blockIdx.z * x_stride_h;
+
+    // [Reduce] Find the sum of each updated row and store in shared memory
+    Tcompute sum_0 = op::common_cuda::reduce_op::sum<BLOCK_SIZE, Tdata, Tcompute>(x, width, x_stride_w);
+    if (threadIdx.x == 0) {
+        // mean_ = sum_0/width;
+        *y = sum_0 / width;
+    }
+    // __syncthreads();
+
+    // [Elementwise] Divide each element by the sum and store in shared memory
+    // *y = mean_;
+}
+
+#endif // __REDUCE_MEAN_KERNEL_CUH__
diff --git a/src/infiniop/ops/reduce_mean/nvidia/reduce_mean_nvidia.cu b/src/infiniop/ops/reduce_mean/nvidia/reduce_mean_nvidia.cu
new file mode 100644
index 000000000..bfc26e23d
--- /dev/null
+++ b/src/infiniop/ops/reduce_mean/nvidia/reduce_mean_nvidia.cu
@@ -0,0 +1,101 @@
+#include "../../../devices/nvidia/nvidia_common.cuh"
+#include "reduce_mean_nvidia.cuh"
+
+#include "../../../devices/nvidia/nvidia_kernel_common.cuh"
+#include <cub/block/block_reduce.cuh>
+
+#include "../../../reduce/cuda/reduce.cuh"
+
+#include "kernel.cuh"
+
+template <unsigned int BLOCK_SIZE, typename Tdata, typename Tcompute>
+INFINIOP_CUDA_KERNEL ReduceMean(
+    Tdata *y_, const Tdata *x_,
+    size_t batch, size_t channels, size_t height, size_t width,
+    ptrdiff_t y_stride_b, ptrdiff_t y_stride_c, ptrdiff_t y_stride_h,
+    ptrdiff_t x_stride_b, ptrdiff_t x_stride_c, ptrdiff_t x_stride_h, ptrdiff_t x_stride_w) {
+    ReduceMeanKernel<BLOCK_SIZE, Tdata, Tcompute>(y_, x_, batch, channels, height, width, y_stride_b, y_stride_c, y_stride_h, x_stride_b, x_stride_c, x_stride_h, x_stride_w);
+}
+
+namespace op::reduce_mean::nvidia {
+
+struct Descriptor::Opaque {
+    std::shared_ptr<device::nvidia::Handle::Internal> internal;
+};
+
+Descriptor::~Descriptor() {
+    delete _opaque;
+}
+
+infiniStatus_t Descriptor::create(
+    infiniopHandle_t handle,
+    Descriptor **desc_ptr,
+    infiniopTensorDescriptor_t y_desc,
+    infiniopTensorDescriptor_t x_desc,
+    size_t dim) {
+    auto info = ReduceMeanInfo::create(y_desc, x_desc, dim);
+    CHECK_RESULT(info);
+    *desc_ptr = new Descriptor(
+        new Opaque{reinterpret_cast<device::nvidia::Handle *>(handle)->internal()},
+        info.take(), 0, handle->device, handle->device_id);
+    return INFINI_STATUS_SUCCESS;
+}
+
+template <unsigned int BLOCK_SIZE>
+infiniStatus_t launchKernel(void *y, const void *x, infiniDtype_t dtype,
+                            size_t batch_size, size_t channels, size_t height, size_t width,
+                            ptrdiff_t y_stride_b, ptrdiff_t y_stride_c, ptrdiff_t y_stride_h,
+                            ptrdiff_t x_stride_b, ptrdiff_t x_stride_c, ptrdiff_t x_stride_h, ptrdiff_t x_stride_w,
+                            cudaStream_t stream) {
+    dim3 grid = dim3(uint32_t(batch_size), uint32_t(channels), uint32_t(height));
+    if (dtype == INFINI_DTYPE_F16) {
+        ReduceMean<BLOCK_SIZE, half, float>
+            <<<grid, BLOCK_SIZE, 0, stream>>>((half *)y, (const half *)x,
+                                              batch_size, channels, height, width,
+                                              y_stride_b, y_stride_c, y_stride_h,
+                                              x_stride_b, x_stride_c, x_stride_h, x_stride_w);
+    } else if (dtype == INFINI_DTYPE_BF16) {
+        ReduceMean<BLOCK_SIZE, __nv_bfloat16, float>
+            <<<grid, BLOCK_SIZE, 0, stream>>>((__nv_bfloat16 *)y, (const __nv_bfloat16 *)x,
+                                              batch_size, channels, height, width,
+                                              y_stride_b, y_stride_c, y_stride_h,
+                                              x_stride_b, x_stride_c, x_stride_h, x_stride_w);
+    } else if (dtype == INFINI_DTYPE_F32) {
+        ReduceMean<BLOCK_SIZE, float, float>
+            <<<grid, BLOCK_SIZE, 0, stream>>>((float *)y, (const float *)x,
+                                              batch_size, channels, height, width,
+                                              y_stride_b, y_stride_c, y_stride_h,
+                                              x_stride_b, x_stride_c, x_stride_h, x_stride_w);
+    } else {
+        return INFINI_STATUS_BAD_TENSOR_DTYPE;
+    }
+    return INFINI_STATUS_SUCCESS;
+}
+
+infiniStatus_t Descriptor::calculate(void *workspace, size_t workspace_size,
+                                     void *y,
+                                     const void *x,
+                                     void *stream_) const {
+    cudaStream_t stream = (cudaStream_t)stream_;
+    if (_opaque->internal->maxThreadsPerBlock() == CUDA_BLOCK_SIZE_1024) {
+        CHECK_STATUS(launchKernel<CUDA_BLOCK_SIZE_1024>(
+            y, x, _info.dtype, _info.shape[0], _info.shape[1], _info.shape[2], _info.shape[3],
+            _info.y_strides[0], _info.y_strides[1], _info.y_strides[2],
+            _info.x_strides[0], _info.x_strides[1], _info.x_strides[2], _info.x_strides[3], stream));
+    } else if (_opaque->internal->maxThreadsPerBlock() == CUDA_BLOCK_SIZE_512) {
+        CHECK_STATUS(launchKernel<CUDA_BLOCK_SIZE_512>(
+            y, x, _info.dtype, _info.shape[0], _info.shape[1], _info.shape[2], _info.shape[3],
+            _info.y_strides[0], _info.y_strides[1], _info.y_strides[2],
+            _info.x_strides[0], _info.x_strides[1], _info.x_strides[2], _info.x_strides[3], stream));
+    } else if (_opaque->internal->maxThreadsPerBlock() == CUDA_BLOCK_SIZE_4096) {
+        CHECK_STATUS(launchKernel<CUDA_BLOCK_SIZE_4096>(
+            y, x, _info.dtype, _info.shape[0], _info.shape[1], _info.shape[2], _info.shape[3],
+            _info.y_strides[0], _info.y_strides[1], _info.y_strides[2],
+            _info.x_strides[0], _info.x_strides[1], _info.x_strides[2], _info.x_strides[3], stream));
+    } else {
+        return INFINI_STATUS_DEVICE_ARCHITECTURE_NOT_SUPPORTED;
+    }
+    return INFINI_STATUS_SUCCESS;
+}
+
+} // namespace op::reduce_mean::nvidia
diff --git a/src/infiniop/ops/reduce_mean/nvidia/reduce_mean_nvidia.cuh b/src/infiniop/ops/reduce_mean/nvidia/reduce_mean_nvidia.cuh
new file mode 100644
index 000000000..be16b4491
--- /dev/null
+++ b/src/infiniop/ops/reduce_mean/nvidia/reduce_mean_nvidia.cuh
@@ -0,0 +1,8 @@
+#ifndef __REDUCE_MEAN_NVIDIA_H__
+#define __REDUCE_MEAN_NVIDIA_H__
+
+#include "../reduce_mean.h"
+
+DESCRIPTOR(nvidia)
+
+#endif
diff --git a/src/infiniop/ops/reduce_mean/operator.cc b/src/infiniop/ops/reduce_mean/operator.cc
new file mode 100644
index 000000000..3696f95d2
--- /dev/null
+++ b/src/infiniop/ops/reduce_mean/operator.cc
@@ -0,0 +1,182 @@
+#include "../../operator.h"
+#include "../../handle.h"
+#include "infiniop/ops/reduce_mean.h"
+
+#ifdef ENABLE_CPU_API
+#include "cpu/reduce_mean_cpu.h"
+#endif
+#if defined(ENABLE_NVIDIA_API) || defined(ENABLE_ILUVATAR_API)
+#include "nvidia/reduce_mean_nvidia.cuh"
+#endif
+#ifdef ENABLE_METAX_API
+#include "metax/reduce_mean_metax.h"
+#endif
+#ifdef ENABLE_ASCEND_API
+#include "ascend/reduce_mean_ascend.h"
+#endif
+
+__C infiniStatus_t infiniopCreateReduceMeanDescriptor(
+    infiniopHandle_t handle,
+    infiniopReduceMeanDescriptor_t *desc_ptr,
+    infiniopTensorDescriptor_t y_desc,
+    infiniopTensorDescriptor_t x_desc,
+    size_t dim) {
+
+#define CREATE(CASE, NAMESPACE)                                                    \
+    case CASE:                                                                     \
+        return op::reduce_mean::NAMESPACE::Descriptor::create(                     \
+            handle,                                                                \
+            reinterpret_cast<op::reduce_mean::NAMESPACE::Descriptor **>(desc_ptr), \
+            y_desc,                                                                \
+            x_desc,                                                                \
+            dim);
+
+    switch (handle->device) {
+#ifdef ENABLE_CPU_API
+        CREATE(INFINI_DEVICE_CPU, cpu)
+#endif
+#ifdef ENABLE_NVIDIA_API
+        CREATE(INFINI_DEVICE_NVIDIA, nvidia)
+#endif
+#ifdef ENABLE_ILUVATAR_API
+        CREATE(INFINI_DEVICE_ILUVATAR, nvidia);
+#endif
+#ifdef ENABLE_METAX_API
+        CREATE(INFINI_DEVICE_METAX, metax)
+#endif
+        // #ifdef ENABLE_ASCEND_API
+        //         CREATE(INFINI_DEVICE_ASCEND, ascend)
+        // #endif
+        // #ifdef ENABLE_CAMBRICON_MLU
+        //     case DevCambriconMlu: {
+        //         return bangCreateCausalSoftmaxDescriptor((BangHandle_t)handle, (CausalSoftmaxBangDescriptor_t *)desc_ptr, y_desc);
+        //         // return cnnlCreateCausalSoftmaxDescriptor((BangHandle_t) handle, (CausalSoftmaxCnnlDescriptor_t *) desc_ptr, y_desc);
+        //     }
+        // #endif
+        // #ifdef ENABLE_MTHREADS_GPU
+        //     case DevMthreadsGpu: {
+        //         return musaCreateCausalSoftmaxDescriptor((MusaHandle_t)handle, (CausalSoftmaxMusaDescriptor_t *)desc_ptr, y_desc);
+        //     }
+        // #endif
+    }
+    return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
+}
+
+__C infiniStatus_t infiniopGetReduceMeanWorkspaceSize(infiniopReduceMeanDescriptor_t desc, size_t *size) {
+
+#define GET(CASE, NAMESPACE)                                                                       \
+    case CASE:                                                                                     \
+        *size = reinterpret_cast<op::reduce_mean::NAMESPACE::Descriptor *>(desc)->workspaceSize(); \
+        return INFINI_STATUS_SUCCESS;
+
+    switch (desc->device_type) {
+#ifdef ENABLE_CPU_API
+        GET(INFINI_DEVICE_CPU, cpu)
+#endif
+#ifdef ENABLE_NVIDIA_API
+        GET(INFINI_DEVICE_NVIDIA, nvidia)
+#endif
+#ifdef ENABLE_ILUVATAR_API
+        GET(INFINI_DEVICE_ILUVATAR, nvidia);
+#endif
+#ifdef ENABLE_METAX_API
+        GET(INFINI_DEVICE_METAX, metax)
+#endif
+        // #ifdef ENABLE_ASCEND_API
+        //         GET(INFINI_DEVICE_ASCEND, ascend)
+        // #endif
+        // #ifdef ENABLE_CAMBRICON_MLU
+        //     case DevCambriconMlu: {
+        //         return bangGetCausalSoftmaxWorkspaceSize((CausalSoftmaxBangDescriptor_t)desc, size);
+        //         // return cnnlGetCausalSoftmaxWorkspaceSize((CausalSoftmaxCnnlDescriptor_t) desc, size);
+        //     }
+
+        // #endif
+        // #ifdef ENABLE_MTHREADS_GPU
+        //     case DevMthreadsGpu: {
+        //         return musaGetCausalSoftmaxWorkspaceSize((CausalSoftmaxMusaDescriptor_t)desc, size);
+        //     }
+        // #endif
+    }
+    return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
+}
+
+__C infiniStatus_t infiniopReduceMean(
+    infiniopReduceMeanDescriptor_t desc,
+    void *workspace, size_t workspace_size,
+    void *y,
+    const void *x,
+    void *stream) {
+
+#define CALCULATE(CASE, NAMESPACE)                                                          \
+    case CASE:                                                                              \
+        return reinterpret_cast<op::reduce_mean::NAMESPACE::Descriptor *>(desc)->calculate( \
+            workspace, workspace_size, y, x, stream);
+
+    switch (desc->device_type) {
+#ifdef ENABLE_CPU_API
+        CALCULATE(INFINI_DEVICE_CPU, cpu)
+#endif
+#ifdef ENABLE_NVIDIA_API
+        CALCULATE(INFINI_DEVICE_NVIDIA, nvidia)
+#endif
+#ifdef ENABLE_ILUVATAR_API
+        CALCULATE(INFINI_DEVICE_ILUVATAR, nvidia);
+#endif
+#ifdef ENABLE_METAX_API
+        CALCULATE(INFINI_DEVICE_METAX, metax)
+#endif
+        // #ifdef ENABLE_ASCEND_API
+        //         CALCULATE(INFINI_DEVICE_ASCEND, ascend)
+        // #endif
+        // #ifdef ENABLE_CAMBRICON_MLU
+        //     case DevCambriconMlu: {
+        //         return bangCausalSoftmax((CausalSoftmaxBangDescriptor_t)desc, workspace, workspace_size, data, stream);
+        //         // return cnnlCausalSoftmax((CausalSoftmaxCnnlDescriptor_t) desc, workspace, workspace_size, data, stream);
+        //     }
+        // #endif
+        // #ifdef ENABLE_MTHREADS_GPU
+        //     case DevMthreadsGpu: {
+        //         return musaCausalSoftmax((CausalSoftmaxMusaDescriptor_t)desc, workspace, workspace_size, data, stream);
+        //     }
+        // #endif
+    }
+    return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
+}
+
+__C infiniStatus_t infiniopDestroyReduceMeanDescriptor(infiniopReduceMeanDescriptor_t desc) {
+
+#define DESTROY(CASE, NAMESPACE)                                                 \
+    case CASE:                                                                   \
+        delete reinterpret_cast<op::reduce_mean::NAMESPACE::Descriptor *>(desc); \
+        return INFINI_STATUS_SUCCESS;
+
+    switch (desc->device_type) {
+#ifdef ENABLE_CPU_API
+        DESTROY(INFINI_DEVICE_CPU, cpu)
+#endif
+#ifdef ENABLE_NVIDIA_API
+        DESTROY(INFINI_DEVICE_NVIDIA, nvidia)
+#endif
+#ifdef ENABLE_ILUVATAR_API
+        DESTROY(INFINI_DEVICE_ILUVATAR, nvidia);
+#endif
+#ifdef ENABLE_METAX_API
+        DESTROY(INFINI_DEVICE_METAX, metax)
+#endif
+        // #ifdef ENABLE_ASCEND_API
+        //         DESTROY(INFINI_DEVICE_ASCEND, ascend)
+        // #endif
+        // #ifdef ENABLE_CAMBRICON_MLU
+        //     case DevCambriconMlu: {
+        //         return bangDestroyCausalSoftmaxDescriptor((CausalSoftmaxBangDescriptor_t)desc);
+        //         // return cnnlDestroyCausalSoftmaxDescriptor((CausalSoftmaxCnnlDescriptor_t) desc);
+        //     }
+        // #endif
+        // #ifdef ENABLE_MTHREADS_GPU
+        //     case DevMthreadsGpu:
+        //         return musaDestroyCausalSoftmaxDescriptor((CausalSoftmaxMusaDescriptor_t)desc);
+        // #endif
+    }
+    return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
+}
diff --git a/src/infiniop/ops/reduce_mean/reduce_mean.h b/src/infiniop/ops/reduce_mean/reduce_mean.h
new file mode 100644
index 000000000..bf2e2dda0
--- /dev/null
+++ b/src/infiniop/ops/reduce_mean/reduce_mean.h
@@ -0,0 +1,47 @@
+#ifndef REDUCE_MEAN_H
+#define REDUCE_MEAN_H
+
+#include "../../operator.h"
+#include "info.h"
+
+#define DESCRIPTOR(NAMESPACE)                                    \
+                                                                 \
+    namespace op::reduce_mean::NAMESPACE {                       \
+    class Descriptor final : public InfiniopDescriptor {         \
+        struct Opaque;                                           \
+        Opaque *_opaque;                                         \
+        ReduceMeanInfo _info;                                    \
+        size_t _workspace_size;                                  \
+                                                                 \
+        Descriptor(                                              \
+            Opaque *opaque,                                      \
+            ReduceMeanInfo info,                                 \
+            size_t workspace_size,                               \
+            infiniDevice_t device_type,                          \
+            int device_id)                                       \
+            : InfiniopDescriptor{device_type, device_id},        \
+              _opaque(opaque),                                   \
+              _info(info),                                       \
+              _workspace_size(workspace_size) {}                 \
+                                                                 \
+    public:                                                      \
+        ~Descriptor();                                           \
+                                                                 \
+        size_t workspaceSize() const { return _workspace_size; } \
+                                                                 \
+        static infiniStatus_t create(                            \
+            infiniopHandle_t handle,                             \
+            Descriptor **desc_ptr,                               \
+            infiniopTensorDescriptor_t y_desc,                   \
+            infiniopTensorDescriptor_t x_desc,                   \
+            size_t dim);                                         \
+                                                                 \
+        infiniStatus_t calculate(                                \
+            void *workspace, size_t workspace_size,              \
+            void *y,                                             \
+            const void *x,                                       \
+            void *stream) const;                                 \
+    };                                                           \
+    }
+
+#endif // REDUCE_MEAN_H
diff --git a/src/infiniop/reduce/cuda/reduce.cuh b/src/infiniop/reduce/cuda/reduce.cuh
index a1d2c2501..cd98d0936 100644
--- a/src/infiniop/reduce/cuda/reduce.cuh
+++ b/src/infiniop/reduce/cuda/reduce.cuh
@@ -61,6 +61,39 @@ __device__ __forceinline__ Tdata max(const Tdata *data_ptr, size_t count) {
     return BlockReduce(temp_storage).Reduce(max_, cub::Max(), BLOCK_SIZE);
 }
 
+// Sum(x) on non-contiguous data of length count
+template <unsigned int BLOCK_SIZE, typename Tdata, typename Tcompute>
+__device__ __forceinline__ Tcompute sum(const Tdata *data_ptr,
+                                        size_t count,
+                                        ptrdiff_t stride) {
+    Tcompute s = 0;
+
+    for (size_t i = threadIdx.x; i < count; i += BLOCK_SIZE) {
+        s += Tcompute(data_ptr[i * stride]);
+    }
+
+    using BlockReduce = cub::BlockReduce<Tcompute, BLOCK_SIZE>;
+    __shared__ typename BlockReduce::TempStorage temp_storage;
+
+    return BlockReduce(temp_storage).Sum(s);
+}
+
+// Max(x) on non-contiguous data of length count
+template <unsigned int BLOCK_SIZE, typename Tdata>
+__device__ __forceinline__ Tdata max(const Tdata *data_ptr,
+                                     size_t count,
+                                     ptrdiff_t stride) {
+    Tdata max_ = data_ptr[0];
+
+    for (size_t i = threadIdx.x; i < count; i += BLOCK_SIZE) {
+        max_ = cub::Max()(max_, data_ptr[i * stride]);
+    }
+
+    using BlockReduce = cub::BlockReduce<Tdata, BLOCK_SIZE>;
+    __shared__ typename BlockReduce::TempStorage temp_storage;
+
+    return BlockReduce(temp_storage).Reduce(max_, cub::Max(), BLOCK_SIZE);
+}
 } // namespace op::common_cuda::reduce_op
 
 #endif
diff --git a/src/infinirt/infinirt_impl.h b/src/infinirt/infinirt_impl.h
index 0d6f8cf05..4c41a1198 100644
--- a/src/infinirt/infinirt_impl.h
+++ b/src/infinirt/infinirt_impl.h
@@ -30,7 +30,6 @@
     infiniStatus_t freeAsync(void *ptr, infinirtStream_t stream) IMPL;
 
 #define INFINIRT_DEVICE_API_IMPL INFINIRT_DEVICE_API(, )
-#define INFINIRT_DEVICE_API_NOOP INFINIRT_DEVICE_API({ return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED; }, \
-                                                     {*count = 0; return INFINI_STATUS_SUCCESS; })
+#define INFINIRT_DEVICE_API_NOOP INFINIRT_DEVICE_API({ return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED; }, {*count = 0; return INFINI_STATUS_SUCCESS; })
 
 #endif // __INFINIRT_IMPL_H__
diff --git a/src/utils.h b/src/utils.h
index f4e63be25..e721f05a6 100644
--- a/src/utils.h
+++ b/src/utils.h
@@ -108,4 +108,4 @@ inline size_t align(size_t size, size_t alignment) {
 
 } // namespace utils
 
-#endif
+#endif
\ No newline at end of file
diff --git a/src/utils/check.h b/src/utils/check.h
index 7f4a2bdd9..76823aa40 100644
--- a/src/utils/check.h
+++ b/src/utils/check.h
@@ -59,4 +59,22 @@
 
 #define CHECK_SAME_STRIDES(FIRST, ...) CHECK_SAME_VEC(INFINI_STATUS_BAD_TENSOR_STRIDES, FIRST, __VA_ARGS__)
 
+#define CHECK_REDUCE_SHAPE(INPUT_SHAPE, DIM, EXPECTED_SHAPE) \
+    do {                                                     \
+        if (INPUT_SHAPE.empty()) {                           \
+            if (!EXPECTED_SHAPE.empty()) {                   \
+                return INFINI_STATUS_BAD_TENSOR_SHAPE;       \
+            }                                                \
+            break;                                           \
+        }                                                    \
+        if (DIM >= INPUT_SHAPE.size()) {                     \
+            return INFINI_STATUS_BAD_PARAM;                  \
+        }                                                    \
+        std::vector<size_t> reduced_shape = INPUT_SHAPE;     \
+        reduced_shape[DIM] = 1;                              \
+        if (reduced_shape != EXPECTED_SHAPE) {               \
+            return INFINI_STATUS_BAD_TENSOR_SHAPE;           \
+        }                                                    \
+    } while (0)
+
 #endif // INFINIUTILS_CHECK_H
diff --git a/test/infiniop-test/test_generate/__init__.py b/test/infiniop-test/test_generate/__init__.py
index a61f63f7c..8db1e6755 100644
--- a/test/infiniop-test/test_generate/__init__.py
+++ b/test/infiniop-test/test_generate/__init__.py
@@ -1 +1,8 @@
-from .infiniop_test import InfiniopTestCase, InfiniopTestWriter, np_dtype_to_ggml, gguf_strides, contiguous_gguf_strides, process_zero_stride_tensor
+from .infiniop_test import (
+    InfiniopTestCase,
+    InfiniopTestWriter,
+    np_dtype_to_ggml,
+    gguf_strides,
+    contiguous_gguf_strides,
+    process_zero_stride_tensor,
+)
diff --git a/test/infiniop-test/test_generate/testcases/add.py b/test/infiniop-test/test_generate/testcases/add.py
index b04ba2042..052ef18a7 100644
--- a/test/infiniop-test/test_generate/testcases/add.py
+++ b/test/infiniop-test/test_generate/testcases/add.py
@@ -4,7 +4,14 @@
 from typing import List
 from numpy.lib.stride_tricks import as_strided
 
-from .. import InfiniopTestWriter, InfiniopTestCase, np_dtype_to_ggml, gguf_strides, contiguous_gguf_strides, process_zero_stride_tensor
+from .. import (
+    InfiniopTestWriter,
+    InfiniopTestCase,
+    np_dtype_to_ggml,
+    gguf_strides,
+    contiguous_gguf_strides,
+    process_zero_stride_tensor,
+)
 
 
 def add(
@@ -26,7 +33,6 @@ def __init__(
         c: np.ndarray,
         shape_c: List[int] | None,
         stride_c: List[int] | None,
-
     ):
         super().__init__("add")
         self.a = a
@@ -39,7 +45,6 @@ def __init__(
         self.shape_c = shape_c
         self.stride_c = stride_c
 
-
     def write_test(self, test_writer: "InfiniopTestWriter"):
         super().write_test(test_writer)
         if self.shape_a is not None:
@@ -49,12 +54,22 @@ def write_test(self, test_writer: "InfiniopTestWriter"):
         if self.shape_c is not None:
             test_writer.add_array(test_writer.gguf_key("c.shape"), self.shape_c)
         if self.stride_a is not None:
-            test_writer.add_array(test_writer.gguf_key("a.strides"), gguf_strides(*self.stride_a))
+            test_writer.add_array(
+                test_writer.gguf_key("a.strides"), gguf_strides(*self.stride_a)
+            )
         if self.stride_b is not None:
-            test_writer.add_array(test_writer.gguf_key("b.strides"), gguf_strides(*self.stride_b))
+            test_writer.add_array(
+                test_writer.gguf_key("b.strides"), gguf_strides(*self.stride_b)
+            )
         test_writer.add_array(
             test_writer.gguf_key("c.strides"),
-            gguf_strides(*self.stride_c if self.stride_c is not None else contiguous_gguf_strides(self.shape_c))
+            gguf_strides(
+                *(
+                    self.stride_c
+                    if self.stride_c is not None
+                    else contiguous_gguf_strides(self.shape_c)
+                )
+            ),
         )
         test_writer.add_tensor(
             test_writer.gguf_key("a"), self.a, raw_dtype=np_dtype_to_ggml(self.a.dtype)
@@ -114,7 +129,6 @@ def write_test(self, test_writer: "InfiniopTestWriter"):
                 stride_c=stride_c,
             )
             test_cases.append(test_case)
-            
+
     test_writer.add_tests(test_cases)
     test_writer.save()
-    
\ No newline at end of file
diff --git a/test/infiniop-test/test_generate/testcases/causal_softmax.py b/test/infiniop-test/test_generate/testcases/causal_softmax.py
index 74c3efcf0..037701865 100644
--- a/test/infiniop-test/test_generate/testcases/causal_softmax.py
+++ b/test/infiniop-test/test_generate/testcases/causal_softmax.py
@@ -4,7 +4,13 @@
 from typing import List
 from enum import Enum, auto
 
-from .. import InfiniopTestWriter, InfiniopTestCase, np_dtype_to_ggml, gguf_strides, contiguous_gguf_strides
+from .. import (
+    InfiniopTestWriter,
+    InfiniopTestCase,
+    np_dtype_to_ggml,
+    gguf_strides,
+    contiguous_gguf_strides,
+)
 
 
 def causal_softmax(x):
@@ -37,8 +43,8 @@ def __init__(
         super().__init__("causal_softmax")
         self.x = x
         self.y = y
-        self.shape_x=shape_x
-        self.shape_y=shape_y
+        self.shape_x = shape_x
+        self.shape_y = shape_y
         self.stride_x = stride_x
         self.stride_y = stride_y
 
@@ -49,10 +55,18 @@ def write_test(self, test_writer: "InfiniopTestWriter"):
         if self.shape_y is not None:
             test_writer.add_array(test_writer.gguf_key("y.shape"), self.shape_y)
         if self.stride_x is not None:
-            test_writer.add_array(test_writer.gguf_key("x.strides"), gguf_strides(*self.stride_x))
+            test_writer.add_array(
+                test_writer.gguf_key("x.strides"), gguf_strides(*self.stride_x)
+            )
         test_writer.add_array(
             test_writer.gguf_key("y.strides"),
-            gguf_strides(*self.stride_y if self.stride_y is not None else contiguous_gguf_strides(self.shape_y))
+            gguf_strides(
+                *(
+                    self.stride_y
+                    if self.stride_y is not None
+                    else contiguous_gguf_strides(self.shape_y)
+                )
+            ),
         )
         test_writer.add_tensor(
             test_writer.gguf_key("x"),
@@ -102,6 +116,6 @@ def write_test(self, test_writer: "InfiniopTestWriter"):
                 stride_y,
             )
             test_cases.append(test_case)
-            
+
     test_writer.add_tests(test_cases)
     test_writer.save()
diff --git a/test/infiniop-test/test_generate/testcases/clip.py b/test/infiniop-test/test_generate/testcases/clip.py
index f08a59929..786153197 100644
--- a/test/infiniop-test/test_generate/testcases/clip.py
+++ b/test/infiniop-test/test_generate/testcases/clip.py
@@ -2,7 +2,13 @@
 import gguf
 from typing import List, Optional, Tuple
 
-from .. import InfiniopTestWriter, InfiniopTestCase, np_dtype_to_ggml, gguf_strides, contiguous_gguf_strides
+from .. import (
+    InfiniopTestWriter,
+    InfiniopTestCase,
+    np_dtype_to_ggml,
+    gguf_strides,
+    contiguous_gguf_strides,
+)
 
 
 def clip(
@@ -35,7 +41,7 @@ def random_tensor(shape, dtype):
     Returns:
         Random tensor with the specified shape and dtype
     """
-    return (np.random.rand(*shape).astype(dtype) * 4.0 - 2.0)
+    return np.random.rand(*shape).astype(dtype) * 4.0 - 2.0
 
 
 class ClipTestCase(InfiniopTestCase):
@@ -52,7 +58,7 @@ def __init__(
         max_val: np.ndarray,
         max_stride: Optional[List[int]],
         y: np.ndarray,
-        y_shape:  Optional[List[int]],
+        y_shape: Optional[List[int]],
         y_stride: Optional[List[int]],
     ):
         super().__init__("clip")
@@ -63,7 +69,7 @@ def __init__(
         self.max_val = max_val
         self.max_stride = max_stride
         self.y = y
-        self.y_shape=y_shape
+        self.y_shape = y_shape
         self.y_stride = y_stride
 
     def write_test(self, test_writer: "InfiniopTestWriter"):
@@ -71,57 +77,64 @@ def write_test(self, test_writer: "InfiniopTestWriter"):
 
         # Add strides as arrays if they exist
         if self.x_stride is not None:
-            test_writer.add_array(test_writer.gguf_key("x.strides"), gguf_strides(*self.x_stride))
+            test_writer.add_array(
+                test_writer.gguf_key("x.strides"), gguf_strides(*self.x_stride)
+            )
         if self.min_stride is not None:
-            test_writer.add_array(test_writer.gguf_key("min_val.strides"), gguf_strides(*self.min_stride))
+            test_writer.add_array(
+                test_writer.gguf_key("min_val.strides"), gguf_strides(*self.min_stride)
+            )
         if self.max_stride is not None:
-            test_writer.add_array(test_writer.gguf_key("max_val.strides"), gguf_strides(*self.max_stride))
+            test_writer.add_array(
+                test_writer.gguf_key("max_val.strides"), gguf_strides(*self.max_stride)
+            )
         if self.y_shape is not None:
             test_writer.add_array(test_writer.gguf_key("y.shape"), self.y_shape)
         test_writer.add_array(
             test_writer.gguf_key("y.strides"),
-            gguf_strides(*self.y_stride if self.y_stride is not None else contiguous_gguf_strides(self.y_shape))
+            gguf_strides(
+                *(
+                    self.y_stride
+                    if self.y_stride is not None
+                    else contiguous_gguf_strides(self.y_shape)
+                )
+            ),
         )
 
         # Add tensors to the test
         test_writer.add_tensor(
-            test_writer.gguf_key("x"),
-            self.x,
-            raw_dtype=np_dtype_to_ggml(self.x.dtype)
+            test_writer.gguf_key("x"), self.x, raw_dtype=np_dtype_to_ggml(self.x.dtype)
         )
 
         test_writer.add_tensor(
             test_writer.gguf_key("min_val"),
             self.min_val,
-            raw_dtype=np_dtype_to_ggml(self.min_val.dtype)
+            raw_dtype=np_dtype_to_ggml(self.min_val.dtype),
         )
 
         test_writer.add_tensor(
             test_writer.gguf_key("max_val"),
             self.max_val,
-            raw_dtype=np_dtype_to_ggml(self.max_val.dtype)
+            raw_dtype=np_dtype_to_ggml(self.max_val.dtype),
         )
 
         test_writer.add_tensor(
-            test_writer.gguf_key("y"),
-            self.y,
-            raw_dtype=np_dtype_to_ggml(self.y.dtype)
+            test_writer.gguf_key("y"), self.y, raw_dtype=np_dtype_to_ggml(self.y.dtype)
         )
 
         # Calculate the expected result
         ans = clip(
             self.x.astype(np.float64),
             self.min_val.astype(np.float64),
-            self.max_val.astype(np.float64)
+            self.max_val.astype(np.float64),
         )
 
         # Add the expected result to the test
         test_writer.add_tensor(
-            test_writer.gguf_key("ans"),
-            ans,
-            raw_dtype=gguf.GGMLQuantizationType.F64
+            test_writer.gguf_key("ans"), ans, raw_dtype=gguf.GGMLQuantizationType.F64
         )
 
+
 if __name__ == "__main__":
     test_writer = InfiniopTestWriter("clip.gguf")
 
@@ -130,23 +143,23 @@ def write_test(self, test_writer: "InfiniopTestWriter"):
 
     # Test case shapes
     shapes = [
-        (10,),                # 1D tensor
-        (5, 10),              # 2D tensor
-        (2, 3, 4),            # 3D tensor
-        (7, 13),              # Prime dimensions
-        (1, 1),               # Minimum shape
-        (100, 100),           # Large shape
-        (16, 16, 16),         # Large 3D
+        (10,),  # 1D tensor
+        (5, 10),  # 2D tensor
+        (2, 3, 4),  # 3D tensor
+        (7, 13),  # Prime dimensions
+        (1, 1),  # Minimum shape
+        (100, 100),  # Large shape
+        (16, 16, 16),  # Large 3D
     ]
 
     # Test case min/max values
     min_max_values = [
-        (-1.0, 1.0),          # Standard range
-        (0.0, 2.0),           # Positive range
-        (-2.0, 0.0),          # Negative range
-        (-1000.0, 1000.0),    # Large range
-        (-0.001, 0.001),      # Small range
-        (0.0, 0.0),           # min=max
+        (-1.0, 1.0),  # Standard range
+        (0.0, 2.0),  # Positive range
+        (-2.0, 0.0),  # Negative range
+        (-1000.0, 1000.0),  # Large range
+        (-0.001, 0.001),  # Small range
+        (0.0, 0.0),  # min=max
     ]
 
     # Data types to test
@@ -171,7 +184,7 @@ def write_test(self, test_writer: "InfiniopTestWriter"):
                         max_stride=None,
                         y=y,
                         y_shape=shape,
-                        y_stride=None
+                        y_stride=None,
                     )
                 )
 
@@ -199,7 +212,7 @@ def write_test(self, test_writer: "InfiniopTestWriter"):
                     max_stride=row_stride,
                     y=y,
                     y_shape=shape,
-                    y_stride=row_stride
+                    y_stride=row_stride,
                 )
             )
 
@@ -219,7 +232,7 @@ def write_test(self, test_writer: "InfiniopTestWriter"):
                     max_stride=col_stride,
                     y=y,
                     y_shape=shape,
-                    y_stride=col_stride
+                    y_stride=col_stride,
                 )
             )
 
@@ -239,7 +252,7 @@ def write_test(self, test_writer: "InfiniopTestWriter"):
                     max_stride=row_stride,
                     y=y,
                     y_shape=shape,
-                    y_stride=col_stride
+                    y_stride=col_stride,
                 )
             )
 
diff --git a/test/infiniop-test/test_generate/testcases/mul.py b/test/infiniop-test/test_generate/testcases/mul.py
index 00c427bcb..ad4f6b806 100644
--- a/test/infiniop-test/test_generate/testcases/mul.py
+++ b/test/infiniop-test/test_generate/testcases/mul.py
@@ -2,30 +2,36 @@
 import gguf
 from typing import List
 
-from .. import InfiniopTestWriter, InfiniopTestCase, np_dtype_to_ggml, gguf_strides, contiguous_gguf_strides
+from .. import (
+    InfiniopTestWriter,
+    InfiniopTestCase,
+    np_dtype_to_ggml,
+    gguf_strides,
+    contiguous_gguf_strides,
+)
 
-def mul(
-    a: np.ndarray,
-    b: np.ndarray
-):
+
+def mul(a: np.ndarray, b: np.ndarray):
     return np.multiply(a, b)
 
+
 def random_tensor(shape, dtype):
     rate = 1e-3
     var = 0.5 * rate  # 数值范围在[-5e-4, 5e-4]
     return rate * np.random.rand(*shape).astype(dtype) - var
 
+
 class MulTestCase(InfiniopTestCase):
     def __init__(
         self,
         a: np.ndarray,
-        shape_a: List[int] | None,        
+        shape_a: List[int] | None,
         stride_a: List[int] | None,
         b: np.ndarray,
-        shape_b: List[int] | None,       
+        shape_b: List[int] | None,
         stride_b: List[int] | None,
         c: np.ndarray,
-        shape_c: List[int] | None,    
+        shape_c: List[int] | None,
         stride_c: List[int] | None,
     ):
         super().__init__("mul")
@@ -39,7 +45,6 @@ def __init__(
         self.shape_c = shape_c
         self.stride_c = stride_c
 
-
     def write_test(self, test_writer: "InfiniopTestWriter"):
         super().write_test(test_writer)
         if self.shape_a is not None:
@@ -49,12 +54,22 @@ def write_test(self, test_writer: "InfiniopTestWriter"):
         if self.shape_c is not None:
             test_writer.add_array(test_writer.gguf_key("c.shape"), self.shape_c)
         if self.stride_a is not None:
-            test_writer.add_array(test_writer.gguf_key("a.strides"), gguf_strides(*self.stride_a))
+            test_writer.add_array(
+                test_writer.gguf_key("a.strides"), gguf_strides(*self.stride_a)
+            )
         if self.stride_b is not None:
-            test_writer.add_array(test_writer.gguf_key("b.strides"), gguf_strides(*self.stride_b))
+            test_writer.add_array(
+                test_writer.gguf_key("b.strides"), gguf_strides(*self.stride_b)
+            )
         test_writer.add_array(
             test_writer.gguf_key("c.strides"),
-            gguf_strides(*self.stride_c if self.stride_c is not None else contiguous_gguf_strides(self.shape_c))
+            gguf_strides(
+                *(
+                    self.stride_c
+                    if self.stride_c is not None
+                    else contiguous_gguf_strides(self.shape_c)
+                )
+            ),
         )
 
         test_writer.add_tensor(
@@ -68,7 +83,7 @@ def write_test(self, test_writer: "InfiniopTestWriter"):
         )
         a_fp64 = self.a.astype(np.float64)
         b_fp64 = self.b.astype(np.float64)
-        
+
         ans_fp64 = np.multiply(a_fp64, b_fp64)
         ans = mul(self.a, self.b)
         test_writer.add_tensor(
@@ -80,7 +95,8 @@ def write_test(self, test_writer: "InfiniopTestWriter"):
             raw_dtype=np_dtype_to_ggml(ans_fp64.dtype),
         )
 
-if __name__ == '__main__':
+
+if __name__ == "__main__":
     test_writer = InfiniopTestWriter("mul.gguf")
     test_cases = []
 
@@ -96,16 +112,15 @@ def write_test(self, test_writer: "InfiniopTestWriter"):
         ((2048, 2560), (2560, 1), (1, 2048), (2560, 1)),
         ((4, 48, 64), (64 * 48, 64, 1), (1, 4, 192), None),
         ((4, 48, 64), None, (1, 4, 192), (48 * 64, 64, 1)),
-    ]   
+    ]
     _TENSOR_DTYPES_ = [np.float32, np.float16]
-    
+
     for dtype in _TENSOR_DTYPES_:
         for shape, stride_a, stride_b, stride_c in _TEST_CASES_:
             a = random_tensor(shape, dtype)
             b = random_tensor(shape, dtype)
             c = np.empty(tuple(0 for _ in shape), dtype=dtype)
 
-                
             test_cases.append(
                 MulTestCase(
                     a=a,
@@ -118,7 +133,7 @@ def write_test(self, test_writer: "InfiniopTestWriter"):
                     shape_c=shape,
                     stride_c=stride_c,
                 )
-            )   
-    
+            )
+
     test_writer.add_tests(test_cases)
     test_writer.save()
diff --git a/test/infiniop-test/test_generate/testcases/rearrange.py b/test/infiniop-test/test_generate/testcases/rearrange.py
index 9617a1fc0..3d3a0e73b 100644
--- a/test/infiniop-test/test_generate/testcases/rearrange.py
+++ b/test/infiniop-test/test_generate/testcases/rearrange.py
@@ -1,14 +1,21 @@
 import torch
 from typing import List
 
-from .. import InfiniopTestWriter, InfiniopTestCase, np_dtype_to_ggml, gguf_strides, contiguous_gguf_strides
+from .. import (
+    InfiniopTestWriter,
+    InfiniopTestCase,
+    np_dtype_to_ggml,
+    gguf_strides,
+    contiguous_gguf_strides,
+)
+
 
 def row_major_strides(shape):
     """生成张量的行优先stride
-    
+
     Args:
         shape: 张量形状
-    
+
     Returns:
         行优先strides列表
     """
@@ -19,12 +26,13 @@ def row_major_strides(shape):
         strides.insert(0, stride)
     return strides
 
+
 def column_major_strides(shape):
     """生成张量的列优先stride
-    
+
     Args:
         shape: 张量形状
-    
+
     Returns:
         列优先strides列表
     """
@@ -35,6 +43,7 @@ def column_major_strides(shape):
         strides.append(stride)
     return strides
 
+
 def rearrange_using_torch(src: torch.Tensor, dst_strides: List[int]) -> torch.Tensor:
     """
     使用torch的rearrange函数计算结果
@@ -66,27 +75,35 @@ def __init__(
         self.shape = shape
         self.src_strides = src_strides
         self.dst_strides = dst_strides
-        
+
     def write_test(self, test_writer: "InfiniopTestWriter"):
         super().write_test(test_writer)
-        
+
         # 写入形状信息
         if self.shape is not None:
             test_writer.add_array(test_writer.gguf_key("src.shape"), self.shape)
             test_writer.add_array(test_writer.gguf_key("dst.shape"), self.shape)
-        
+
         # 写入strides信息
         if self.src_strides is not None:
-            test_writer.add_array(test_writer.gguf_key("src.strides"), gguf_strides(*self.src_strides))
+            test_writer.add_array(
+                test_writer.gguf_key("src.strides"), gguf_strides(*self.src_strides)
+            )
         test_writer.add_array(
             test_writer.gguf_key("dst.strides"),
-            gguf_strides(*self.dst_strides if self.dst_strides is not None else contiguous_gguf_strides(self.shape))
+            gguf_strides(
+                *(
+                    self.dst_strides
+                    if self.dst_strides is not None
+                    else contiguous_gguf_strides(self.shape)
+                )
+            ),
         )
-        
+
         # 转换torch tensor为numpy用于写入文件
         src_numpy = self.src.detach().cpu().numpy()
         dst_numpy = self.dst.detach().cpu().numpy()
-        
+
         # 写入张量数据
         test_writer.add_tensor(
             test_writer.gguf_key("src"),
@@ -98,9 +115,13 @@ def write_test(self, test_writer: "InfiniopTestWriter"):
             dst_numpy,
             raw_dtype=np_dtype_to_ggml(dst_numpy.dtype),
         )
-        
+
         # 计算并写入答案
-        dst_strides_for_ans = self.dst_strides if self.dst_strides is not None else list(contiguous_gguf_strides(self.shape))
+        dst_strides_for_ans = (
+            self.dst_strides
+            if self.dst_strides is not None
+            else list(contiguous_gguf_strides(self.shape))
+        )
         ans_torch = rearrange_using_torch(self.src, dst_strides_for_ans)
         ans_numpy = ans_torch.detach().cpu().numpy()
         test_writer.add_tensor(
@@ -109,6 +130,7 @@ def write_test(self, test_writer: "InfiniopTestWriter"):
             raw_dtype=np_dtype_to_ggml(src_numpy.dtype),
         )
 
+
 if __name__ == "__main__":
     test_writer = InfiniopTestWriter("rearrange.gguf")
     test_cases = []
@@ -117,12 +139,20 @@ def write_test(self, test_writer: "InfiniopTestWriter"):
         # (shape, src_stride, dst_stride)
         ((100, 100), (1, 100), (100, 1)),
         ((4, 4), (1, 4), (4, 1)),
-        ((4, 6, 64), (64, 4*64, 1), (6*64, 64, 1)),
+        ((4, 6, 64), (64, 4 * 64, 1), (6 * 64, 64, 1)),
         ((2000, 2000), (1, 2000), (2000, 1)),
         ((2001, 2001), (1, 2001), (2001, 1)),
         ((2, 2, 2, 4), (16, 8, 4, 1), (16, 8, 1, 2)),
-        ((3, 4, 7, 53, 9), row_major_strides((3, 4, 7, 53, 9)), column_major_strides((3, 4, 7, 53, 9))),
-        ((3, 4, 50, 50, 5, 7), row_major_strides((3, 4, 50, 50, 5, 7)), column_major_strides((3, 4, 50, 50, 5, 7))),
+        (
+            (3, 4, 7, 53, 9),
+            row_major_strides((3, 4, 7, 53, 9)),
+            column_major_strides((3, 4, 7, 53, 9)),
+        ),
+        (
+            (3, 4, 50, 50, 5, 7),
+            row_major_strides((3, 4, 50, 50, 5, 7)),
+            column_major_strides((3, 4, 50, 50, 5, 7)),
+        ),
     ]
 
     _TENSOR_DTYPES_ = [torch.float32, torch.float16]
@@ -132,7 +162,7 @@ def write_test(self, test_writer: "InfiniopTestWriter"):
             src = torch.rand(*shape, dtype=dtype)
             # 生成目标张量，使用正确的形状
             dst = torch.empty(shape, dtype=dtype)
-            
+
             test_case = RearrangeTestCase(
                 src=src,
                 dst=dst,
@@ -140,7 +170,7 @@ def write_test(self, test_writer: "InfiniopTestWriter"):
                 src_strides=src_strides,
                 dst_strides=dst_strides,
             )
-            test_cases.append(test_case)        
+            test_cases.append(test_case)
 
     test_writer.add_tests(test_cases)
-    test_writer.save() 
+    test_writer.save()
diff --git a/test/infiniop-test/test_generate/testcases/reduce_max.py b/test/infiniop-test/test_generate/testcases/reduce_max.py
new file mode 100644
index 000000000..868b8b3c4
--- /dev/null
+++ b/test/infiniop-test/test_generate/testcases/reduce_max.py
@@ -0,0 +1,124 @@
+from ast import List
+import numpy as np
+import gguf
+from typing import List
+from enum import Enum, auto
+
+from .. import (
+    InfiniopTestWriter,
+    InfiniopTestCase,
+    np_dtype_to_ggml,
+    gguf_strides,
+    contiguous_gguf_strides,
+)
+
+
+def reduce_max(x, dim):
+    if isinstance(x, np.float64):
+        return x
+    return x.max(axis=dim, keepdims=True)
+
+
+def random_tensor(shape: tuple, dtype: np.dtype) -> np.ndarray:
+    return np.random.uniform(-1.0, 1.0, shape).astype(dtype) * 0.001
+
+
+class ReduceMaxTestCase(InfiniopTestCase):
+    def __init__(
+        self,
+        x: np.ndarray,
+        y: np.ndarray,
+        shape_x: List[int] | None,
+        shape_y: List[int] | None,
+        stride_x: List[int] | None,
+        stride_y: List[int] | None,
+        dim: int = 0,
+    ):
+        super().__init__("reduce_max")
+        self.x = x
+        self.y = y
+        self.shape_x = shape_x
+        self.shape_y = shape_y
+        self.stride_x = stride_x
+        self.stride_y = stride_y
+        self.dim = dim
+
+    def write_test(self, test_writer: "InfiniopTestWriter"):
+        super().write_test(test_writer)
+        print(self.shape_y, self.shape_x, self.stride_y, self.stride_x, self.dim)
+        if self.shape_x is not None:
+            test_writer.add_array(test_writer.gguf_key("input.shape"), self.shape_x)
+        if self.shape_y is not None:
+            test_writer.add_array(test_writer.gguf_key("output.shape"), self.shape_y)
+        if self.stride_x is not None:
+            test_writer.add_array(
+                test_writer.gguf_key("input.strides"), gguf_strides(*self.stride_x)
+            )
+        test_writer.add_array(
+            test_writer.gguf_key("output.strides"),
+            gguf_strides(
+                *(
+                    self.stride_y
+                    if self.stride_y is not None
+                    else contiguous_gguf_strides(self.shape_y)
+                )
+            ),
+        )
+        test_writer.add_uint64(test_writer.gguf_key("dim"), self.dim)
+        test_writer.add_tensor(
+            test_writer.gguf_key("input"),
+            self.x,
+            raw_dtype=np_dtype_to_ggml(self.x.dtype),
+        )
+        test_writer.add_tensor(
+            test_writer.gguf_key("output"),
+            self.y,
+            raw_dtype=np_dtype_to_ggml(self.y.dtype),
+        )
+        ans = reduce_max(self.x.astype(np.float64), self.dim)
+        test_writer.add_tensor(
+            test_writer.gguf_key("ans"), ans, raw_dtype=gguf.GGMLQuantizationType.F64
+        )
+
+
+if __name__ == "__main__":
+    test_writer = InfiniopTestWriter("reduce_max.gguf")
+    test_cases = []
+    # ==============================================================================
+    #  Configuration
+    # ==============================================================================
+    # These are not maxt to be imported from other modules
+    _TEST_CASES_ = [
+        # y_shape, x_shape, y_stride, x_stride, dim
+        # ((0,), (0,), (0,), (0,), 0),
+        ((1,), (32,), None, None, 0),
+        ((1, 4), (1, 4), None, None, 0),
+        ((1, 1), (1, 4), None, None, 1),
+        ((16, 1), (16, 2048), None, None, 1),
+        ((1, 16), (2048, 16), None, None, 0),
+        ((16, 1), (16, 2048), (4096, 1), (4096, 1), 1),
+        ((1, 2048), (16, 2048), (4096, 1), (4096, 1), 0),
+        ((4, 4, 1), (4, 4, 2048), None, None, 2),
+        ((1, 4, 4), (2048, 4, 4), None, None, 0),
+        ((4, 1, 4), (4, 2048, 4), (45056, 5632, 1), (32768, 8, 1), 1),
+        ((1, 8, 4, 8), (16, 8, 4, 8), (256, 32, 8, 1), (256, 32, 8, 1), 0),
+    ]
+    _TENSOR_DTYPES_ = [np.float16, np.float32]
+
+    for dtype in _TENSOR_DTYPES_:
+        for shape_y, shape_x, stride_y, stride_x, dim in _TEST_CASES_:
+            x = random_tensor(shape_x, dtype)
+            y = np.empty(tuple(0 for _ in shape_y), dtype=dtype)
+            test_case = ReduceMaxTestCase(
+                x,
+                y,
+                shape_x,
+                shape_y,
+                stride_x,
+                stride_y,
+                dim,
+            )
+            test_cases.append(test_case)
+
+    test_writer.add_tests(test_cases)
+    test_writer.save()
diff --git a/test/infiniop-test/test_generate/testcases/reduce_mean.py b/test/infiniop-test/test_generate/testcases/reduce_mean.py
new file mode 100644
index 000000000..a452a8a89
--- /dev/null
+++ b/test/infiniop-test/test_generate/testcases/reduce_mean.py
@@ -0,0 +1,124 @@
+from ast import List
+import numpy as np
+import gguf
+from typing import List
+from enum import Enum, auto
+
+from .. import (
+    InfiniopTestWriter,
+    InfiniopTestCase,
+    np_dtype_to_ggml,
+    gguf_strides,
+    contiguous_gguf_strides,
+)
+
+
+def reduce_mean(x, dim):
+    if isinstance(x, np.float64):
+        return x
+    return x.mean(axis=dim, keepdims=True)
+
+
+def random_tensor(shape: tuple, dtype: np.dtype) -> np.ndarray:
+    return np.random.uniform(-1.0, 1.0, shape).astype(dtype) * 0.001
+
+
+class ReduceMeanTestCase(InfiniopTestCase):
+    def __init__(
+        self,
+        x: np.ndarray,
+        y: np.ndarray,
+        shape_x: List[int] | None,
+        shape_y: List[int] | None,
+        stride_x: List[int] | None,
+        stride_y: List[int] | None,
+        dim: int = 0,
+    ):
+        super().__init__("reduce_mean")
+        self.x = x
+        self.y = y
+        self.shape_x = shape_x
+        self.shape_y = shape_y
+        self.stride_x = stride_x
+        self.stride_y = stride_y
+        self.dim = dim
+
+    def write_test(self, test_writer: "InfiniopTestWriter"):
+        super().write_test(test_writer)
+        print(self.shape_y, self.shape_x, self.stride_y, self.stride_x, self.dim)
+        if self.shape_x is not None:
+            test_writer.add_array(test_writer.gguf_key("input.shape"), self.shape_x)
+        if self.shape_y is not None:
+            test_writer.add_array(test_writer.gguf_key("output.shape"), self.shape_y)
+        if self.stride_x is not None:
+            test_writer.add_array(
+                test_writer.gguf_key("input.strides"), gguf_strides(*self.stride_x)
+            )
+        test_writer.add_array(
+            test_writer.gguf_key("output.strides"),
+            gguf_strides(
+                *(
+                    self.stride_y
+                    if self.stride_y is not None
+                    else contiguous_gguf_strides(self.shape_y)
+                )
+            ),
+        )
+        test_writer.add_uint64(test_writer.gguf_key("dim"), self.dim)
+        test_writer.add_tensor(
+            test_writer.gguf_key("input"),
+            self.x,
+            raw_dtype=np_dtype_to_ggml(self.x.dtype),
+        )
+        test_writer.add_tensor(
+            test_writer.gguf_key("output"),
+            self.y,
+            raw_dtype=np_dtype_to_ggml(self.y.dtype),
+        )
+        ans = reduce_mean(self.x.astype(np.float64), self.dim)
+        test_writer.add_tensor(
+            test_writer.gguf_key("ans"), ans, raw_dtype=gguf.GGMLQuantizationType.F64
+        )
+
+
+if __name__ == "__main__":
+    test_writer = InfiniopTestWriter("reduce_mean.gguf")
+    test_cases = []
+    # ==============================================================================
+    #  Configuration
+    # ==============================================================================
+    # These are not meant to be imported from other modules
+    _TEST_CASES_ = [
+        # y_shape, x_shape, y_stride, x_stride, dim
+        # ((0,), (0,), (0,), (0,), 0),
+        ((1,), (32,), None, None, 0),
+        ((1, 4), (1, 4), None, None, 0),
+        ((1, 1), (1, 4), None, None, 1),
+        ((16, 1), (16, 2048), None, None, 1),
+        ((1, 16), (2048, 16), None, None, 0),
+        ((16, 1), (16, 2048), (4096, 1), (4096, 1), 1),
+        ((1, 2048), (16, 2048), (4096, 1), (4096, 1), 0),
+        ((4, 4, 1), (4, 4, 2048), None, None, 2),
+        ((1, 4, 4), (2048, 4, 4), None, None, 0),
+        ((4, 1, 4), (4, 2048, 4), (45056, 5632, 1), (32768, 8, 1), 1),
+        ((1, 8, 4, 8), (16, 8, 4, 8), (256, 32, 8, 1), (256, 32, 8, 1), 0),
+    ]
+    _TENSOR_DTYPES_ = [np.float16, np.float32]
+
+    for dtype in _TENSOR_DTYPES_:
+        for shape_y, shape_x, stride_y, stride_x, dim in _TEST_CASES_:
+            x = random_tensor(shape_x, dtype)
+            y = np.empty(tuple(0 for _ in shape_y), dtype=dtype)
+            test_case = ReduceMeanTestCase(
+                x,
+                y,
+                shape_x,
+                shape_y,
+                stride_x,
+                stride_y,
+                dim,
+            )
+            test_cases.append(test_case)
+
+    test_writer.add_tests(test_cases)
+    test_writer.save()
diff --git a/test/infiniop-test/test_generate/testcases/rms_norm.py b/test/infiniop-test/test_generate/testcases/rms_norm.py
index 681ebafc4..9332c090a 100644
--- a/test/infiniop-test/test_generate/testcases/rms_norm.py
+++ b/test/infiniop-test/test_generate/testcases/rms_norm.py
@@ -1,11 +1,19 @@
 import numpy as np
 from typing import List
 
-from .. import InfiniopTestWriter, InfiniopTestCase, np_dtype_to_ggml, gguf_strides, contiguous_gguf_strides
+from .. import (
+    InfiniopTestWriter,
+    InfiniopTestCase,
+    np_dtype_to_ggml,
+    gguf_strides,
+    contiguous_gguf_strides,
+)
+
 
 def random_tensor(shape: tuple, dtype: np.dtype) -> np.ndarray:
     return np.random.uniform(-1.0, 1.0, shape).astype(dtype) * 0.001
 
+
 def rms_norm(x: np.ndarray, w: np.ndarray, epsilon: float) -> np.ndarray:
     """
     使用numpy计算rms_norm结果
@@ -16,13 +24,14 @@ def rms_norm(x: np.ndarray, w: np.ndarray, epsilon: float) -> np.ndarray:
     Returns:
         输出张量, 形状与 input 相同
     """
-    squared = x ** 2
+    squared = x**2
     mean = np.mean(squared, axis=-1, keepdims=True)
     rms = np.sqrt(mean + epsilon)
-    
+
     normalized = x / rms
     return normalized * w
 
+
 class RMSNormTestCase(InfiniopTestCase):
     def __init__(
         self,
@@ -40,9 +49,9 @@ def __init__(
         self.y = y
         self.shape = shape
         self.epsilon = epsilon
-        self.x_strides=x_strides
-        self.y_strides=y_strides
-        
+        self.x_strides = x_strides
+        self.y_strides = y_strides
+
     def write_test(self, test_writer: "InfiniopTestWriter"):
         super().write_test(test_writer)
         test_writer.add_float32(test_writer.gguf_key("epsilon"), self.epsilon)
@@ -50,10 +59,18 @@ def write_test(self, test_writer: "InfiniopTestWriter"):
             test_writer.add_array(test_writer.gguf_key("x.shape"), self.shape)
             test_writer.add_array(test_writer.gguf_key("y.shape"), self.shape)
         if self.x_strides is not None:
-            test_writer.add_array(test_writer.gguf_key("x.strides"), gguf_strides(*self.x_strides))
+            test_writer.add_array(
+                test_writer.gguf_key("x.strides"), gguf_strides(*self.x_strides)
+            )
         test_writer.add_array(
             test_writer.gguf_key("y.strides"),
-            gguf_strides(*self.y_strides if self.y_strides is not None else contiguous_gguf_strides(self.shape))
+            gguf_strides(
+                *(
+                    self.y_strides
+                    if self.y_strides is not None
+                    else contiguous_gguf_strides(self.shape)
+                )
+            ),
         )
         test_writer.add_tensor(
             test_writer.gguf_key("x"),
@@ -70,13 +87,16 @@ def write_test(self, test_writer: "InfiniopTestWriter"):
             self.y,
             raw_dtype=np_dtype_to_ggml(self.y.dtype),
         )
-        ans = rms_norm(self.x.astype(np.float64), self.w.astype(np.float64), self.epsilon)
+        ans = rms_norm(
+            self.x.astype(np.float64), self.w.astype(np.float64), self.epsilon
+        )
         test_writer.add_tensor(
             test_writer.gguf_key("ans"),
             ans,
             raw_dtype=np_dtype_to_ggml(np.float64),
         )
 
+
 if __name__ == "__main__":
     test_writer = InfiniopTestWriter("rms_norm.gguf")
     test_cases = []
@@ -112,9 +132,9 @@ def write_test(self, test_writer: "InfiniopTestWriter"):
                 shape=shape,
                 x_strides=x_strides,
                 y_strides=y_strides,
-                epsilon=epsilon
+                epsilon=epsilon,
             )
-            test_cases.append(test_case)        
+            test_cases.append(test_case)
 
     test_writer.add_tests(test_cases)
     test_writer.save()
diff --git a/test/infiniop-test/test_generate/testcases/rope.py b/test/infiniop-test/test_generate/testcases/rope.py
index 85d9685dd..27f5a06db 100644
--- a/test/infiniop-test/test_generate/testcases/rope.py
+++ b/test/infiniop-test/test_generate/testcases/rope.py
@@ -4,11 +4,17 @@
 from typing import List
 
 
-from .. import InfiniopTestWriter, InfiniopTestCase, np_dtype_to_ggml, gguf_strides, contiguous_gguf_strides
+from .. import (
+    InfiniopTestWriter,
+    InfiniopTestCase,
+    np_dtype_to_ggml,
+    gguf_strides,
+    contiguous_gguf_strides,
+)
 
 
 def rotary_embedding(t, sin, cos):
-    dh = t.shape[2] 
+    dh = t.shape[2]
     assert dh % 2 == 0, "Embedding dimension must be even."
 
     t_even = t[..., 0::2]  # [seq_len, n_head, dh // 2]
@@ -30,7 +36,9 @@ def rotary_embedding(t, sin, cos):
 def sin_cos_table(pos, dim, theta, dtype):
     assert dim % 2 == 0, "Embedding dimension must be even."
 
-    freqs = 1.0 / (theta ** (np.arange(0, dim, 2)[: (dim // 2)].astype(np.float32) / dim))
+    freqs = 1.0 / (
+        theta ** (np.arange(0, dim, 2)[: (dim // 2)].astype(np.float32) / dim)
+    )
 
     angles = np.outer(pos, freqs)
 
@@ -79,19 +87,33 @@ def write_test(self, test_writer: "InfiniopTestWriter"):
             test_writer.add_array(test_writer.gguf_key("x.shape"), self.shape_x)
         test_writer.add_array(
             test_writer.gguf_key("y.strides"),
-            gguf_strides(*self.stride_y if self.stride_y is not None else contiguous_gguf_strides(self.shape_y))
+            gguf_strides(
+                *(
+                    self.stride_y
+                    if self.stride_y is not None
+                    else contiguous_gguf_strides(self.shape_y)
+                )
+            ),
         )
         if self.stride_x is not None:
-            test_writer.add_array(test_writer.gguf_key("x.strides"), gguf_strides(*self.stride_x))
+            test_writer.add_array(
+                test_writer.gguf_key("x.strides"), gguf_strides(*self.stride_x)
+            )
 
         test_writer.add_tensor(
-            test_writer.gguf_key("pos_ids"), self.pos_ids, raw_dtype=np_dtype_to_ggml(self.pos_ids.dtype)
+            test_writer.gguf_key("pos_ids"),
+            self.pos_ids,
+            raw_dtype=np_dtype_to_ggml(self.pos_ids.dtype),
         )
         test_writer.add_tensor(
-            test_writer.gguf_key("sin_table"), self.sin_table, raw_dtype=np_dtype_to_ggml(self.sin_table.dtype)
+            test_writer.gguf_key("sin_table"),
+            self.sin_table,
+            raw_dtype=np_dtype_to_ggml(self.sin_table.dtype),
         )
         test_writer.add_tensor(
-            test_writer.gguf_key("cos_table"), self.cos_table, raw_dtype=np_dtype_to_ggml(self.cos_table.dtype)
+            test_writer.gguf_key("cos_table"),
+            self.cos_table,
+            raw_dtype=np_dtype_to_ggml(self.cos_table.dtype),
         )
         ans = rotary_embedding(
             self.x.astype(np.float64),
@@ -103,8 +125,6 @@ def write_test(self, test_writer: "InfiniopTestWriter"):
         )
 
 
-
-
 if __name__ == "__main__":
     # ==============================================================================
     #  Configuration (Internal Use Only)
@@ -130,7 +150,9 @@ def write_test(self, test_writer: "InfiniopTestWriter"):
             x = np.random.rand(*shape).astype(dtype)
             y = np.empty(tuple(0 for _ in shape), dtype=dtype)
             pos_ids = np.arange(0, x.shape[0], dtype=np.int32)
-            sin_table, cos_table = sin_cos_table(pos_ids, x.shape[2], theta=1e5, dtype=dtype)
+            sin_table, cos_table = sin_cos_table(
+                pos_ids, x.shape[2], theta=1e5, dtype=dtype
+            )
             test_case = RoPETestCase(
                 y=y,
                 x=x,
diff --git a/test/infiniop-test/test_generate/testcases/swiglu.py b/test/infiniop-test/test_generate/testcases/swiglu.py
index cb692b613..aa3450fed 100644
--- a/test/infiniop-test/test_generate/testcases/swiglu.py
+++ b/test/infiniop-test/test_generate/testcases/swiglu.py
@@ -2,7 +2,14 @@
 import gguf
 from typing import List
 
-from .. import InfiniopTestWriter, InfiniopTestCase, np_dtype_to_ggml, gguf_strides, contiguous_gguf_strides, process_zero_stride_tensor
+from .. import (
+    InfiniopTestWriter,
+    InfiniopTestCase,
+    np_dtype_to_ggml,
+    gguf_strides,
+    contiguous_gguf_strides,
+    process_zero_stride_tensor,
+)
 
 
 def swiglu(
@@ -26,7 +33,6 @@ def __init__(
         c: np.ndarray,
         shape_c: List[int] | None,
         stride_c: List[int] | None,
-
     ):
         super().__init__("swiglu")
         self.a = a
@@ -39,7 +45,6 @@ def __init__(
         self.shape_c = shape_c
         self.stride_c = stride_c
 
-
     def write_test(self, test_writer: "InfiniopTestWriter"):
         super().write_test(test_writer)
         if self.shape_a is not None:
@@ -47,14 +52,24 @@ def write_test(self, test_writer: "InfiniopTestWriter"):
         if self.shape_b is not None:
             test_writer.add_array(test_writer.gguf_key("b.shape"), self.shape_b)
         if self.shape_c is not None:
-            test_writer.add_array(test_writer.gguf_key("c.shape"), self.shape_c)  
+            test_writer.add_array(test_writer.gguf_key("c.shape"), self.shape_c)
         if self.stride_a is not None:
-            test_writer.add_array(test_writer.gguf_key("a.strides"), gguf_strides(*self.stride_a))
+            test_writer.add_array(
+                test_writer.gguf_key("a.strides"), gguf_strides(*self.stride_a)
+            )
         if self.stride_b is not None:
-            test_writer.add_array(test_writer.gguf_key("b.strides"), gguf_strides(*self.stride_b))
+            test_writer.add_array(
+                test_writer.gguf_key("b.strides"), gguf_strides(*self.stride_b)
+            )
         test_writer.add_array(
             test_writer.gguf_key("c.strides"),
-            gguf_strides(*self.stride_c if self.stride_c is not None else contiguous_gguf_strides(self.shape_c))
+            gguf_strides(
+                *(
+                    self.stride_c
+                    if self.stride_c is not None
+                    else contiguous_gguf_strides(self.shape_c)
+                )
+            ),
         )
         test_writer.add_tensor(
             test_writer.gguf_key("a"), self.a, raw_dtype=np_dtype_to_ggml(self.a.dtype)
diff --git a/test/infiniop/libinfiniop/op_register.py b/test/infiniop/libinfiniop/op_register.py
index e92e77105..72bf31f48 100644
--- a/test/infiniop/libinfiniop/op_register.py
+++ b/test/infiniop/libinfiniop/op_register.py
@@ -454,6 +454,7 @@ def swiglu_(lib):
         infiniopOperatorDescriptor_t,
     ]
 
+
 @OpRegister.operator
 def conv_(lib):
     lib.infiniopCreateConvDescriptor.restype = c_int32
@@ -489,3 +490,69 @@ def conv_(lib):
     lib.infiniopDestroyConvDescriptor.argtypes = [
         infiniopOperatorDescriptor_t,
     ]
+
+
+@OpRegister.operator
+def reduce_mean_(lib):
+    lib.infiniopCreateReduceMeanDescriptor.restype = c_int32
+    lib.infiniopCreateReduceMeanDescriptor.argtypes = [
+        infiniopHandle_t,
+        POINTER(infiniopOperatorDescriptor_t),
+        infiniopTensorDescriptor_t,
+        infiniopTensorDescriptor_t,
+        c_size_t,
+    ]
+
+    lib.infiniopGetReduceMeanWorkspaceSize.restype = c_int32
+    lib.infiniopGetReduceMeanWorkspaceSize.argtypes = [
+        infiniopOperatorDescriptor_t,
+        POINTER(c_size_t),
+    ]
+
+    lib.infiniopReduceMean.restype = c_int32
+    lib.infiniopReduceMean.argtypes = [
+        infiniopOperatorDescriptor_t,
+        c_void_p,
+        c_size_t,
+        c_void_p,
+        c_void_p,
+        c_void_p,
+    ]
+
+    lib.infiniopDestroyReduceMeanDescriptor.restype = c_int32
+    lib.infiniopDestroyReduceMeanDescriptor.argtypes = [
+        infiniopOperatorDescriptor_t,
+    ]
+
+
+@OpRegister.operator
+def reduce_max_(lib):
+    lib.infiniopCreateReduceMaxDescriptor.restype = c_int32
+    lib.infiniopCreateReduceMaxDescriptor.argtypes = [
+        infiniopHandle_t,
+        POINTER(infiniopOperatorDescriptor_t),
+        infiniopTensorDescriptor_t,
+        infiniopTensorDescriptor_t,
+        c_size_t,
+    ]
+
+    lib.infiniopGetReduceMaxWorkspaceSize.restype = c_int32
+    lib.infiniopGetReduceMaxWorkspaceSize.argtypes = [
+        infiniopOperatorDescriptor_t,
+        POINTER(c_size_t),
+    ]
+
+    lib.infiniopReduceMax.restype = c_int32
+    lib.infiniopReduceMax.argtypes = [
+        infiniopOperatorDescriptor_t,
+        c_void_p,
+        c_size_t,
+        c_void_p,
+        c_void_p,
+        c_void_p,
+    ]
+
+    lib.infiniopDestroyReduceMaxDescriptor.restype = c_int32
+    lib.infiniopDestroyReduceMaxDescriptor.argtypes = [
+        infiniopOperatorDescriptor_t,
+    ]
diff --git a/test/infiniop/reduce_max.py b/test/infiniop/reduce_max.py
new file mode 100644
index 000000000..00c86f76a
--- /dev/null
+++ b/test/infiniop/reduce_max.py
@@ -0,0 +1,154 @@
+import torch
+import ctypes
+from ctypes import c_uint64
+from libinfiniop import (
+    LIBINFINIOP,
+    TestTensor,
+    get_test_devices,
+    check_error,
+    test_operator,
+    get_args,
+    debug,
+    get_tolerance,
+    profile_operation,
+    TestWorkspace,
+    InfiniDtype,
+    InfiniDtypeNames,
+    InfiniDeviceNames,
+    infiniopOperatorDescriptor_t,
+)
+
+# ==============================================================================
+#  Configuration (Internal Use Only)
+# ==============================================================================
+# These are not maxt to be imported from other modules
+_TEST_CASES_ = [
+    # y_shape, x_shape, y_stride, x_stride, dim
+    ((), (), None, None, 0),
+    ((1,), (32,), None, None, 0),
+    ((1, 4), (1, 4), None, None, 0),
+    ((1, 1), (1, 4), None, None, 1),
+    ((16, 1), (16, 2048), None, None, 1),
+    ((1, 16), (2048, 16), None, None, 0),
+    ((16, 1), (16, 2048), (4096, 1), (4096, 1), 1),
+    ((1, 2048), (16, 2048), (4096, 1), (4096, 1), 0),
+    ((4, 4, 1), (4, 4, 2048), None, None, 2),
+    ((1, 4, 4), (2048, 4, 4), None, None, 0),
+    ((4, 1, 4), (4, 2048, 4), (45056, 5632, 1), (32768, 8, 1), 1),
+]
+
+# x types used for testing
+_TENSOR_DTYPES = [InfiniDtype.F16, InfiniDtype.BF16, InfiniDtype.F32]
+
+_TEST_CASES = _TEST_CASES_
+
+# Tolerance map for different data types
+_TOLERANCE_MAP = {
+    InfiniDtype.F16: {"atol": 1e-3, "rtol": 1e-3},
+    InfiniDtype.BF16: {"atol": 1e-2, "rtol": 1e-2},
+    InfiniDtype.F32: {"atol": 1e-4, "rtol": 1e-4},
+}
+
+DEBUG = False
+PROFILE = False
+NUM_PRERUN = 10
+NUM_ITERATIONS = 1000
+
+
+def reduce_max(x, dim):
+    return x.max(dim=dim, keepdim=True)[0]
+
+
+def test(
+    handle,
+    device,
+    y_shape,
+    x_shape,
+    y_stride,
+    x_stride,
+    dim,
+    dtype=InfiniDtype.F16,
+    sync=None,
+):
+    print(
+        f"Testing Reduce_Max on {InfiniDeviceNames[device]} with y_shape:{y_shape} x_shape:{x_shape}"
+        f" y_stride:{y_stride} x_stride:{x_stride} dim:{dim} dtype:{InfiniDtypeNames[dtype]}"
+    )
+
+    x = TestTensor(x_shape, x_stride, dtype, device)
+    ans = reduce_max(x.torch_tensor(), dim)
+
+    y = TestTensor(y_shape, y_stride, dtype, device)
+
+    if sync is not None:
+        sync()
+
+    descriptor = infiniopOperatorDescriptor_t()
+    check_error(
+        LIBINFINIOP.infiniopCreateReduceMaxDescriptor(
+            handle,
+            ctypes.byref(descriptor),
+            y.descriptor,
+            x.descriptor,
+            ctypes.c_size_t(dim),
+        )
+    )
+
+    # Invalidate the shape and strides in the descriptor to prevent them from being directly used by the kernel
+    x.destroy_desc()
+    y.destroy_desc()
+
+    workspace_size = c_uint64(0)
+    check_error(
+        LIBINFINIOP.infiniopGetReduceMaxWorkspaceSize(
+            descriptor, ctypes.byref(workspace_size)
+        )
+    )
+    workspace = TestWorkspace(workspace_size.value, x.device)
+
+    def lib_reduce_max():
+        check_error(
+            LIBINFINIOP.infiniopReduceMax(
+                descriptor,
+                workspace.data(),
+                workspace_size.value,
+                y.data(),
+                x.data(),
+                None,
+            )
+        )
+
+    lib_reduce_max()
+
+    if sync is not None:
+        sync()
+
+    atol, rtol = get_tolerance(_TOLERANCE_MAP, dtype)
+    if DEBUG:
+        debug(y.actual_tensor(), ans, atol=atol, rtol=rtol)
+    assert torch.allclose(y.actual_tensor(), ans, atol=atol, rtol=rtol)
+
+    # Profiling workflow
+    if PROFILE:
+        # fmt: off
+        profile_operation("PyTorch", lambda: causal_softmax(x.torch_tensor()), device, NUM_PRERUN, NUM_ITERATIONS)
+        profile_operation("    lib", lambda: lib_causal_softmax(), device, NUM_PRERUN, NUM_ITERATIONS)
+        # fmt: on
+
+    check_error(LIBINFINIOP.infiniopDestroyReduceMaxDescriptor(descriptor))
+
+
+if __name__ == "__main__":
+    args = get_args()
+
+    # Configure testing options
+    DEBUG = args.debug
+    PROFILE = args.profile
+    NUM_PRERUN = args.num_prerun
+    NUM_ITERATIONS = args.num_iterations
+
+    # Execute tests
+    for device in get_test_devices(args):
+        test_operator(device, test, _TEST_CASES, _TENSOR_DTYPES)
+
+    print("\033[92mTest passed!\033[0m")
diff --git a/test/infiniop/reduce_mean.py b/test/infiniop/reduce_mean.py
new file mode 100644
index 000000000..c097cd296
--- /dev/null
+++ b/test/infiniop/reduce_mean.py
@@ -0,0 +1,154 @@
+import torch
+import ctypes
+from ctypes import c_uint64
+from libinfiniop import (
+    LIBINFINIOP,
+    TestTensor,
+    get_test_devices,
+    check_error,
+    test_operator,
+    get_args,
+    debug,
+    get_tolerance,
+    profile_operation,
+    TestWorkspace,
+    InfiniDtype,
+    InfiniDtypeNames,
+    InfiniDeviceNames,
+    infiniopOperatorDescriptor_t,
+)
+
+# ==============================================================================
+#  Configuration (Internal Use Only)
+# ==============================================================================
+# These are not meant to be imported from other modules
+_TEST_CASES_ = [
+    # y_shape, x_shape, y_stride, x_stride, dim
+    ((), (), None, None, 0),
+    ((1,), (32,), None, None, 0),
+    ((1, 4), (1, 4), None, None, 0),
+    ((1, 1), (1, 4), None, None, 1),
+    ((16, 1), (16, 2048), None, None, 1),
+    ((1, 16), (2048, 16), None, None, 0),
+    ((16, 1), (16, 2048), (4096, 1), (4096, 1), 1),
+    ((1, 2048), (16, 2048), (4096, 1), (4096, 1), 0),
+    ((4, 4, 1), (4, 4, 2048), None, None, 2),
+    ((1, 4, 4), (2048, 4, 4), None, None, 0),
+    ((4, 1, 4), (4, 2048, 4), (45056, 5632, 1), (32768, 8, 1), 1),
+]
+
+# x types used for testing
+_TENSOR_DTYPES = [InfiniDtype.F16, InfiniDtype.BF16, InfiniDtype.F32]
+
+_TEST_CASES = _TEST_CASES_
+
+# Tolerance map for different data types
+_TOLERANCE_MAP = {
+    InfiniDtype.F16: {"atol": 1e-3, "rtol": 1e-3},
+    InfiniDtype.BF16: {"atol": 1e-2, "rtol": 1e-2},
+    InfiniDtype.F32: {"atol": 1e-4, "rtol": 1e-4},
+}
+
+DEBUG = False
+PROFILE = False
+NUM_PRERUN = 10
+NUM_ITERATIONS = 1000
+
+
+def reduce_mean(x, dim):
+    return x.mean(dim=dim, keepdim=True)
+
+
+def test(
+    handle,
+    device,
+    y_shape,
+    x_shape,
+    y_stride,
+    x_stride,
+    dim,
+    dtype=InfiniDtype.F16,
+    sync=None,
+):
+    print(
+        f"Testing Reduce_Mean on {InfiniDeviceNames[device]} with y_shape:{y_shape} x_shape:{x_shape}"
+        f" y_stride:{y_stride} x_stride:{x_stride} dim:{dim} dtype:{InfiniDtypeNames[dtype]}"
+    )
+
+    x = TestTensor(x_shape, x_stride, dtype, device)
+    ans = reduce_mean(x.torch_tensor(), dim)
+
+    y = TestTensor(y_shape, y_stride, dtype, device)
+
+    if sync is not None:
+        sync()
+
+    descriptor = infiniopOperatorDescriptor_t()
+    check_error(
+        LIBINFINIOP.infiniopCreateReduceMeanDescriptor(
+            handle,
+            ctypes.byref(descriptor),
+            y.descriptor,
+            x.descriptor,
+            ctypes.c_size_t(dim),
+        )
+    )
+
+    # Invalidate the shape and strides in the descriptor to prevent them from being directly used by the kernel
+    x.destroy_desc()
+    y.destroy_desc()
+
+    workspace_size = c_uint64(0)
+    check_error(
+        LIBINFINIOP.infiniopGetReduceMeanWorkspaceSize(
+            descriptor, ctypes.byref(workspace_size)
+        )
+    )
+    workspace = TestWorkspace(workspace_size.value, x.device)
+
+    def lib_reduce_mean():
+        check_error(
+            LIBINFINIOP.infiniopReduceMean(
+                descriptor,
+                workspace.data(),
+                workspace_size.value,
+                y.data(),
+                x.data(),
+                None,
+            )
+        )
+
+    lib_reduce_mean()
+
+    if sync is not None:
+        sync()
+
+    atol, rtol = get_tolerance(_TOLERANCE_MAP, dtype)
+    if DEBUG:
+        debug(y.actual_tensor(), ans, atol=atol, rtol=rtol)
+    assert torch.allclose(y.actual_tensor(), ans, atol=atol, rtol=rtol)
+
+    # Profiling workflow
+    if PROFILE:
+        # fmt: off
+        profile_operation("PyTorch", lambda: causal_softmax(x.torch_tensor()), device, NUM_PRERUN, NUM_ITERATIONS)
+        profile_operation("    lib", lambda: lib_causal_softmax(), device, NUM_PRERUN, NUM_ITERATIONS)
+        # fmt: on
+
+    check_error(LIBINFINIOP.infiniopDestroyReduceMeanDescriptor(descriptor))
+
+
+if __name__ == "__main__":
+    args = get_args()
+
+    # Configure testing options
+    DEBUG = args.debug
+    PROFILE = args.profile
+    NUM_PRERUN = args.num_prerun
+    NUM_ITERATIONS = args.num_iterations
+
+    # Execute tests
+    for device in get_test_devices(args):
+        test_operator(device, test, _TEST_CASES, _TENSOR_DTYPES)
+
+    print("\033[92mTest passed!\033[0m")