diff --git a/include/infiniop.h b/include/infiniop.h
index d51b8d92e..30a07e4b4 100644
--- a/include/infiniop.h
+++ b/include/infiniop.h
@@ -4,18 +4,27 @@
 #include "infiniop/handle.h"
 #include "infiniop/ops/add.h"
 #include "infiniop/ops/attention.h"
+#include "infiniop/ops/cast.h"
 #include "infiniop/ops/causal_softmax.h"
 #include "infiniop/ops/clip.h"
 #include "infiniop/ops/conv.h"
+#include "infiniop/ops/cos.h"
+#include "infiniop/ops/exp.h"
 #include "infiniop/ops/gemm.h"
+#include "infiniop/ops/hardswish.h"
+#include "infiniop/ops/leakyrelu.h"
 #include "infiniop/ops/mul.h"
 #include "infiniop/ops/random_sample.h"
 #include "infiniop/ops/rearrange.h"
 #include "infiniop/ops/relu.h"
 #include "infiniop/ops/rms_norm.h"
 #include "infiniop/ops/rope.h"
+#include "infiniop/ops/sigmoid_backward.h"
+#include "infiniop/ops/sin.h"
 #include "infiniop/ops/sub.h"
 #include "infiniop/ops/swiglu.h"
+#include "infiniop/ops/tanh.h"
+#include "infiniop/ops/where.h"
 #include "infiniop/tensor_descriptor.h"
 
 #endif // __INFINIOP_API_H__
diff --git a/include/infiniop/ops/cast.h b/include/infiniop/ops/cast.h
new file mode 100644
index 000000000..81d771efe
--- /dev/null
+++ b/include/infiniop/ops/cast.h
@@ -0,0 +1,24 @@
+#ifndef __INFINIOP_CAST_API_H__
+#define __INFINIOP_CAST_API_H__
+
+#include "../operator_descriptor.h"
+
+typedef struct InfiniopDescriptor *infiniopCastDescriptor_t;
+
+__C __export infiniStatus_t infiniopCreateCastDescriptor(infiniopHandle_t handle,
+                                                         infiniopCastDescriptor_t *desc_ptr,
+                                                         infiniopTensorDescriptor_t output,
+                                                         infiniopTensorDescriptor_t input);
+
+__C __export infiniStatus_t infiniopGetCastWorkspaceSize(infiniopCastDescriptor_t desc, size_t *size);
+
+__C __export infiniStatus_t infiniopCast(infiniopCastDescriptor_t desc,
+                                         void *workspace,
+                                         size_t workspace_size,
+                                         void *output,
+                                         const void *input,
+                                         void *stream);
+
+__C __export infiniStatus_t infiniopDestroyCastDescriptor(infiniopCastDescriptor_t desc);
+
+#endif
diff --git a/include/infiniop/ops/cos.h b/include/infiniop/ops/cos.h
new file mode 100644
index 000000000..aeb551e77
--- /dev/null
+++ b/include/infiniop/ops/cos.h
@@ -0,0 +1,24 @@
+#ifndef __INFINIOP_COS_API_H__
+#define __INFINIOP_COS_API_H__
+
+#include "../operator_descriptor.h"
+
+typedef struct InfiniopDescriptor *infiniopCosDescriptor_t;
+
+__C __export infiniStatus_t infiniopCreateCosDescriptor(infiniopHandle_t handle,
+                                                        infiniopCosDescriptor_t *desc_ptr,
+                                                        infiniopTensorDescriptor_t output,
+                                                        infiniopTensorDescriptor_t input);
+
+__C __export infiniStatus_t infiniopGetCosWorkspaceSize(infiniopCosDescriptor_t desc, size_t *size);
+
+__C __export infiniStatus_t infiniopCos(infiniopCosDescriptor_t desc,
+                                        void *workspace,
+                                        size_t workspace_size,
+                                        void *output,
+                                        const void *input,
+                                        void *stream);
+
+__C __export infiniStatus_t infiniopDestroyCosDescriptor(infiniopCosDescriptor_t desc);
+
+#endif
diff --git a/include/infiniop/ops/exp.h b/include/infiniop/ops/exp.h
new file mode 100644
index 000000000..624bc5363
--- /dev/null
+++ b/include/infiniop/ops/exp.h
@@ -0,0 +1,24 @@
+#ifndef __INFINIOP_EXP_API_H__
+#define __INFINIOP_EXP_API_H__
+
+#include "../operator_descriptor.h"
+
+typedef struct InfiniopDescriptor *infiniopExpDescriptor_t;
+
+__C __export infiniStatus_t infiniopCreateExpDescriptor(infiniopHandle_t handle,
+                                                        infiniopExpDescriptor_t *desc_ptr,
+                                                        infiniopTensorDescriptor_t output,
+                                                        infiniopTensorDescriptor_t input);
+
+__C __export infiniStatus_t infiniopGetExpWorkspaceSize(infiniopExpDescriptor_t desc, size_t *size);
+
+__C __export infiniStatus_t infiniopExp(infiniopExpDescriptor_t desc,
+                                        void *workspace,
+                                        size_t workspace_size,
+                                        void *output,
+                                        const void *input,
+                                        void *stream);
+
+__C __export infiniStatus_t infiniopDestroyExpDescriptor(infiniopExpDescriptor_t desc);
+
+#endif
diff --git a/include/infiniop/ops/hardswish.h b/include/infiniop/ops/hardswish.h
new file mode 100644
index 000000000..8d655fe82
--- /dev/null
+++ b/include/infiniop/ops/hardswish.h
@@ -0,0 +1,24 @@
+#ifndef __INFINIOP_HARDSWISH_API_H__
+#define __INFINIOP_HARDSWISH_API_H__
+
+#include "../operator_descriptor.h"
+
+typedef struct InfiniopDescriptor *infiniopHardswishDescriptor_t;
+
+__C __export infiniStatus_t infiniopCreateHardswishDescriptor(infiniopHandle_t handle,
+                                                              infiniopHardswishDescriptor_t *desc_ptr,
+                                                              infiniopTensorDescriptor_t output,
+                                                              infiniopTensorDescriptor_t input);
+
+__C __export infiniStatus_t infiniopGetHardswishWorkspaceSize(infiniopHardswishDescriptor_t desc, size_t *size);
+
+__C __export infiniStatus_t infiniopHardswish(infiniopHardswishDescriptor_t desc,
+                                              void *workspace,
+                                              size_t workspace_size,
+                                              void *output,
+                                              const void *input,
+                                              void *stream);
+
+__C __export infiniStatus_t infiniopDestroyHardswishDescriptor(infiniopHardswishDescriptor_t desc);
+
+#endif
diff --git a/include/infiniop/ops/leakyrelu.h b/include/infiniop/ops/leakyrelu.h
new file mode 100644
index 000000000..adc46d1c6
--- /dev/null
+++ b/include/infiniop/ops/leakyrelu.h
@@ -0,0 +1,25 @@
+#ifndef __INFINIOP_LEAKYRELU_API_H__
+#define __INFINIOP_LEAKYRELU_API_H__
+
+#include "../operator_descriptor.h"
+
+typedef struct InfiniopDescriptor *infiniopLeakyreluDescriptor_t;
+
+__C __export infiniStatus_t infiniopCreateLeakyreluDescriptor(infiniopHandle_t handle,
+                                                              infiniopLeakyreluDescriptor_t *desc_ptr,
+                                                              infiniopTensorDescriptor_t output,
+                                                              infiniopTensorDescriptor_t input,
+                                                              float negative_slope);
+
+__C __export infiniStatus_t infiniopGetLeakyreluWorkspaceSize(infiniopLeakyreluDescriptor_t desc, size_t *size);
+
+__C __export infiniStatus_t infiniopLeakyrelu(infiniopLeakyreluDescriptor_t desc,
+                                              void *workspace,
+                                              size_t workspace_size,
+                                              void *output,
+                                              const void *input,
+                                              void *stream);
+
+__C __export infiniStatus_t infiniopDestroyLeakyreluDescriptor(infiniopLeakyreluDescriptor_t desc);
+
+#endif
diff --git a/include/infiniop/ops/sigmoid_backward.h b/include/infiniop/ops/sigmoid_backward.h
new file mode 100644
index 000000000..abab0cde7
--- /dev/null
+++ b/include/infiniop/ops/sigmoid_backward.h
@@ -0,0 +1,26 @@
+#ifndef __INFINIOP_SIGMOID_BACKWARD_API_H__
+#define __INFINIOP_SIGMOID_BACKWARD_API_H__
+
+#include "../operator_descriptor.h"
+
+typedef struct InfiniopDescriptor *infiniopSigmoidBackwardDescriptor_t;
+
+__C __export infiniStatus_t infiniopCreateSigmoidBackwardDescriptor(infiniopHandle_t handle,
+                                                                    infiniopSigmoidBackwardDescriptor_t *desc_ptr,
+                                                                    infiniopTensorDescriptor_t grad_input,
+                                                                    infiniopTensorDescriptor_t input,
+                                                                    infiniopTensorDescriptor_t grad_output);
+
+__C __export infiniStatus_t infiniopGetSigmoidBackwardWorkspaceSize(infiniopSigmoidBackwardDescriptor_t desc, size_t *size);
+
+__C __export infiniStatus_t infiniopSigmoidBackward(infiniopSigmoidBackwardDescriptor_t desc,
+                                                    void *workspace,
+                                                    size_t workspace_size,
+                                                    void *grad_input,
+                                                    const void *input,
+                                                    const void *grad_output,
+                                                    void *stream);
+
+__C __export infiniStatus_t infiniopDestroySigmoidBackwardDescriptor(infiniopSigmoidBackwardDescriptor_t desc);
+
+#endif
diff --git a/include/infiniop/ops/sin.h b/include/infiniop/ops/sin.h
new file mode 100644
index 000000000..640deccc0
--- /dev/null
+++ b/include/infiniop/ops/sin.h
@@ -0,0 +1,24 @@
+#ifndef __INFINIOP_SIN_API_H__
+#define __INFINIOP_SIN_API_H__
+
+#include "../operator_descriptor.h"
+
+typedef struct InfiniopDescriptor *infiniopSinDescriptor_t;
+
+__C __export infiniStatus_t infiniopCreateSinDescriptor(infiniopHandle_t handle,
+                                                        infiniopSinDescriptor_t *desc_ptr,
+                                                        infiniopTensorDescriptor_t output,
+                                                        infiniopTensorDescriptor_t input);
+
+__C __export infiniStatus_t infiniopGetSinWorkspaceSize(infiniopSinDescriptor_t desc, size_t *size);
+
+__C __export infiniStatus_t infiniopSin(infiniopSinDescriptor_t desc,
+                                        void *workspace,
+                                        size_t workspace_size,
+                                        void *output,
+                                        const void *input,
+                                        void *stream);
+
+__C __export infiniStatus_t infiniopDestroySinDescriptor(infiniopSinDescriptor_t desc);
+
+#endif
diff --git a/include/infiniop/ops/tanh.h b/include/infiniop/ops/tanh.h
new file mode 100644
index 000000000..742dba860
--- /dev/null
+++ b/include/infiniop/ops/tanh.h
@@ -0,0 +1,24 @@
+#ifndef __INFINIOP_TANH_API_H__
+#define __INFINIOP_TANH_API_H__
+
+#include "../operator_descriptor.h"
+
+typedef struct InfiniopDescriptor *infiniopTanhDescriptor_t;
+
+__C __export infiniStatus_t infiniopCreateTanhDescriptor(infiniopHandle_t handle,
+                                                         infiniopTanhDescriptor_t *desc_ptr,
+                                                         infiniopTensorDescriptor_t output,
+                                                         infiniopTensorDescriptor_t input);
+
+__C __export infiniStatus_t infiniopGetTanhWorkspaceSize(infiniopTanhDescriptor_t desc, size_t *size);
+
+__C __export infiniStatus_t infiniopTanh(infiniopTanhDescriptor_t desc,
+                                         void *workspace,
+                                         size_t workspace_size,
+                                         void *output,
+                                         const void *input,
+                                         void *stream);
+
+__C __export infiniStatus_t infiniopDestroyTanhDescriptor(infiniopTanhDescriptor_t desc);
+
+#endif
diff --git a/include/infiniop/ops/where.h b/include/infiniop/ops/where.h
new file mode 100644
index 000000000..713db102f
--- /dev/null
+++ b/include/infiniop/ops/where.h
@@ -0,0 +1,28 @@
+#ifndef __INFINIOP_WHERE_API_H__
+#define __INFINIOP_WHERE_API_H__
+
+#include "../operator_descriptor.h"
+
+typedef struct InfiniopDescriptor *infiniopWhereDescriptor_t;
+
+__C __export infiniStatus_t infiniopCreateWhereDescriptor(infiniopHandle_t handle,
+                                                          infiniopWhereDescriptor_t *desc_ptr,
+                                                          infiniopTensorDescriptor_t c,
+                                                          infiniopTensorDescriptor_t a,
+                                                          infiniopTensorDescriptor_t b,
+                                                          infiniopTensorDescriptor_t condition);
+
+__C __export infiniStatus_t infiniopGetWhereWorkspaceSize(infiniopWhereDescriptor_t desc, size_t *size);
+
+__C __export infiniStatus_t infiniopWhere(infiniopWhereDescriptor_t desc,
+                                          void *workspace,
+                                          size_t workspace_size,
+                                          void *c,
+                                          const void *a,
+                                          const void *b,
+                                          const void *condition,
+                                          void *stream);
+
+__C __export infiniStatus_t infiniopDestroyWhereDescriptor(infiniopWhereDescriptor_t desc);
+
+#endif
diff --git a/scripts/python_test.py b/scripts/python_test.py
index eb2d4319e..02fd65c63 100644
--- a/scripts/python_test.py
+++ b/scripts/python_test.py
@@ -24,6 +24,15 @@ def run_tests(args):
         "rope.py",
         "sub.py",
         "swiglu.py",
+        "exp.py",
+        "sin.py",
+        "cos.py",
+        "leakyrelu.py",
+        "tanh.py",
+        "sigmoid_backward.py",
+        "hardswish.py",
+        "cast.py",
+        "where.py",
     ]:
         result = subprocess.run(
             f"python {test} {args} --debug", text=True, encoding="utf-8", shell=True
diff --git a/src/infiniop-test/include/ops.hpp b/src/infiniop-test/include/ops.hpp
index 3820f7cfd..c3a120703 100644
--- a/src/infiniop-test/include/ops.hpp
+++ b/src/infiniop-test/include/ops.hpp
@@ -16,6 +16,15 @@ DECLARE_INFINIOP_TEST(add)
 DECLARE_INFINIOP_TEST(causal_softmax)
 DECLARE_INFINIOP_TEST(rearrange)
 DECLARE_INFINIOP_TEST(sub)
+DECLARE_INFINIOP_TEST(exp)
+DECLARE_INFINIOP_TEST(sin)
+DECLARE_INFINIOP_TEST(cos)
+DECLARE_INFINIOP_TEST(leakyrelu)
+DECLARE_INFINIOP_TEST(tanh)
+DECLARE_INFINIOP_TEST(sigmoid_backward)
+DECLARE_INFINIOP_TEST(hardswish)
+DECLARE_INFINIOP_TEST(cast)
+DECLARE_INFINIOP_TEST(where)
 
 #define REGISTER_INFINIOP_TEST(name)                      \
     {                                                     \
@@ -30,19 +39,28 @@ DECLARE_INFINIOP_TEST(sub)
 /*
  * Register all the tests here
  */
-#define TEST_BUILDER_MAPPINGS                  \
-    {                                          \
-        REGISTER_INFINIOP_TEST(gemm)           \
-        REGISTER_INFINIOP_TEST(random_sample)  \
-        REGISTER_INFINIOP_TEST(add)            \
-        REGISTER_INFINIOP_TEST(mul)            \
-        REGISTER_INFINIOP_TEST(clip)           \
-        REGISTER_INFINIOP_TEST(swiglu)         \
-        REGISTER_INFINIOP_TEST(rope)           \
-        REGISTER_INFINIOP_TEST(rms_norm)       \
-        REGISTER_INFINIOP_TEST(causal_softmax) \
-        REGISTER_INFINIOP_TEST(rearrange)      \
-        REGISTER_INFINIOP_TEST(sub)            \
+#define TEST_BUILDER_MAPPINGS                    \
+    {                                            \
+        REGISTER_INFINIOP_TEST(gemm)             \
+        REGISTER_INFINIOP_TEST(random_sample)    \
+        REGISTER_INFINIOP_TEST(add)              \
+        REGISTER_INFINIOP_TEST(mul)              \
+        REGISTER_INFINIOP_TEST(clip)             \
+        REGISTER_INFINIOP_TEST(swiglu)           \
+        REGISTER_INFINIOP_TEST(rope)             \
+        REGISTER_INFINIOP_TEST(rms_norm)         \
+        REGISTER_INFINIOP_TEST(causal_softmax)   \
+        REGISTER_INFINIOP_TEST(rearrange)        \
+        REGISTER_INFINIOP_TEST(sub)              \
+        REGISTER_INFINIOP_TEST(exp)              \
+        REGISTER_INFINIOP_TEST(sin)              \
+        REGISTER_INFINIOP_TEST(cos)              \
+        REGISTER_INFINIOP_TEST(leakyrelu)        \
+        REGISTER_INFINIOP_TEST(tanh)             \
+        REGISTER_INFINIOP_TEST(sigmoid_backward) \
+        REGISTER_INFINIOP_TEST(hardswish)        \
+        REGISTER_INFINIOP_TEST(cast)             \
+        REGISTER_INFINIOP_TEST(where)            \
     }
 
 namespace infiniop_test {
diff --git a/src/infiniop-test/src/ops/cast.cpp b/src/infiniop-test/src/ops/cast.cpp
new file mode 100644
index 000000000..6547bc25a
--- /dev/null
+++ b/src/infiniop-test/src/ops/cast.cpp
@@ -0,0 +1,122 @@
+#include "ops.hpp"
+#include "utils.hpp"
+#include <infinirt.h>
+#include <iomanip>
+#include <iostream>
+
+namespace infiniop_test::cast {
+struct Test::Attributes {
+    std::shared_ptr<Tensor> input;
+    std::shared_ptr<Tensor> output;
+    std::shared_ptr<Tensor> ans;
+};
+
+std::shared_ptr<Test> Test::build(
+    std::unordered_map<std::string, std::vector<uint8_t>> attributes,
+    std::unordered_map<std::string, std::shared_ptr<Tensor>> tensors,
+    double rtol, double atol) {
+    auto test = std::shared_ptr<Test>(new Test(rtol, atol));
+    test->_attributes = new Attributes();
+    if (tensors.find("input") == tensors.end()
+        || tensors.find("output") == tensors.end()
+        || tensors.find("ans") == tensors.end()) {
+        throw std::runtime_error("Invalid Test");
+    }
+
+    test->_attributes->input = tensors["input"];
+    test->_attributes->output = tensors["output"];
+    test->_attributes->ans = tensors["ans"];
+
+    auto elemType = test->_attributes->input->ggml_type();
+    if (elemType == GGML_TYPE_I32) {
+        test->_rtol = 1e-5;
+        test->_atol = 1e-5;
+    }
+    if (elemType == GGML_TYPE_I64) {
+        test->_rtol = 1e-5;
+        test->_atol = 1e-5;
+    }
+    if (elemType == GGML_TYPE_F16) {
+        test->_rtol = 1e-3;
+        test->_atol = 1e-3;
+    }
+    if (elemType == GGML_TYPE_F32) {
+        test->_rtol = 1e-7;
+        test->_atol = 1e-7;
+    }
+    if (elemType == GGML_TYPE_F64) {
+        test->_rtol = 1e-7;
+        test->_atol = 1e-7;
+    }
+    return test;
+}
+
+std::shared_ptr<infiniop_test::Result> Test::run(
+    infiniopHandle_t handle, infiniDevice_t device, int device_id, size_t warm_ups, size_t iterations) {
+    infiniopCastDescriptor_t op_desc;
+    auto input = _attributes->input->to(device, device_id);
+    auto output = _attributes->output->to(device, device_id);
+
+    CHECK_OR(infiniopCreateCastDescriptor(handle, &op_desc,
+                                          output->desc(),
+                                          input->desc()),
+             return TEST_FAILED(OP_CREATION_FAILED, "Failed to create op descriptor."));
+    size_t workspace_size;
+    CHECK_OR(infiniopGetCastWorkspaceSize(op_desc, &workspace_size),
+             return TEST_FAILED(OP_CREATION_FAILED, "Failed to get workspace size."));
+    void *workspace;
+    CHECK_OR(infinirtMalloc(&workspace, workspace_size),
+             return TEST_FAILED(OP_CREATION_FAILED, "Failed to allocate workspace."));
+    CHECK_OR(infiniopCast(op_desc, workspace, workspace_size,
+                          output->data(),
+                          input->data(),
+                          nullptr),
+             return TEST_FAILED(OP_EXECUTION_FAILED, "Failed during execution."));
+
+    try {
+        allClose(output, _attributes->ans, _rtol, _atol);
+    } catch (const std::exception &e) {
+        return TEST_FAILED(RESULT_INCORRECT, e.what());
+    }
+
+    double elapsed_time = 0.;
+
+    elapsed_time = benchmark(
+        [=]() {
+            infiniopCast(
+                op_desc, workspace, workspace_size,
+                output->data(),
+                input->data(),
+                nullptr);
+        },
+        warm_ups, iterations);
+
+    return TEST_PASSED(elapsed_time);
+}
+
+std::vector<std::string> Test::attribute_names() {
+    return {};
+}
+
+std::vector<std::string> Test::tensor_names() {
+    return {"input", "output", "ans"};
+}
+
+std::vector<std::string> Test::output_names() {
+    return {"output"};
+}
+
+std::string Test::toString() const {
+    std::ostringstream oss;
+    oss << op_name() << std::endl;
+    oss << "- input: " << _attributes->input->info() << std::endl;
+    oss << "- output: " << _attributes->output->info() << std::endl;
+    oss << std::scientific << std::setprecision(2);
+    oss << "- rtol=" << _rtol << ", atol=" << _atol << std::endl;
+    return oss.str();
+}
+
+Test::~Test() {
+    delete _attributes;
+}
+} // namespace infiniop_test::cast
diff --git a/src/infiniop-test/src/ops/cos.cpp b/src/infiniop-test/src/ops/cos.cpp
new file mode 100644
index 000000000..52de283af
--- /dev/null
+++ b/src/infiniop-test/src/ops/cos.cpp
@@ -0,0 +1,114 @@
+#include "ops.hpp"
+#include "utils.hpp"
+#include <infinirt.h>
+#include <iomanip>
+#include <iostream>
+
+namespace infiniop_test::cos {
+struct Test::Attributes {
+    std::shared_ptr<Tensor> input;
+    std::shared_ptr<Tensor> output;
+    std::shared_ptr<Tensor> ans;
+};
+
+std::shared_ptr<Test> Test::build(
+    std::unordered_map<std::string, std::vector<uint8_t>> attributes,
+    std::unordered_map<std::string, std::shared_ptr<Tensor>> tensors,
+    double rtol, double atol) {
+    auto test = std::shared_ptr<Test>(new Test(rtol, atol));
+    test->_attributes = new Attributes();
+    if (tensors.find("input") == tensors.end()
+        || tensors.find("output") == tensors.end()
+        || tensors.find("ans") == tensors.end()) {
+        throw std::runtime_error("Invalid Test");
+    }
+
+    test->_attributes->input = tensors["input"];
+    test->_attributes->output = tensors["output"];
+    test->_attributes->ans = tensors["ans"];
+
+    auto elemType = test->_attributes->input->ggml_type();
+    if (elemType == GGML_TYPE_BF16) {
+        test->_rtol = 1e-2;
+        test->_atol = 1e-2;
+    }
+    if (elemType == GGML_TYPE_F16) {
+        test->_rtol = 1e-3;
+        test->_atol = 1e-3;
+    }
+    if (elemType == GGML_TYPE_F32) {
+        test->_rtol = 1e-7;
+        test->_atol = 1e-7;
+    }
+
+    return test;
+}
+
+std::shared_ptr<infiniop_test::Result> Test::run(
+    infiniopHandle_t handle, infiniDevice_t device, int device_id, size_t warm_ups, size_t iterations) {
+    infiniopCosDescriptor_t op_desc;
+    auto input = _attributes->input->to(device, device_id);
+    auto output = _attributes->output->to(device, device_id);
+    CHECK_OR(infiniopCreateCosDescriptor(handle, &op_desc,
+                                         output->desc(),
+                                         input->desc()),
+             return TEST_FAILED(OP_CREATION_FAILED, "Failed to create op descriptor."));
+    size_t workspace_size;
+    CHECK_OR(infiniopGetCosWorkspaceSize(op_desc, &workspace_size),
+             return TEST_FAILED(OP_CREATION_FAILED, "Failed to get workspace size."));
+    void *workspace;
+    CHECK_OR(infinirtMalloc(&workspace, workspace_size),
+             return TEST_FAILED(OP_CREATION_FAILED, "Failed to allocate workspace."));
+    CHECK_OR(infiniopCos(op_desc, workspace, workspace_size,
+                         output->data(),
+                         input->data(),
+                         nullptr),
+             return TEST_FAILED(OP_EXECUTION_FAILED, "Failed during execution."));
+
+    try {
+        allClose(output, _attributes->ans, _rtol, _atol);
+    } catch (const std::exception &e) {
+        return TEST_FAILED(RESULT_INCORRECT, e.what());
+    }
+
+    double elapsed_time = 0.;
+
+    elapsed_time = benchmark(
+        [=]() {
+            infiniopCos(
+                op_desc, workspace, workspace_size,
+                output->data(),
+                input->data(),
+                nullptr);
+        },
+        warm_ups, iterations);
+
+    return TEST_PASSED(elapsed_time);
+}
+
+std::vector<std::string> Test::attribute_names() {
+    return {};
+}
+
+std::vector<std::string> Test::tensor_names() {
+    return {"input", "output", "ans"};
+}
+
+std::vector<std::string> Test::output_names() {
+    return {"output"};
+}
+
+std::string Test::toString() const {
+    std::ostringstream oss;
+    oss << op_name() << std::endl;
+    oss << "- input: " << _attributes->input->info() << std::endl;
+    oss << "- output: " << _attributes->output->info() << std::endl;
+    oss << std::scientific << std::setprecision(2);
+    oss << "- rtol=" << _rtol << ", atol=" << _atol << std::endl;
+    return oss.str();
+}
+
+Test::~Test() {
+    delete _attributes;
+}
+} // namespace infiniop_test::cos
diff --git a/src/infiniop-test/src/ops/exp.cpp b/src/infiniop-test/src/ops/exp.cpp
new file mode 100644
index 000000000..070f8ef6b
--- /dev/null
+++ b/src/infiniop-test/src/ops/exp.cpp
@@ -0,0 +1,114 @@
+#include "ops.hpp"
+#include "utils.hpp"
+#include <infinirt.h>
+#include <iomanip>
+#include <iostream>
+
+namespace infiniop_test::exp {
+struct Test::Attributes {
+    std::shared_ptr<Tensor> input;
+    std::shared_ptr<Tensor> output;
+    std::shared_ptr<Tensor> ans;
+};
+
+std::shared_ptr<Test> Test::build(
+    std::unordered_map<std::string, std::vector<uint8_t>> attributes,
+    std::unordered_map<std::string, std::shared_ptr<Tensor>> tensors,
+    double rtol, double atol) {
+    auto test = std::shared_ptr<Test>(new Test(rtol, atol));
+    test->_attributes = new Attributes();
+    if (tensors.find("input") == tensors.end()
+        || tensors.find("output") == tensors.end()
+        || tensors.find("ans") == tensors.end()) {
+        throw std::runtime_error("Invalid Test");
+    }
+
+    test->_attributes->input = tensors["input"];
+    test->_attributes->output = tensors["output"];
+    test->_attributes->ans = tensors["ans"];
+
+    auto elemType = test->_attributes->input->ggml_type();
+    if (elemType == GGML_TYPE_BF16) {
+        test->_rtol = 1e-2;
+        test->_atol = 1e-2;
+    }
+    if (elemType == GGML_TYPE_F16) {
+        test->_rtol = 1e-3;
+        test->_atol = 1e-3;
+    }
+    if (elemType == GGML_TYPE_F32) {
+        test->_rtol = 1e-6;
+        test->_atol = 1e-6;
+    }
+
+    return test;
+}
+
+std::shared_ptr<infiniop_test::Result> Test::run(
+    infiniopHandle_t handle, infiniDevice_t device, int device_id, size_t warm_ups, size_t iterations) {
+    infiniopExpDescriptor_t op_desc;
+    auto input = _attributes->input->to(device, device_id);
+    auto output = _attributes->output->to(device, device_id);
+    CHECK_OR(infiniopCreateExpDescriptor(handle, &op_desc,
+                                         output->desc(),
+                                         input->desc()),
+             return TEST_FAILED(OP_CREATION_FAILED, "Failed to create op descriptor."));
+    size_t workspace_size;
+    CHECK_OR(infiniopGetExpWorkspaceSize(op_desc, &workspace_size),
+             return TEST_FAILED(OP_CREATION_FAILED, "Failed to get workspace size."));
+    void *workspace;
+    CHECK_OR(infinirtMalloc(&workspace, workspace_size),
+             return TEST_FAILED(OP_CREATION_FAILED, "Failed to allocate workspace."));
+    CHECK_OR(infiniopExp(op_desc, workspace, workspace_size,
+                         output->data(),
+                         input->data(),
+                         nullptr),
+             return TEST_FAILED(OP_EXECUTION_FAILED, "Failed during execution."));
+
+    try {
+        allClose(output, _attributes->ans, _rtol, _atol);
+    } catch (const std::exception &e) {
+        return TEST_FAILED(RESULT_INCORRECT, e.what());
+    }
+
+    double elapsed_time = 0.;
+
+    elapsed_time = benchmark(
+        [=]() {
+            infiniopExp(
+                op_desc, workspace, workspace_size,
+                output->data(),
+                input->data(),
+                nullptr);
+        },
+        warm_ups, iterations);
+
+    return TEST_PASSED(elapsed_time);
+}
+
+std::vector<std::string> Test::attribute_names() {
+    return {};
+}
+
+std::vector<std::string> Test::tensor_names() {
+    return {"input", "output", "ans"};
+}
+
+std::vector<std::string> Test::output_names() {
+    return {"output"};
+}
+
+std::string Test::toString() const {
+    std::ostringstream oss;
+    oss << op_name() << std::endl;
+    oss << "- input: " << _attributes->input->info() << std::endl;
+    oss << "- output: " << _attributes->output->info() << std::endl;
+    oss << std::scientific << std::setprecision(2);
+    oss << "- rtol=" << _rtol << ", atol=" << _atol << std::endl;
+    return oss.str();
+}
+
+Test::~Test() {
+    delete _attributes;
+}
+} // namespace infiniop_test::exp
diff --git a/src/infiniop-test/src/ops/hardswish.cpp b/src/infiniop-test/src/ops/hardswish.cpp
new file mode 100644
index 000000000..0ccf4f52a
--- /dev/null
+++ b/src/infiniop-test/src/ops/hardswish.cpp
@@ -0,0 +1,114 @@
+#include "ops.hpp"
+#include "utils.hpp"
+#include <infinirt.h>
+#include <iomanip>
+#include <iostream>
+
+namespace infiniop_test::hardswish {
+struct Test::Attributes {
+    std::shared_ptr<Tensor> input;
+    std::shared_ptr<Tensor> output;
+    std::shared_ptr<Tensor> ans;
+};
+
+std::shared_ptr<Test> Test::build(
+    std::unordered_map<std::string, std::vector<uint8_t>> attributes,
+    std::unordered_map<std::string, std::shared_ptr<Tensor>> tensors,
+    double rtol, double atol) {
+    auto test = std::shared_ptr<Test>(new Test(rtol, atol));
+    test->_attributes = new Attributes();
+    if (tensors.find("input") == tensors.end()
+        || tensors.find("output") == tensors.end()
+        || tensors.find("ans") == tensors.end()) {
+        throw std::runtime_error("Invalid Test");
+    }
+
+    test->_attributes->input = tensors["input"];
+    test->_attributes->output = tensors["output"];
+    test->_attributes->ans = tensors["ans"];
+
+    auto elemType = test->_attributes->input->ggml_type();
+    if (elemType == GGML_TYPE_BF16) {
+        test->_rtol = 1e-2;
+        test->_atol = 1e-2;
+    }
+    if (elemType == GGML_TYPE_F16) {
+        test->_rtol = 1e-3;
+        test->_atol = 1e-3;
+    }
+    if (elemType == GGML_TYPE_F32) {
+        test->_rtol = 1e-6;
+        test->_atol = 1e-6;
+    }
+
+    return test;
+}
+
+std::shared_ptr<infiniop_test::Result> Test::run(
+    infiniopHandle_t handle, infiniDevice_t device, int device_id, size_t warm_ups, size_t iterations) {
+    infiniopHardswishDescriptor_t op_desc;
+    auto input = _attributes->input->to(device, device_id);
+    auto output = _attributes->output->to(device, device_id);
+    CHECK_OR(infiniopCreateHardswishDescriptor(handle, &op_desc,
+                                               output->desc(),
+                                               input->desc()),
+             return TEST_FAILED(OP_CREATION_FAILED, "Failed to create op descriptor."));
+    size_t workspace_size;
+    CHECK_OR(infiniopGetHardswishWorkspaceSize(op_desc, &workspace_size),
+             return TEST_FAILED(OP_CREATION_FAILED, "Failed to get workspace size."));
+    void *workspace;
+    CHECK_OR(infinirtMalloc(&workspace, workspace_size),
+             return TEST_FAILED(OP_CREATION_FAILED, "Failed to allocate workspace."));
+    CHECK_OR(infiniopHardswish(op_desc, workspace, workspace_size,
+                               output->data(),
+                               input->data(),
+                               nullptr),
+             return TEST_FAILED(OP_EXECUTION_FAILED, "Failed during execution."));
+
+    try {
+        allClose(output, _attributes->ans, _rtol, _atol);
+    } catch (const std::exception &e) {
+        return TEST_FAILED(RESULT_INCORRECT, e.what());
+    }
+
+    double elapsed_time = 0.;
+
+    elapsed_time = benchmark(
+        [=]() {
+            infiniopHardswish(
+                op_desc, workspace, workspace_size,
+                output->data(),
+                input->data(),
+                nullptr);
+        },
+        warm_ups, iterations);
+
+    return TEST_PASSED(elapsed_time);
+}
+
+std::vector<std::string> Test::attribute_names() {
+    return {};
+}
+
+std::vector<std::string> Test::tensor_names() {
+    return {"input", "output", "ans"};
+}
+
+std::vector<std::string> Test::output_names() {
+    return {"output"};
+}
+
+std::string Test::toString() const {
+    std::ostringstream oss;
+    oss << op_name() << std::endl;
+    oss << "- input: " << _attributes->input->info() << std::endl;
+    oss << "- output: " << _attributes->output->info() << std::endl;
+    oss << std::scientific << std::setprecision(2);
+    oss << "- rtol=" << _rtol << ", atol=" << _atol << std::endl;
+    return oss.str();
+}
+
+Test::~Test() {
+    delete _attributes;
+}
+} // namespace infiniop_test::hardswish
diff --git a/src/infiniop-test/src/ops/leakyrelu.cpp b/src/infiniop-test/src/ops/leakyrelu.cpp
new file mode 100644
index 000000000..b7d9eb89c
--- /dev/null
+++ b/src/infiniop-test/src/ops/leakyrelu.cpp
@@ -0,0 +1,123 @@
+#include "ops.hpp"
+#include "utils.hpp"
+#include <infinirt.h>
+#include <iomanip>
+#include <iostream>
+
+namespace infiniop_test::leakyrelu {
+struct Test::Attributes {
+    float negative_slope;
+    std::shared_ptr<Tensor> input;
+    std::shared_ptr<Tensor> output;
+    std::shared_ptr<Tensor> ans;
+};
+
+std::shared_ptr<Test> Test::build(
+    std::unordered_map<std::string, std::vector<uint8_t>> attributes,
+    std::unordered_map<std::string, std::shared_ptr<Tensor>> tensors,
+    double rtol, double atol) {
+    auto test = std::shared_ptr<Test>(new Test(rtol, atol));
+    test->_attributes = new Attributes();
+    if (attributes.find("negative_slope") == attributes.end()
+        || tensors.find("input") == tensors.end()
+        || tensors.find("output") == tensors.end()
+        || tensors.find("ans") == tensors.end()) {
+        throw std::runtime_error("Invalid Test");
+    }
+
+    test->_attributes->negative_slope = *reinterpret_cast<float *>(attributes["negative_slope"].data());
+
+    test->_attributes->input = tensors["input"];
+    test->_attributes->output = tensors["output"];
+    test->_attributes->ans = tensors["ans"];
+
+    auto elemType = test->_attributes->input->ggml_type();
+    if (elemType == GGML_TYPE_BF16) {
+        test->_rtol = 1e-2;
+        test->_atol = 1e-2;
+    }
+    if (elemType == GGML_TYPE_F16) {
+        test->_rtol = 1e-3;
+        test->_atol = 1e-3;
+    }
+    if (elemType == GGML_TYPE_F32) {
+        test->_rtol = 1e-7;
+        test->_atol = 1e-7;
+    }
+
+    return test;
+}
+
+std::shared_ptr<infiniop_test::Result> Test::run(
+    infiniopHandle_t handle, infiniDevice_t device, int device_id, size_t warm_ups, size_t iterations) {
+    infiniopLeakyreluDescriptor_t op_desc;
+    auto input = _attributes->input->to(device, device_id);
+    auto output = _attributes->output->to(device, device_id);
+    CHECK_OR(infiniopCreateLeakyreluDescriptor(handle, &op_desc,
+                                               output->desc(),
+                                               input->desc(),
+                                               _attributes->negative_slope),
+             return TEST_FAILED(OP_CREATION_FAILED, "Failed to create op descriptor."));
+
+    size_t workspace_size;
+    CHECK_OR(infiniopGetLeakyreluWorkspaceSize(op_desc, &workspace_size),
+             return TEST_FAILED(OP_CREATION_FAILED, "Failed to get workspace size."));
+    void *workspace = nullptr;
+    if (workspace_size > 0) {
+        CHECK_OR(infinirtMalloc(&workspace, workspace_size),
+                 return TEST_FAILED(OP_CREATION_FAILED, "Failed to allocate workspace"));
+    }
+    CHECK_OR(infiniopLeakyrelu(op_desc, workspace, workspace_size,
+                               output->data(),
+                               input->data(),
+                               nullptr),
+             return TEST_FAILED(OP_EXECUTION_FAILED, "Failed during execution."));
+
+    try {
+        allClose(output, _attributes->ans, _rtol, _atol);
+    } catch (const std::exception &e) {
+        return TEST_FAILED(RESULT_INCORRECT, e.what());
+    }
+
+    double elapsed_time = 0.;
+
+    elapsed_time = benchmark(
+        [=]() {
+            infiniopLeakyrelu(
+                op_desc, workspace, workspace_size,
+                output->data(),
+                input->data(),
+                nullptr);
+        },
+        warm_ups, iterations);
+
+    return TEST_PASSED(elapsed_time);
+}
+
+std::vector<std::string> Test::attribute_names() {
+    return {"negative_slope"};
+}
+
+std::vector<std::string> Test::tensor_names() {
+    return {"input", "output", "ans"};
+}
+
+std::vector<std::string> Test::output_names() {
+    return {"output"};
+}
+
+std::string Test::toString() const {
+    std::ostringstream oss;
+    oss << op_name() << std::endl;
+    oss << "- negative_slope=" << _attributes->negative_slope << std::endl;
+    oss << "- input: " << _attributes->input->info() << std::endl;
+    oss << "- output: " << _attributes->output->info() << std::endl;
+    oss << std::scientific << std::setprecision(2);
+    oss << "- rtol=" << _rtol << ", atol=" << _atol << std::endl;
+    return oss.str();
+}
+
+Test::~Test() {
+    delete _attributes;
+}
+} // namespace infiniop_test::leakyrelu
diff --git a/src/infiniop-test/src/ops/sigmoid_backward.cpp b/src/infiniop-test/src/ops/sigmoid_backward.cpp
new file mode 100644
index 000000000..434dbf598
--- /dev/null
+++ b/src/infiniop-test/src/ops/sigmoid_backward.cpp
@@ -0,0 +1,122 @@
+#include "ops.hpp"
+#include "utils.hpp"
+#include <infinirt.h>
+#include <iomanip>
+#include <iostream>
+
+namespace infiniop_test::sigmoid_backward {
+struct Test::Attributes {
+    std::shared_ptr<Tensor> input;
+    std::shared_ptr<Tensor> grad_output;
+    std::shared_ptr<Tensor> grad_input;
+    std::shared_ptr<Tensor> ans;
+};
+
+std::shared_ptr<Test> Test::build(
+    std::unordered_map<std::string, std::vector<uint8_t>> attributes,
+    std::unordered_map<std::string, std::shared_ptr<Tensor>> tensors,
+    double rtol, double atol) {
+    auto test = std::shared_ptr<Test>(new Test(rtol, atol));
+    test->_attributes = new Attributes();
+    if (tensors.find("input") == tensors.end()
+        || tensors.find("grad_output") == tensors.end()
+        || tensors.find("grad_input") == tensors.end()
+        || tensors.find("ans") == tensors.end()) {
+        throw std::runtime_error("Invalid Test");
+    }
+
+    test->_attributes->input = tensors["input"];
+    test->_attributes->grad_output = tensors["grad_output"];
+    test->_attributes->grad_input = tensors["grad_input"];
+    test->_attributes->ans = tensors["ans"];
+
+    auto elemType = test->_attributes->input->ggml_type();
+    if (elemType == GGML_TYPE_BF16) {
+        test->_rtol = 1e-2;
+        test->_atol = 1e-2;
+    }
+    if (elemType == GGML_TYPE_F16) {
+        test->_rtol = 1e-3;
+        test->_atol = 1e-3;
+    }
+    if (elemType == GGML_TYPE_F32) {
+        test->_rtol = 1e-6;
+        test->_atol = 1e-6;
+    }
+
+    return test;
+}
+
+std::shared_ptr<infiniop_test::Result> Test::run(
+    infiniopHandle_t handle, infiniDevice_t device, int device_id, size_t warm_ups, size_t iterations) {
+    infiniopSigmoidBackwardDescriptor_t op_desc;
+    auto input = _attributes->input->to(device, device_id);
+    auto grad_output = _attributes->grad_output->to(device, device_id);
+    auto grad_input = _attributes->grad_input->to(device, device_id);
+    CHECK_OR(infiniopCreateSigmoidBackwardDescriptor(handle, &op_desc,
+                                                     grad_input->desc(),
+                                                     input->desc(),
+                                                     grad_output->desc()),
+             return TEST_FAILED(OP_CREATION_FAILED, "Failed to create op descriptor."));
+    size_t workspace_size;
+    CHECK_OR(infiniopGetSigmoidBackwardWorkspaceSize(op_desc, &workspace_size),
+             return TEST_FAILED(OP_CREATION_FAILED, "Failed to get workspace size."));
+    void *workspace;
+    CHECK_OR(infinirtMalloc(&workspace, workspace_size),
+             return TEST_FAILED(OP_CREATION_FAILED, "Failed to allocate workspace."));
+    CHECK_OR(infiniopSigmoidBackward(op_desc, workspace, workspace_size,
+                                     grad_input->data(),
+                                     input->data(),
+                                     grad_output->data(),
+                                     nullptr),
+             return TEST_FAILED(OP_EXECUTION_FAILED, "Failed during execution."));
+
+    try {
+        allClose(grad_input, _attributes->ans, _rtol, _atol);
+    } catch (const std::exception &e) {
+        return TEST_FAILED(RESULT_INCORRECT, e.what());
+    }
+
+    double elapsed_time = 0.;
+
+    elapsed_time = benchmark(
+        [=]() {
+            infiniopSigmoidBackward(
+                op_desc, workspace, workspace_size,
+                grad_input->data(),
+                input->data(),
+                grad_output->data(),
+                nullptr);
+        },
+        warm_ups, iterations);
+
+    return TEST_PASSED(elapsed_time);
+}
+
+std::vector<std::string> Test::attribute_names() {
+    return {};
+}
+
+std::vector<std::string> Test::tensor_names() {
+    return {"input", "grad_output", "grad_input", "ans"};
+}
+
+std::vector<std::string> Test::output_names() {
+    return {"grad_input"};
+}
+
+std::string Test::toString() const {
+    std::ostringstream oss;
+    oss << op_name() << std::endl;
+    oss << "- input: " << _attributes->input->info() << std::endl;
+    oss << "- grad_output: " << _attributes->grad_output->info() << std::endl;
+    oss << "- grad_input: " << _attributes->grad_input->info() << std::endl;
+    oss << std::scientific << std::setprecision(2);
+    oss << "- rtol=" << _rtol << ", atol=" << _atol << std::endl;
+    return oss.str();
+}
+
+Test::~Test() {
+    delete _attributes;
+}
+} // namespace infiniop_test::sigmoid_backward
diff --git a/src/infiniop-test/src/ops/sin.cpp b/src/infiniop-test/src/ops/sin.cpp
new file mode 100644
index 000000000..e1406e588
--- /dev/null
+++ b/src/infiniop-test/src/ops/sin.cpp
@@ -0,0 +1,114 @@
+#include "ops.hpp"
+#include "utils.hpp"
+#include <infinirt.h>
+#include <iomanip>
+#include <iostream>
+
+namespace infiniop_test::sin {
+struct Test::Attributes {
+    std::shared_ptr<Tensor> input;
+    std::shared_ptr<Tensor> output;
+    std::shared_ptr<Tensor> ans;
+};
+
+std::shared_ptr<Test> Test::build(
+    std::unordered_map<std::string, std::vector<uint8_t>> attributes,
+    std::unordered_map<std::string, std::shared_ptr<Tensor>> tensors,
+    double rtol, double atol) {
+    auto test = std::shared_ptr<Test>(new Test(rtol, atol));
+    test->_attributes = new Attributes();
+    if (tensors.find("input") == tensors.end()
+        || tensors.find("output") == tensors.end()
+        || tensors.find("ans") == tensors.end()) {
+        throw std::runtime_error("Invalid Test");
+    }
+
+    test->_attributes->input = tensors["input"];
+    test->_attributes->output = tensors["output"];
+    test->_attributes->ans = tensors["ans"];
+
+    auto elemType = test->_attributes->input->ggml_type();
+    if (elemType == GGML_TYPE_BF16) {
+        test->_rtol = 1e-2;
+        test->_atol = 1e-2;
+    }
+    if (elemType == GGML_TYPE_F16) {
+        test->_rtol = 1e-3;
+        test->_atol = 1e-3;
+    }
+    if (elemType == GGML_TYPE_F32) {
+        test->_rtol = 1e-7;
+        test->_atol = 1e-7;
+    }
+
+    return test;
+}
+
+std::shared_ptr<infiniop_test::Result> Test::run(
+    infiniopHandle_t handle, infiniDevice_t device, int device_id, size_t warm_ups, size_t iterations) {
+    infiniopSinDescriptor_t op_desc;
+    auto input = _attributes->input->to(device, device_id);
+    auto output = _attributes->output->to(device, device_id);
+    CHECK_OR(infiniopCreateSinDescriptor(handle, &op_desc,
+                                         output->desc(),
+                                         input->desc()),
+             return TEST_FAILED(OP_CREATION_FAILED, "Failed to create op descriptor."));
+    size_t workspace_size;
+    CHECK_OR(infiniopGetSinWorkspaceSize(op_desc, &workspace_size),
+             return TEST_FAILED(OP_CREATION_FAILED, "Failed to get workspace size."));
+    void *workspace;
+    CHECK_OR(infinirtMalloc(&workspace, workspace_size),
+             return TEST_FAILED(OP_CREATION_FAILED, "Failed to allocate workspace."));
+    CHECK_OR(infiniopSin(op_desc, workspace, workspace_size,
+                         output->data(),
+                         input->data(),
+                         nullptr),
+             return TEST_FAILED(OP_EXECUTION_FAILED, "Failed during execution."));
+
+    try {
+        allClose(output, _attributes->ans, _rtol, _atol);
+    } catch (const std::exception &e) {
+        return TEST_FAILED(RESULT_INCORRECT, e.what());
+    }
+
+    double elapsed_time = 0.;
+
+    elapsed_time = benchmark(
+        [=]() {
+            infiniopSin(
+                op_desc, workspace, workspace_size,
+                output->data(),
+                input->data(),
+                nullptr);
+        },
+        warm_ups, iterations);
+
+    return TEST_PASSED(elapsed_time);
+}
+
+std::vector<std::string> Test::attribute_names() {
+    return {};
+}
+
+std::vector<std::string> Test::tensor_names() {
+    return {"input", "output", "ans"};
+}
+
+std::vector<std::string> Test::output_names() {
+    return {"output"};
+}
+
+std::string Test::toString() const {
+    std::ostringstream oss;
+    oss << op_name() << std::endl;
+    oss << "- input: " << _attributes->input->info() << std::endl;
+    oss << "- output: " << _attributes->output->info() << std::endl;
+    oss << std::scientific << std::setprecision(2);
+    oss << "- rtol=" << _rtol << ", atol=" << _atol << std::endl;
+    return oss.str();
+}
+
+Test::~Test() {
+    delete _attributes;
+}
+} // namespace infiniop_test::sin
diff --git a/src/infiniop-test/src/ops/tanh.cpp b/src/infiniop-test/src/ops/tanh.cpp
new file mode 100644
index 000000000..6f966de09
--- /dev/null
+++ b/src/infiniop-test/src/ops/tanh.cpp
@@ -0,0 +1,114 @@
+#include "ops.hpp"
+#include "utils.hpp"
+#include <infinirt.h>
+#include <iomanip>
+#include <iostream>
+
+namespace infiniop_test::tanh {
+struct Test::Attributes {
+    std::shared_ptr<Tensor> input;
+    std::shared_ptr<Tensor> output;
+    std::shared_ptr<Tensor> ans;
+};
+
+std::shared_ptr<Test> Test::build(
+    std::unordered_map<std::string, std::vector<uint8_t>> attributes,
+    std::unordered_map<std::string, std::shared_ptr<Tensor>> tensors,
+    double rtol, double atol) {
+    auto test = std::shared_ptr<Test>(new Test(rtol, atol));
+    test->_attributes = new Attributes();
+    if (tensors.find("input") == tensors.end()
+        || tensors.find("output") == tensors.end()
+        || tensors.find("ans") == tensors.end()) {
+        throw std::runtime_error("Invalid Test");
+    }
+
+    test->_attributes->input = tensors["input"];
+    test->_attributes->output = tensors["output"];
+    test->_attributes->ans = tensors["ans"];
+
+    auto elemType = test->_attributes->input->ggml_type();
+    if (elemType == GGML_TYPE_BF16) {
+        test->_rtol = 1e-2;
+        test->_atol = 1e-2;
+    }
+    if (elemType == GGML_TYPE_F16) {
+        test->_rtol = 1e-3;
+        test->_atol = 1e-3;
+    }
+    if (elemType == GGML_TYPE_F32) {
+        test->_rtol = 1e-6;
+        test->_atol = 1e-6;
+    }
+
+    return test;
+}
+
+std::shared_ptr<infiniop_test::Result> Test::run(
+    infiniopHandle_t handle, infiniDevice_t device, int device_id, size_t warm_ups, size_t iterations) {
+    infiniopTanhDescriptor_t op_desc;
+    auto input = _attributes->input->to(device, device_id);
+    auto output = _attributes->output->to(device, device_id);
+    CHECK_OR(infiniopCreateTanhDescriptor(handle, &op_desc,
+                                          output->desc(),
+                                          input->desc()),
+             return TEST_FAILED(OP_CREATION_FAILED, "Failed to create op descriptor."));
+    size_t workspace_size;
+    CHECK_OR(infiniopGetTanhWorkspaceSize(op_desc, &workspace_size),
+             return TEST_FAILED(OP_CREATION_FAILED, "Failed to get workspace size."));
+    void *workspace;
+    CHECK_OR(infinirtMalloc(&workspace, workspace_size),
+             return TEST_FAILED(OP_CREATION_FAILED, "Failed to allocate workspace."));
+    CHECK_OR(infiniopTanh(op_desc, workspace, workspace_size,
+                          output->data(),
+                          input->data(),
+                          nullptr),
+             return TEST_FAILED(OP_EXECUTION_FAILED, "Failed during execution."));
+
+    try {
+        allClose(output, _attributes->ans, _rtol, _atol);
+    } catch (const std::exception &e) {
+        return TEST_FAILED(RESULT_INCORRECT, e.what());
+    }
+
+    double elapsed_time = 0.;
+
+    elapsed_time = benchmark(
+        [=]() {
+            infiniopTanh(
+                op_desc, workspace, workspace_size,
+                output->data(),
+                input->data(),
+                nullptr);
+        },
+        warm_ups, iterations);
+
+    return TEST_PASSED(elapsed_time);
+}
+
+std::vector<std::string> Test::attribute_names() {
+    return {};
+}
+
+std::vector<std::string> Test::tensor_names() {
+    return {"input", "output", "ans"};
+}
+
+std::vector<std::string> Test::output_names() {
+    return {"output"};
+}
+
+std::string Test::toString() const {
+    std::ostringstream oss;
+    oss << op_name() << std::endl;
+    oss << "- input: " << _attributes->input->info() << std::endl;
+    oss << "- output: " << _attributes->output->info() << std::endl;
+    oss << std::scientific << std::setprecision(2);
+    oss << "- rtol=" << _rtol << ", atol=" << _atol << std::endl;
+    return oss.str();
+}
+
+Test::~Test() {
+    delete _attributes;
+}
+} // namespace infiniop_test::tanh
diff --git a/src/infiniop-test/src/ops/where.cpp b/src/infiniop-test/src/ops/where.cpp
new file mode 100644
index 000000000..fea9cba92
--- /dev/null
+++ b/src/infiniop-test/src/ops/where.cpp
@@ -0,0 +1,151 @@
+#include "ops.hpp"
+#include "utils.hpp"
+#include <infinirt.h>
+#include <iomanip>
+#include <iostream>
+
+namespace infiniop_test::where {
+struct Test::Attributes {
+    std::shared_ptr<Tensor> a;
+    std::shared_ptr<Tensor> b;
+    std::shared_ptr<Tensor> condition;
+    std::shared_ptr<Tensor> c;
+    std::shared_ptr<Tensor> ans;
+};
+
+std::shared_ptr<Test> Test::build(
+    std::unordered_map<std::string, std::vector<uint8_t>> attributes,
+    std::unordered_map<std::string, std::shared_ptr<Tensor>> tensors,
+    double rtol, double atol) {
+    auto test = std::shared_ptr<Test>(new Test(rtol, atol));
+    test->_attributes = new Attributes();
+    if (tensors.find("a") == tensors.end()
+        || tensors.find("b") == tensors.end()
+        || tensors.find("condition") == tensors.end()
+        || tensors.find("c") == tensors.end()
+        || tensors.find("ans") == tensors.end()) {
+        throw std::runtime_error("Invalid Test");
+    }
+
+    test->_attributes->a = tensors["a"];
+    test->_attributes->b = tensors["b"];
+    test->_attributes->condition = tensors["condition"];
+    test->_attributes->c = tensors["c"];
+    test->_attributes->ans = tensors["ans"];
+
+    auto elemType = test->_attributes->a->ggml_type();
+    if (elemType == GGML_TYPE_I8) {
+        test->_rtol = 1e-5;
+        test->_atol = 1e-5;
+    }
+    if (elemType == GGML_TYPE_I16) {
+        test->_rtol = 1e-5;
+        test->_atol = 1e-5;
+    }
+    if (elemType == GGML_TYPE_I32) {
+        test->_rtol = 1e-5;
+        test->_atol = 1e-5;
+    }
+    if (elemType == GGML_TYPE_I64) {
+        test->_rtol = 1e-5;
+        test->_atol = 1e-5;
+    }
+    if (elemType == GGML_TYPE_F16) {
+        test->_rtol = 1e-7;
+        test->_atol = 1e-7;
+    }
+    if (elemType == GGML_TYPE_F32) {
+        test->_rtol = 1e-7;
+        test->_atol = 1e-7;
+    }
+    if (elemType == GGML_TYPE_F64) {
+        test->_rtol = 1e-7;
+        test->_atol = 1e-7;
+    }
+    if (elemType == GGML_TYPE_BF16) {
+        test->_rtol = 1e-5;
+        test->_atol = 1e-5;
+    }
+
+    return test;
+}
+
+std::shared_ptr<infiniop_test::Result> Test::run(
+    infiniopHandle_t handle, infiniDevice_t device, int device_id, size_t warm_ups, size_t iterations) {
+    infiniopWhereDescriptor_t op_desc;
+    auto a = _attributes->a->to(device, device_id);
+    auto b = _attributes->b->to(device, device_id);
+    auto condition = _attributes->condition->to(device, device_id);
+    auto c = _attributes->c->to(device, device_id);
+    CHECK_OR(infiniopCreateWhereDescriptor(handle, &op_desc,
+                                           c->desc(),
+                                           a->desc(),
+                                           b->desc(),
+                                           condition->desc()),
+             return TEST_FAILED(OP_CREATION_FAILED, "Failed to create op descriptor."));
+    size_t workspace_size;
+    CHECK_OR(infiniopGetWhereWorkspaceSize(op_desc, &workspace_size),
+             return TEST_FAILED(OP_CREATION_FAILED, "Failed to get workspace size."));
+    void *workspace;
+    CHECK_OR(infinirtMalloc(&workspace, workspace_size),
+             return TEST_FAILED(OP_CREATION_FAILED, "Failed to allocate workspace."));
+    CHECK_OR(infiniopWhere(op_desc, workspace, workspace_size,
+                           c->data(),
+                           a->data(),
+                           b->data(),
+                           condition->data(),
+                           nullptr),
+             return TEST_FAILED(OP_EXECUTION_FAILED, "Failed during execution."));
+
+    try {
+        allClose(c, _attributes->ans, _rtol, _atol);
+    } catch (const std::exception &e) {
+        return TEST_FAILED(RESULT_INCORRECT, e.what());
+    }
+
+    double elapsed_time = 0.;
+
+    elapsed_time = benchmark(
+        [=]() {
+            infiniopWhere(
+                op_desc, workspace, workspace_size,
+                c->data(),
+                a->data(),
+                b->data(),
+                condition->data(),
+                nullptr);
+        },
+        warm_ups, iterations);
+
+    return TEST_PASSED(elapsed_time);
+}
+
+std::vector<std::string> Test::attribute_names() {
+    return {};
+}
+
+std::vector<std::string> Test::tensor_names() {
+    return {"a", "b", "condition", "c", "ans"};
+}
+
+std::vector<std::string> Test::output_names() {
+    return {"c"};
+}
+
+std::string Test::toString() const {
+    std::ostringstream oss;
+    oss << op_name() << std::endl;
+    oss << "- a: " << _attributes->a->info() << std::endl;
+    oss << "- b: " << _attributes->b->info() << std::endl;
+    oss << "- condition: " << _attributes->condition->info() << std::endl;
+    oss << "- c: " << _attributes->c->info() << std::endl;
+    oss << std::scientific << std::setprecision(2);
+    oss << "- rtol=" << _rtol << ", atol=" << _atol << std::endl;
+    return oss.str();
+}
+
+Test::~Test() {
+    delete _attributes;
+}
+
+} // namespace infiniop_test::where
diff --git a/src/infiniop/ops/cast/cast.h b/src/infiniop/ops/cast/cast.h
new file mode 100644
index 000000000..5e66997cc
--- /dev/null
+++ b/src/infiniop/ops/cast/cast.h
@@ -0,0 +1,48 @@
+#ifndef __CAST_H__
+#define __CAST_H__
+
+#include "../../operator.h"
+#include "info.h"
+
+#define DESCRIPTOR(NAMESPACE)                             \
+                                                          \
+    namespace op::cast::NAMESPACE {                       \
+    class Descriptor final : public InfiniopDescriptor {  \
+        struct Opaque;                                    \
+        Opaque *_opaque;                                  \
+                                                          \
+        CastInfo _info;                                   \
+        size_t _min_workspace_size;                       \
+                                                          \
+        Descriptor(                                       \
+            CastInfo info,                                \
+            size_t min_workspace_size,                    \
+            Opaque *opaque,                               \
+            infiniDevice_t device_type,                   \
+            int device_id)                                \
+            : InfiniopDescriptor{device_type, device_id}, \
+              _opaque(opaque),                            \
+              _info(info),                                \
+              _min_workspace_size(min_workspace_size) {}  \
+                                                          \
+    public:                                               \
+        ~Descriptor();                                    \
+                                                          \
+        static infiniStatus_t create(                     \
+            infiniopHandle_t handle,                      \
+            Descriptor **desc_ptr,                        \
+            infiniopTensorDescriptor_t out_desc,          \
+            infiniopTensorDescriptor_t in_desc);          \
+                                                          \
+        size_t workspaceSize() const;                     \
+                                                          \
+        infiniStatus_t calculate(                         \
+            void *workspace,                              \
+            size_t workspace_size,                        \
+            void *output,                                 \
+            const void *input,                            \
+            void *stream) const;                          \
+    };                                                    \
+    }
+
+#endif // __CAST_H__
diff --git a/src/infiniop/ops/cast/cpu/cast_cpu.cc b/src/infiniop/ops/cast/cpu/cast_cpu.cc
new file mode 100644
index 000000000..36d2e9e28
--- /dev/null
+++ b/src/infiniop/ops/cast/cpu/cast_cpu.cc
@@ -0,0 +1,144 @@
+#include "cast_cpu.h"
+#include "../../../devices/cpu/common_cpu.h"
+#include "../info.h"
+#include "infinicore.h"
+#include <algorithm>
+
+namespace op::cast::cpu {
+
+Descriptor::~Descriptor() = default;
+
+infiniStatus_t Descriptor::create(
+    infiniopHandle_t handle_,
+    Descriptor **desc_ptr,
+    infiniopTensorDescriptor_t out_desc,
+    infiniopTensorDescriptor_t in_desc) {
+
+    auto handle = reinterpret_cast<device::cpu::Handle *>(handle_);
+
+    auto info_r = CastInfo::create(out_desc, in_desc);
+    CHECK_RESULT(info_r);
+
+    *desc_ptr = new Descriptor(
+        info_r.take(),
+        0,
+        nullptr,
+        handle->device, handle->device_id);
+
+    return INFINI_STATUS_SUCCESS;
+}
+
+size_t Descriptor::workspaceSize() const {
+    return _min_workspace_size;
+}
+
+template <typename Tout, typename Tin>
+static inline void cpu_cast_impl_incremental(
+    void *output, const void *input, const op::cast::CastInfo &info) {
+
+    const size_t ndim = info.shape.size();
+    const size_t n = info.n;
+
+    auto out_base = reinterpret_cast<Tout *>(output);
+    auto in_base = reinterpret_cast<const Tin *>(input);
+
+    const std::vector<size_t> &shape = info.shape;
+    const std::vector<ptrdiff_t> &in_stride = info.in_stride;
+    const std::vector<ptrdiff_t> &out_stride = info.out_stride;
+
+    if (n == 0) {
+        return;
+    }
+
+    std::vector<size_t> idx(ndim, 0);
+    ptrdiff_t in_off = 0;
+    ptrdiff_t out_off = 0;
+
+    for (size_t it = 0; it < n; ++it) {
+        const Tin *in_elem = in_base + in_off;
+        Tout *out_elem = out_base + out_off;
+        *out_elem = utils::cast<Tout, Tin>(*in_elem);
+
+        for (int d = static_cast<int>(ndim) - 1; d >= 0; --d) {
+            idx[d] += 1;
+            if (in_stride[d] != 0) {
+                in_off += in_stride[d];
+            }
+            if (out_stride[d] != 0) {
+                out_off += out_stride[d];
+            }
+
+            if (idx[d] < shape[d]) {
+                break;
+            } else {
+                idx[d] = 0;
+                if (in_stride[d] != 0) {
+                    in_off -= static_cast<ptrdiff_t>(shape[d]) * in_stride[d];
+                }
+                if (out_stride[d] != 0) {
+                    out_off -= static_cast<ptrdiff_t>(shape[d]) * out_stride[d];
+                }
+            }
+        }
+    }
+}
+
+infiniStatus_t Descriptor::calculate(
+    void *workspace,
+    size_t workspace_size,
+    void *output,
+    const void *input,
+    void *stream) const {
+
+    if (output == const_cast<void *>(input)) {
+        return INFINI_STATUS_BAD_PARAM; // or INFINI_STATUS_INPLACE_NOT_SUPPORTED
+    }
+
+#define CASE_OUT(DT_OUT, TOUT)                                               \
+    case DT_OUT: {                                                           \
+        switch (_info.dt_in) {                                               \
+        case INFINI_DTYPE_I32:                                               \
+            cpu_cast_impl_incremental<TOUT, int32_t>(output, input, _info);  \
+            break;                                                           \
+        case INFINI_DTYPE_I64:                                               \
+            cpu_cast_impl_incremental<TOUT, int64_t>(output, input, _info);  \
+            break;                                                           \
+        case INFINI_DTYPE_U32:                                               \
+            cpu_cast_impl_incremental<TOUT, uint32_t>(output, input, _info); \
+            break;                                                           \
+        case INFINI_DTYPE_U64:                                               \
+            cpu_cast_impl_incremental<TOUT, uint64_t>(output, input, _info); \
+            break;                                                           \
+        case INFINI_DTYPE_F16:                                               \
+            cpu_cast_impl_incremental<TOUT, fp16_t>(output, input, _info);   \
+            break;                                                           \
+        case INFINI_DTYPE_F32:                                               \
+            cpu_cast_impl_incremental<TOUT, float>(output, input, _info);    \
+            break;                                                           \
+        case INFINI_DTYPE_F64:                                               \
+            cpu_cast_impl_incremental<TOUT, double>(output, input, _info);   \
+            break;                                                           \
+        default:                                                             \
+            return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED;                  \
+        }                                                                    \
+        break;                                                               \
+    }
+
+    switch (_info.dt_out) {
+        CASE_OUT(INFINI_DTYPE_I32, int32_t);
+        CASE_OUT(INFINI_DTYPE_I64, int64_t);
+        CASE_OUT(INFINI_DTYPE_U32, uint32_t);
+        CASE_OUT(INFINI_DTYPE_U64, uint64_t);
+        CASE_OUT(INFINI_DTYPE_F16, fp16_t);
+        CASE_OUT(INFINI_DTYPE_F32, float);
+        CASE_OUT(INFINI_DTYPE_F64, double);
+    default:
+        return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
+    }
+
+#undef CASE_OUT
+
+    return INFINI_STATUS_SUCCESS;
+}
+
+} // namespace op::cast::cpu
diff --git a/src/infiniop/ops/cast/cpu/cast_cpu.h b/src/infiniop/ops/cast/cpu/cast_cpu.h
new file mode 100644
index 000000000..ca929a694
--- /dev/null
+++ b/src/infiniop/ops/cast/cpu/cast_cpu.h
@@ -0,0 +1,8 @@
+#ifndef __CAST_CPU_H__
+#define __CAST_CPU_H__
+
+#include "../cast.h"
+
+DESCRIPTOR(cpu)
+
+#endif // __CAST_CPU_H__
diff --git a/src/infiniop/ops/cast/cuda/kernel.cuh b/src/infiniop/ops/cast/cuda/kernel.cuh
new file mode 100644
index 000000000..3736442a3
--- /dev/null
+++ b/src/infiniop/ops/cast/cuda/kernel.cuh
@@ -0,0 +1,79 @@
+#ifndef __CAST_CUDA_KERNEL_CUH__
+#define __CAST_CUDA_KERNEL_CUH__
+
+#include <cuda_fp16.h>
+#include <stdint.h>
+#include <type_traits>
+
+template <typename Tout, typename Tin>
+__device__ __forceinline__ Tout device_cast(const Tin &v) {
+    if constexpr (std::is_same_v<Tout, half>) {
+        float f;
+        if constexpr (std::is_same_v<Tin, half>) {
+            f = __half2float(v);
+        } else {
+            f = static_cast<float>(v);
+        }
+        return __float2half_rn(f);
+    } else if constexpr (std::is_same_v<Tout, float>) {
+        if constexpr (std::is_same_v<Tin, half>) {
+            return __half2float(v);
+        } else {
+            return static_cast<float>(v);
+        }
+    } else if constexpr (std::is_same_v<Tout, double>) {
+        if constexpr (std::is_same_v<Tin, half>) {
+            return static_cast<double>(__half2float(v));
+        } else {
+            return static_cast<double>(v);
+        }
+    } else { // integer outputs
+        // convert via double/float then to integer (truncate)
+        if constexpr (std::is_same_v<Tin, half>) {
+            float f = __half2float(v);
+            return static_cast<Tout>(f);
+        } else {
+            return static_cast<Tout>(v);
+        }
+    }
+}
+
+template <class ToutDev, class TinDev>
+__global__ void cast_kernel(
+    ToutDev *__restrict__ out,
+    const TinDev *__restrict__ in,
+    size_t n,
+    const size_t *__restrict__ shape,
+    const size_t *__restrict__ div,
+    const long long *__restrict__ in_stride,
+    const long long *__restrict__ out_stride,
+    int ndim) {
+
+    size_t gid = static_cast<size_t>(blockIdx.x) * blockDim.x + threadIdx.x;
+    size_t grid_stride = static_cast<size_t>(blockDim.x) * gridDim.x;
+
+    for (size_t linear = gid; linear < n; linear += grid_stride) {
+        unsigned long long rem = linear;
+        long long in_off = 0;
+        long long out_off = 0;
+        for (int d = 0; d < ndim; ++d) {
+            unsigned long long idx_d = 0;
+            size_t divisor = div[d];
+            if (divisor != 0) {
+                idx_d = rem / divisor;
+                rem = rem % divisor;
+            } else {
+                idx_d = 0;
+            }
+            if (in_stride[d] != 0) {
+                in_off += static_cast<long long>(idx_d) * in_stride[d];
+            }
+            if (out_stride[d] != 0) {
+                out_off += static_cast<long long>(idx_d) * out_stride[d];
+            }
+        }
+        out[static_cast<size_t>(out_off)] = device_cast<ToutDev, TinDev>(in[static_cast<size_t>(in_off)]);
+    }
+}
+
+#endif // __CAST_CUDA_KERNEL_CUH__
diff --git a/src/infiniop/ops/cast/info.h b/src/infiniop/ops/cast/info.h
new file mode 100644
index 000000000..8f85f6da8
--- /dev/null
+++ b/src/infiniop/ops/cast/info.h
@@ -0,0 +1,60 @@
+#ifndef __CAST_INFO_H__
+#define __CAST_INFO_H__
+
+#include "../../../utils.h"
+#include "../../tensor.h"
+#include <vector>
+
+namespace op::cast {
+
+class CastInfo {
+    CastInfo() = default;
+
+public:
+    infiniDtype_t dt_in;
+    infiniDtype_t dt_out;
+    std::vector<size_t> shape;
+    std::vector<ptrdiff_t> in_stride;
+    std::vector<ptrdiff_t> out_stride;
+    size_t n;
+
+    static utils::Result<CastInfo> create(
+        infiniopTensorDescriptor_t out_desc,
+        infiniopTensorDescriptor_t in_desc) {
+
+        auto dt_out = out_desc->dtype();
+        auto dt_in = in_desc->dtype();
+
+        CHECK_DTYPE(dt_in,
+                    INFINI_DTYPE_I32, INFINI_DTYPE_I64,
+                    INFINI_DTYPE_U32, INFINI_DTYPE_U64,
+                    INFINI_DTYPE_F16, INFINI_DTYPE_F32, INFINI_DTYPE_F64);
+        CHECK_DTYPE(dt_out,
+                    INFINI_DTYPE_I32, INFINI_DTYPE_I64,
+                    INFINI_DTYPE_U32, INFINI_DTYPE_U64,
+                    INFINI_DTYPE_F16, INFINI_DTYPE_F32, INFINI_DTYPE_F64);
+
+        CHECK_OR_RETURN(out_desc->ndim() == in_desc->ndim(), INFINI_STATUS_BAD_TENSOR_SHAPE);
+        for (size_t i = 0; i < out_desc->ndim(); ++i) {
+            CHECK_OR_RETURN(out_desc->dim(i) == in_desc->dim(i), INFINI_STATUS_BAD_TENSOR_SHAPE);
+        }
+
+        size_t n = 1;
+        for (size_t i = 0; i < in_desc->ndim(); ++i) {
+            n *= static_cast<size_t>(in_desc->dim(i));
+        }
+
+        return utils::Result<CastInfo>(CastInfo{
+            dt_in,
+            dt_out,
+            out_desc->shape(),
+            in_desc->strides(),
+            out_desc->strides(),
+            n,
+        });
+    }
+};
+
+} // namespace op::cast
+
+#endif // __CAST_INFO_H__
diff --git a/src/infiniop/ops/cast/metax/cast_metax.h b/src/infiniop/ops/cast/metax/cast_metax.h
new file mode 100644
index 000000000..5ba92911e
--- /dev/null
+++ b/src/infiniop/ops/cast/metax/cast_metax.h
@@ -0,0 +1,8 @@
+#ifndef __CAST_METAX_API_H__
+#define __CAST_METAX_API_H__
+
+#include "../cast.h"
+
+DESCRIPTOR(metax)
+
+#endif // __CAST_METAX_API_H__
diff --git a/src/infiniop/ops/cast/metax/cast_metax.maca b/src/infiniop/ops/cast/metax/cast_metax.maca
new file mode 100644
index 000000000..4b2103da3
--- /dev/null
+++ b/src/infiniop/ops/cast/metax/cast_metax.maca
@@ -0,0 +1,201 @@
+#include "../cuda/kernel.cuh"
+#include "../../../devices/metax/metax_common.h"
+#include "../cast.h"
+#include "cast_metax.h"
+#include "../info.h"
+
+namespace op::cast::metax {
+
+struct Descriptor::Opaque {
+    std::shared_ptr<device::metax::Handle::Internal> internal;
+};
+
+Descriptor::~Descriptor() {
+    delete _opaque;
+}
+
+template <typename T> struct MapHcType { using Type = T; };
+template <> struct MapHcType<fp16_t> { using Type = half; };
+
+infiniStatus_t Descriptor::create(
+    infiniopHandle_t handle_,
+    Descriptor **desc_ptr,
+    infiniopTensorDescriptor_t out_desc,
+    infiniopTensorDescriptor_t in_desc) {
+    auto handle = reinterpret_cast<device::metax::Handle *>(handle_);
+
+    auto info_r = CastInfo::create(out_desc, in_desc);
+    CHECK_RESULT(info_r);
+    auto info = info_r.take();
+
+    size_t workspace_size = 0;
+
+    *desc_ptr = new Descriptor(
+        info,
+        workspace_size,
+        new Opaque{handle->internal()},
+        handle->device, handle->device_id);
+
+    return INFINI_STATUS_SUCCESS;
+}
+
+size_t Descriptor::workspaceSize() const {
+    return _min_workspace_size;
+}
+
+template <class ToutHost, class TinHost>
+static inline infiniStatus_t metax_cast_impl_incremental(
+    void *output_, const void *input_, 
+    const op::cast::CastInfo &info, 
+    void *stream_) {
+
+    int bs = 256, grid = 0;
+    hcError_t propErr;
+    int device_id_local = 0;
+    using DevTout = typename MapHcType<ToutHost>::Type;
+    using DevTin  = typename MapHcType<TinHost>::Type;
+
+    auto out_dev = reinterpret_cast<DevTout *>(output_);
+    auto in_dev  = reinterpret_cast<const DevTin *>(input_);
+    auto stream = reinterpret_cast<hcStream_t>(stream_);
+
+    int ndim = static_cast<int>(info.shape.size());
+    if (ndim == 0) {
+        return INFINI_STATUS_SUCCESS;
+    }
+
+    std::vector<size_t> h_shape(info.shape.begin(), info.shape.end());
+    std::vector<size_t> h_div(ndim);
+    h_div[ndim - 1] = 1;
+    for (int d = ndim - 2; d >= 0; --d) {
+        h_div[d] = h_div[d + 1] * h_shape[d + 1];
+    }
+
+    std::vector<long long> h_in_stride(ndim), h_out_stride(ndim);
+    for (int d = 0; d < ndim; ++d) {
+        h_in_stride[d] = static_cast<long long>(info.in_stride[d]);
+        h_out_stride[d] = static_cast<long long>(info.out_stride[d]);
+    }
+
+    size_t *d_shape = nullptr;
+    size_t *d_div = nullptr;
+    long long *d_in_stride = nullptr;
+    long long *d_out_stride = nullptr;
+
+    hcError_t err = hcSuccess;
+    err = hcMalloc(reinterpret_cast<void **>(&d_shape), sizeof(size_t) * ndim);
+    if (err != hcSuccess) goto cleanup;
+    err = hcMalloc(reinterpret_cast<void **>(&d_div), sizeof(size_t) * ndim);
+    if (err != hcSuccess) goto cleanup;
+    err = hcMalloc(reinterpret_cast<void **>(&d_in_stride), sizeof(long long) * ndim);
+    if (err != hcSuccess) goto cleanup;
+    err = hcMalloc(reinterpret_cast<void **>(&d_out_stride), sizeof(long long) * ndim);
+    if (err != hcSuccess) goto cleanup;
+
+    err = hcMemcpyAsync(d_shape, h_shape.data(), sizeof(size_t) * ndim, hcMemcpyHostToDevice, stream);
+    if (err != hcSuccess) goto cleanup;
+    err = hcMemcpyAsync(d_div, h_div.data(), sizeof(size_t) * ndim, hcMemcpyHostToDevice, stream);
+    if (err != hcSuccess) goto cleanup;
+    err = hcMemcpyAsync(d_in_stride, h_in_stride.data(), sizeof(long long) * ndim, hcMemcpyHostToDevice, stream);
+    if (err != hcSuccess) goto cleanup;
+    err = hcMemcpyAsync(d_out_stride, h_out_stride.data(), sizeof(long long) * ndim, hcMemcpyHostToDevice, stream);
+    if (err != hcSuccess) goto cleanup;
+
+    device_id_local = 0;
+    propErr = hcGetDevice(&device_id_local);
+    if (propErr == hcSuccess) {
+        hcDeviceProp_t prop;
+        if (hcGetDeviceProperties(&prop, device_id_local) == hcSuccess) {
+            bs = std::min(bs, static_cast<int>(prop.maxThreadsPerBlock) / 2);
+        } else {
+            if (bs > 256) bs = 256;
+        }
+    } else {
+        if (bs > 256) bs = 256;
+    }
+
+    if (bs <= 0) bs = 256;
+    grid = static_cast<int>((info.n + bs - 1) / bs);
+    if (grid <= 0) grid = 1;
+
+    cast_kernel<DevTout, DevTin><<<grid, bs, 0, stream>>>(
+        out_dev, in_dev, info.n, d_shape, d_div, d_in_stride, d_out_stride, ndim);
+
+    err = hcGetLastError();
+    if (err != hcSuccess) goto cleanup;
+
+    err = hcStreamSynchronize(stream);
+    if (err != hcSuccess) goto cleanup;
+
+    hcFree(d_shape);
+    hcFree(d_div);
+    hcFree(d_in_stride);
+    hcFree(d_out_stride);
+    return INFINI_STATUS_SUCCESS;
+
+cleanup:
+    hcFree(d_shape);
+    hcFree(d_div);
+    hcFree(d_in_stride);
+    hcFree(d_out_stride);
+    return INFINI_STATUS_DEVICE_ARCHITECTURE_NOT_SUPPORTED;
+}
+
+infiniStatus_t Descriptor::calculate(
+    void *workspace,
+    size_t workspace_size,
+    void *output,
+    const void *input,
+    void *stream) const {
+
+    if (output == const_cast<void*>(input)) {
+        return INFINI_STATUS_BAD_PARAM;
+    }
+
+    #define CASE_OUT(DT_OUT, TOUT)                                    \
+        case DT_OUT: {                                                \
+            switch (_info.dt_in) {                                    \
+            case INFINI_DTYPE_I32:                                    \
+                metax_cast_impl_incremental<TOUT, int32_t>(output, input, _info, stream); \
+                break;                                                \
+            case INFINI_DTYPE_I64:                                    \
+                metax_cast_impl_incremental<TOUT, int64_t>(output, input, _info, stream); \
+                break;                                                \
+            case INFINI_DTYPE_U32:                                    \
+                metax_cast_impl_incremental<TOUT, uint32_t>(output, input, _info, stream); \
+                break;                                                \
+            case INFINI_DTYPE_U64:                                    \
+                metax_cast_impl_incremental<TOUT, uint64_t>(output, input, _info, stream); \
+                break;                                                \
+            case INFINI_DTYPE_F16:                                    \
+                metax_cast_impl_incremental<TOUT, fp16_t>(output, input, _info, stream); \
+                break;                                                \
+            case INFINI_DTYPE_F32:                                    \
+                metax_cast_impl_incremental<TOUT, float>(output, input, _info, stream); \
+                break;                                                \
+            case INFINI_DTYPE_F64:                                    \
+                metax_cast_impl_incremental<TOUT, double>(output, input, _info, stream); \
+                break;                                                \
+            default:                                                  \
+                return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED;                 \
+            }                                                         \
+            break;                                                    \
+        }
+
+    switch (_info.dt_out) {
+        CASE_OUT(INFINI_DTYPE_I32, int32_t);
+        CASE_OUT(INFINI_DTYPE_I64, int64_t);
+        CASE_OUT(INFINI_DTYPE_U32, uint32_t);
+        CASE_OUT(INFINI_DTYPE_U64, uint64_t);
+        CASE_OUT(INFINI_DTYPE_F16, fp16_t);
+        CASE_OUT(INFINI_DTYPE_F32, float);
+        CASE_OUT(INFINI_DTYPE_F64, double);
+    default:
+        return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
+    }
+
+    #undef CASE_OUT
+    return INFINI_STATUS_SUCCESS;
+}
+
+}; // namespace op::cast::metax
diff --git a/src/infiniop/ops/cast/nvidia/cast_nvidia.cu b/src/infiniop/ops/cast/nvidia/cast_nvidia.cu
new file mode 100644
index 000000000..8e7eea473
--- /dev/null
+++ b/src/infiniop/ops/cast/nvidia/cast_nvidia.cu
@@ -0,0 +1,239 @@
+#include "../../../devices/nvidia/nvidia_handle.cuh"
+#include "../cast.h"
+#include "../cuda/kernel.cuh"
+#include "../info.h"
+#include "cast_nvidia.cuh"
+#include <algorithm>
+#include <cstring>
+#include <cuda_runtime.h>
+#include <vector>
+
+namespace op::cast::nvidia {
+
+struct Descriptor::Opaque {
+    std::shared_ptr<device::nvidia::Handle::Internal> internal;
+};
+
+Descriptor::~Descriptor() {
+    delete _opaque;
+}
+
+template <typename T>
+struct MapCudaType {
+    using Type = T;
+};
+template <>
+struct MapCudaType<fp16_t> {
+    using Type = half;
+};
+
+infiniStatus_t Descriptor::create(
+    infiniopHandle_t handle_,
+    Descriptor **desc_ptr,
+    infiniopTensorDescriptor_t out_desc,
+    infiniopTensorDescriptor_t in_desc) {
+    auto handle = reinterpret_cast<device::nvidia::Handle *>(handle_);
+
+    auto info_r = CastInfo::create(out_desc, in_desc);
+    CHECK_RESULT(info_r);
+    auto info = info_r.take();
+
+    size_t workspace_size = 0;
+
+    *desc_ptr = new Descriptor(
+        info,
+        workspace_size,
+        new Opaque{handle->internal()},
+        handle->device, handle->device_id);
+
+    return INFINI_STATUS_SUCCESS;
+}
+
+size_t Descriptor::workspaceSize() const {
+    return _min_workspace_size;
+}
+
+template <class ToutHost, class TinHost>
+static inline infiniStatus_t cuda_cast_impl_incremental(
+    void *output_, const void *input_,
+    const op::cast::CastInfo &info,
+    void *stream_) {
+
+    int bs = 256, grid = 0;
+    cudaError_t propErr;
+    int device_id_local = 0;
+    using DevTout = typename MapCudaType<ToutHost>::Type;
+    using DevTin = typename MapCudaType<TinHost>::Type;
+
+    auto out_dev = reinterpret_cast<DevTout *>(output_);
+    auto in_dev = reinterpret_cast<const DevTin *>(input_);
+    auto stream = reinterpret_cast<cudaStream_t>(stream_);
+
+    int ndim = static_cast<int>(info.shape.size());
+    if (ndim == 0) {
+        return INFINI_STATUS_SUCCESS;
+    }
+
+    std::vector<size_t> h_shape(info.shape.begin(), info.shape.end());
+    std::vector<size_t> h_div(ndim);
+    h_div[ndim - 1] = 1;
+    for (int d = ndim - 2; d >= 0; --d) {
+        h_div[d] = h_div[d + 1] * h_shape[d + 1];
+    }
+
+    std::vector<long long> h_in_stride(ndim), h_out_stride(ndim);
+    for (int d = 0; d < ndim; ++d) {
+        h_in_stride[d] = static_cast<long long>(info.in_stride[d]);
+        h_out_stride[d] = static_cast<long long>(info.out_stride[d]);
+    }
+
+    size_t *d_shape = nullptr;
+    size_t *d_div = nullptr;
+    long long *d_in_stride = nullptr;
+    long long *d_out_stride = nullptr;
+
+    cudaError_t err = cudaSuccess;
+    err = cudaMalloc(reinterpret_cast<void **>(&d_shape), sizeof(size_t) * ndim);
+    if (err != cudaSuccess) {
+        goto cleanup;
+    }
+    err = cudaMalloc(reinterpret_cast<void **>(&d_div), sizeof(size_t) * ndim);
+    if (err != cudaSuccess) {
+        goto cleanup;
+    }
+    err = cudaMalloc(reinterpret_cast<void **>(&d_in_stride), sizeof(long long) * ndim);
+    if (err != cudaSuccess) {
+        goto cleanup;
+    }
+    err = cudaMalloc(reinterpret_cast<void **>(&d_out_stride), sizeof(long long) * ndim);
+    if (err != cudaSuccess) {
+        goto cleanup;
+    }
+
+    err = cudaMemcpyAsync(d_shape, h_shape.data(), sizeof(size_t) * ndim, cudaMemcpyHostToDevice, stream);
+    if (err != cudaSuccess) {
+        goto cleanup;
+    }
+    err = cudaMemcpyAsync(d_div, h_div.data(), sizeof(size_t) * ndim, cudaMemcpyHostToDevice, stream);
+    if (err != cudaSuccess) {
+        goto cleanup;
+    }
+    err = cudaMemcpyAsync(d_in_stride, h_in_stride.data(), sizeof(long long) * ndim, cudaMemcpyHostToDevice, stream);
+    if (err != cudaSuccess) {
+        goto cleanup;
+    }
+    err = cudaMemcpyAsync(d_out_stride, h_out_stride.data(), sizeof(long long) * ndim, cudaMemcpyHostToDevice, stream);
+    if (err != cudaSuccess) {
+        goto cleanup;
+    }
+
+    device_id_local = 0;
+    propErr = cudaGetDevice(&device_id_local);
+    if (propErr == cudaSuccess) {
+        cudaDeviceProp prop;
+        if (cudaGetDeviceProperties(&prop, device_id_local) == cudaSuccess) {
+            bs = std::min(bs, static_cast<int>(prop.maxThreadsPerBlock) / 2);
+        } else {
+            if (bs > 256) {
+                bs = 256;
+            }
+        }
+    } else {
+        if (bs > 256) {
+            bs = 256;
+        }
+    }
+
+    if (bs <= 0) {
+        bs = 256;
+    }
+    grid = static_cast<int>((info.n + bs - 1) / bs);
+    if (grid <= 0) {
+        grid = 1;
+    }
+
+    cast_kernel<DevTout, DevTin><<<grid, bs, 0, stream>>>(
+        out_dev, in_dev, info.n, d_shape, d_div, d_in_stride, d_out_stride, ndim);
+
+    err = cudaGetLastError();
+    if (err != cudaSuccess) {
+        goto cleanup;
+    }
+
+    err = cudaStreamSynchronize(stream);
+    if (err != cudaSuccess) {
+        goto cleanup;
+    }
+
+    cudaFree(d_shape);
+    cudaFree(d_div);
+    cudaFree(d_in_stride);
+    cudaFree(d_out_stride);
+    return INFINI_STATUS_SUCCESS;
+
+cleanup:
+    cudaFree(d_shape);
+    cudaFree(d_div);
+    cudaFree(d_in_stride);
+    cudaFree(d_out_stride);
+    return INFINI_STATUS_DEVICE_ARCHITECTURE_NOT_SUPPORTED;
+}
+
+infiniStatus_t Descriptor::calculate(
+    void *workspace,
+    size_t workspace_size,
+    void *output,
+    const void *input,
+    void *stream) const {
+
+    if (output == const_cast<void *>(input)) {
+        return INFINI_STATUS_BAD_PARAM;
+    }
+
+#define CASE_OUT(DT_OUT, TOUT)                                                        \
+    case DT_OUT: {                                                                    \
+        switch (_info.dt_in) {                                                        \
+        case INFINI_DTYPE_I32:                                                        \
+            cuda_cast_impl_incremental<TOUT, int32_t>(output, input, _info, stream);  \
+            break;                                                                    \
+        case INFINI_DTYPE_I64:                                                        \
+            cuda_cast_impl_incremental<TOUT, int64_t>(output, input, _info, stream);  \
+            break;                                                                    \
+        case INFINI_DTYPE_U32:                                                        \
+            cuda_cast_impl_incremental<TOUT, uint32_t>(output, input, _info, stream); \
+            break;                                                                    \
+        case INFINI_DTYPE_U64:                                                        \
+            cuda_cast_impl_incremental<TOUT, uint64_t>(output, input, _info, stream); \
+            break;                                                                    \
+        case INFINI_DTYPE_F16:                                                        \
+            cuda_cast_impl_incremental<TOUT, fp16_t>(output, input, _info, stream);   \
+            break;                                                                    \
+        case INFINI_DTYPE_F32:                                                        \
+            cuda_cast_impl_incremental<TOUT, float>(output, input, _info, stream);    \
+            break;                                                                    \
+        case INFINI_DTYPE_F64:                                                        \
+            cuda_cast_impl_incremental<TOUT, double>(output, input, _info, stream);   \
+            break;                                                                    \
+        default:                                                                      \
+            return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED;                           \
+        }                                                                             \
+        break;                                                                        \
+    }
+
+    switch (_info.dt_out) {
+        CASE_OUT(INFINI_DTYPE_I32, int32_t);
+        CASE_OUT(INFINI_DTYPE_I64, int64_t);
+        CASE_OUT(INFINI_DTYPE_U32, uint32_t);
+        CASE_OUT(INFINI_DTYPE_U64, uint64_t);
+        CASE_OUT(INFINI_DTYPE_F16, fp16_t);
+        CASE_OUT(INFINI_DTYPE_F32, float);
+        CASE_OUT(INFINI_DTYPE_F64, double);
+    default:
+        return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
+    }
+
+#undef CASE_OUT
+    return INFINI_STATUS_SUCCESS;
+}
+
+}; // namespace op::cast::nvidia
diff --git a/src/infiniop/ops/cast/nvidia/cast_nvidia.cuh b/src/infiniop/ops/cast/nvidia/cast_nvidia.cuh
new file mode 100644
index 000000000..032e1fb2e
--- /dev/null
+++ b/src/infiniop/ops/cast/nvidia/cast_nvidia.cuh
@@ -0,0 +1,8 @@
+#ifndef __CAST_CUDA_API_H__
+#define __CAST_CUDA_API_H__
+
+#include "../cast.h"
+
+DESCRIPTOR(nvidia)
+
+#endif // __CAST_CUDA_API_H__
diff --git a/src/infiniop/ops/cast/operator.cc b/src/infiniop/ops/cast/operator.cc
new file mode 100644
index 000000000..12d26953b
--- /dev/null
+++ b/src/infiniop/ops/cast/operator.cc
@@ -0,0 +1,142 @@
+#include "../../operator.h"
+#include "../../handle.h"
+#include "infiniop/ops/cast.h"
+
+#ifdef ENABLE_CPU_API
+#include "cpu/cast_cpu.h"
+#endif
+#if defined(ENABLE_NVIDIA_API) || defined(ENABLE_ILUVATAR_API)
+#include "nvidia/cast_nvidia.cuh"
+#endif
+#ifdef ENABLE_METAX_API
+#include "metax/cast_metax.h"
+#endif
+
+__C infiniStatus_t infiniopCreateCastDescriptor(
+    infiniopHandle_t handle,
+    infiniopCastDescriptor_t *desc_ptr,
+    infiniopTensorDescriptor_t output_desc,
+    infiniopTensorDescriptor_t input_desc) {
+
+#define CREATE(CASE, NAMESPACE)                                             \
+    case CASE:                                                              \
+        return op::cast::NAMESPACE::Descriptor::create(                     \
+            handle,                                                         \
+            reinterpret_cast<op::cast::NAMESPACE::Descriptor **>(desc_ptr), \
+            output_desc,                                                    \
+            input_desc)
+
+    switch (handle->device) {
+
+#ifdef ENABLE_CPU_API
+        CREATE(INFINI_DEVICE_CPU, cpu);
+#endif
+#ifdef ENABLE_NVIDIA_API
+        CREATE(INFINI_DEVICE_NVIDIA, nvidia);
+#endif
+#ifdef ENABLE_ILUVATAR_API
+        CREATE(INFINI_DEVICE_ILUVATAR, nvidia);
+#endif
+#ifdef ENABLE_METAX_API
+        CREATE(INFINI_DEVICE_METAX, metax);
+#endif
+
+    default:
+        return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
+    }
+
+#undef CREATE
+}
+
+__C infiniStatus_t infiniopGetCastWorkspaceSize(infiniopCastDescriptor_t desc, size_t *size) {
+
+#define GET(CASE, NAMESPACE)                                                                \
+    case CASE:                                                                              \
+        *size = reinterpret_cast<op::cast::NAMESPACE::Descriptor *>(desc)->workspaceSize(); \
+        return INFINI_STATUS_SUCCESS;
+
+    switch (desc->device_type) {
+#ifdef ENABLE_CPU_API
+        GET(INFINI_DEVICE_CPU, cpu);
+#endif
+#ifdef ENABLE_NVIDIA_API
+        GET(INFINI_DEVICE_NVIDIA, nvidia);
+#endif
+#ifdef ENABLE_ILUVATAR_API
+        GET(INFINI_DEVICE_ILUVATAR, nvidia);
+#endif
+#ifdef ENABLE_METAX_API
+        GET(INFINI_DEVICE_METAX, metax);
+#endif
+    default:
+        return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
+    }
+#undef GET
+
+    return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
+}
+
+__C infiniStatus_t infiniopCast(
+    infiniopCastDescriptor_t desc,
+    void *workspace,
+    size_t workspace_size,
+    void *output,
+    const void *input,
+    void *stream) {
+
+#define CALCULATE(CASE, NAMESPACE)                                             \
+    case CASE:                                                                 \
+        return reinterpret_cast<const op::cast::NAMESPACE::Descriptor *>(desc) \
+            ->calculate(workspace, workspace_size, output, input, stream)
+
+    switch (desc->device_type) {
+
+#ifdef ENABLE_CPU_API
+        CALCULATE(INFINI_DEVICE_CPU, cpu);
+#endif
+#ifdef ENABLE_NVIDIA_API
+        CALCULATE(INFINI_DEVICE_NVIDIA, nvidia);
+#endif
+#ifdef ENABLE_ILUVATAR_API
+        CALCULATE(INFINI_DEVICE_ILUVATAR, nvidia);
+#endif
+#ifdef ENABLE_METAX_API
+        CALCULATE(INFINI_DEVICE_METAX, metax);
+#endif
+
+    default:
+        return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
+    }
+
+#undef CALCULATE
+}
+
+__C infiniStatus_t
+infiniopDestroyCastDescriptor(infiniopCastDescriptor_t desc) {
+
+#define DELETE(CASE, NAMESPACE)                                                 \
+    case CASE:                                                                  \
+        delete reinterpret_cast<const op::cast::NAMESPACE::Descriptor *>(desc); \
+        return INFINI_STATUS_SUCCESS
+
+    switch (desc->device_type) {
+
+#ifdef ENABLE_CPU_API
+        DELETE(INFINI_DEVICE_CPU, cpu);
+#endif
+#ifdef ENABLE_NVIDIA_API
+        DELETE(INFINI_DEVICE_NVIDIA, nvidia);
+#endif
+#ifdef ENABLE_ILUVATAR_API
+        DELETE(INFINI_DEVICE_ILUVATAR, nvidia);
+#endif
+#ifdef ENABLE_METAX_API
+        DELETE(INFINI_DEVICE_METAX, metax);
+#endif
+
+    default:
+        return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
+    }
+
+#undef DELETE
+}
diff --git a/src/infiniop/ops/cos/cpu/cos_cpu.cc b/src/infiniop/ops/cos/cpu/cos_cpu.cc
new file mode 100644
index 000000000..f5d27ec49
--- /dev/null
+++ b/src/infiniop/ops/cos/cpu/cos_cpu.cc
@@ -0,0 +1,52 @@
+#include "cos_cpu.h"
+
+namespace op::cos::cpu {
+
+Descriptor::~Descriptor() = default;
+
+infiniStatus_t Descriptor::create(
+    infiniopHandle_t handle_,
+    Descriptor **desc_ptr,
+    infiniopTensorDescriptor_t out_desc,
+    std::vector<infiniopTensorDescriptor_t> input_desc_vec) {
+
+    auto handle = reinterpret_cast<device::cpu::Handle *>(handle_);
+    auto dtype = out_desc->dtype();
+
+    const auto &input_desc = input_desc_vec.at(0);
+    const auto &output_shape = out_desc->shape();
+    const auto &input_shape = input_desc->shape();
+
+    CHECK_DTYPE(dtype, INFINI_DTYPE_F16, INFINI_DTYPE_F32, INFINI_DTYPE_F64, INFINI_DTYPE_BF16);
+
+    CHECK_SAME_SHAPE(output_shape, input_shape);
+
+    // create CPU elementwise descriptor
+    CREATE_ELEMENTWISE_CPU_DESCRIPTOR(handle, dtype, out_desc, input_desc_vec);
+
+    return INFINI_STATUS_SUCCESS;
+}
+
+infiniStatus_t Descriptor::calculate(
+    void *workspace,
+    size_t workspace_size,
+    void *output,
+    std::vector<const void *> inputs,
+    void *stream) const {
+
+    switch (_dtype) {
+    case INFINI_DTYPE_F16:
+        return _device_info->calculate<CosOp, fp16_t>(_info, output, inputs, stream);
+    case INFINI_DTYPE_F32:
+        return _device_info->calculate<CosOp, float>(_info, output, inputs, stream);
+    case INFINI_DTYPE_F64:
+        return _device_info->calculate<CosOp, double>(_info, output, inputs, stream);
+    case INFINI_DTYPE_BF16:
+        return _device_info->calculate<CosOp, bf16_t>(_info, output, inputs, stream);
+    default:
+        return INFINI_STATUS_BAD_TENSOR_DTYPE;
+    }
+
+    return INFINI_STATUS_SUCCESS;
+}
+} // namespace op::cos::cpu
diff --git a/src/infiniop/ops/cos/cpu/cos_cpu.h b/src/infiniop/ops/cos/cpu/cos_cpu.h
new file mode 100644
index 000000000..af324eb80
--- /dev/null
+++ b/src/infiniop/ops/cos/cpu/cos_cpu.h
@@ -0,0 +1,21 @@
+#ifndef __COS_CPU_H__
+#define __COS_CPU_H__
+
+#include "../../../elementwise/cpu/elementwise_cpu.h"
+#include <cmath>
+
+ELEMENTWISE_DESCRIPTOR(cos, cpu)
+
+namespace op::cos::cpu {
+typedef struct CosOp {
+public:
+    static constexpr size_t num_inputs = 1;
+
+    template <typename T>
+    T operator()(const T &input) const {
+        return std::cos(input);
+    }
+} CosOp;
+} // namespace op::cos::cpu
+
+#endif // __COS_CPU_H__
diff --git a/src/infiniop/ops/cos/cuda/kernel.cuh b/src/infiniop/ops/cos/cuda/kernel.cuh
new file mode 100644
index 000000000..5db7ee8f6
--- /dev/null
+++ b/src/infiniop/ops/cos/cuda/kernel.cuh
@@ -0,0 +1,48 @@
+#ifndef __COS_CUDA_H__
+#define __COS_CUDA_H__
+
+#include <cmath>
+#include <cuda_bf16.h>
+#include <cuda_fp16.h>
+
+namespace op::cos::cuda {
+typedef struct CosOp {
+    static constexpr size_t num_inputs = 1;
+
+    template <typename T>
+    __device__ __forceinline__ T operator()(const T &input) const {
+        auto cos_f32 = [] __device__(float x) {
+            double xd = static_cast<double>(x);
+            double yd = std::cos(xd);
+            return static_cast<float>(yd);
+        };
+
+        if constexpr (std::is_same_v<T, half2>) {
+            float2 vf = __half22float2(input);
+            float2 vr = make_float2(
+                cos_f32(vf.x),
+                cos_f32(vf.y));
+            return __float22half2_rn(vr);
+        } else if constexpr (std::is_same_v<T, half>) {
+            float xf = __half2float(input);
+            float yf = cos_f32(xf);
+            return __float2half_rn(yf);
+        } else if constexpr (std::is_same_v<T, cuda_bfloat162>) {
+            float f0 = __bfloat162float(__low2bfloat16(input));
+            float f1 = __bfloat162float(__high2bfloat16(input));
+            return __floats2bfloat162_rz(cos_f32(f0), cos_f32(f1));
+        } else if constexpr (std::is_same_v<T, cuda_bfloat16>) {
+            float xf = __bfloat162float(input);
+            return __float2bfloat16_rz(cos_f32(xf));
+        } else if constexpr (std::is_same_v<T, float>) {
+            return cos_f32(input);
+        } else if constexpr (std::is_same_v<T, double>) {
+            return std::cos(input);
+        } else {
+            return std::cos(input);
+        }
+    }
+} CosOp;
+} // namespace op::cos::cuda
+
+#endif // __COS_CUDA_H__
diff --git a/src/infiniop/ops/cos/metax/cos_metax.h b/src/infiniop/ops/cos/metax/cos_metax.h
new file mode 100644
index 000000000..a98fa3211
--- /dev/null
+++ b/src/infiniop/ops/cos/metax/cos_metax.h
@@ -0,0 +1,8 @@
+#ifndef __COS_METAX_API_H__
+#define __COS_METAX_API_H__
+
+#include "../../../elementwise/metax/elementwise_metax_api.h"
+
+ELEMENTWISE_DESCRIPTOR(cos, metax)
+
+#endif // __COS_METAX_API_H__
diff --git a/src/infiniop/ops/cos/metax/cos_metax.maca b/src/infiniop/ops/cos/metax/cos_metax.maca
new file mode 100644
index 000000000..144db47ef
--- /dev/null
+++ b/src/infiniop/ops/cos/metax/cos_metax.maca
@@ -0,0 +1,60 @@
+#include "cos_metax.h"
+
+#include "../../../elementwise/metax/elementwise_metax.h"
+
+#include "../cuda/kernel.cuh"
+
+namespace op::cos::metax {
+
+Descriptor::~Descriptor() = default;
+
+infiniStatus_t Descriptor::create(
+    infiniopHandle_t handle_,
+    Descriptor **desc_ptr,
+    infiniopTensorDescriptor_t out_desc,
+    std::vector<infiniopTensorDescriptor_t> input_desc_vec) {
+
+    auto handle = reinterpret_cast<device::metax::Handle *>(handle_);
+    auto dtype = out_desc->dtype();
+
+    const auto &input_desc = input_desc_vec.at(0);
+    const auto &output_shape = out_desc->shape();
+    const auto &input_shape = input_desc->shape();
+
+    CHECK_DTYPE(dtype, INFINI_DTYPE_F16, INFINI_DTYPE_F32, INFINI_DTYPE_F64, INFINI_DTYPE_BF16);
+
+    CHECK_SAME_SHAPE(output_shape, input_shape);
+
+    // create CUDA elementwise descriptor
+    CREATE_ELEMENTWISE_METAX_DESCRIPTOR(handle, dtype, out_desc, input_desc_vec)
+
+    return INFINI_STATUS_SUCCESS;
+}
+
+infiniStatus_t Descriptor::calculate(
+    void *workspace,
+    size_t workspace_size,
+    void *output,
+    std::vector<const void *> inputs,
+    void *stream) const {
+
+    if (workspace_size < _workspace_size) {
+        return INFINI_STATUS_INSUFFICIENT_WORKSPACE;
+    }
+
+    switch (_dtype) {
+    case INFINI_DTYPE_F16:
+        return _device_info->calculate<256, cuda::CosOp, half>(_info, workspace, output, inputs, stream);
+    case INFINI_DTYPE_BF16:
+        return _device_info->calculate<256, cuda::CosOp, cuda_bfloat16>(_info, workspace, output, inputs, stream);
+    case INFINI_DTYPE_F32:
+        return _device_info->calculate<256, cuda::CosOp, float>(_info, workspace, output, inputs, stream);
+    case INFINI_DTYPE_F64:
+        return _device_info->calculate<256, cuda::CosOp, double>(_info, workspace, output, inputs, stream);
+    default:
+        return INFINI_STATUS_BAD_TENSOR_DTYPE;
+    }
+
+    return INFINI_STATUS_SUCCESS;
+}
+} // namespace op::cos::metax
diff --git a/src/infiniop/ops/cos/nvidia/cos_nvidia.cu b/src/infiniop/ops/cos/nvidia/cos_nvidia.cu
new file mode 100644
index 000000000..433363c91
--- /dev/null
+++ b/src/infiniop/ops/cos/nvidia/cos_nvidia.cu
@@ -0,0 +1,59 @@
+#include "../../../elementwise/nvidia/elementwise_nvidia.cuh"
+
+#include "../cuda/kernel.cuh"
+#include "cos_nvidia.cuh"
+
+namespace op::cos::nvidia {
+
+Descriptor::~Descriptor() = default;
+
+infiniStatus_t Descriptor::create(
+    infiniopHandle_t handle_,
+    Descriptor **desc_ptr,
+    infiniopTensorDescriptor_t out_desc,
+    std::vector<infiniopTensorDescriptor_t> input_desc_vec) {
+
+    auto handle = reinterpret_cast<device::nvidia::Handle *>(handle_);
+    auto dtype = out_desc->dtype();
+
+    const auto &input_desc = input_desc_vec.at(0);
+    const auto &output_shape = out_desc->shape();
+    const auto &input_shape = input_desc->shape();
+
+    CHECK_DTYPE(dtype, INFINI_DTYPE_F16, INFINI_DTYPE_F32, INFINI_DTYPE_F64, INFINI_DTYPE_BF16);
+
+    CHECK_SAME_SHAPE(output_shape, input_shape);
+
+    // create CUDA elementwise descriptor
+    CREATE_ELEMENTWISE_CUDA_DESCRIPTOR(handle, dtype, out_desc, input_desc_vec)
+
+    return INFINI_STATUS_SUCCESS;
+}
+
+infiniStatus_t Descriptor::calculate(
+    void *workspace,
+    size_t workspace_size,
+    void *output,
+    std::vector<const void *> inputs,
+    void *stream) const {
+
+    if (workspace_size < _workspace_size) {
+        return INFINI_STATUS_INSUFFICIENT_WORKSPACE;
+    }
+
+    switch (_dtype) {
+    case INFINI_DTYPE_F16:
+        return _device_info->calculate<256, cuda::CosOp, half>(_info, workspace, output, inputs, stream);
+    case INFINI_DTYPE_BF16:
+        return _device_info->calculate<256, cuda::CosOp, cuda_bfloat16>(_info, workspace, output, inputs, stream);
+    case INFINI_DTYPE_F32:
+        return _device_info->calculate<256, cuda::CosOp, float>(_info, workspace, output, inputs, stream);
+    case INFINI_DTYPE_F64:
+        return _device_info->calculate<256, cuda::CosOp, double>(_info, workspace, output, inputs, stream);
+    default:
+        return INFINI_STATUS_BAD_TENSOR_DTYPE;
+    }
+
+    return INFINI_STATUS_SUCCESS;
+}
+} // namespace op::cos::nvidia
diff --git a/src/infiniop/ops/cos/nvidia/cos_nvidia.cuh b/src/infiniop/ops/cos/nvidia/cos_nvidia.cuh
new file mode 100644
index 000000000..f6c350dd6
--- /dev/null
+++ b/src/infiniop/ops/cos/nvidia/cos_nvidia.cuh
@@ -0,0 +1,8 @@
+#ifndef __COS_CUDA_API_H__
+#define __COS_CUDA_API_H__
+
+#include "../../../elementwise/nvidia/elementwise_nvidia_api.cuh"
+
+ELEMENTWISE_DESCRIPTOR(cos, nvidia)
+
+#endif // __COS_CUDA_API_H__
diff --git a/src/infiniop/ops/cos/operator.cc b/src/infiniop/ops/cos/operator.cc
new file mode 100644
index 000000000..71a5f807c
--- /dev/null
+++ b/src/infiniop/ops/cos/operator.cc
@@ -0,0 +1,142 @@
+#include "../../operator.h"
+#include "../../handle.h"
+#include "infiniop/ops/cos.h"
+
+#ifdef ENABLE_CPU_API
+#include "cpu/cos_cpu.h"
+#endif
+#if defined(ENABLE_NVIDIA_API) || defined(ENABLE_ILUVATAR_API)
+#include "nvidia/cos_nvidia.cuh"
+#endif
+#ifdef ENABLE_METAX_API
+#include "metax/cos_metax.h"
+#endif
+
+__C infiniStatus_t infiniopCreateCosDescriptor(
+    infiniopHandle_t handle,
+    infiniopCosDescriptor_t *desc_ptr,
+    infiniopTensorDescriptor_t output_desc,
+    infiniopTensorDescriptor_t input_desc) {
+
+#define CREATE(CASE, NAMESPACE)                                            \
+    case CASE:                                                             \
+        return op::cos::NAMESPACE::Descriptor::create(                     \
+            handle,                                                        \
+            reinterpret_cast<op::cos::NAMESPACE::Descriptor **>(desc_ptr), \
+            output_desc,                                                   \
+            {input_desc})
+
+    switch (handle->device) {
+
+#ifdef ENABLE_CPU_API
+        CREATE(INFINI_DEVICE_CPU, cpu);
+#endif
+#ifdef ENABLE_NVIDIA_API
+        CREATE(INFINI_DEVICE_NVIDIA, nvidia);
+#endif
+#ifdef ENABLE_ILUVATAR_API
+        CREATE(INFINI_DEVICE_ILUVATAR, nvidia);
+#endif
+#ifdef ENABLE_METAX_API
+        CREATE(INFINI_DEVICE_METAX, metax);
+#endif
+
+    default:
+        return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
+    }
+
+#undef CREATE
+}
+
+__C infiniStatus_t infiniopGetCosWorkspaceSize(infiniopCosDescriptor_t desc, size_t *size) {
+
+#define GET(CASE, NAMESPACE)                                                               \
+    case CASE:                                                                             \
+        *size = reinterpret_cast<op::cos::NAMESPACE::Descriptor *>(desc)->workspaceSize(); \
+        return INFINI_STATUS_SUCCESS;
+
+    switch (desc->device_type) {
+#ifdef ENABLE_CPU_API
+        GET(INFINI_DEVICE_CPU, cpu);
+#endif
+#ifdef ENABLE_NVIDIA_API
+        GET(INFINI_DEVICE_NVIDIA, nvidia);
+#endif
+#ifdef ENABLE_ILUVATAR_API
+        GET(INFINI_DEVICE_ILUVATAR, nvidia);
+#endif
+#ifdef ENABLE_METAX_API
+        GET(INFINI_DEVICE_METAX, metax);
+#endif
+    default:
+        return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
+    }
+#undef GET
+
+    return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
+}
+
+__C infiniStatus_t infiniopCos(
+    infiniopCosDescriptor_t desc,
+    void *workspace,
+    size_t workspace_size,
+    void *output,
+    const void *input,
+    void *stream) {
+
+#define CALCULATE(CASE, NAMESPACE)                                            \
+    case CASE:                                                                \
+        return reinterpret_cast<const op::cos::NAMESPACE::Descriptor *>(desc) \
+            ->calculate(workspace, workspace_size, output, {input}, stream)
+
+    switch (desc->device_type) {
+
+#ifdef ENABLE_CPU_API
+        CALCULATE(INFINI_DEVICE_CPU, cpu);
+#endif
+#ifdef ENABLE_NVIDIA_API
+        CALCULATE(INFINI_DEVICE_NVIDIA, nvidia);
+#endif
+#ifdef ENABLE_ILUVATAR_API
+        CALCULATE(INFINI_DEVICE_ILUVATAR, nvidia);
+#endif
+#ifdef ENABLE_METAX_API
+        CALCULATE(INFINI_DEVICE_METAX, metax);
+#endif
+
+    default:
+        return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
+    }
+
+#undef CALCULATE
+}
+
+__C infiniStatus_t
+infiniopDestroyCosDescriptor(infiniopCosDescriptor_t desc) {
+
+#define DELETE(CASE, NAMESPACE)                                                \
+    case CASE:                                                                 \
+        delete reinterpret_cast<const op::cos::NAMESPACE::Descriptor *>(desc); \
+        return INFINI_STATUS_SUCCESS
+
+    switch (desc->device_type) {
+
+#ifdef ENABLE_CPU_API
+        DELETE(INFINI_DEVICE_CPU, cpu);
+#endif
+#ifdef ENABLE_NVIDIA_API
+        DELETE(INFINI_DEVICE_NVIDIA, nvidia);
+#endif
+#ifdef ENABLE_ILUVATAR_API
+        DELETE(INFINI_DEVICE_ILUVATAR, nvidia);
+#endif
+#ifdef ENABLE_METAX_API
+        DELETE(INFINI_DEVICE_METAX, metax);
+#endif
+
+    default:
+        return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
+    }
+
+#undef DELETE
+}
diff --git a/src/infiniop/ops/exp/cpu/exp_cpu.cc b/src/infiniop/ops/exp/cpu/exp_cpu.cc
new file mode 100644
index 000000000..58a6d0f2d
--- /dev/null
+++ b/src/infiniop/ops/exp/cpu/exp_cpu.cc
@@ -0,0 +1,52 @@
+#include "exp_cpu.h"
+
+namespace op::exp::cpu {
+
+Descriptor::~Descriptor() = default;
+
+infiniStatus_t Descriptor::create(
+    infiniopHandle_t handle_,
+    Descriptor **desc_ptr,
+    infiniopTensorDescriptor_t out_desc,
+    std::vector<infiniopTensorDescriptor_t> input_desc_vec) {
+
+    auto handle = reinterpret_cast<device::cpu::Handle *>(handle_);
+    auto dtype = out_desc->dtype();
+
+    const auto &input_desc = input_desc_vec.at(0);
+    const auto &output_shape = out_desc->shape();
+    const auto &input_shape = input_desc->shape();
+
+    CHECK_DTYPE(dtype, INFINI_DTYPE_F16, INFINI_DTYPE_F32, INFINI_DTYPE_F64, INFINI_DTYPE_BF16);
+
+    CHECK_SAME_SHAPE(output_shape, input_shape);
+
+    // create CPU elementwise descriptor
+    CREATE_ELEMENTWISE_CPU_DESCRIPTOR(handle, dtype, out_desc, input_desc_vec);
+
+    return INFINI_STATUS_SUCCESS;
+}
+
+infiniStatus_t Descriptor::calculate(
+    void *workspace,
+    size_t workspace_size,
+    void *output,
+    std::vector<const void *> inputs,
+    void *stream) const {
+
+    switch (_dtype) {
+    case INFINI_DTYPE_F16:
+        return _device_info->calculate<ExpOp, fp16_t>(_info, output, inputs, stream);
+    case INFINI_DTYPE_F32:
+        return _device_info->calculate<ExpOp, float>(_info, output, inputs, stream);
+    case INFINI_DTYPE_F64:
+        return _device_info->calculate<ExpOp, double>(_info, output, inputs, stream);
+    case INFINI_DTYPE_BF16:
+        return _device_info->calculate<ExpOp, bf16_t>(_info, output, inputs, stream);
+    default:
+        return INFINI_STATUS_BAD_TENSOR_DTYPE;
+    }
+
+    return INFINI_STATUS_SUCCESS;
+}
+} // namespace op::exp::cpu
diff --git a/src/infiniop/ops/exp/cpu/exp_cpu.h b/src/infiniop/ops/exp/cpu/exp_cpu.h
new file mode 100644
index 000000000..867c7afa5
--- /dev/null
+++ b/src/infiniop/ops/exp/cpu/exp_cpu.h
@@ -0,0 +1,21 @@
+#ifndef __EXP_CPU_H__
+#define __EXP_CPU_H__
+
+#include "../../../elementwise/cpu/elementwise_cpu.h"
+#include <cmath>
+
+ELEMENTWISE_DESCRIPTOR(exp, cpu)
+
+namespace op::exp::cpu {
+typedef struct ExpOp {
+public:
+    static constexpr size_t num_inputs = 1;
+
+    template <typename T>
+    T operator()(const T &input) const {
+        return std::exp(input);
+    }
+} ExpOp;
+} // namespace op::exp::cpu
+
+#endif // __EXP_CPU_H__
diff --git a/src/infiniop/ops/exp/cuda/kernel.cuh b/src/infiniop/ops/exp/cuda/kernel.cuh
new file mode 100644
index 000000000..12446f31a
--- /dev/null
+++ b/src/infiniop/ops/exp/cuda/kernel.cuh
@@ -0,0 +1,39 @@
+#ifndef __EXP_CUDA_H__
+#define __EXP_CUDA_H__
+
+#include <cmath>
+#include <cuda_bf16.h>
+#include <cuda_fp16.h>
+
+namespace op::exp::cuda {
+typedef struct ExpOp {
+    static constexpr size_t num_inputs = 1;
+
+    template <typename T>
+    __device__ __forceinline__ T operator()(const T &input) const {
+        if constexpr (std::is_same_v<T, half2>) {
+            float2 vf = __half22float2(input);
+            float2 vr = make_float2(__expf(vf.x), __expf(vf.y));
+            return __float22half2_rn(vr);
+        } else if constexpr (std::is_same_v<T, half>) {
+            float inputf = __half2float(input);
+            return __float2half_rn(__expf(inputf));
+        } else if constexpr (std::is_same_v<T, cuda_bfloat162>) {
+            float f0 = __bfloat162float(__low2bfloat16(input));
+            float f1 = __bfloat162float(__high2bfloat16(input));
+            return __floats2bfloat162_rn(__expf(f0), __expf(f1));
+        } else if constexpr (std::is_same_v<T, cuda_bfloat16>) {
+            float inputf = __bfloat162float(input);
+            return __float2bfloat16_rn(__expf(inputf));
+        } else if constexpr (std::is_same_v<T, float>) {
+            return __expf(input);
+        } else if constexpr (std::is_same_v<T, double>) {
+            return std::exp(input);
+        } else {
+            return std::exp(input);
+        }
+    }
+} ExpOp;
+} // namespace op::exp::cuda
+
+#endif // __EXP_CUDA_H__
diff --git a/src/infiniop/ops/exp/metax/exp_metax.h b/src/infiniop/ops/exp/metax/exp_metax.h
new file mode 100644
index 000000000..fb10faf9b
--- /dev/null
+++ b/src/infiniop/ops/exp/metax/exp_metax.h
@@ -0,0 +1,8 @@
+#ifndef __EXP_METAX_API_H__
+#define __EXP_METAX_API_H__
+
+#include "../../../elementwise/metax/elementwise_metax_api.h"
+
+ELEMENTWISE_DESCRIPTOR(exp, metax)
+
+#endif // __EXP_METAX_API_H__
diff --git a/src/infiniop/ops/exp/metax/exp_metax.maca b/src/infiniop/ops/exp/metax/exp_metax.maca
new file mode 100644
index 000000000..c71703c6d
--- /dev/null
+++ b/src/infiniop/ops/exp/metax/exp_metax.maca
@@ -0,0 +1,60 @@
+#include "exp_metax.h"
+
+#include "../../../elementwise/metax/elementwise_metax.h"
+
+#include "../cuda/kernel.cuh"
+
+namespace op::exp::metax {
+
+Descriptor::~Descriptor() = default;
+
+infiniStatus_t Descriptor::create(
+    infiniopHandle_t handle_,
+    Descriptor **desc_ptr,
+    infiniopTensorDescriptor_t out_desc,
+    std::vector<infiniopTensorDescriptor_t> input_desc_vec) {
+
+    auto handle = reinterpret_cast<device::metax::Handle *>(handle_);
+    auto dtype = out_desc->dtype();
+
+    const auto &input_desc = input_desc_vec.at(0);
+    const auto &output_shape = out_desc->shape();
+    const auto &input_shape = input_desc->shape();
+
+    CHECK_DTYPE(dtype, INFINI_DTYPE_F16, INFINI_DTYPE_F32, INFINI_DTYPE_F64, INFINI_DTYPE_BF16);
+
+    CHECK_SAME_SHAPE(output_shape, input_shape);
+
+    // create CUDA elementwise descriptor
+    CREATE_ELEMENTWISE_METAX_DESCRIPTOR(handle, dtype, out_desc, input_desc_vec)
+
+    return INFINI_STATUS_SUCCESS;
+}
+
+infiniStatus_t Descriptor::calculate(
+    void *workspace,
+    size_t workspace_size,
+    void *output,
+    std::vector<const void *> inputs,
+    void *stream) const {
+
+    if (workspace_size < _workspace_size) {
+        return INFINI_STATUS_INSUFFICIENT_WORKSPACE;
+    }
+
+    switch (_dtype) {
+    case INFINI_DTYPE_F16:
+        return _device_info->calculate<256, cuda::ExpOp, half>(_info, workspace, output, inputs, stream);
+    case INFINI_DTYPE_BF16:
+        return _device_info->calculate<256, cuda::ExpOp, cuda_bfloat16>(_info, workspace, output, inputs, stream);
+    case INFINI_DTYPE_F32:
+        return _device_info->calculate<256, cuda::ExpOp, float>(_info, workspace, output, inputs, stream);
+    case INFINI_DTYPE_F64:
+        return _device_info->calculate<256, cuda::ExpOp, double>(_info, workspace, output, inputs, stream);
+    default:
+        return INFINI_STATUS_BAD_TENSOR_DTYPE;
+    }
+
+    return INFINI_STATUS_SUCCESS;
+}
+} // namespace op::exp::metax
diff --git a/src/infiniop/ops/exp/nvidia/exp_nvidia.cu b/src/infiniop/ops/exp/nvidia/exp_nvidia.cu
new file mode 100644
index 000000000..3bdf2eb45
--- /dev/null
+++ b/src/infiniop/ops/exp/nvidia/exp_nvidia.cu
@@ -0,0 +1,59 @@
+#include "../../../elementwise/nvidia/elementwise_nvidia.cuh"
+
+#include "../cuda/kernel.cuh"
+#include "exp_nvidia.cuh"
+
+namespace op::exp::nvidia {
+
+Descriptor::~Descriptor() = default;
+
+infiniStatus_t Descriptor::create(
+    infiniopHandle_t handle_,
+    Descriptor **desc_ptr,
+    infiniopTensorDescriptor_t out_desc,
+    std::vector<infiniopTensorDescriptor_t> input_desc_vec) {
+
+    auto handle = reinterpret_cast<device::nvidia::Handle *>(handle_);
+    auto dtype = out_desc->dtype();
+
+    const auto &input_desc = input_desc_vec.at(0);
+    const auto &output_shape = out_desc->shape();
+    const auto &input_shape = input_desc->shape();
+
+    CHECK_DTYPE(dtype, INFINI_DTYPE_F16, INFINI_DTYPE_F32, INFINI_DTYPE_F64, INFINI_DTYPE_BF16);
+
+    CHECK_SAME_SHAPE(output_shape, input_shape);
+
+    // create CUDA elementwise descriptor
+    CREATE_ELEMENTWISE_CUDA_DESCRIPTOR(handle, dtype, out_desc, input_desc_vec)
+
+    return INFINI_STATUS_SUCCESS;
+}
+
+infiniStatus_t Descriptor::calculate(
+    void *workspace,
+    size_t workspace_size,
+    void *output,
+    std::vector<const void *> inputs,
+    void *stream) const {
+
+    if (workspace_size < _workspace_size) {
+        return INFINI_STATUS_INSUFFICIENT_WORKSPACE;
+    }
+
+    switch (_dtype) {
+    case INFINI_DTYPE_F16:
+        return _device_info->calculate<256, cuda::ExpOp, half>(_info, workspace, output, inputs, stream);
+    case INFINI_DTYPE_BF16:
+        return _device_info->calculate<256, cuda::ExpOp, cuda_bfloat16>(_info, workspace, output, inputs, stream);
+    case INFINI_DTYPE_F32:
+        return _device_info->calculate<256, cuda::ExpOp, float>(_info, workspace, output, inputs, stream);
+    case INFINI_DTYPE_F64:
+        return _device_info->calculate<256, cuda::ExpOp, double>(_info, workspace, output, inputs, stream);
+    default:
+        return INFINI_STATUS_BAD_TENSOR_DTYPE;
+    }
+
+    return INFINI_STATUS_SUCCESS;
+}
+} // namespace op::exp::nvidia
diff --git a/src/infiniop/ops/exp/nvidia/exp_nvidia.cuh b/src/infiniop/ops/exp/nvidia/exp_nvidia.cuh
new file mode 100644
index 000000000..7545e8f3e
--- /dev/null
+++ b/src/infiniop/ops/exp/nvidia/exp_nvidia.cuh
@@ -0,0 +1,8 @@
+#ifndef __EXP_CUDA_API_H__
+#define __EXP_CUDA_API_H__
+
+#include "../../../elementwise/nvidia/elementwise_nvidia_api.cuh"
+
+ELEMENTWISE_DESCRIPTOR(exp, nvidia)
+
+#endif // __EXP_CUDA_API_H__
diff --git a/src/infiniop/ops/exp/operator.cc b/src/infiniop/ops/exp/operator.cc
new file mode 100644
index 000000000..ee1dc6768
--- /dev/null
+++ b/src/infiniop/ops/exp/operator.cc
@@ -0,0 +1,142 @@
+#include "../../operator.h"
+#include "../../handle.h"
+#include "infiniop/ops/exp.h"
+
+#ifdef ENABLE_CPU_API
+#include "cpu/exp_cpu.h"
+#endif
+#if defined(ENABLE_NVIDIA_API) || defined(ENABLE_ILUVATAR_API)
+#include "nvidia/exp_nvidia.cuh"
+#endif
+#ifdef ENABLE_METAX_API
+#include "metax/exp_metax.h"
+#endif
+
+__C infiniStatus_t infiniopCreateExpDescriptor(
+    infiniopHandle_t handle,
+    infiniopExpDescriptor_t *desc_ptr,
+    infiniopTensorDescriptor_t output_desc,
+    infiniopTensorDescriptor_t input_desc) {
+
+#define CREATE(CASE, NAMESPACE)                                            \
+    case CASE:                                                             \
+        return op::exp::NAMESPACE::Descriptor::create(                     \
+            handle,                                                        \
+            reinterpret_cast<op::exp::NAMESPACE::Descriptor **>(desc_ptr), \
+            output_desc,                                                   \
+            {input_desc})
+
+    switch (handle->device) {
+
+#ifdef ENABLE_CPU_API
+        CREATE(INFINI_DEVICE_CPU, cpu);
+#endif
+#ifdef ENABLE_NVIDIA_API
+        CREATE(INFINI_DEVICE_NVIDIA, nvidia);
+#endif
+#ifdef ENABLE_ILUVATAR_API
+        CREATE(INFINI_DEVICE_ILUVATAR, nvidia);
+#endif
+#ifdef ENABLE_METAX_API
+        CREATE(INFINI_DEVICE_METAX, metax);
+#endif
+
+    default:
+        return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
+    }
+
+#undef CREATE
+}
+
+__C infiniStatus_t infiniopGetExpWorkspaceSize(infiniopExpDescriptor_t desc, size_t *size) {
+
+#define GET(CASE, NAMESPACE)                                                               \
+    case CASE:                                                                             \
+        *size = reinterpret_cast<op::exp::NAMESPACE::Descriptor *>(desc)->workspaceSize(); \
+        return INFINI_STATUS_SUCCESS;
+
+    switch (desc->device_type) {
+#ifdef ENABLE_CPU_API
+        GET(INFINI_DEVICE_CPU, cpu);
+#endif
+#ifdef ENABLE_NVIDIA_API
+        GET(INFINI_DEVICE_NVIDIA, nvidia);
+#endif
+#ifdef ENABLE_ILUVATAR_API
+        GET(INFINI_DEVICE_ILUVATAR, nvidia);
+#endif
+#ifdef ENABLE_METAX_API
+        GET(INFINI_DEVICE_METAX, metax);
+#endif
+    default:
+        return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
+    }
+#undef GET
+
+    return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
+}
+
+__C infiniStatus_t infiniopExp(
+    infiniopExpDescriptor_t desc,
+    void *workspace,
+    size_t workspace_size,
+    void *output,
+    const void *input,
+    void *stream) {
+
+#define CALCULATE(CASE, NAMESPACE)                                            \
+    case CASE:                                                                \
+        return reinterpret_cast<const op::exp::NAMESPACE::Descriptor *>(desc) \
+            ->calculate(workspace, workspace_size, output, {input}, stream)
+
+    switch (desc->device_type) {
+
+#ifdef ENABLE_CPU_API
+        CALCULATE(INFINI_DEVICE_CPU, cpu);
+#endif
+#ifdef ENABLE_NVIDIA_API
+        CALCULATE(INFINI_DEVICE_NVIDIA, nvidia);
+#endif
+#ifdef ENABLE_ILUVATAR_API
+        CALCULATE(INFINI_DEVICE_ILUVATAR, nvidia);
+#endif
+#ifdef ENABLE_METAX_API
+        CALCULATE(INFINI_DEVICE_METAX, metax);
+#endif
+
+    default:
+        return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
+    }
+
+#undef CALCULATE
+}
+
+__C infiniStatus_t
+infiniopDestroyExpDescriptor(infiniopExpDescriptor_t desc) {
+
+#define DELETE(CASE, NAMESPACE)                                                \
+    case CASE:                                                                 \
+        delete reinterpret_cast<const op::exp::NAMESPACE::Descriptor *>(desc); \
+        return INFINI_STATUS_SUCCESS
+
+    switch (desc->device_type) {
+
+#ifdef ENABLE_CPU_API
+        DELETE(INFINI_DEVICE_CPU, cpu);
+#endif
+#ifdef ENABLE_NVIDIA_API
+        DELETE(INFINI_DEVICE_NVIDIA, nvidia);
+#endif
+#ifdef ENABLE_ILUVATAR_API
+        DELETE(INFINI_DEVICE_ILUVATAR, nvidia);
+#endif
+#ifdef ENABLE_METAX_API
+        DELETE(INFINI_DEVICE_METAX, metax);
+#endif
+
+    default:
+        return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
+    }
+
+#undef DELETE
+}
diff --git a/src/infiniop/ops/hardswish/cpu/hardswish_cpu.cc b/src/infiniop/ops/hardswish/cpu/hardswish_cpu.cc
new file mode 100644
index 000000000..e7b68508a
--- /dev/null
+++ b/src/infiniop/ops/hardswish/cpu/hardswish_cpu.cc
@@ -0,0 +1,52 @@
+#include "hardswish_cpu.h"
+
+namespace op::hardswish::cpu {
+
+Descriptor::~Descriptor() = default;
+
+infiniStatus_t Descriptor::create(
+    infiniopHandle_t handle_,
+    Descriptor **desc_ptr,
+    infiniopTensorDescriptor_t out_desc,
+    std::vector<infiniopTensorDescriptor_t> input_desc_vec) {
+
+    auto handle = reinterpret_cast<device::cpu::Handle *>(handle_);
+    auto dtype = out_desc->dtype();
+
+    const auto &input_desc = input_desc_vec.at(0);
+    const auto &output_shape = out_desc->shape();
+    const auto &input_shape = input_desc->shape();
+
+    CHECK_DTYPE(dtype, INFINI_DTYPE_F16, INFINI_DTYPE_F32, INFINI_DTYPE_F64, INFINI_DTYPE_BF16);
+
+    CHECK_SAME_SHAPE(output_shape, input_shape);
+
+    // create CPU elementwise descriptor
+    CREATE_ELEMENTWISE_CPU_DESCRIPTOR(handle, dtype, out_desc, input_desc_vec);
+
+    return INFINI_STATUS_SUCCESS;
+}
+
+infiniStatus_t Descriptor::calculate(
+    void *workspace,
+    size_t workspace_size,
+    void *output,
+    std::vector<const void *> inputs,
+    void *stream) const {
+
+    switch (_dtype) {
+    case INFINI_DTYPE_F16:
+        return _device_info->calculate<HardswishOp, fp16_t>(_info, output, inputs, stream);
+    case INFINI_DTYPE_F32:
+        return _device_info->calculate<HardswishOp, float>(_info, output, inputs, stream);
+    case INFINI_DTYPE_F64:
+        return _device_info->calculate<HardswishOp, double>(_info, output, inputs, stream);
+    case INFINI_DTYPE_BF16:
+        return _device_info->calculate<HardswishOp, bf16_t>(_info, output, inputs, stream);
+    default:
+        return INFINI_STATUS_BAD_TENSOR_DTYPE;
+    }
+
+    return INFINI_STATUS_SUCCESS;
+}
+} // namespace op::hardswish::cpu
diff --git a/src/infiniop/ops/hardswish/cpu/hardswish_cpu.h b/src/infiniop/ops/hardswish/cpu/hardswish_cpu.h
new file mode 100644
index 000000000..e137be8a0
--- /dev/null
+++ b/src/infiniop/ops/hardswish/cpu/hardswish_cpu.h
@@ -0,0 +1,30 @@
+#ifndef __HARDSWISH_CPU_H__
+#define __HARDSWISH_CPU_H__
+
+#include "../../../elementwise/cpu/elementwise_cpu.h"
+#include <algorithm>
+
+ELEMENTWISE_DESCRIPTOR(hardswish, cpu)
+
+namespace op::hardswish::cpu {
+typedef struct HardswishOp {
+public:
+    static constexpr size_t num_inputs = 1;
+
+    template <typename T>
+    T operator()(const T &input) const {
+        if constexpr (std::is_integral_v<T>) {
+            return static_cast<T>(0);
+        } else {
+            // x * clamp(x + 3, 0, 6) / 6
+            auto x = static_cast<double>(input);
+            double y = x + 3.0;
+            y = std::min(std::max(y, 0.0), 6.0);
+            double out = x * (y / 6.0);
+            return static_cast<T>(out);
+        }
+    }
+} HardswishOp;
+} // namespace op::hardswish::cpu
+
+#endif // __HARDSWISH_CPU_H__
diff --git a/src/infiniop/ops/hardswish/cuda/kernel.cuh b/src/infiniop/ops/hardswish/cuda/kernel.cuh
new file mode 100644
index 000000000..d5b369bce
--- /dev/null
+++ b/src/infiniop/ops/hardswish/cuda/kernel.cuh
@@ -0,0 +1,55 @@
+#ifndef __HARDSWISH_CUDA_H__
+#define __HARDSWISH_CUDA_H__
+
+#include <cmath>
+#include <cuda_bf16.h>
+#include <cuda_fp16.h>
+
+namespace op::hardswish::cuda {
+
+typedef struct HardswishOp {
+    static constexpr size_t num_inputs = 1;
+
+    // Hardswish: f(x) = x * clamp(x + 3, 0, 6) / 6
+    __device__ __forceinline__ float hswish_f32(float x) const {
+        float y = x + 3.0f;
+        y = y < 0.0f ? 0.0f : (y > 6.0f ? 6.0f : y);
+        return x * (y * (1.0f / 6.0f));
+    }
+
+    template <typename T>
+    __device__ __forceinline__ T operator()(const T &input) const {
+        if constexpr (std::is_same_v<T, half2>) {
+            float2 vf = __half22float2(input);
+            float2 vr = make_float2(
+                hswish_f32(vf.x),
+                hswish_f32(vf.y));
+            return __float22half2_rn(vr);
+        } else if constexpr (std::is_same_v<T, half>) {
+            float xf = __half2float(input);
+            float yf = hswish_f32(xf);
+            return __float2half_rn(yf);
+        } else if constexpr (std::is_same_v<T, cuda_bfloat162>) {
+            float f0 = __bfloat162float(__low2bfloat16(input));
+            float f1 = __bfloat162float(__high2bfloat16(input));
+            return __floats2bfloat162_rn(hswish_f32(f0), hswish_f32(f1));
+        } else if constexpr (std::is_same_v<T, cuda_bfloat16>) {
+            float xf = __bfloat162float(input);
+            return __float2bfloat16_rz(hswish_f32(xf));
+        } else if constexpr (std::is_same_v<T, float>) {
+            return hswish_f32(input);
+        } else if constexpr (std::is_same_v<T, double>) {
+            double xd = static_cast<double>(input);
+            double yd = xd * (std::fmin(std::fmax(xd + 3.0, 0.0), 6.0) / 6.0);
+            return static_cast<T>(yd);
+        } else {
+            double xd = static_cast<double>(input);
+            double yd = xd * (std::fmin(std::fmax(xd + 3.0, 0.0), 6.0) / 6.0);
+            return static_cast<T>(yd);
+        }
+    }
+} HardswishOp;
+
+} // namespace op::hardswish::cuda
+
+#endif // __HARDSWISH_CUDA_H__
diff --git a/src/infiniop/ops/hardswish/metax/hardswish_metax.h b/src/infiniop/ops/hardswish/metax/hardswish_metax.h
new file mode 100644
index 000000000..16b131aa9
--- /dev/null
+++ b/src/infiniop/ops/hardswish/metax/hardswish_metax.h
@@ -0,0 +1,8 @@
+#ifndef __HARDSWISH_METAX_API_H__
+#define __HARDSWISH_METAX_API_H__
+
+#include "../../../elementwise/metax/elementwise_metax_api.h"
+
+ELEMENTWISE_DESCRIPTOR(hardswish, metax)
+
+#endif // __HARDSWISH_METAX_API_H__
diff --git a/src/infiniop/ops/hardswish/metax/hardswish_metax.maca b/src/infiniop/ops/hardswish/metax/hardswish_metax.maca
new file mode 100644
index 000000000..e53b94357
--- /dev/null
+++ b/src/infiniop/ops/hardswish/metax/hardswish_metax.maca
@@ -0,0 +1,60 @@
+#include "hardswish_metax.h"
+
+#include "../../../elementwise/metax/elementwise_metax.h"
+
+#include "../cuda/kernel.cuh"
+
+namespace op::hardswish::metax {
+
+Descriptor::~Descriptor() = default;
+
+infiniStatus_t Descriptor::create(
+    infiniopHandle_t handle_,
+    Descriptor **desc_ptr,
+    infiniopTensorDescriptor_t out_desc,
+    std::vector<infiniopTensorDescriptor_t> input_desc_vec) {
+
+    auto handle = reinterpret_cast<device::metax::Handle *>(handle_);
+    auto dtype = out_desc->dtype();
+
+    const auto &input_desc = input_desc_vec.at(0);
+    const auto &output_shape = out_desc->shape();
+    const auto &input_shape = input_desc->shape();
+
+    CHECK_DTYPE(dtype, INFINI_DTYPE_F16, INFINI_DTYPE_F32, INFINI_DTYPE_F64, INFINI_DTYPE_BF16);
+
+    CHECK_SAME_SHAPE(output_shape, input_shape);
+
+    // create CUDA elementwise descriptor
+    CREATE_ELEMENTWISE_METAX_DESCRIPTOR(handle, dtype, out_desc, input_desc_vec)
+
+    return INFINI_STATUS_SUCCESS;
+}
+
+infiniStatus_t Descriptor::calculate(
+    void *workspace,
+    size_t workspace_size,
+    void *output,
+    std::vector<const void *> inputs,
+    void *stream) const {
+
+    if (workspace_size < _workspace_size) {
+        return INFINI_STATUS_INSUFFICIENT_WORKSPACE;
+    }
+
+    switch (_dtype) {
+    case INFINI_DTYPE_F16:
+        return _device_info->calculate<256, cuda::HardswishOp, half>(_info, workspace, output, inputs, stream);
+    case INFINI_DTYPE_BF16:
+        return _device_info->calculate<256, cuda::HardswishOp, cuda_bfloat16>(_info, workspace, output, inputs, stream);
+    case INFINI_DTYPE_F32:
+        return _device_info->calculate<256, cuda::HardswishOp, float>(_info, workspace, output, inputs, stream);
+    case INFINI_DTYPE_F64:
+        return _device_info->calculate<256, cuda::HardswishOp, double>(_info, workspace, output, inputs, stream);
+    default:
+        return INFINI_STATUS_BAD_TENSOR_DTYPE;
+    }
+
+    return INFINI_STATUS_SUCCESS;
+}
+} // namespace op::hardswish::metax
diff --git a/src/infiniop/ops/hardswish/nvidia/hardswish_nvidia.cu b/src/infiniop/ops/hardswish/nvidia/hardswish_nvidia.cu
new file mode 100644
index 000000000..9e279c2ef
--- /dev/null
+++ b/src/infiniop/ops/hardswish/nvidia/hardswish_nvidia.cu
@@ -0,0 +1,59 @@
+#include "../../../elementwise/nvidia/elementwise_nvidia.cuh"
+
+#include "../cuda/kernel.cuh"
+#include "hardswish_nvidia.cuh"
+
+namespace op::hardswish::nvidia {
+
+Descriptor::~Descriptor() = default;
+
+infiniStatus_t Descriptor::create(
+    infiniopHandle_t handle_,
+    Descriptor **desc_ptr,
+    infiniopTensorDescriptor_t out_desc,
+    std::vector<infiniopTensorDescriptor_t> input_desc_vec) {
+
+    auto handle = reinterpret_cast<device::nvidia::Handle *>(handle_);
+    auto dtype = out_desc->dtype();
+
+    const auto &input_desc = input_desc_vec.at(0);
+    const auto &output_shape = out_desc->shape();
+    const auto &input_shape = input_desc->shape();
+
+    CHECK_DTYPE(dtype, INFINI_DTYPE_F16, INFINI_DTYPE_F32, INFINI_DTYPE_F64, INFINI_DTYPE_BF16);
+
+    CHECK_SAME_SHAPE(output_shape, input_shape);
+
+    // create CUDA elementwise descriptor
+    CREATE_ELEMENTWISE_CUDA_DESCRIPTOR(handle, dtype, out_desc, input_desc_vec)
+
+    return INFINI_STATUS_SUCCESS;
+}
+
+infiniStatus_t Descriptor::calculate(
+    void *workspace,
+    size_t workspace_size,
+    void *output,
+    std::vector<const void *> inputs,
+    void *stream) const {
+
+    if (workspace_size < _workspace_size) {
+        return INFINI_STATUS_INSUFFICIENT_WORKSPACE;
+    }
+
+    switch (_dtype) {
+    case INFINI_DTYPE_F16:
+        return _device_info->calculate<256, cuda::HardswishOp, half>(_info, workspace, output, inputs, stream);
+    case INFINI_DTYPE_BF16:
+        return _device_info->calculate<256, cuda::HardswishOp, cuda_bfloat16>(_info, workspace, output, inputs, stream);
+    case INFINI_DTYPE_F32:
+        return _device_info->calculate<256, cuda::HardswishOp, float>(_info, workspace, output, inputs, stream);
+    case INFINI_DTYPE_F64:
+        return _device_info->calculate<256, cuda::HardswishOp, double>(_info, workspace, output, inputs, stream);
+    default:
+        return INFINI_STATUS_BAD_TENSOR_DTYPE;
+    }
+
+    return INFINI_STATUS_SUCCESS;
+}
+} // namespace op::hardswish::nvidia
diff --git a/src/infiniop/ops/hardswish/nvidia/hardswish_nvidia.cuh b/src/infiniop/ops/hardswish/nvidia/hardswish_nvidia.cuh
new file mode 100644
index 000000000..f869ad52f
--- /dev/null
+++ b/src/infiniop/ops/hardswish/nvidia/hardswish_nvidia.cuh
@@ -0,0 +1,8 @@
+#ifndef __HARDSWISH_CUDA_API_H__
+#define __HARDSWISH_CUDA_API_H__
+
+#include "../../../elementwise/nvidia/elementwise_nvidia_api.cuh"
+
+ELEMENTWISE_DESCRIPTOR(hardswish, nvidia)
+
+#endif // __HARDSWISH_CUDA_API_H__
diff --git a/src/infiniop/ops/hardswish/operator.cc b/src/infiniop/ops/hardswish/operator.cc
new file mode 100644
index 000000000..e8ba19fc1
--- /dev/null
+++ b/src/infiniop/ops/hardswish/operator.cc
@@ -0,0 +1,142 @@
+#include "../../operator.h"
+#include "../../handle.h"
+#include "infiniop/ops/hardswish.h"
+
+#ifdef ENABLE_CPU_API
+#include "cpu/hardswish_cpu.h"
+#endif
+#if defined(ENABLE_NVIDIA_API) || defined(ENABLE_ILUVATAR_API)
+#include "nvidia/hardswish_nvidia.cuh"
+#endif
+#ifdef ENABLE_METAX_API
+#include "metax/hardswish_metax.h"
+#endif
+
+__C infiniStatus_t infiniopCreateHardswishDescriptor(
+    infiniopHandle_t handle,
+    infiniopHardswishDescriptor_t *desc_ptr,
+    infiniopTensorDescriptor_t output_desc,
+    infiniopTensorDescriptor_t input_desc) {
+
+#define CREATE(CASE, NAMESPACE)                                                  \
+    case CASE:                                                                   \
+        return op::hardswish::NAMESPACE::Descriptor::create(                     \
+            handle,                                                              \
+            reinterpret_cast<op::hardswish::NAMESPACE::Descriptor **>(desc_ptr), \
+            output_desc,                                                         \
+            {input_desc})
+
+    switch (handle->device) {
+
+#ifdef ENABLE_CPU_API
+        CREATE(INFINI_DEVICE_CPU, cpu);
+#endif
+#ifdef ENABLE_NVIDIA_API
+        CREATE(INFINI_DEVICE_NVIDIA, nvidia);
+#endif
+#ifdef ENABLE_ILUVATAR_API
+        CREATE(INFINI_DEVICE_ILUVATAR, nvidia);
+#endif
+#ifdef ENABLE_METAX_API
+        CREATE(INFINI_DEVICE_METAX, metax);
+#endif
+
+    default:
+        return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
+    }
+
+#undef CREATE
+}
+
+__C infiniStatus_t infiniopGetHardswishWorkspaceSize(infiniopHardswishDescriptor_t desc, size_t *size) {
+
+#define GET(CASE, NAMESPACE)                                                                     \
+    case CASE:                                                                                   \
+        *size = reinterpret_cast<op::hardswish::NAMESPACE::Descriptor *>(desc)->workspaceSize(); \
+        return INFINI_STATUS_SUCCESS;
+
+    switch (desc->device_type) {
+#ifdef ENABLE_CPU_API
+        GET(INFINI_DEVICE_CPU, cpu);
+#endif
+#ifdef ENABLE_NVIDIA_API
+        GET(INFINI_DEVICE_NVIDIA, nvidia);
+#endif
+#ifdef ENABLE_ILUVATAR_API
+        GET(INFINI_DEVICE_ILUVATAR, nvidia);
+#endif
+#ifdef ENABLE_METAX_API
+        GET(INFINI_DEVICE_METAX, metax);
+#endif
+    default:
+        return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
+    }
+#undef GET
+
+    return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
+}
+
+__C infiniStatus_t infiniopHardswish(
+    infiniopHardswishDescriptor_t desc,
+    void *workspace,
+    size_t workspace_size,
+    void *output,
+    const void *input,
+    void *stream) {
+
+#define CALCULATE(CASE, NAMESPACE)                                                  \
+    case CASE:                                                                      \
+        return reinterpret_cast<const op::hardswish::NAMESPACE::Descriptor *>(desc) \
+            ->calculate(workspace, workspace_size, output, {input}, stream)
+
+    switch (desc->device_type) {
+
+#ifdef ENABLE_CPU_API
+        CALCULATE(INFINI_DEVICE_CPU, cpu);
+#endif
+#ifdef ENABLE_NVIDIA_API
+        CALCULATE(INFINI_DEVICE_NVIDIA, nvidia);
+#endif
+#ifdef ENABLE_ILUVATAR_API
+        CALCULATE(INFINI_DEVICE_ILUVATAR, nvidia);
+#endif
+#ifdef ENABLE_METAX_API
+        CALCULATE(INFINI_DEVICE_METAX, metax);
+#endif
+
+    default:
+        return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
+    }
+
+#undef CALCULATE
+}
+
+__C infiniStatus_t
+infiniopDestroyHardswishDescriptor(infiniopHardswishDescriptor_t desc) {
+
+#define DELETE(CASE, NAMESPACE)                                                      \
+    case CASE:                                                                       \
+        delete reinterpret_cast<const op::hardswish::NAMESPACE::Descriptor *>(desc); \
+        return INFINI_STATUS_SUCCESS
+
+    switch (desc->device_type) {
+
+#ifdef ENABLE_CPU_API
+        DELETE(INFINI_DEVICE_CPU, cpu);
+#endif
+#ifdef ENABLE_NVIDIA_API
+        DELETE(INFINI_DEVICE_NVIDIA, nvidia);
+#endif
+#ifdef ENABLE_ILUVATAR_API
+        DELETE(INFINI_DEVICE_ILUVATAR, nvidia);
+#endif
+#ifdef ENABLE_METAX_API
+        DELETE(INFINI_DEVICE_METAX, metax);
+#endif
+
+    default:
+        return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
+    }
+
+#undef DELETE
+}
diff --git a/src/infiniop/ops/leakyrelu/cpu/leakyrelu_cpu.cc b/src/infiniop/ops/leakyrelu/cpu/leakyrelu_cpu.cc
new file mode 100644
index 000000000..c10a44cb5
--- /dev/null
+++ b/src/infiniop/ops/leakyrelu/cpu/leakyrelu_cpu.cc
@@ -0,0 +1,114 @@
+#include "leakyrelu_cpu.h"
+#include "../../../devices/cpu/common_cpu.h"
+#include "../info.h"
+#include "infinicore.h"
+#include <algorithm>
+
+namespace op::leakyrelu::cpu {
+
+struct Descriptor::Opaque {};
+
+Descriptor::~Descriptor() { delete _opaque; }
+
+infiniStatus_t Descriptor::create(
+    infiniopHandle_t handle_,
+    Descriptor **desc_ptr,
+    infiniopTensorDescriptor_t out_desc,
+    infiniopTensorDescriptor_t in_desc,
+    float negative_slope) {
+
+    auto handle = reinterpret_cast<device::cpu::Handle *>(handle_);
+
+    auto info_r = LeakyReLUInfo::create(out_desc, in_desc, negative_slope);
+    CHECK_RESULT(info_r);
+
+    *desc_ptr = new Descriptor(
+        info_r.take(),
+        0,
+        nullptr,
+        handle->device, handle->device_id);
+
+    return INFINI_STATUS_SUCCESS;
+}
+
+size_t Descriptor::workspaceSize() const { return _min_workspace_size; }
+
+template <typename T>
+static inline void cpu_leakyrelu_impl_incremental(
+    void *output, const void *input, const op::leakyrelu::LeakyReLUInfo &info) {
+
+    const size_t ndim = info.shape.size();
+    const size_t n = info.n;
+
+    if (n == 0) {
+        return;
+    }
+
+    auto out_base = reinterpret_cast<T *>(output);
+    auto in_base = reinterpret_cast<const T *>(input);
+
+    const std::vector<size_t> &shape = info.shape;
+    const std::vector<ptrdiff_t> &in_stride = info.in_stride;
+    const std::vector<ptrdiff_t> &out_stride = info.out_stride;
+
+    std::vector<size_t> idx(ndim, 0);
+    ptrdiff_t in_off = 0;
+    ptrdiff_t out_off = 0;
+
+    for (size_t it = 0; it < n; ++it) {
+        const T *in_elem = in_base + in_off;
+        T *out_elem = out_base + out_off;
+
+        float v = utils::cast<float, T>(*in_elem);
+        float outv = v >= 0.0f ? v : v * info.negative_slope;
+        *out_elem = utils::cast<T, float>(outv);
+        for (int d = static_cast<int>(ndim) - 1; d >= 0; --d) {
+            idx[d] += 1;
+            if (in_stride[d] != 0) {
+                in_off += in_stride[d];
+            }
+            if (out_stride[d] != 0) {
+                out_off += out_stride[d];
+            }
+
+            if (idx[d] < shape[d]) {
+                break;
+            } else {
+                idx[d] = 0;
+                if (in_stride[d] != 0) {
+                    in_off -= static_cast<ptrdiff_t>(shape[d]) * in_stride[d];
+                }
+                if (out_stride[d] != 0) {
+                    out_off -= static_cast<ptrdiff_t>(shape[d]) * out_stride[d];
+                }
+            }
+        }
+    }
+}
+
+infiniStatus_t Descriptor::calculate(
+    void *workspace,
+    size_t workspace_size,
+    void *output,
+    const void *input,
+    void *stream) const {
+
+    switch (_info.dt_in) {
+    case INFINI_DTYPE_F16:
+        cpu_leakyrelu_impl_incremental<fp16_t>(output, input, _info);
+        break;
+    case INFINI_DTYPE_BF16:
+        cpu_leakyrelu_impl_incremental<bf16_t>(output, input, _info);
+        break;
+    case INFINI_DTYPE_F32:
+        cpu_leakyrelu_impl_incremental<float>(output, input, _info);
+        break;
+    case INFINI_DTYPE_F64:
+        cpu_leakyrelu_impl_incremental<double>(output, input, _info);
+        break;
+    default:
+        return INFINI_STATUS_BAD_TENSOR_DTYPE;
+    }
+    return INFINI_STATUS_SUCCESS;
+}
+} // namespace op::leakyrelu::cpu
diff --git a/src/infiniop/ops/leakyrelu/cpu/leakyrelu_cpu.h b/src/infiniop/ops/leakyrelu/cpu/leakyrelu_cpu.h
new file mode 100644
index 000000000..e58ca1409
--- /dev/null
+++ b/src/infiniop/ops/leakyrelu/cpu/leakyrelu_cpu.h
@@ -0,0 +1,7 @@
+#ifndef __LEAKYRELU_CPU_H__
+#define __LEAKYRELU_CPU_H__
+#include "../leakyrelu.h"
+
+DESCRIPTOR(cpu)
+
+#endif // __LEAKYRELU_CPU_H__
diff --git a/src/infiniop/ops/leakyrelu/cuda/kernel.cuh b/src/infiniop/ops/leakyrelu/cuda/kernel.cuh
new file mode 100644
index 000000000..afca17002
--- /dev/null
+++ b/src/infiniop/ops/leakyrelu/cuda/kernel.cuh
@@ -0,0 +1,73 @@
+#ifndef __LEAKYRELU_CUDA_KERNEL_CUH__
+#define __LEAKYRELU_CUDA_KERNEL_CUH__
+
+#include <cuda_bf16.h>
+#include <cuda_fp16.h>
+#include <stdint.h>
+#include <type_traits>
+
+template <typename DevT>
+__device__ __forceinline__ float to_float_for_leaky(const DevT &v) {
+    if constexpr (std::is_same_v<DevT, half>) {
+        return __half2float(v);
+    } else if constexpr (std::is_same_v<DevT, __nv_bfloat16>) {
+        return __bfloat162float(v);
+    } else {
+        return static_cast<float>(v);
+    }
+}
+
+template <typename DevT>
+__device__ __forceinline__ DevT from_float_for_leaky(float f) {
+    if constexpr (std::is_same_v<DevT, half>) {
+        return __float2half_rn(f);
+    } else if constexpr (std::is_same_v<DevT, __nv_bfloat16>) {
+        return __float2bfloat16(f);
+    } else {
+        return static_cast<DevT>(f);
+    }
+}
+
+template <class DevT>
+__global__ void leakyrelu_kernel(
+    DevT *__restrict__ out,
+    const DevT *__restrict__ in,
+    size_t n,
+    float negative_slope,
+    const size_t *__restrict__ shape,
+    const size_t *__restrict__ div,
+    const long long *__restrict__ in_stride,
+    const long long *__restrict__ out_stride,
+    int ndim) {
+
+    size_t gid = static_cast<size_t>(blockIdx.x) * blockDim.x + threadIdx.x;
+    size_t grid_stride = static_cast<size_t>(blockDim.x) * gridDim.x;
+
+    for (size_t linear = gid; linear < n; linear += grid_stride) {
+        unsigned long long rem = linear;
+        long long in_off = 0;
+        long long out_off = 0;
+        for (int d = 0; d < ndim; ++d) {
+            unsigned long long idx_d = 0;
+            size_t divisor = div[d];
+            if (divisor != 0) {
+                idx_d = rem / divisor;
+                rem = rem % divisor;
+            } else {
+                idx_d = 0;
+            }
+            if (in_stride[d] != 0) {
+                in_off += static_cast<long long>(idx_d) * in_stride[d];
+            }
+            if (out_stride[d] != 0) {
+                out_off += static_cast<long long>(idx_d) * out_stride[d];
+            }
+        }
+
+        float v = to_float_for_leaky(in[static_cast<size_t>(in_off)]);
+        float outv = v >= 0.0f ? v : v * negative_slope;
+        out[static_cast<size_t>(out_off)] = from_float_for_leaky<DevT>(outv);
+    }
+}
+
+#endif // __LEAKYRELU_CUDA_KERNEL_CUH__
diff --git a/src/infiniop/ops/leakyrelu/info.h b/src/infiniop/ops/leakyrelu/info.h
new file mode 100644
index 000000000..1f074d85a
--- /dev/null
+++ b/src/infiniop/ops/leakyrelu/info.h
@@ -0,0 +1,53 @@
+#ifndef __LEAKYRELU_INFO_H__
+#define __LEAKYRELU_INFO_H__
+
+#include "../../../utils.h"
+#include "../../tensor.h"
+#include <vector>
+
+namespace op::leakyrelu {
+
+class LeakyReLUInfo {
+    LeakyReLUInfo() = default;
+
+public:
+    infiniDtype_t dt_in;
+    std::vector<size_t> shape;
+    std::vector<ptrdiff_t> in_stride;
+    std::vector<ptrdiff_t> out_stride;
+    size_t n;
+    float negative_slope;
+
+    static utils::Result<LeakyReLUInfo> create(
+        infiniopTensorDescriptor_t out_desc,
+        infiniopTensorDescriptor_t in_desc,
+        float negative_slope) {
+
+        auto dt_raw = in_desc->dtype();
+        infiniDtype_t dt_in = dt_raw;
+
+        CHECK_DTYPE(dt_in, INFINI_DTYPE_F16, INFINI_DTYPE_BF16, INFINI_DTYPE_F32, INFINI_DTYPE_F64);
+
+        CHECK_OR_RETURN(out_desc->ndim() == in_desc->ndim(), INFINI_STATUS_BAD_TENSOR_SHAPE);
+        for (size_t i = 0; i < out_desc->ndim(); ++i) {
+            CHECK_OR_RETURN(out_desc->dim(i) == in_desc->dim(i), INFINI_STATUS_BAD_TENSOR_SHAPE);
+        }
+
+        size_t n = 1;
+        for (size_t i = 0; i < in_desc->ndim(); ++i) {
+            n *= static_cast<size_t>(in_desc->dim(i));
+        }
+
+        return utils::Result<LeakyReLUInfo>(LeakyReLUInfo{
+            dt_in,
+            out_desc->shape(),
+            in_desc->strides(),
+            out_desc->strides(),
+            n,
+            negative_slope});
+    }
+};
+
+} // namespace op::leakyrelu
+
+#endif // __LEAKYRELU_INFO_H__
diff --git a/src/infiniop/ops/leakyrelu/leakyrelu.h b/src/infiniop/ops/leakyrelu/leakyrelu.h
new file mode 100644
index 000000000..a6a01a85b
--- /dev/null
+++ b/src/infiniop/ops/leakyrelu/leakyrelu.h
@@ -0,0 +1,49 @@
+#ifndef __LEAKYRELU_H__
+#define __LEAKYRELU_H__
+
+#include "../../operator.h"
+#include "info.h"
+
+#define DESCRIPTOR(NAMESPACE)                             \
+                                                          \
+    namespace op::leakyrelu::NAMESPACE {                  \
+    class Descriptor final : public InfiniopDescriptor {  \
+        struct Opaque;                                    \
+        Opaque *_opaque;                                  \
+                                                          \
+        LeakyReLUInfo _info;                              \
+        size_t _min_workspace_size;                       \
+                                                          \
+        Descriptor(                                       \
+            LeakyReLUInfo info,                           \
+            size_t min_workspace_size,                    \
+            Opaque *opaque,                               \
+            infiniDevice_t device_type,                   \
+            int device_id)                                \
+            : InfiniopDescriptor{device_type, device_id}, \
+              _opaque(opaque),                            \
+              _info(info),                                \
+              _min_workspace_size(min_workspace_size) {}  \
+                                                          \
+    public:                                               \
+        ~Descriptor();                                    \
+                                                          \
+        static infiniStatus_t create(                     \
+            infiniopHandle_t handle,                      \
+            Descriptor **desc_ptr,                        \
+            infiniopTensorDescriptor_t out_desc,          \
+            infiniopTensorDescriptor_t in_desc,           \
+            float negative_slope);                        \
+                                                          \
+        size_t workspaceSize() const;                     \
+                                                          \
+        infiniStatus_t calculate(                         \
+            void *workspace,                              \
+            size_t workspace_size,                        \
+            void *output,                                 \
+            const void *input,                            \
+            void *stream) const;                          \
+    };                                                    \
+    }
+
+#endif // __LEAKYRELU_H__
diff --git a/src/infiniop/ops/leakyrelu/metax/leakyrelu_metax.h b/src/infiniop/ops/leakyrelu/metax/leakyrelu_metax.h
new file mode 100644
index 000000000..15cdccc61
--- /dev/null
+++ b/src/infiniop/ops/leakyrelu/metax/leakyrelu_metax.h
@@ -0,0 +1,8 @@
+#ifndef __LEAKYRELU_METAX_API_H__
+#define __LEAKYRELU_METAX_API_H__
+
+#include "../leakyrelu.h"
+
+DESCRIPTOR(metax)
+
+#endif // __LEAKYRELU_METAX_API_H__
diff --git a/src/infiniop/ops/leakyrelu/metax/leakyrelu_metax.maca b/src/infiniop/ops/leakyrelu/metax/leakyrelu_metax.maca
new file mode 100644
index 000000000..871c3f663
--- /dev/null
+++ b/src/infiniop/ops/leakyrelu/metax/leakyrelu_metax.maca
@@ -0,0 +1,174 @@
+#include "../cuda/kernel.cuh"
+#include "../../../devices/metax/metax_common.h"
+#include "../../../devices/metax/metax_kernel_common.h"
+#include "../leakyrelu.h"
+#include "leakyrelu_metax.h"
+#include "../info.h"
+
+namespace op::leakyrelu::metax {
+
+struct Descriptor::Opaque {
+    std::shared_ptr<device::metax::Handle::Internal> internal;
+};
+
+Descriptor::~Descriptor() {
+    delete _opaque;
+}
+
+template <typename T> struct MapHcType { using Type = T; };
+template <> struct MapHcType<fp16_t> { using Type = half; };
+#if defined(__HC_BF16_TYPES_EXIST__) || defined(__HC_ARCH__)
+template <> struct MapHcType<bf16_t> { using Type = __nv_bfloat16; };
+#endif
+
+infiniStatus_t Descriptor::create(
+    infiniopHandle_t handle_,
+    Descriptor **desc_ptr,
+    infiniopTensorDescriptor_t out_desc,
+    infiniopTensorDescriptor_t in_desc,
+    float negative_slope) {
+    auto handle = reinterpret_cast<device::metax::Handle *>(handle_);
+
+    auto info_r = LeakyReLUInfo::create(out_desc, in_desc, negative_slope);
+    CHECK_RESULT(info_r);
+    auto info = info_r.take();
+
+    size_t workspace_size = 0;
+
+    *desc_ptr = new Descriptor(
+        info,
+        workspace_size,
+        new Opaque{handle->internal()},
+        handle->device, handle->device_id);
+
+    return INFINI_STATUS_SUCCESS;
+}
+
+size_t Descriptor::workspaceSize() const {
+    return _min_workspace_size;
+}
+
+template <typename T>
+static inline infiniStatus_t metax_leakyrelu_impl_incremental(
+    void *output_, const void *input_, 
+    const op::leakyrelu::LeakyReLUInfo &info, 
+    void *stream_) {
+
+    int bs = 256, grid = 0;
+    hcError_t propErr;
+    int device_id_local = 0;
+    using DevT = typename MapHcType<T>::Type;
+
+    auto out_dev = reinterpret_cast<DevT *>(output_);
+    auto in_dev  = reinterpret_cast<const DevT *>(input_);
+    auto stream = reinterpret_cast<hcStream_t>(stream_);
+
+    int ndim = static_cast<int>(info.shape.size());
+    if (ndim == 0) {
+        return INFINI_STATUS_SUCCESS;
+    }
+
+    std::vector<size_t> h_shape(info.shape.begin(), info.shape.end());
+    std::vector<size_t> h_div(ndim);
+    h_div[ndim - 1] = 1;
+    for (int d = ndim - 2; d >= 0; --d) {
+        h_div[d] = h_div[d + 1] * h_shape[d + 1];
+    }
+
+    std::vector<long long> h_in_stride(ndim), h_out_stride(ndim);
+    for (int d = 0; d < ndim; ++d) {
+        h_in_stride[d] = static_cast<long long>(info.in_stride[d]);
+        h_out_stride[d] = static_cast<long long>(info.out_stride[d]);
+    }
+
+    size_t *d_shape = nullptr;
+    size_t *d_div = nullptr;
+    long long *d_in_stride = nullptr;
+    long long *d_out_stride = nullptr;
+
+    hcError_t err = hcSuccess;
+
+    err = hcMalloc(reinterpret_cast<void **>(&d_shape), sizeof(size_t) * ndim);
+    if (err != hcSuccess) goto cleanup;
+    err = hcMalloc(reinterpret_cast<void **>(&d_div), sizeof(size_t) * ndim);
+    if (err != hcSuccess) goto cleanup;
+    err = hcMalloc(reinterpret_cast<void **>(&d_in_stride), sizeof(long long) * ndim);
+    if (err != hcSuccess) goto cleanup;
+    err = hcMalloc(reinterpret_cast<void **>(&d_out_stride), sizeof(long long) * ndim);
+    if (err != hcSuccess) goto cleanup;
+    err = hcMemcpyAsync(d_shape, h_shape.data(), sizeof(size_t) * ndim, hcMemcpyHostToDevice, stream);
+    if (err != hcSuccess) goto cleanup;
+    err = hcMemcpyAsync(d_div, h_div.data(), sizeof(size_t) * ndim, hcMemcpyHostToDevice, stream);
+    if (err != hcSuccess) goto cleanup;
+    err = hcMemcpyAsync(d_in_stride, h_in_stride.data(), sizeof(long long) * ndim, hcMemcpyHostToDevice, stream);
+    if (err != hcSuccess) goto cleanup;
+    err = hcMemcpyAsync(d_out_stride, h_out_stride.data(), sizeof(long long) * ndim, hcMemcpyHostToDevice, stream);
+    if (err != hcSuccess) goto cleanup;
+
+    device_id_local = 0;
+    propErr = hcGetDevice(&device_id_local);
+    if (propErr == hcSuccess) {
+        hcDeviceProp_t prop;
+        if (hcGetDeviceProperties(&prop, device_id_local) == hcSuccess) {
+            bs = std::min(bs, static_cast<int>(prop.maxThreadsPerBlock) / 2);
+        } else {
+            if (bs > 256) bs = 256;
+        }
+    } else {
+        if (bs > 256) bs = 256;
+    }
+
+    if (bs <= 0) bs = 256;
+    grid = static_cast<int>((info.n + bs - 1) / bs);
+    if (grid <= 0) grid = 1;
+
+    leakyrelu_kernel<DevT><<<grid, bs, 0, stream>>>(
+        out_dev, in_dev, info.n, info.negative_slope, d_shape, d_div, d_in_stride, d_out_stride, ndim);
+
+    err = hcGetLastError();
+    if (err != hcSuccess) goto cleanup;
+
+    err = hcStreamSynchronize(stream);
+    if (err != hcSuccess) goto cleanup;
+
+    hcFree(d_shape);
+    hcFree(d_div);
+    hcFree(d_in_stride);
+    hcFree(d_out_stride);
+    return INFINI_STATUS_SUCCESS;
+
+cleanup:
+    hcFree(d_shape);
+    hcFree(d_div);
+    hcFree(d_in_stride);
+    hcFree(d_out_stride);
+    return INFINI_STATUS_DEVICE_ARCHITECTURE_NOT_SUPPORTED;
+}
+
+infiniStatus_t Descriptor::calculate(
+    void *workspace,
+    size_t workspace_size,
+    void *output,
+    const void *input,
+    void *stream) const {
+
+    switch (_info.dt_in) {     
+    case INFINI_DTYPE_F16:
+        metax_leakyrelu_impl_incremental<fp16_t>(output, input, _info, stream);
+        break;             
+    case INFINI_DTYPE_BF16:
+        metax_leakyrelu_impl_incremental<cuda_bfloat16>(output, input, _info, stream);
+        break;             
+    case INFINI_DTYPE_F32: 
+        metax_leakyrelu_impl_incremental<float>(output, input, _info, stream);
+        break;            
+    case INFINI_DTYPE_F64: 
+        metax_leakyrelu_impl_incremental<double>(output, input, _info, stream);
+        break;            
+    default:               
+        return INFINI_STATUS_BAD_TENSOR_DTYPE;    
+    }                      
+    return INFINI_STATUS_SUCCESS;
+}
+
+}; // namespace op::leakyrelu::metax
diff --git a/src/infiniop/ops/leakyrelu/nvidia/leakyrelu_nvidia.cu b/src/infiniop/ops/leakyrelu/nvidia/leakyrelu_nvidia.cu
new file mode 100644
index 000000000..9b65bc421
--- /dev/null
+++ b/src/infiniop/ops/leakyrelu/nvidia/leakyrelu_nvidia.cu
@@ -0,0 +1,215 @@
+#include "../../../devices/nvidia/nvidia_handle.cuh"
+#include "../../../devices/nvidia/nvidia_kernel_common.cuh"
+#include "../cuda/kernel.cuh"
+#include "../info.h"
+#include "../leakyrelu.h"
+#include "leakyrelu_nvidia.cuh"
+#include <algorithm>
+#include <cstring>
+#include <cuda_runtime.h>
+#include <vector>
+
+namespace op::leakyrelu::nvidia {
+
+struct Descriptor::Opaque {
+    std::shared_ptr<device::nvidia::Handle::Internal> internal;
+};
+
+Descriptor::~Descriptor() {
+    delete _opaque;
+}
+
+template <typename T>
+struct MapCudaType {
+    using Type = T;
+};
+template <>
+struct MapCudaType<fp16_t> {
+    using Type = half;
+};
+#if defined(__CUDA_BF16_TYPES_EXIST__) || defined(__CUDA_ARCH__)
+template <>
+struct MapCudaType<bf16_t> {
+    using Type = __nv_bfloat16;
+};
+#endif
+
+infiniStatus_t Descriptor::create(
+    infiniopHandle_t handle_,
+    Descriptor **desc_ptr,
+    infiniopTensorDescriptor_t out_desc,
+    infiniopTensorDescriptor_t in_desc,
+    float negative_slope) {
+    auto handle = reinterpret_cast<device::nvidia::Handle *>(handle_);
+
+    auto info_r = LeakyReLUInfo::create(out_desc, in_desc, negative_slope);
+    CHECK_RESULT(info_r);
+    auto info = info_r.take();
+
+    size_t workspace_size = 0;
+
+    *desc_ptr = new Descriptor(
+        info,
+        workspace_size,
+        new Opaque{handle->internal()},
+        handle->device, handle->device_id);
+
+    return INFINI_STATUS_SUCCESS;
+}
+
+size_t Descriptor::workspaceSize() const {
+    return _min_workspace_size;
+}
+
+template <typename T>
+static inline infiniStatus_t cuda_leakyrelu_impl_incremental(
+    void *output_, const void *input_,
+    const op::leakyrelu::LeakyReLUInfo &info,
+    void *stream_) {
+
+    int bs = 256, grid = 0;
+    cudaError_t propErr;
+    int device_id_local = 0;
+    using DevT = typename MapCudaType<T>::Type;
+
+    auto out_dev = reinterpret_cast<DevT *>(output_);
+    auto in_dev = reinterpret_cast<const DevT *>(input_);
+    auto stream = reinterpret_cast<cudaStream_t>(stream_);
+
+    int ndim = static_cast<int>(info.shape.size());
+    if (ndim == 0) {
+        return INFINI_STATUS_SUCCESS;
+    }
+
+    std::vector<size_t> h_shape(info.shape.begin(), info.shape.end());
+    std::vector<size_t> h_div(ndim);
+    h_div[ndim - 1] = 1;
+    for (int d = ndim - 2; d >= 0; --d) {
+        h_div[d] = h_div[d + 1] * h_shape[d + 1];
+    }
+
+    std::vector<long long> h_in_stride(ndim), h_out_stride(ndim);
+    for (int d = 0; d < ndim; ++d) {
+        h_in_stride[d] = static_cast<long long>(info.in_stride[d]);
+        h_out_stride[d] = static_cast<long long>(info.out_stride[d]);
+    }
+
+    size_t *d_shape = nullptr;
+    size_t *d_div = nullptr;
+    long long *d_in_stride = nullptr;
+    long long *d_out_stride = nullptr;
+
+    cudaError_t err = cudaSuccess;
+
+    err = cudaMalloc(reinterpret_cast<void **>(&d_shape), sizeof(size_t) * ndim);
+    if (err != cudaSuccess) {
+        goto cleanup;
+    }
+    err = cudaMalloc(reinterpret_cast<void **>(&d_div), sizeof(size_t) * ndim);
+    if (err != cudaSuccess) {
+        goto cleanup;
+    }
+    err = cudaMalloc(reinterpret_cast<void **>(&d_in_stride), sizeof(long long) * ndim);
+    if (err != cudaSuccess) {
+        goto cleanup;
+    }
+    err = cudaMalloc(reinterpret_cast<void **>(&d_out_stride), sizeof(long long) * ndim);
+    if (err != cudaSuccess) {
+        goto cleanup;
+    }
+    err = cudaMemcpyAsync(d_shape, h_shape.data(), sizeof(size_t) * ndim, cudaMemcpyHostToDevice, stream);
+    if (err != cudaSuccess) {
+        goto cleanup;
+    }
+    err = cudaMemcpyAsync(d_div, h_div.data(), sizeof(size_t) * ndim, cudaMemcpyHostToDevice, stream);
+    if (err != cudaSuccess) {
+        goto cleanup;
+    }
+    err = cudaMemcpyAsync(d_in_stride, h_in_stride.data(), sizeof(long long) * ndim, cudaMemcpyHostToDevice, stream);
+    if (err != cudaSuccess) {
+        goto cleanup;
+    }
+    err = cudaMemcpyAsync(d_out_stride, h_out_stride.data(), sizeof(long long) * ndim, cudaMemcpyHostToDevice, stream);
+    if (err != cudaSuccess) {
+        goto cleanup;
+    }
+
+    device_id_local = 0;
+    propErr = cudaGetDevice(&device_id_local);
+    if (propErr == cudaSuccess) {
+        cudaDeviceProp prop;
+        if (cudaGetDeviceProperties(&prop, device_id_local) == cudaSuccess) {
+            bs = std::min(bs, static_cast<int>(prop.maxThreadsPerBlock) / 2);
+        } else {
+            if (bs > 256) {
+                bs = 256;
+            }
+        }
+    } else {
+        if (bs > 256) {
+            bs = 256;
+        }
+    }
+
+    if (bs <= 0) {
+        bs = 256;
+    }
+    grid = static_cast<int>((info.n + bs - 1) / bs);
+    if (grid <= 0) {
+        grid = 1;
+    }
+
+    leakyrelu_kernel<DevT><<<grid, bs, 0, stream>>>(
+        out_dev, in_dev, info.n, info.negative_slope, d_shape, d_div, d_in_stride, d_out_stride, ndim);
+
+    err = cudaGetLastError();
+    if (err != cudaSuccess) {
+        goto cleanup;
+    }
+
+    err = cudaStreamSynchronize(stream);
+    if (err != cudaSuccess) {
+        goto cleanup;
+    }
+
+    cudaFree(d_shape);
+    cudaFree(d_div);
+    cudaFree(d_in_stride);
+    cudaFree(d_out_stride);
+    return INFINI_STATUS_SUCCESS;
+
+cleanup:
+    cudaFree(d_shape);
+    cudaFree(d_div);
+    cudaFree(d_in_stride);
+    cudaFree(d_out_stride);
+    return INFINI_STATUS_DEVICE_ARCHITECTURE_NOT_SUPPORTED;
+}
+
+infiniStatus_t Descriptor::calculate(
+    void *workspace,
+    size_t workspace_size,
+    void *output,
+    const void *input,
+    void *stream) const {
+
+    switch (_info.dt_in) {
+    case INFINI_DTYPE_F16:
+        cuda_leakyrelu_impl_incremental<fp16_t>(output, input, _info, stream);
+        break;
+    case INFINI_DTYPE_BF16:
+        cuda_leakyrelu_impl_incremental<cuda_bfloat16>(output, input, _info, stream);
+        break;
+    case INFINI_DTYPE_F32:
+        cuda_leakyrelu_impl_incremental<float>(output, input, _info, stream);
+        break;
+    case INFINI_DTYPE_F64:
+        cuda_leakyrelu_impl_incremental<double>(output, input, _info, stream);
+        break;
+    default:
+        return INFINI_STATUS_BAD_TENSOR_DTYPE;
+    }
+    return INFINI_STATUS_SUCCESS;
+}
+
+}; // namespace op::leakyrelu::nvidia
diff --git a/src/infiniop/ops/leakyrelu/nvidia/leakyrelu_nvidia.cuh b/src/infiniop/ops/leakyrelu/nvidia/leakyrelu_nvidia.cuh
new file mode 100644
index 000000000..fb891a6c9
--- /dev/null
+++ b/src/infiniop/ops/leakyrelu/nvidia/leakyrelu_nvidia.cuh
@@ -0,0 +1,8 @@
+#ifndef __LEAKYRELU_CUDA_API_H__
+#define __LEAKYRELU_CUDA_API_H__
+
+#include "../leakyrelu.h"
+
+DESCRIPTOR(nvidia)
+
+#endif // __LEAKYRELU_CUDA_API_H__
diff --git a/src/infiniop/ops/leakyrelu/operator.cc b/src/infiniop/ops/leakyrelu/operator.cc
new file mode 100644
index 000000000..3f78a4916
--- /dev/null
+++ b/src/infiniop/ops/leakyrelu/operator.cc
@@ -0,0 +1,164 @@
+#include "../../operator.h"
+#include "../../handle.h"
+#include "infiniop/ops/leakyrelu.h"
+
+#ifdef ENABLE_CPU_API
+#include "cpu/leakyrelu_cpu.h"
+#endif
+#if defined(ENABLE_NVIDIA_API) || defined(ENABLE_ILUVATAR_API)
+#include "nvidia/leakyrelu_nvidia.cuh"
+#endif
+#ifdef ENABLE_METAX_API
+#include "metax/leakyrelu_metax.h"
+#endif
+
+__C infiniStatus_t infiniopCreateLeakyreluDescriptor(
+    infiniopHandle_t handle,
+    infiniopLeakyreluDescriptor_t *desc_ptr,
+    infiniopTensorDescriptor_t y_desc,
+    infiniopTensorDescriptor_t x_desc,
+    float negative_slope) {
+
+#define CREATE_LEAKY(CASE, NAMESPACE)                                            \
+    case CASE:                                                                   \
+        return op::leakyrelu::NAMESPACE::Descriptor::create(                     \
+            handle,                                                              \
+            reinterpret_cast<op::leakyrelu::NAMESPACE::Descriptor **>(desc_ptr), \
+            y_desc,                                                              \
+            x_desc,                                                              \
+            negative_slope)
+
+    switch (handle->device) {
+#ifdef ENABLE_CPU_API
+        CREATE_LEAKY(INFINI_DEVICE_CPU, cpu);
+#endif
+#ifdef ENABLE_NVIDIA_API
+        CREATE_LEAKY(INFINI_DEVICE_NVIDIA, nvidia);
+#endif
+#ifdef ENABLE_ILUVATAR_API
+        CREATE_LEAKY(INFINI_DEVICE_ILUVATAR, nvidia);
+#endif
+#ifdef ENABLE_KUNLUN_API
+        CREATE_LEAKY(INFINI_DEVICE_KUNLUN, kunlun);
+#endif
+#ifdef ENABLE_ASCEND_API
+        CREATE_LEAKY(INFINI_DEVICE_ASCEND, ascend);
+#endif
+#ifdef ENABLE_METAX_API
+        CREATE_LEAKY(INFINI_DEVICE_METAX, metax);
+#endif
+#ifdef ENABLE_MOORE_API
+        CREATE_LEAKY(INFINI_DEVICE_MOORE, musa);
+#endif
+    }
+
+#undef CREATE_LEAKY
+    return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
+}
+
+__C infiniStatus_t infiniopGetLeakyreluWorkspaceSize(infiniopLeakyreluDescriptor_t desc, size_t *size) {
+
+#define GET(CASE, NAMESPACE)                                                                     \
+    case CASE:                                                                                   \
+        *size = reinterpret_cast<op::leakyrelu::NAMESPACE::Descriptor *>(desc)->workspaceSize(); \
+        return INFINI_STATUS_SUCCESS
+
+    switch (desc->device_type) {
+#ifdef ENABLE_CPU_API
+        GET(INFINI_DEVICE_CPU, cpu);
+#endif
+#ifdef ENABLE_NVIDIA_API
+        GET(INFINI_DEVICE_NVIDIA, nvidia);
+#endif
+#ifdef ENABLE_ILUVATAR_API
+        GET(INFINI_DEVICE_ILUVATAR, nvidia);
+#endif
+#ifdef ENABLE_KUNLUN_API
+        GET(INFINI_DEVICE_KUNLUN, kunlun);
+#endif
+#ifdef ENABLE_ASCEND_API
+        GET(INFINI_DEVICE_ASCEND, ascend);
+#endif
+#ifdef ENABLE_METAX_API
+        GET(INFINI_DEVICE_METAX, metax);
+#endif
+#ifdef ENABLE_MOORE_API
+        GET(INFINI_DEVICE_MOORE, musa);
+#endif
+    }
+
+#undef GET
+
+    return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
+}
+
+__C infiniStatus_t infiniopLeakyrelu(infiniopLeakyreluDescriptor_t desc, void *workspace, size_t workspace_size,
+                                     void *y, const void *x, void *stream) {
+
+#define CALC_LEAKY(CASE, NAMESPACE)                                                       \
+    case CASE:                                                                            \
+        return reinterpret_cast<op::leakyrelu::NAMESPACE::Descriptor *>(desc)->calculate( \
+            workspace, workspace_size, y, x, stream)
+
+    switch (desc->device_type) {
+#ifdef ENABLE_CPU_API
+        CALC_LEAKY(INFINI_DEVICE_CPU, cpu);
+#endif
+#ifdef ENABLE_NVIDIA_API
+        CALC_LEAKY(INFINI_DEVICE_NVIDIA, nvidia);
+#endif
+#ifdef ENABLE_ILUVATAR_API
+        CALC_LEAKY(INFINI_DEVICE_ILUVATAR, nvidia);
+#endif
+#ifdef ENABLE_KUNLUN_API
+        CALC_LEAKY(INFINI_DEVICE_KUNLUN, kunlun);
+#endif
+#ifdef ENABLE_ASCEND_API
+        CALC_LEAKY(INFINI_DEVICE_ASCEND, ascend);
+#endif
+#ifdef ENABLE_METAX_API
+        CALC_LEAKY(INFINI_DEVICE_METAX, metax);
+#endif
+#ifdef ENABLE_MOORE_API
+        CALC_LEAKY(INFINI_DEVICE_MOORE, musa);
+#endif
+    }
+
+#undef CALC_LEAKY
+    return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
+}
+
+__C infiniStatus_t infiniopDestroyLeakyreluDescriptor(infiniopLeakyreluDescriptor_t desc) {
+
+#define DESTROY_LEAKY(CASE, NAMESPACE)                                         \
+    case CASE:                                                                 \
+        delete reinterpret_cast<op::leakyrelu::NAMESPACE::Descriptor *>(desc); \
+        return INFINI_STATUS_SUCCESS
+
+    switch (desc->device_type) {
+#ifdef ENABLE_CPU_API
+        DESTROY_LEAKY(INFINI_DEVICE_CPU, cpu);
+#endif
+#ifdef ENABLE_NVIDIA_API
+        DESTROY_LEAKY(INFINI_DEVICE_NVIDIA, nvidia);
+#endif
+#ifdef ENABLE_ILUVATAR_API
+        DESTROY_LEAKY(INFINI_DEVICE_ILUVATAR, nvidia);
+#endif
+#ifdef ENABLE_KUNLUN_API
+        DESTROY_LEAKY(INFINI_DEVICE_KUNLUN, kunlun);
+#endif
+#ifdef ENABLE_ASCEND_API
+        DESTROY_LEAKY(INFINI_DEVICE_ASCEND, ascend);
+#endif
+#ifdef ENABLE_METAX_API
+        DESTROY_LEAKY(INFINI_DEVICE_METAX, metax);
+#endif
+#ifdef ENABLE_MOORE_API
+        DESTROY_LEAKY(INFINI_DEVICE_MOORE, musa);
+#endif
+    }
+
+#undef DESTROY_LEAKY
+    return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
+}
diff --git a/src/infiniop/ops/sigmoid_backward/cpu/sigmoid_backward_cpu.cc b/src/infiniop/ops/sigmoid_backward/cpu/sigmoid_backward_cpu.cc
new file mode 100644
index 000000000..ea3d5e63c
--- /dev/null
+++ b/src/infiniop/ops/sigmoid_backward/cpu/sigmoid_backward_cpu.cc
@@ -0,0 +1,54 @@
+#include "sigmoid_backward_cpu.h"
+
+namespace op::sigmoid_backward::cpu {
+
+Descriptor::~Descriptor() = default;
+
+infiniStatus_t Descriptor::create(
+    infiniopHandle_t handle_,
+    Descriptor **desc_ptr,
+    infiniopTensorDescriptor_t out_desc,
+    std::vector<infiniopTensorDescriptor_t> input_desc_vec) {
+
+    auto handle = reinterpret_cast<device::cpu::Handle *>(handle_);
+    auto dtype = out_desc->dtype();
+
+    const auto &input_desc = input_desc_vec.at(0);
+    const auto &grad_output_desc = input_desc_vec.at(1);
+    const auto &grad_input_shape = out_desc->shape();
+    const auto &input_shape = input_desc->shape();
+    const auto &grad_output_shape = grad_output_desc->shape();
+
+    CHECK_DTYPE(dtype, INFINI_DTYPE_F16, INFINI_DTYPE_F32, INFINI_DTYPE_F64, INFINI_DTYPE_BF16);
+
+    CHECK_SAME_SHAPE(grad_input_shape, input_shape, grad_output_shape);
+
+    // create CPU elementwise descriptor
+    CREATE_ELEMENTWISE_CPU_DESCRIPTOR(handle, dtype, out_desc, input_desc_vec);
+
+    return INFINI_STATUS_SUCCESS;
+}
+
+infiniStatus_t Descriptor::calculate(
+    void *workspace,
+    size_t workspace_size,
+    void *output,
+    std::vector<const void *> inputs,
+    void *stream) const {
+
+    switch (_dtype) {
+    case INFINI_DTYPE_F16:
+        return _device_info->calculate<SigmoidBackwardOp, fp16_t>(_info, output, inputs, stream);
+    case INFINI_DTYPE_F32:
+        return _device_info->calculate<SigmoidBackwardOp, float>(_info, output, inputs, stream);
+    case INFINI_DTYPE_F64:
+        return _device_info->calculate<SigmoidBackwardOp, double>(_info, output, inputs, stream);
+    case INFINI_DTYPE_BF16:
+        return _device_info->calculate<SigmoidBackwardOp, bf16_t>(_info, output, inputs, stream);
+    default:
+        return INFINI_STATUS_BAD_TENSOR_DTYPE;
+    }
+
+    return INFINI_STATUS_SUCCESS;
+}
+} // namespace op::sigmoid_backward::cpu
diff --git a/src/infiniop/ops/sigmoid_backward/cpu/sigmoid_backward_cpu.h b/src/infiniop/ops/sigmoid_backward/cpu/sigmoid_backward_cpu.h
new file mode 100644
index 000000000..32537ef17
--- /dev/null
+++ b/src/infiniop/ops/sigmoid_backward/cpu/sigmoid_backward_cpu.h
@@ -0,0 +1,30 @@
+#ifndef __SIGMOID_BACKWARD_CPU_H__
+#define __SIGMOID_BACKWARD_CPU_H__
+
+#include "../../../elementwise/cpu/elementwise_cpu.h"
+
+ELEMENTWISE_DESCRIPTOR(sigmoid_backward, cpu)
+
+namespace op::sigmoid_backward::cpu {
+typedef struct SigmoidBackwardOp {
+public:
+    static constexpr size_t num_inputs = 2;
+    template <typename T>
+    T operator()(const T &x, const T &grad_out) const {
+        using ComputeT = std::conditional_t<std::is_same_v<T, fp16_t> || std::is_same_v<T, bf16_t>,
+                                            float, T>;
+        ComputeT xv = utils::cast<ComputeT, T>(x);
+        ComputeT gov = utils::cast<ComputeT, T>(grad_out);
+
+        // sigmoid(x) = 1 / (1 + exp(-x))
+        ComputeT s = static_cast<ComputeT>(1) / (static_cast<ComputeT>(1) + std::exp(-xv));
+
+        // grad_input = grad_output * s * (1 - s)
+        ComputeT gin = gov * s * (static_cast<ComputeT>(1) - s);
+
+        return utils::cast<T, ComputeT>(gin);
+    }
+} SigmoidBackwardOp;
+} // namespace op::sigmoid_backward::cpu
+
+#endif // __SIGMOID_BACKWARD_CPU_H__
diff --git a/src/infiniop/ops/sigmoid_backward/cuda/kernel.cuh b/src/infiniop/ops/sigmoid_backward/cuda/kernel.cuh
new file mode 100644
index 000000000..42c850004
--- /dev/null
+++ b/src/infiniop/ops/sigmoid_backward/cuda/kernel.cuh
@@ -0,0 +1,62 @@
+#ifndef __SIGMOID_BACKWARD_CUDA_H__
+#define __SIGMOID_BACKWARD_CUDA_H__
+
+#include <cmath>
+#include <cuda_bf16.h>
+#include <cuda_fp16.h>
+#include <type_traits>
+
+namespace op::sigmoid_backward::cuda {
+typedef struct SigmoidBackwardOp {
+public:
+    static constexpr size_t num_inputs = 2;
+
+    template <typename T>
+    __device__ __forceinline__ T operator()(const T &x, const T &grad_out) const {
+        if constexpr (std::is_same_v<T, half2>) {
+            float2 xf = __half22float2(x);
+            float2 gf = __half22float2(grad_out);
+            float2 sf;
+            sf.x = 1.0f / (1.0f + __expf(-xf.x));
+            sf.y = 1.0f / (1.0f + __expf(-xf.y));
+            float2 gr;
+            gr.x = gf.x * sf.x * (1.0f - sf.x);
+            gr.y = gf.y * sf.y * (1.0f - sf.y);
+            return __float22half2_rn(gr);
+        } else if constexpr (std::is_same_v<T, half>) {
+            float xf = __half2float(x);
+            float gf = __half2float(grad_out);
+            float s = 1.0f / (1.0f + __expf(-xf));
+            float gr = gf * s * (1.0f - s);
+            return __float2half_rn(gr);
+        } else if constexpr (std::is_same_v<T, cuda_bfloat162>) {
+            float f0 = __bfloat162float(__low2bfloat16(x));
+            float f1 = __bfloat162float(__high2bfloat16(x));
+            float g0 = __bfloat162float(__low2bfloat16(grad_out));
+            float g1 = __bfloat162float(__high2bfloat16(grad_out));
+            float s0 = 1.0f / (1.0f + __expf(-f0));
+            float s1 = 1.0f / (1.0f + __expf(-f1));
+            float r0 = g0 * s0 * (1.0f - s0);
+            float r1 = g1 * s1 * (1.0f - s1);
+            return __floats2bfloat162_rn(r0, r1);
+        } else if constexpr (std::is_same_v<T, cuda_bfloat16>) {
+            float xf = __bfloat162float(x);
+            float gf = __bfloat162float(grad_out);
+            float s = 1.0f / (1.0f + __expf(-xf));
+            float gr = gf * s * (1.0f - s);
+            return __float2bfloat16_rn(gr);
+        } else if constexpr (std::is_same_v<T, float>) {
+            float s = 1.0f / (1.0f + __expf(-x));
+            return grad_out * s * (1.0f - s);
+        } else if constexpr (std::is_same_v<T, double>) {
+            double s = 1.0 / (1.0 + std::exp(-x));
+            return grad_out * s * (1.0 - s);
+        } else {
+            auto s = static_cast<float>(1) / (static_cast<float>(1) + std::exp(-static_cast<float>(x)));
+            return static_cast<T>(static_cast<float>(grad_out) * s * (1.0f - s));
+        }
+    }
+} SigmoidBackwardOp;
+} // namespace op::sigmoid_backward::cuda
+
+#endif // __SIGMOID_BACKWARD_CUDA_H__
diff --git a/src/infiniop/ops/sigmoid_backward/metax/sigmoid_backward_metax.h b/src/infiniop/ops/sigmoid_backward/metax/sigmoid_backward_metax.h
new file mode 100644
index 000000000..fa1708559
--- /dev/null
+++ b/src/infiniop/ops/sigmoid_backward/metax/sigmoid_backward_metax.h
@@ -0,0 +1,8 @@
+#ifndef __SIGMOID_BACKWARD_METAX_API_H__
+#define __SIGMOID_BACKWARD_METAX_API_H__
+
+#include "../../../elementwise/metax/elementwise_metax_api.h"
+
+ELEMENTWISE_DESCRIPTOR(sigmoid_backward, metax)
+
+#endif // __SIGMOID_BACKWARD_METAX_API_H__
diff --git a/src/infiniop/ops/sigmoid_backward/metax/sigmoid_backward_metax.maca b/src/infiniop/ops/sigmoid_backward/metax/sigmoid_backward_metax.maca
new file mode 100644
index 000000000..ed99ac65d
--- /dev/null
+++ b/src/infiniop/ops/sigmoid_backward/metax/sigmoid_backward_metax.maca
@@ -0,0 +1,62 @@
+#include "sigmoid_backward_metax.h"
+
+#include "../../../elementwise/metax/elementwise_metax.h"
+
+#include "../cuda/kernel.cuh"
+
+namespace op::sigmoid_backward::metax {
+
+Descriptor::~Descriptor() = default;
+
+infiniStatus_t Descriptor::create(
+    infiniopHandle_t handle_,
+    Descriptor **desc_ptr,
+    infiniopTensorDescriptor_t out_desc,
+    std::vector<infiniopTensorDescriptor_t> input_desc_vec) {
+
+    auto handle = reinterpret_cast<device::metax::Handle *>(handle_);
+    auto dtype = out_desc->dtype();
+
+    const auto &input_desc = input_desc_vec.at(0);
+    const auto &grad_output_desc = input_desc_vec.at(1);
+    const auto &grad_input_shape = out_desc->shape();
+    const auto &input_shape = input_desc->shape();
+    const auto &grad_output_shape = grad_output_desc->shape();
+
+    CHECK_DTYPE(dtype, INFINI_DTYPE_F16, INFINI_DTYPE_F32, INFINI_DTYPE_F64, INFINI_DTYPE_BF16);
+
+    CHECK_SAME_SHAPE(grad_input_shape, input_shape, grad_output_shape);
+
+    // create CUDA elementwise descriptor
+    CREATE_ELEMENTWISE_METAX_DESCRIPTOR(handle, dtype, out_desc, input_desc_vec)
+
+    return INFINI_STATUS_SUCCESS;
+}
+
+infiniStatus_t Descriptor::calculate(
+    void *workspace,
+    size_t workspace_size,
+    void *output,
+    std::vector<const void *> inputs,
+    void *stream) const {
+
+    if (workspace_size < _workspace_size) {
+        return INFINI_STATUS_INSUFFICIENT_WORKSPACE;
+    }
+
+    switch (_dtype) {
+    case INFINI_DTYPE_F16:
+        return _device_info->calculate<256, cuda::SigmoidBackwardOp, half>(_info, workspace, output, inputs, stream);
+    case INFINI_DTYPE_BF16:
+        return _device_info->calculate<256, cuda::SigmoidBackwardOp, cuda_bfloat16>(_info, workspace, output, inputs, stream);
+    case INFINI_DTYPE_F32:
+        return _device_info->calculate<256, cuda::SigmoidBackwardOp, float>(_info, workspace, output, inputs, stream);
+    case INFINI_DTYPE_F64:
+        return _device_info->calculate<256, cuda::SigmoidBackwardOp, double>(_info, workspace, output, inputs, stream);
+    default:
+        return INFINI_STATUS_BAD_TENSOR_DTYPE;
+    }
+
+    return INFINI_STATUS_SUCCESS;
+}
+} // namespace op::sigmoid_backward::metax
diff --git a/src/infiniop/ops/sigmoid_backward/nvidia/sigmoid_backward_nvidia.cu b/src/infiniop/ops/sigmoid_backward/nvidia/sigmoid_backward_nvidia.cu
new file mode 100644
index 000000000..e7e604af4
--- /dev/null
+++ b/src/infiniop/ops/sigmoid_backward/nvidia/sigmoid_backward_nvidia.cu
@@ -0,0 +1,61 @@
+#include "../../../elementwise/nvidia/elementwise_nvidia.cuh"
+
+#include "../cuda/kernel.cuh"
+#include "sigmoid_backward_nvidia.cuh"
+
+namespace op::sigmoid_backward::nvidia {
+
+Descriptor::~Descriptor() = default;
+
+infiniStatus_t Descriptor::create(
+    infiniopHandle_t handle_,
+    Descriptor **desc_ptr,
+    infiniopTensorDescriptor_t out_desc,
+    std::vector<infiniopTensorDescriptor_t> input_desc_vec) {
+
+    auto handle = reinterpret_cast<device::nvidia::Handle *>(handle_);
+    auto dtype = out_desc->dtype();
+
+    const auto &input_desc = input_desc_vec.at(0);
+    const auto &grad_output_desc = input_desc_vec.at(1);
+    const auto &grad_input_shape = out_desc->shape();
+    const auto &input_shape = input_desc->shape();
+    const auto &grad_output_shape = grad_output_desc->shape();
+
+    CHECK_DTYPE(dtype, INFINI_DTYPE_F16, INFINI_DTYPE_F32, INFINI_DTYPE_F64, INFINI_DTYPE_BF16);
+
+    CHECK_SAME_SHAPE(grad_input_shape, input_shape, grad_output_shape);
+
+    // create CUDA elementwise descriptor
+    CREATE_ELEMENTWISE_CUDA_DESCRIPTOR(handle, dtype, out_desc, input_desc_vec)
+
+    return INFINI_STATUS_SUCCESS;
+}
+
+infiniStatus_t Descriptor::calculate(
+    void *workspace,
+    size_t workspace_size,
+    void *output,
+    std::vector<const void *> inputs,
+    void *stream) const {
+
+    if (workspace_size < _workspace_size) {
+        return INFINI_STATUS_INSUFFICIENT_WORKSPACE;
+    }
+
+    switch (_dtype) {
+    case INFINI_DTYPE_F16:
+        return _device_info->calculate<256, cuda::SigmoidBackwardOp, half>(_info, workspace, output, inputs, stream);
+    case INFINI_DTYPE_BF16:
+        return _device_info->calculate<256, cuda::SigmoidBackwardOp, cuda_bfloat16>(_info, workspace, output, inputs, stream);
+    case INFINI_DTYPE_F32:
+        return _device_info->calculate<256, cuda::SigmoidBackwardOp, float>(_info, workspace, output, inputs, stream);
+    case INFINI_DTYPE_F64:
+        return _device_info->calculate<256, cuda::SigmoidBackwardOp, double>(_info, workspace, output, inputs, stream);
+    default:
+        return INFINI_STATUS_BAD_TENSOR_DTYPE;
+    }
+
+    return INFINI_STATUS_SUCCESS;
+}
+} // namespace op::sigmoid_backward::nvidia
diff --git a/src/infiniop/ops/sigmoid_backward/nvidia/sigmoid_backward_nvidia.cuh b/src/infiniop/ops/sigmoid_backward/nvidia/sigmoid_backward_nvidia.cuh
new file mode 100644
index 000000000..822f870fe
--- /dev/null
+++ b/src/infiniop/ops/sigmoid_backward/nvidia/sigmoid_backward_nvidia.cuh
@@ -0,0 +1,8 @@
+#ifndef __SIGMOID_BACKWARD_CUDA_API_H__
+#define __SIGMOID_BACKWARD_CUDA_API_H__
+
+#include "../../../elementwise/nvidia/elementwise_nvidia_api.cuh"
+
+ELEMENTWISE_DESCRIPTOR(sigmoid_backward, nvidia)
+
+#endif // __SIGMOID_BACKWARD_CUDA_API_H__
diff --git a/src/infiniop/ops/sigmoid_backward/operator.cc b/src/infiniop/ops/sigmoid_backward/operator.cc
new file mode 100644
index 000000000..40a279f4b
--- /dev/null
+++ b/src/infiniop/ops/sigmoid_backward/operator.cc
@@ -0,0 +1,145 @@
+#include "../../operator.h"
+#include "../../handle.h"
+#include "infiniop/ops/sigmoid_backward.h"
+
+#ifdef ENABLE_CPU_API
+#include "cpu/sigmoid_backward_cpu.h"
+#endif
+#if defined(ENABLE_NVIDIA_API) || defined(ENABLE_ILUVATAR_API)
+#include "nvidia/sigmoid_backward_nvidia.cuh"
+#endif
+#ifdef ENABLE_METAX_API
+#include "metax/sigmoid_backward_metax.h"
+#endif
+
+__C infiniStatus_t infiniopCreateSigmoidBackwardDescriptor(
+    infiniopHandle_t handle,
+    infiniopSigmoidBackwardDescriptor_t *desc_ptr,
+    infiniopTensorDescriptor_t grad_input_desc,
+    infiniopTensorDescriptor_t input_desc,
+    infiniopTensorDescriptor_t grad_output_desc) {
+
+#define CREATE(CASE, NAMESPACE)                                                         \
+    case CASE:                                                                          \
+        return op::sigmoid_backward::NAMESPACE::Descriptor::create(                     \
+            handle,                                                                     \
+            reinterpret_cast<op::sigmoid_backward::NAMESPACE::Descriptor **>(desc_ptr), \
+            grad_input_desc,                                                            \
+            {input_desc,                                                                \
+             grad_output_desc})
+
+    switch (handle->device) {
+
+#ifdef ENABLE_CPU_API
+        CREATE(INFINI_DEVICE_CPU, cpu);
+#endif
+#ifdef ENABLE_NVIDIA_API
+        CREATE(INFINI_DEVICE_NVIDIA, nvidia);
+#endif
+#ifdef ENABLE_ILUVATAR_API
+        CREATE(INFINI_DEVICE_ILUVATAR, nvidia);
+#endif
+#ifdef ENABLE_METAX_API
+        CREATE(INFINI_DEVICE_METAX, metax);
+#endif
+
+    default:
+        return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
+    }
+
+#undef CREATE
+}
+
+__C infiniStatus_t infiniopGetSigmoidBackwardWorkspaceSize(infiniopSigmoidBackwardDescriptor_t desc, size_t *size) {
+
+#define GET(CASE, NAMESPACE)                                                                            \
+    case CASE:                                                                                          \
+        *size = reinterpret_cast<op::sigmoid_backward::NAMESPACE::Descriptor *>(desc)->workspaceSize(); \
+        return INFINI_STATUS_SUCCESS
+
+    switch (desc->device_type) {
+#ifdef ENABLE_CPU_API
+        GET(INFINI_DEVICE_CPU, cpu);
+#endif
+#ifdef ENABLE_NVIDIA_API
+        GET(INFINI_DEVICE_NVIDIA, nvidia);
+#endif
+#ifdef ENABLE_ILUVATAR_API
+        GET(INFINI_DEVICE_ILUVATAR, nvidia);
+#endif
+#ifdef ENABLE_METAX_API
+        GET(INFINI_DEVICE_METAX, metax);
+#endif
+    default:
+        return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
+    }
+#undef GET
+
+    return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
+}
+
+__C infiniStatus_t infiniopSigmoidBackward(
+    infiniopSigmoidBackwardDescriptor_t desc,
+    void *workspace,
+    size_t workspace_size,
+    void *grad_input,
+    const void *input,
+    const void *grad_output,
+    void *stream) {
+
+#define CALCULATE(CASE, NAMESPACE)                                                         \
+    case CASE:                                                                             \
+        return reinterpret_cast<const op::sigmoid_backward::NAMESPACE::Descriptor *>(desc) \
+            ->calculate(workspace, workspace_size, grad_input, {input, grad_output}, stream)
+
+    switch (desc->device_type) {
+
+#ifdef ENABLE_CPU_API
+        CALCULATE(INFINI_DEVICE_CPU, cpu);
+#endif
+#ifdef ENABLE_NVIDIA_API
+        CALCULATE(INFINI_DEVICE_NVIDIA, nvidia);
+#endif
+#ifdef ENABLE_ILUVATAR_API
+        CALCULATE(INFINI_DEVICE_ILUVATAR, nvidia);
+#endif
+#ifdef ENABLE_METAX_API
+        CALCULATE(INFINI_DEVICE_METAX, metax);
+#endif
+
+    default:
+        return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
+    }
+
+#undef CALCULATE
+}
+
+__C infiniStatus_t
+infiniopDestroySigmoidBackwardDescriptor(infiniopSigmoidBackwardDescriptor_t desc) {
+
+#define DELETE(CASE, NAMESPACE)                                                             \
+    case CASE:                                                                              \
+        delete reinterpret_cast<const op::sigmoid_backward::NAMESPACE::Descriptor *>(desc); \
+        return INFINI_STATUS_SUCCESS
+
+    switch (desc->device_type) {
+
+#ifdef ENABLE_CPU_API
+        DELETE(INFINI_DEVICE_CPU, cpu);
+#endif
+#ifdef ENABLE_NVIDIA_API
+        DELETE(INFINI_DEVICE_NVIDIA, nvidia);
+#endif
+#ifdef ENABLE_ILUVATAR_API
+        DELETE(INFINI_DEVICE_ILUVATAR, nvidia);
+#endif
+#ifdef ENABLE_METAX_API
+        DELETE(INFINI_DEVICE_METAX, metax);
+#endif
+
+    default:
+        return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
+    }
+
+#undef DELETE
+}
diff --git a/src/infiniop/ops/sin/cpu/sin_cpu.cc b/src/infiniop/ops/sin/cpu/sin_cpu.cc
new file mode 100644
index 000000000..88ba6cdd6
--- /dev/null
+++ b/src/infiniop/ops/sin/cpu/sin_cpu.cc
@@ -0,0 +1,52 @@
+#include "sin_cpu.h"
+
+namespace op::sin::cpu {
+
+Descriptor::~Descriptor() = default;
+
+infiniStatus_t Descriptor::create(
+    infiniopHandle_t handle_,
+    Descriptor **desc_ptr,
+    infiniopTensorDescriptor_t out_desc,
+    std::vector<infiniopTensorDescriptor_t> input_desc_vec) {
+
+    auto handle = reinterpret_cast<device::cpu::Handle *>(handle_);
+    auto dtype = out_desc->dtype();
+
+    const auto &input_desc = input_desc_vec.at(0);
+    const auto &output_shape = out_desc->shape();
+    const auto &input_shape = input_desc->shape();
+
+    CHECK_DTYPE(dtype, INFINI_DTYPE_F16, INFINI_DTYPE_F32, INFINI_DTYPE_F64, INFINI_DTYPE_BF16);
+
+    CHECK_SAME_SHAPE(output_shape, input_shape);
+
+    // create CPU elementwise descriptor
+    CREATE_ELEMENTWISE_CPU_DESCRIPTOR(handle, dtype, out_desc, input_desc_vec);
+
+    return INFINI_STATUS_SUCCESS;
+}
+
+infiniStatus_t Descriptor::calculate(
+    void *workspace,
+    size_t workspace_size,
+    void *output,
+    std::vector<const void *> inputs,
+    void *stream) const {
+
+    switch (_dtype) {
+    case INFINI_DTYPE_F16:
+        return _device_info->calculate<SinOp, fp16_t>(_info, output, inputs, stream);
+    case INFINI_DTYPE_F32:
+        return _device_info->calculate<SinOp, float>(_info, output, inputs, stream);
+    case INFINI_DTYPE_F64:
+        return _device_info->calculate<SinOp, double>(_info, output, inputs, stream);
+    case INFINI_DTYPE_BF16:
+        return _device_info->calculate<SinOp, bf16_t>(_info, output, inputs, stream);
+    default:
+        return INFINI_STATUS_BAD_TENSOR_DTYPE;
+    }
+
+    return INFINI_STATUS_SUCCESS;
+}
+} // namespace op::sin::cpu
diff --git a/src/infiniop/ops/sin/cpu/sin_cpu.h b/src/infiniop/ops/sin/cpu/sin_cpu.h
new file mode 100644
index 000000000..80e406f98
--- /dev/null
+++ b/src/infiniop/ops/sin/cpu/sin_cpu.h
@@ -0,0 +1,21 @@
+#ifndef __SIN_CPU_H__
+#define __SIN_CPU_H__
+
+#include "../../../elementwise/cpu/elementwise_cpu.h"
+#include <cmath>
+
+ELEMENTWISE_DESCRIPTOR(sin, cpu)
+
+namespace op::sin::cpu {
+typedef struct SinOp {
+public:
+    static constexpr size_t num_inputs = 1;
+
+    template <typename T>
+    T operator()(const T &input) const {
+        return std::sin(input);
+    }
+} SinOp;
+} // namespace op::sin::cpu
+
+#endif // __SIN_CPU_H__
diff --git a/src/infiniop/ops/sin/cuda/kernel.cuh b/src/infiniop/ops/sin/cuda/kernel.cuh
new file mode 100644
index 000000000..30641366c
--- /dev/null
+++ b/src/infiniop/ops/sin/cuda/kernel.cuh
@@ -0,0 +1,39 @@
+#ifndef __SIN_CUDA_H__
+#define __SIN_CUDA_H__
+
+#include <cmath>
+#include <cuda_bf16.h>
+#include <cuda_fp16.h>
+
+namespace op::sin::cuda {
+typedef struct SinOp {
+    static constexpr size_t num_inputs = 1;
+
+    template <typename T>
+    __device__ __forceinline__ T operator()(const T &input) const {
+        if constexpr (std::is_same_v<T, half2>) {
+            float2 vf = __half22float2(input);
+            float2 vr = make_float2(__sinf(vf.x), __sinf(vf.y));
+            return __float22half2_rn(vr);
+        } else if constexpr (std::is_same_v<T, half>) {
+            float inputf = __half2float(input);
+            return __float2half_rn(sinf(inputf));
+        } else if constexpr (std::is_same_v<T, cuda_bfloat162>) {
+            float f0 = __bfloat162float(__low2bfloat16(input));
+            float f1 = __bfloat162float(__high2bfloat16(input));
+            return __floats2bfloat162_rn(__sinf(f0), __sinf(f1));
+        } else if constexpr (std::is_same_v<T, cuda_bfloat16>) {
+            float inputf = __bfloat162float(input);
+            return __float2bfloat16_rn(__sinf(inputf));
+        } else if constexpr (std::is_same_v<T, float>) {
+            return sinf(input);
+        } else if constexpr (std::is_same_v<T, double>) {
+            return std::sin(input);
+        } else {
+            return std::sin(input);
+        }
+    }
+} SinOp;
+} // namespace op::sin::cuda
+
+#endif // __SIN_CUDA_H__
diff --git a/src/infiniop/ops/sin/metax/sin_metax.h b/src/infiniop/ops/sin/metax/sin_metax.h
new file mode 100644
index 000000000..5b272d4d9
--- /dev/null
+++ b/src/infiniop/ops/sin/metax/sin_metax.h
@@ -0,0 +1,8 @@
+#ifndef __SIN_METAX_API_H__
+#define __SIN_METAX_API_H__
+
+#include "../../../elementwise/metax/elementwise_metax_api.h"
+
+ELEMENTWISE_DESCRIPTOR(sin, metax)
+
+#endif // __SIN_METAX_API_H__
diff --git a/src/infiniop/ops/sin/metax/sin_metax.maca b/src/infiniop/ops/sin/metax/sin_metax.maca
new file mode 100644
index 000000000..5ea69e139
--- /dev/null
+++ b/src/infiniop/ops/sin/metax/sin_metax.maca
@@ -0,0 +1,60 @@
+#include "sin_metax.h"
+
+#include "../../../elementwise/metax/elementwise_metax.h"
+
+#include "../cuda/kernel.cuh"
+
+namespace op::sin::metax {
+
+Descriptor::~Descriptor() = default;
+
+infiniStatus_t Descriptor::create(
+    infiniopHandle_t handle_,
+    Descriptor **desc_ptr,
+    infiniopTensorDescriptor_t out_desc,
+    std::vector<infiniopTensorDescriptor_t> input_desc_vec) {
+
+    auto handle = reinterpret_cast<device::metax::Handle *>(handle_);
+    auto dtype = out_desc->dtype();
+
+    const auto &input_desc = input_desc_vec.at(0);
+    const auto &output_shape = out_desc->shape();
+    const auto &input_shape = input_desc->shape();
+
+    CHECK_DTYPE(dtype, INFINI_DTYPE_F16, INFINI_DTYPE_F32, INFINI_DTYPE_F64, INFINI_DTYPE_BF16);
+
+    CHECK_SAME_SHAPE(output_shape, input_shape);
+
+    // create CUDA elementwise descriptor
+    CREATE_ELEMENTWISE_METAX_DESCRIPTOR(handle, dtype, out_desc, input_desc_vec)
+
+    return INFINI_STATUS_SUCCESS;
+}
+
+infiniStatus_t Descriptor::calculate(
+    void *workspace,
+    size_t workspace_size,
+    void *output,
+    std::vector<const void *> inputs,
+    void *stream) const {
+
+    if (workspace_size < _workspace_size) {
+        return INFINI_STATUS_INSUFFICIENT_WORKSPACE;
+    }
+
+    switch (_dtype) {
+    case INFINI_DTYPE_F16:
+        return _device_info->calculate<256, cuda::SinOp, half>(_info, workspace, output, inputs, stream);
+    case INFINI_DTYPE_BF16:
+        return _device_info->calculate<256, cuda::SinOp, cuda_bfloat16>(_info, workspace, output, inputs, stream);
+    case INFINI_DTYPE_F32:
+        return _device_info->calculate<256, cuda::SinOp, float>(_info, workspace, output, inputs, stream);
+    case INFINI_DTYPE_F64:
+        return _device_info->calculate<256, cuda::SinOp, double>(_info, workspace, output, inputs, stream);
+    default:
+        return INFINI_STATUS_BAD_TENSOR_DTYPE;
+    }
+
+    return INFINI_STATUS_SUCCESS;
+}
+} // namespace op::sin::metax
diff --git a/src/infiniop/ops/sin/nvidia/sin_nvidia.cu b/src/infiniop/ops/sin/nvidia/sin_nvidia.cu
new file mode 100644
index 000000000..6fbf952bc
--- /dev/null
+++ b/src/infiniop/ops/sin/nvidia/sin_nvidia.cu
@@ -0,0 +1,59 @@
+#include "../../../elementwise/nvidia/elementwise_nvidia.cuh"
+
+#include "../cuda/kernel.cuh"
+#include "sin_nvidia.cuh"
+
+namespace op::sin::nvidia {
+
+Descriptor::~Descriptor() = default;
+
+infiniStatus_t Descriptor::create(
+    infiniopHandle_t handle_,
+    Descriptor **desc_ptr,
+    infiniopTensorDescriptor_t out_desc,
+    std::vector<infiniopTensorDescriptor_t> input_desc_vec) {
+
+    auto handle = reinterpret_cast<device::nvidia::Handle *>(handle_);
+    auto dtype = out_desc->dtype();
+
+    const auto &input_desc = input_desc_vec.at(0);
+    const auto &output_shape = out_desc->shape();
+    const auto &input_shape = input_desc->shape();
+
+    CHECK_DTYPE(dtype, INFINI_DTYPE_F16, INFINI_DTYPE_F32, INFINI_DTYPE_F64, INFINI_DTYPE_BF16);
+
+    CHECK_SAME_SHAPE(output_shape, input_shape);
+
+    // create CUDA elementwise descriptor
+    CREATE_ELEMENTWISE_CUDA_DESCRIPTOR(handle, dtype, out_desc, input_desc_vec)
+
+    return INFINI_STATUS_SUCCESS;
+}
+
+infiniStatus_t Descriptor::calculate(
+    void *workspace,
+    size_t workspace_size,
+    void *output,
+    std::vector<const void *> inputs,
+    void *stream) const {
+
+    if (workspace_size < _workspace_size) {
+        return INFINI_STATUS_INSUFFICIENT_WORKSPACE;
+    }
+
+    switch (_dtype) {
+    case INFINI_DTYPE_F16:
+        return _device_info->calculate<256, cuda::SinOp, half>(_info, workspace, output, inputs, stream);
+    case INFINI_DTYPE_BF16:
+        return _device_info->calculate<256, cuda::SinOp, cuda_bfloat16>(_info, workspace, output, inputs, stream);
+    case INFINI_DTYPE_F32:
+        return _device_info->calculate<256, cuda::SinOp, float>(_info, workspace, output, inputs, stream);
+    case INFINI_DTYPE_F64:
+        return _device_info->calculate<256, cuda::SinOp, double>(_info, workspace, output, inputs, stream);
+    default:
+        return INFINI_STATUS_BAD_TENSOR_DTYPE;
+    }
+
+    return INFINI_STATUS_SUCCESS;
+}
+} // namespace op::sin::nvidia
diff --git a/src/infiniop/ops/sin/nvidia/sin_nvidia.cuh b/src/infiniop/ops/sin/nvidia/sin_nvidia.cuh
new file mode 100644
index 000000000..31f5b48ef
--- /dev/null
+++ b/src/infiniop/ops/sin/nvidia/sin_nvidia.cuh
@@ -0,0 +1,8 @@
+#ifndef __SIN_CUDA_API_H__
+#define __SIN_CUDA_API_H__
+
+#include "../../../elementwise/nvidia/elementwise_nvidia_api.cuh"
+
+ELEMENTWISE_DESCRIPTOR(sin, nvidia)
+
+#endif // __SIN_CUDA_API_H__
diff --git a/src/infiniop/ops/sin/operator.cc b/src/infiniop/ops/sin/operator.cc
new file mode 100644
index 000000000..978561a04
--- /dev/null
+++ b/src/infiniop/ops/sin/operator.cc
@@ -0,0 +1,142 @@
+#include "../../operator.h"
+#include "../../handle.h"
+#include "infiniop/ops/sin.h"
+
+#ifdef ENABLE_CPU_API
+#include "cpu/sin_cpu.h"
+#endif
+#if defined(ENABLE_NVIDIA_API) || defined(ENABLE_ILUVATAR_API)
+#include "nvidia/sin_nvidia.cuh"
+#endif
+#ifdef ENABLE_METAX_API
+#include "metax/sin_metax.h"
+#endif
+
+__C infiniStatus_t infiniopCreateSinDescriptor(
+    infiniopHandle_t handle,
+    infiniopSinDescriptor_t *desc_ptr,
+    infiniopTensorDescriptor_t output_desc,
+    infiniopTensorDescriptor_t input_desc) {
+
+#define CREATE(CASE, NAMESPACE)                                            \
+    case CASE:                                                             \
+        return op::sin::NAMESPACE::Descriptor::create(                     \
+            handle,                                                        \
+            reinterpret_cast<op::sin::NAMESPACE::Descriptor **>(desc_ptr), \
+            output_desc,                                                   \
+            {input_desc})
+
+    switch (handle->device) {
+
+#ifdef ENABLE_CPU_API
+        CREATE(INFINI_DEVICE_CPU, cpu);
+#endif
+#ifdef ENABLE_NVIDIA_API
+        CREATE(INFINI_DEVICE_NVIDIA, nvidia);
+#endif
+#ifdef ENABLE_ILUVATAR_API
+        CREATE(INFINI_DEVICE_ILUVATAR, nvidia);
+#endif
+#ifdef ENABLE_METAX_API
+        CREATE(INFINI_DEVICE_METAX, metax);
+#endif
+
+    default:
+        return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
+    }
+
+#undef CREATE
+}
+
+__C infiniStatus_t infiniopGetSinWorkspaceSize(infiniopSinDescriptor_t desc, size_t *size) {
+
+#define GET(CASE, NAMESPACE)                                                               \
+    case CASE:                                                                             \
+        *size = reinterpret_cast<op::sin::NAMESPACE::Descriptor *>(desc)->workspaceSize(); \
+        return INFINI_STATUS_SUCCESS;
+
+    switch (desc->device_type) {
+#ifdef ENABLE_CPU_API
+        GET(INFINI_DEVICE_CPU, cpu);
+#endif
+#ifdef ENABLE_NVIDIA_API
+        GET(INFINI_DEVICE_NVIDIA, nvidia);
+#endif
+#ifdef ENABLE_ILUVATAR_API
+        GET(INFINI_DEVICE_ILUVATAR, nvidia);
+#endif
+#ifdef ENABLE_METAX_API
+        GET(INFINI_DEVICE_METAX, metax);
+#endif
+    default:
+        return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
+    }
+#undef GET
+
+    return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
+}
+
+__C infiniStatus_t infiniopSin(
+    infiniopSinDescriptor_t desc,
+    void *workspace,
+    size_t workspace_size,
+    void *output,
+    const void *input,
+    void *stream) {
+
+#define CALCULATE(CASE, NAMESPACE)                                            \
+    case CASE:                                                                \
+        return reinterpret_cast<const op::sin::NAMESPACE::Descriptor *>(desc) \
+            ->calculate(workspace, workspace_size, output, {input}, stream)
+
+    switch (desc->device_type) {
+
+#ifdef ENABLE_CPU_API
+        CALCULATE(INFINI_DEVICE_CPU, cpu);
+#endif
+#ifdef ENABLE_NVIDIA_API
+        CALCULATE(INFINI_DEVICE_NVIDIA, nvidia);
+#endif
+#ifdef ENABLE_ILUVATAR_API
+        CALCULATE(INFINI_DEVICE_ILUVATAR, nvidia);
+#endif
+#ifdef ENABLE_METAX_API
+        CALCULATE(INFINI_DEVICE_METAX, metax);
+#endif
+
+    default:
+        return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
+    }
+
+#undef CALCULATE
+}
+
+__C infiniStatus_t
+infiniopDestroySinDescriptor(infiniopSinDescriptor_t desc) {
+
+#define DELETE(CASE, NAMESPACE)                                                \
+    case CASE:                                                                 \
+        delete reinterpret_cast<const op::sin::NAMESPACE::Descriptor *>(desc); \
+        return INFINI_STATUS_SUCCESS
+
+    switch (desc->device_type) {
+
+#ifdef ENABLE_CPU_API
+        DELETE(INFINI_DEVICE_CPU, cpu);
+#endif
+#ifdef ENABLE_NVIDIA_API
+        DELETE(INFINI_DEVICE_NVIDIA, nvidia);
+#endif
+#ifdef ENABLE_ILUVATAR_API
+        DELETE(INFINI_DEVICE_ILUVATAR, nvidia);
+#endif
+#ifdef ENABLE_METAX_API
+        DELETE(INFINI_DEVICE_METAX, metax);
+#endif
+
+    default:
+        return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
+    }
+
+#undef DELETE
+}
diff --git a/src/infiniop/ops/tanh/cpu/tanh_cpu.cc b/src/infiniop/ops/tanh/cpu/tanh_cpu.cc
new file mode 100644
index 000000000..23a92ed65
--- /dev/null
+++ b/src/infiniop/ops/tanh/cpu/tanh_cpu.cc
@@ -0,0 +1,52 @@
+#include "tanh_cpu.h"
+
+namespace op::tanh::cpu {
+
+Descriptor::~Descriptor() = default;
+
+infiniStatus_t Descriptor::create(
+    infiniopHandle_t handle_,
+    Descriptor **desc_ptr,
+    infiniopTensorDescriptor_t out_desc,
+    std::vector<infiniopTensorDescriptor_t> input_desc_vec) {
+
+    auto handle = reinterpret_cast<device::cpu::Handle *>(handle_);
+    auto dtype = out_desc->dtype();
+
+    const auto &input_desc = input_desc_vec.at(0);
+    const auto &output_shape = out_desc->shape();
+    const auto &input_shape = input_desc->shape();
+
+    CHECK_DTYPE(dtype, INFINI_DTYPE_F16, INFINI_DTYPE_F32, INFINI_DTYPE_F64, INFINI_DTYPE_BF16);
+
+    CHECK_SAME_SHAPE(output_shape, input_shape);
+
+    // create CPU elementwise descriptor
+    CREATE_ELEMENTWISE_CPU_DESCRIPTOR(handle, dtype, out_desc, input_desc_vec);
+
+    return INFINI_STATUS_SUCCESS;
+}
+
+infiniStatus_t Descriptor::calculate(
+    void *workspace,
+    size_t workspace_size,
+    void *output,
+    std::vector<const void *> inputs,
+    void *stream) const {
+
+    switch (_dtype) {
+    case INFINI_DTYPE_F16:
+        return _device_info->calculate<TanhOp, fp16_t>(_info, output, inputs, stream);
+    case INFINI_DTYPE_F32:
+        return _device_info->calculate<TanhOp, float>(_info, output, inputs, stream);
+    case INFINI_DTYPE_F64:
+        return _device_info->calculate<TanhOp, double>(_info, output, inputs, stream);
+    case INFINI_DTYPE_BF16:
+        return _device_info->calculate<TanhOp, bf16_t>(_info, output, inputs, stream);
+    default:
+        return INFINI_STATUS_BAD_TENSOR_DTYPE;
+    }
+
+    return INFINI_STATUS_SUCCESS;
+}
+} // namespace op::tanh::cpu
diff --git a/src/infiniop/ops/tanh/cpu/tanh_cpu.h b/src/infiniop/ops/tanh/cpu/tanh_cpu.h
new file mode 100644
index 000000000..73fd7c1b6
--- /dev/null
+++ b/src/infiniop/ops/tanh/cpu/tanh_cpu.h
@@ -0,0 +1,21 @@
+#ifndef __TANH_CPU_H__
+#define __TANH_CPU_H__
+
+#include "../../../elementwise/cpu/elementwise_cpu.h"
+#include <cmath>
+
+ELEMENTWISE_DESCRIPTOR(tanh, cpu)
+
+namespace op::tanh::cpu {
+typedef struct TanhOp {
+public:
+    static constexpr size_t num_inputs = 1;
+
+    template <typename T>
+    T operator()(const T &input) const {
+        return std::tanh(input);
+    }
+} TanhOp;
+} // namespace op::tanh::cpu
+
+#endif // __TANH_CPU_H__
diff --git a/src/infiniop/ops/tanh/cuda/kernel.cuh b/src/infiniop/ops/tanh/cuda/kernel.cuh
new file mode 100644
index 000000000..62979a20e
--- /dev/null
+++ b/src/infiniop/ops/tanh/cuda/kernel.cuh
@@ -0,0 +1,46 @@
+#ifndef __TANH_CUDA_H__
+#define __TANH_CUDA_H__
+
+#include <cmath>
+#include <cuda_bf16.h>
+#include <cuda_fp16.h>
+
+namespace op::tanh::cuda {
+typedef struct TanhOp {
+    static constexpr size_t num_inputs = 1;
+
+    __device__ __forceinline__ float tanh_f32_func(float x) const {
+        return tanhf(x);
+    }
+    template <typename T>
+    __device__ __forceinline__ T operator()(const T &input) const {
+        if constexpr (std::is_same_v<T, half2>) {
+            float2 vf = __half22float2(input);
+            float2 vr = make_float2(tanh_f32_func(vf.x), tanh_f32_func(vf.y));
+            return __float22half2_rn(vr);
+        } else if constexpr (std::is_same_v<T, half>) {
+            float xf = __half2float(input);
+            float yf = tanh_f32_func(xf);
+            return __float2half_rn(yf);
+        } else if constexpr (std::is_same_v<T, cuda_bfloat162>) {
+            float f0 = __bfloat162float(__low2bfloat16(input));
+            float f1 = __bfloat162float(__high2bfloat16(input));
+            float r0 = tanh_f32_func(f0);
+            float r1 = tanh_f32_func(f1);
+            return __floats2bfloat162_rn(r0, r1);
+        } else if constexpr (std::is_same_v<T, cuda_bfloat16>) {
+            float xf = __bfloat162float(input);
+            float rf = tanh_f32_func(xf);
+            return __float2bfloat16_rn(rf);
+        } else if constexpr (std::is_same_v<T, float>) {
+            return tanh_f32_func(input);
+        } else if constexpr (std::is_same_v<T, double>) {
+            return std::tanh(input);
+        } else {
+            return std::tanh(input);
+        }
+    }
+} TanhOp;
+} // namespace op::tanh::cuda
+
+#endif // __TANH_CUDA_H__
diff --git a/src/infiniop/ops/tanh/metax/tanh_metax.h b/src/infiniop/ops/tanh/metax/tanh_metax.h
new file mode 100644
index 000000000..8432a7f0d
--- /dev/null
+++ b/src/infiniop/ops/tanh/metax/tanh_metax.h
@@ -0,0 +1,8 @@
+#ifndef __TANH_METAX_API_H__
+#define __TANH_METAX_API_H__
+
+#include "../../../elementwise/metax/elementwise_metax_api.h"
+
+ELEMENTWISE_DESCRIPTOR(tanh, metax)
+
+#endif // __TANH_METAX_API_H__
diff --git a/src/infiniop/ops/tanh/metax/tanh_metax.maca b/src/infiniop/ops/tanh/metax/tanh_metax.maca
new file mode 100644
index 000000000..0a01554c4
--- /dev/null
+++ b/src/infiniop/ops/tanh/metax/tanh_metax.maca
@@ -0,0 +1,60 @@
+#include "tanh_metax.h"
+
+#include "../../../elementwise/metax/elementwise_metax.h"
+
+#include "../cuda/kernel.cuh"
+
+namespace op::tanh::metax {
+
+Descriptor::~Descriptor() = default;
+
+infiniStatus_t Descriptor::create(
+    infiniopHandle_t handle_,
+    Descriptor **desc_ptr,
+    infiniopTensorDescriptor_t out_desc,
+    std::vector<infiniopTensorDescriptor_t> input_desc_vec) {
+
+    auto handle = reinterpret_cast<device::metax::Handle *>(handle_);
+    auto dtype = out_desc->dtype();
+
+    const auto &input_desc = input_desc_vec.at(0);
+    const auto &output_shape = out_desc->shape();
+    const auto &input_shape = input_desc->shape();
+
+    CHECK_DTYPE(dtype, INFINI_DTYPE_F16, INFINI_DTYPE_F32, INFINI_DTYPE_F64, INFINI_DTYPE_BF16);
+
+    CHECK_SAME_SHAPE(output_shape, input_shape);
+
+    // create CUDA elementwise descriptor
+    CREATE_ELEMENTWISE_METAX_DESCRIPTOR(handle, dtype, out_desc, input_desc_vec)
+
+    return INFINI_STATUS_SUCCESS;
+}
+
+infiniStatus_t Descriptor::calculate(
+    void *workspace,
+    size_t workspace_size,
+    void *output,
+    std::vector<const void *> inputs,
+    void *stream) const {
+
+    if (workspace_size < _workspace_size) {
+        return INFINI_STATUS_INSUFFICIENT_WORKSPACE;
+    }
+
+    switch (_dtype) {
+    case INFINI_DTYPE_F16:
+        return _device_info->calculate<256, cuda::TanhOp, half>(_info, workspace, output, inputs, stream);
+    case INFINI_DTYPE_BF16:
+        return _device_info->calculate<256, cuda::TanhOp, cuda_bfloat16>(_info, workspace, output, inputs, stream);
+    case INFINI_DTYPE_F32:
+        return _device_info->calculate<256, cuda::TanhOp, float>(_info, workspace, output, inputs, stream);
+    case INFINI_DTYPE_F64:
+        return _device_info->calculate<256, cuda::TanhOp, double>(_info, workspace, output, inputs, stream);
+    default:
+        return INFINI_STATUS_BAD_TENSOR_DTYPE;
+    }
+
+    return INFINI_STATUS_SUCCESS;
+}
+} // namespace op::tanh::metax
diff --git a/src/infiniop/ops/tanh/nvidia/tanh_nvidia.cu b/src/infiniop/ops/tanh/nvidia/tanh_nvidia.cu
new file mode 100644
index 000000000..a2c36551c
--- /dev/null
+++ b/src/infiniop/ops/tanh/nvidia/tanh_nvidia.cu
@@ -0,0 +1,59 @@
+#include "../../../elementwise/nvidia/elementwise_nvidia.cuh"
+
+#include "../cuda/kernel.cuh"
+#include "tanh_nvidia.cuh"
+
+namespace op::tanh::nvidia {
+
+Descriptor::~Descriptor() = default;
+
+infiniStatus_t Descriptor::create(
+    infiniopHandle_t handle_,
+    Descriptor **desc_ptr,
+    infiniopTensorDescriptor_t out_desc,
+    std::vector<infiniopTensorDescriptor_t> input_desc_vec) {
+
+    auto handle = reinterpret_cast<device::nvidia::Handle *>(handle_);
+    auto dtype = out_desc->dtype();
+
+    const auto &input_desc = input_desc_vec.at(0);
+    const auto &output_shape = out_desc->shape();
+    const auto &input_shape = input_desc->shape();
+
+    CHECK_DTYPE(dtype, INFINI_DTYPE_F16, INFINI_DTYPE_F32, INFINI_DTYPE_F64, INFINI_DTYPE_BF16);
+
+    CHECK_SAME_SHAPE(output_shape, input_shape);
+
+    // create CUDA elementwise descriptor
+    CREATE_ELEMENTWISE_CUDA_DESCRIPTOR(handle, dtype, out_desc, input_desc_vec)
+
+    return INFINI_STATUS_SUCCESS;
+}
+
+infiniStatus_t Descriptor::calculate(
+    void *workspace,
+    size_t workspace_size,
+    void *output,
+    std::vector<const void *> inputs,
+    void *stream) const {
+
+    if (workspace_size < _workspace_size) {
+        return INFINI_STATUS_INSUFFICIENT_WORKSPACE;
+    }
+
+    switch (_dtype) {
+    case INFINI_DTYPE_F16:
+        return _device_info->calculate<256, cuda::TanhOp, half>(_info, workspace, output, inputs, stream);
+    case INFINI_DTYPE_BF16:
+        return _device_info->calculate<256, cuda::TanhOp, cuda_bfloat16>(_info, workspace, output, inputs, stream);
+    case INFINI_DTYPE_F32:
+        return _device_info->calculate<256, cuda::TanhOp, float>(_info, workspace, output, inputs, stream);
+    case INFINI_DTYPE_F64:
+        return _device_info->calculate<256, cuda::TanhOp, double>(_info, workspace, output, inputs, stream);
+    default:
+        return INFINI_STATUS_BAD_TENSOR_DTYPE;
+    }
+
+    return INFINI_STATUS_SUCCESS;
+}
+} // namespace op::tanh::nvidia
diff --git a/src/infiniop/ops/tanh/nvidia/tanh_nvidia.cuh b/src/infiniop/ops/tanh/nvidia/tanh_nvidia.cuh
new file mode 100644
index 000000000..cb37b2528
--- /dev/null
+++ b/src/infiniop/ops/tanh/nvidia/tanh_nvidia.cuh
@@ -0,0 +1,8 @@
+#ifndef __TANH_CUDA_API_H__
+#define __TANH_CUDA_API_H__
+
+#include "../../../elementwise/nvidia/elementwise_nvidia_api.cuh"
+
+ELEMENTWISE_DESCRIPTOR(tanh, nvidia)
+
+#endif // __TANH_CUDA_API_H__
diff --git a/src/infiniop/ops/tanh/operator.cc b/src/infiniop/ops/tanh/operator.cc
new file mode 100644
index 000000000..d34d97df6
--- /dev/null
+++ b/src/infiniop/ops/tanh/operator.cc
@@ -0,0 +1,142 @@
+#include "../../operator.h"
+#include "../../handle.h"
+#include "infiniop/ops/tanh.h"
+
+#ifdef ENABLE_CPU_API
+#include "cpu/tanh_cpu.h"
+#endif
+#if defined(ENABLE_NVIDIA_API) || defined(ENABLE_ILUVATAR_API)
+#include "nvidia/tanh_nvidia.cuh"
+#endif
+#ifdef ENABLE_METAX_API
+#include "metax/tanh_metax.h"
+#endif
+
+__C infiniStatus_t infiniopCreateTanhDescriptor(
+    infiniopHandle_t handle,
+    infiniopTanhDescriptor_t *desc_ptr,
+    infiniopTensorDescriptor_t output_desc,
+    infiniopTensorDescriptor_t input_desc) {
+
+#define CREATE(CASE, NAMESPACE)                                             \
+    case CASE:                                                              \
+        return op::tanh::NAMESPACE::Descriptor::create(                     \
+            handle,                                                         \
+            reinterpret_cast<op::tanh::NAMESPACE::Descriptor **>(desc_ptr), \
+            output_desc,                                                    \
+            {input_desc})
+
+    switch (handle->device) {
+
+#ifdef ENABLE_CPU_API
+        CREATE(INFINI_DEVICE_CPU, cpu);
+#endif
+#ifdef ENABLE_NVIDIA_API
+        CREATE(INFINI_DEVICE_NVIDIA, nvidia);
+#endif
+#ifdef ENABLE_ILUVATAR_API
+        CREATE(INFINI_DEVICE_ILUVATAR, nvidia);
+#endif
+#ifdef ENABLE_METAX_API
+        CREATE(INFINI_DEVICE_METAX, metax);
+#endif
+
+    default:
+        return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
+    }
+
+#undef CREATE
+}
+
+__C infiniStatus_t infiniopGetTanhWorkspaceSize(infiniopTanhDescriptor_t desc, size_t *size) {
+
+#define GET(CASE, NAMESPACE)                                                                \
+    case CASE:                                                                              \
+        *size = reinterpret_cast<op::tanh::NAMESPACE::Descriptor *>(desc)->workspaceSize(); \
+        return INFINI_STATUS_SUCCESS;
+
+    switch (desc->device_type) {
+#ifdef ENABLE_CPU_API
+        GET(INFINI_DEVICE_CPU, cpu);
+#endif
+#ifdef ENABLE_NVIDIA_API
+        GET(INFINI_DEVICE_NVIDIA, nvidia);
+#endif
+#ifdef ENABLE_ILUVATAR_API
+        GET(INFINI_DEVICE_ILUVATAR, nvidia);
+#endif
+#ifdef ENABLE_METAX_API
+        GET(INFINI_DEVICE_METAX, metax);
+#endif
+    default:
+        return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
+    }
+#undef GET
+
+    return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
+}
+
+__C infiniStatus_t infiniopTanh(
+    infiniopTanhDescriptor_t desc,
+    void *workspace,
+    size_t workspace_size,
+    void *output,
+    const void *input,
+    void *stream) {
+
+#define CALCULATE(CASE, NAMESPACE)                                             \
+    case CASE:                                                                 \
+        return reinterpret_cast<const op::tanh::NAMESPACE::Descriptor *>(desc) \
+            ->calculate(workspace, workspace_size, output, {input}, stream)
+
+    switch (desc->device_type) {
+
+#ifdef ENABLE_CPU_API
+        CALCULATE(INFINI_DEVICE_CPU, cpu);
+#endif
+#ifdef ENABLE_NVIDIA_API
+        CALCULATE(INFINI_DEVICE_NVIDIA, nvidia);
+#endif
+#ifdef ENABLE_ILUVATAR_API
+        CALCULATE(INFINI_DEVICE_ILUVATAR, nvidia);
+#endif
+#ifdef ENABLE_METAX_API
+        CALCULATE(INFINI_DEVICE_METAX, metax);
+#endif
+
+    default:
+        return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
+    }
+
+#undef CALCULATE
+}
+
+__C infiniStatus_t
+infiniopDestroyTanhDescriptor(infiniopTanhDescriptor_t desc) {
+
+#define DELETE(CASE, NAMESPACE)                                                 \
+    case CASE:                                                                  \
+        delete reinterpret_cast<const op::tanh::NAMESPACE::Descriptor *>(desc); \
+        return INFINI_STATUS_SUCCESS
+
+    switch (desc->device_type) {
+
+#ifdef ENABLE_CPU_API
+        DELETE(INFINI_DEVICE_CPU, cpu);
+#endif
+#ifdef ENABLE_NVIDIA_API
+        DELETE(INFINI_DEVICE_NVIDIA, nvidia);
+#endif
+#ifdef ENABLE_ILUVATAR_API
+        DELETE(INFINI_DEVICE_ILUVATAR, nvidia);
+#endif
+#ifdef ENABLE_METAX_API
+        DELETE(INFINI_DEVICE_METAX, metax);
+#endif
+
+    default:
+        return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
+    }
+
+#undef DELETE
+}
diff --git a/src/infiniop/ops/where/cpu/where_cpu.cc b/src/infiniop/ops/where/cpu/where_cpu.cc
new file mode 100644
index 000000000..de7e86e3e
--- /dev/null
+++ b/src/infiniop/ops/where/cpu/where_cpu.cc
@@ -0,0 +1,84 @@
+#include "where_cpu.h"
+
+namespace op::where::cpu {
+
+Descriptor::~Descriptor() = default;
+
+infiniStatus_t Descriptor::create(
+    infiniopHandle_t handle_,
+    Descriptor **desc_ptr,
+    infiniopTensorDescriptor_t out_desc,
+    std::vector<infiniopTensorDescriptor_t> input_desc_vec) {
+
+    auto handle = reinterpret_cast<device::cpu::Handle *>(handle_);
+    auto dtype = out_desc->dtype();
+
+    const auto &a_desc = input_desc_vec.at(0);
+    const auto &b_desc = input_desc_vec.at(1);
+    const auto &cond_desc = input_desc_vec.at(2);
+
+    const auto &c_shape = out_desc->shape();
+    const auto &a_shape = a_desc->shape();
+    const auto &b_shape = b_desc->shape();
+    const auto &cond_shape = cond_desc->shape();
+
+    CHECK_DTYPE(cond_desc->dtype(),
+                INFINI_DTYPE_F16, INFINI_DTYPE_F32, INFINI_DTYPE_F64, INFINI_DTYPE_BF16,
+                INFINI_DTYPE_I8, INFINI_DTYPE_I16, INFINI_DTYPE_I32, INFINI_DTYPE_I64,
+                INFINI_DTYPE_U8, INFINI_DTYPE_U16, INFINI_DTYPE_U32, INFINI_DTYPE_U64,
+                INFINI_DTYPE_BOOL);
+
+    CHECK_DTYPE(dtype,
+                INFINI_DTYPE_F16, INFINI_DTYPE_F32, INFINI_DTYPE_F64, INFINI_DTYPE_BF16,
+                INFINI_DTYPE_I8, INFINI_DTYPE_I16, INFINI_DTYPE_I32, INFINI_DTYPE_I64,
+                INFINI_DTYPE_U8, INFINI_DTYPE_U16, INFINI_DTYPE_U32, INFINI_DTYPE_U64,
+                INFINI_DTYPE_BOOL);
+
+    CHECK_SAME_SHAPE(c_shape, a_shape, b_shape, cond_shape);
+
+    CREATE_ELEMENTWISE_CPU_DESCRIPTOR(handle, dtype, out_desc, input_desc_vec);
+
+    return INFINI_STATUS_SUCCESS;
+}
+
+infiniStatus_t Descriptor::calculate(
+    void *workspace,
+    size_t workspace_size,
+    void *output,
+    std::vector<const void *> inputs,
+    void *stream) const {
+
+    switch (_dtype) {
+    case INFINI_DTYPE_F16:
+        return _device_info->calculate<WhereOp, fp16_t>(_info, output, inputs, stream);
+    case INFINI_DTYPE_BF16:
+        return _device_info->calculate<WhereOp, bf16_t>(_info, output, inputs, stream);
+    case INFINI_DTYPE_F32:
+        return _device_info->calculate<WhereOp, float>(_info, output, inputs, stream);
+    case INFINI_DTYPE_F64:
+        return _device_info->calculate<WhereOp, double>(_info, output, inputs, stream);
+    case INFINI_DTYPE_I8:
+        return _device_info->calculate<WhereOp, int8_t>(_info, output, inputs, stream);
+    case INFINI_DTYPE_I16:
+        return _device_info->calculate<WhereOp, int16_t>(_info, output, inputs, stream);
+    case INFINI_DTYPE_I32:
+        return _device_info->calculate<WhereOp, int32_t>(_info, output, inputs, stream);
+    case INFINI_DTYPE_I64:
+        return _device_info->calculate<WhereOp, int64_t>(_info, output, inputs, stream);
+    case INFINI_DTYPE_U8:
+        return _device_info->calculate<WhereOp, uint8_t>(_info, output, inputs, stream);
+    case INFINI_DTYPE_U16:
+        return _device_info->calculate<WhereOp, uint16_t>(_info, output, inputs, stream);
+    case INFINI_DTYPE_U32:
+        return _device_info->calculate<WhereOp, uint32_t>(_info, output, inputs, stream);
+    case INFINI_DTYPE_U64:
+        return _device_info->calculate<WhereOp, uint64_t>(_info, output, inputs, stream);
+    case INFINI_DTYPE_BOOL:
+        return _device_info->calculate<WhereOp, bool>(_info, output, inputs, stream);
+    default:
+        return INFINI_STATUS_BAD_TENSOR_DTYPE;
+    }
+
+    return INFINI_STATUS_SUCCESS;
+}
+} // namespace op::where::cpu
diff --git a/src/infiniop/ops/where/cpu/where_cpu.h b/src/infiniop/ops/where/cpu/where_cpu.h
new file mode 100644
index 000000000..3d86cb4f7
--- /dev/null
+++ b/src/infiniop/ops/where/cpu/where_cpu.h
@@ -0,0 +1,19 @@
+#ifndef __WHERE_CPU_H__
+#define __WHERE_CPU_H__
+
+#include "../../../elementwise/cpu/elementwise_cpu.h"
+
+ELEMENTWISE_DESCRIPTOR(where, cpu)
+
+namespace op::where::cpu {
+typedef struct WhereOp {
+public:
+    static constexpr size_t num_inputs = 3;
+    template <typename T>
+    T operator()(const T &a, const T &b, const bool &cond) const {
+        return cond ? a : b;
+    }
+} WhereOp;
+} // namespace op::where::cpu
+
+#endif // __WHERE_CPU_H__
diff --git a/src/infiniop/ops/where/cuda/kernel.cuh b/src/infiniop/ops/where/cuda/kernel.cuh
new file mode 100644
index 000000000..8eb5c762b
--- /dev/null
+++ b/src/infiniop/ops/where/cuda/kernel.cuh
@@ -0,0 +1,15 @@
+#ifndef __WHERE_CUDA_H__
+#define __WHERE_CUDA_H__
+
+namespace op::where::cuda {
+typedef struct WhereOp {
+public:
+    static constexpr size_t num_inputs = 3;
+    template <typename T>
+    __device__ __forceinline__ T operator()(const T &a, const T &b, const bool &cond) const {
+        return cond ? a : b;
+    }
+} WhereOp;
+} // namespace op::where::cuda
+
+#endif // __WHERE_CUDA_H__
diff --git a/src/infiniop/ops/where/metax/where_metax.h b/src/infiniop/ops/where/metax/where_metax.h
new file mode 100644
index 000000000..43bb1a945
--- /dev/null
+++ b/src/infiniop/ops/where/metax/where_metax.h
@@ -0,0 +1,8 @@
+#ifndef __WHERE_METAX_API_H__
+#define __WHERE_METAX_API_H__
+
+#include "../../../elementwise/metax/elementwise_metax_api.h"
+
+ELEMENTWISE_DESCRIPTOR(where, metax)
+
+#endif // __WHERE_METAX_API_H__
diff --git a/src/infiniop/ops/where/metax/where_metax.maca b/src/infiniop/ops/where/metax/where_metax.maca
new file mode 100644
index 000000000..b648cfbcc
--- /dev/null
+++ b/src/infiniop/ops/where/metax/where_metax.maca
@@ -0,0 +1,92 @@
+#include "where_metax.h"
+
+#include "../../../elementwise/metax/elementwise_metax.h"
+
+#include "../cuda/kernel.cuh"
+
+namespace op::where::metax {
+
+Descriptor::~Descriptor() = default;
+
+infiniStatus_t Descriptor::create(
+    infiniopHandle_t handle_,
+    Descriptor **desc_ptr,
+    infiniopTensorDescriptor_t out_desc,
+    std::vector<infiniopTensorDescriptor_t> input_desc_vec) {
+
+    auto handle = reinterpret_cast<device::metax::Handle *>(handle_);
+    auto dtype = out_desc->dtype();
+
+    const auto &a_desc = input_desc_vec.at(0);
+    const auto &b_desc = input_desc_vec.at(1);
+    const auto &cond_desc = input_desc_vec.at(2);
+
+    const auto &c_shape = out_desc->shape();
+    const auto &a_shape = a_desc->shape();
+    const auto &b_shape = b_desc->shape();
+    const auto &cond_shape = cond_desc->shape();
+
+    CHECK_DTYPE(cond_desc->dtype(),
+                INFINI_DTYPE_F16, INFINI_DTYPE_F32, INFINI_DTYPE_F64, INFINI_DTYPE_BF16,
+                INFINI_DTYPE_I8, INFINI_DTYPE_I16, INFINI_DTYPE_I32, INFINI_DTYPE_I64,
+                INFINI_DTYPE_U8, INFINI_DTYPE_U16, INFINI_DTYPE_U32, INFINI_DTYPE_U64,
+                INFINI_DTYPE_BOOL);
+
+    CHECK_DTYPE(dtype,
+                INFINI_DTYPE_F16, INFINI_DTYPE_F32, INFINI_DTYPE_F64, INFINI_DTYPE_BF16,
+                INFINI_DTYPE_I8, INFINI_DTYPE_I16, INFINI_DTYPE_I32, INFINI_DTYPE_I64,
+                INFINI_DTYPE_U8, INFINI_DTYPE_U16, INFINI_DTYPE_U32, INFINI_DTYPE_U64,
+                INFINI_DTYPE_BOOL);
+
+    CHECK_SAME_SHAPE(c_shape, a_shape, b_shape, cond_shape);
+
+    CREATE_ELEMENTWISE_METAX_DESCRIPTOR(handle, dtype, out_desc, input_desc_vec)
+
+    return INFINI_STATUS_SUCCESS;
+}
+
+infiniStatus_t Descriptor::calculate(
+    void *workspace,
+    size_t workspace_size,
+    void *output,
+    std::vector<const void *> inputs,
+    void *stream) const {
+
+    if (workspace_size < _workspace_size) {
+        return INFINI_STATUS_INSUFFICIENT_WORKSPACE;
+    }
+
+    switch (_dtype) {
+    case INFINI_DTYPE_F16:
+        return _device_info->calculate<256, cuda::WhereOp, half>(_info, workspace, output, inputs, stream);
+    case INFINI_DTYPE_BF16:
+        return _device_info->calculate<256, cuda::WhereOp, cuda_bfloat16>(_info, workspace, output, inputs, stream);
+    case INFINI_DTYPE_F32:
+        return _device_info->calculate<256, cuda::WhereOp, float>(_info, workspace, output, inputs, stream);
+    case INFINI_DTYPE_F64:
+        return _device_info->calculate<256, cuda::WhereOp, double>(_info, workspace, output, inputs, stream);
+    case INFINI_DTYPE_I8:
+        return _device_info->calculate<256, cuda::WhereOp, int8_t>(_info, workspace, output, inputs, stream);
+    case INFINI_DTYPE_I16:
+        return _device_info->calculate<256, cuda::WhereOp, int16_t>(_info, workspace, output, inputs, stream);
+    case INFINI_DTYPE_I32:
+        return _device_info->calculate<256, cuda::WhereOp, int32_t>(_info, workspace, output, inputs, stream);
+    case INFINI_DTYPE_I64:
+        return _device_info->calculate<256, cuda::WhereOp, int64_t>(_info, workspace, output, inputs, stream);
+    case INFINI_DTYPE_U8:
+        return _device_info->calculate<256, cuda::WhereOp, uint8_t>(_info, workspace, output, inputs, stream);
+    case INFINI_DTYPE_U16:
+        return _device_info->calculate<256, cuda::WhereOp, uint16_t>(_info, workspace, output, inputs, stream);
+    case INFINI_DTYPE_U32:
+        return _device_info->calculate<256, cuda::WhereOp, uint32_t>(_info, workspace, output, inputs, stream);
+    case INFINI_DTYPE_U64:
+        return _device_info->calculate<256, cuda::WhereOp, uint64_t>(_info, workspace, output, inputs, stream);
+    case INFINI_DTYPE_BOOL:
+        return _device_info->calculate<256, cuda::WhereOp, bool>(_info, workspace, output, inputs, stream);
+    default:
+        return INFINI_STATUS_BAD_TENSOR_DTYPE;
+    }
+
+    return INFINI_STATUS_SUCCESS;
+}
+} // namespace op::where::metax
diff --git a/src/infiniop/ops/where/nvidia/where_nvidia.cu b/src/infiniop/ops/where/nvidia/where_nvidia.cu
new file mode 100644
index 000000000..860089bd2
--- /dev/null
+++ b/src/infiniop/ops/where/nvidia/where_nvidia.cu
@@ -0,0 +1,91 @@
+#include "../../../elementwise/nvidia/elementwise_nvidia.cuh"
+
+#include "../cuda/kernel.cuh"
+#include "where_nvidia.cuh"
+
+namespace op::where::nvidia {
+
+Descriptor::~Descriptor() = default;
+
+infiniStatus_t Descriptor::create(
+    infiniopHandle_t handle_,
+    Descriptor **desc_ptr,
+    infiniopTensorDescriptor_t out_desc,
+    std::vector<infiniopTensorDescriptor_t> input_desc_vec) {
+
+    auto handle = reinterpret_cast<device::nvidia::Handle *>(handle_);
+    auto dtype = out_desc->dtype();
+
+    const auto &a_desc = input_desc_vec.at(0);
+    const auto &b_desc = input_desc_vec.at(1);
+    const auto &cond_desc = input_desc_vec.at(2);
+
+    const auto &c_shape = out_desc->shape();
+    const auto &a_shape = a_desc->shape();
+    const auto &b_shape = b_desc->shape();
+    const auto &cond_shape = cond_desc->shape();
+
+    CHECK_DTYPE(cond_desc->dtype(),
+                INFINI_DTYPE_F16, INFINI_DTYPE_F32, INFINI_DTYPE_F64, INFINI_DTYPE_BF16,
+                INFINI_DTYPE_I8, INFINI_DTYPE_I16, INFINI_DTYPE_I32, INFINI_DTYPE_I64,
+                INFINI_DTYPE_U8, INFINI_DTYPE_U16, INFINI_DTYPE_U32, INFINI_DTYPE_U64,
+                INFINI_DTYPE_BOOL);
+
+    CHECK_DTYPE(dtype,
+                INFINI_DTYPE_F16, INFINI_DTYPE_F32, INFINI_DTYPE_F64, INFINI_DTYPE_BF16,
+                INFINI_DTYPE_I8, INFINI_DTYPE_I16, INFINI_DTYPE_I32, INFINI_DTYPE_I64,
+                INFINI_DTYPE_U8, INFINI_DTYPE_U16, INFINI_DTYPE_U32, INFINI_DTYPE_U64,
+                INFINI_DTYPE_BOOL);
+
+    CHECK_SAME_SHAPE(c_shape, a_shape, b_shape, cond_shape);
+
+    CREATE_ELEMENTWISE_CUDA_DESCRIPTOR(handle, dtype, out_desc, input_desc_vec)
+
+    return INFINI_STATUS_SUCCESS;
+}
+
+infiniStatus_t Descriptor::calculate(
+    void *workspace,
+    size_t workspace_size,
+    void *output,
+    std::vector<const void *> inputs,
+    void *stream) const {
+
+    if (workspace_size < _workspace_size) {
+        return INFINI_STATUS_INSUFFICIENT_WORKSPACE;
+    }
+
+    switch (_dtype) {
+    case INFINI_DTYPE_F16:
+        return _device_info->calculate<256, cuda::WhereOp, half>(_info, workspace, output, inputs, stream);
+    case INFINI_DTYPE_BF16:
+        return _device_info->calculate<256, cuda::WhereOp, cuda_bfloat16>(_info, workspace, output, inputs, stream);
+    case INFINI_DTYPE_F32:
+        return _device_info->calculate<256, cuda::WhereOp, float>(_info, workspace, output, inputs, stream);
+    case INFINI_DTYPE_F64:
+        return _device_info->calculate<256, cuda::WhereOp, double>(_info, workspace, output, inputs, stream);
+    case INFINI_DTYPE_I8:
+        return _device_info->calculate<256, cuda::WhereOp, int8_t>(_info, workspace, output, inputs, stream);
+    case INFINI_DTYPE_I16:
+        return _device_info->calculate<256, cuda::WhereOp, int16_t>(_info, workspace, output, inputs, stream);
+    case INFINI_DTYPE_I32:
+        return _device_info->calculate<256, cuda::WhereOp, int32_t>(_info, workspace, output, inputs, stream);
+    case INFINI_DTYPE_I64:
+        return _device_info->calculate<256, cuda::WhereOp, int64_t>(_info, workspace, output, inputs, stream);
+    case INFINI_DTYPE_U8:
+        return _device_info->calculate<256, cuda::WhereOp, uint8_t>(_info, workspace, output, inputs, stream);
+    case INFINI_DTYPE_U16:
+        return _device_info->calculate<256, cuda::WhereOp, uint16_t>(_info, workspace, output, inputs, stream);
+    case INFINI_DTYPE_U32:
+        return _device_info->calculate<256, cuda::WhereOp, uint32_t>(_info, workspace, output, inputs, stream);
+    case INFINI_DTYPE_U64:
+        return _device_info->calculate<256, cuda::WhereOp, uint64_t>(_info, workspace, output, inputs, stream);
+    case INFINI_DTYPE_BOOL:
+        return _device_info->calculate<256, cuda::WhereOp, bool>(_info, workspace, output, inputs, stream);
+    default:
+        return INFINI_STATUS_BAD_TENSOR_DTYPE;
+    }
+
+    return INFINI_STATUS_SUCCESS;
+}
+} // namespace op::where::nvidia
diff --git a/src/infiniop/ops/where/nvidia/where_nvidia.cuh b/src/infiniop/ops/where/nvidia/where_nvidia.cuh
new file mode 100644
index 000000000..c168364a8
--- /dev/null
+++ b/src/infiniop/ops/where/nvidia/where_nvidia.cuh
@@ -0,0 +1,8 @@
+#ifndef __WHERE_CUDA_API_H__
+#define __WHERE_CUDA_API_H__
+
+#include "../../../elementwise/nvidia/elementwise_nvidia_api.cuh"
+
+ELEMENTWISE_DESCRIPTOR(where, nvidia)
+
+#endif // __WHERE_CUDA_API_H__
diff --git a/src/infiniop/ops/where/operator.cc b/src/infiniop/ops/where/operator.cc
new file mode 100644
index 000000000..d69b1d4e1
--- /dev/null
+++ b/src/infiniop/ops/where/operator.cc
@@ -0,0 +1,148 @@
+#include "../../operator.h"
+#include "../../handle.h"
+#include "infiniop/ops/where.h"
+
+#ifdef ENABLE_CPU_API
+#include "cpu/where_cpu.h"
+#endif
+#if defined(ENABLE_NVIDIA_API) || defined(ENABLE_ILUVATAR_API)
+#include "nvidia/where_nvidia.cuh"
+#endif
+#ifdef ENABLE_METAX_API
+#include "metax/where_metax.h"
+#endif
+
+__C infiniStatus_t infiniopCreateWhereDescriptor(
+    infiniopHandle_t handle,
+    infiniopWhereDescriptor_t *desc_ptr,
+    infiniopTensorDescriptor_t c_desc,
+    infiniopTensorDescriptor_t a_desc,
+    infiniopTensorDescriptor_t b_desc,
+    infiniopTensorDescriptor_t condition_desc) {
+
+#define CREATE(CASE, NAMESPACE)                                              \
+    case CASE:                                                               \
+        return op::where::NAMESPACE::Descriptor::create(                     \
+            handle,                                                          \
+            reinterpret_cast<op::where::NAMESPACE::Descriptor **>(desc_ptr), \
+            c_desc,                                                          \
+            {a_desc,                                                         \
+             b_desc,                                                         \
+             condition_desc})
+
+    switch (handle->device) {
+
+#ifdef ENABLE_CPU_API
+        CREATE(INFINI_DEVICE_CPU, cpu);
+#endif
+#ifdef ENABLE_NVIDIA_API
+        CREATE(INFINI_DEVICE_NVIDIA, nvidia);
+#endif
+#ifdef ENABLE_ILUVATAR_API
+        CREATE(INFINI_DEVICE_ILUVATAR, nvidia);
+#endif
+#ifdef ENABLE_METAX_API
+        CREATE(INFINI_DEVICE_METAX, metax);
+#endif
+
+    default:
+        return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
+    }
+
+#undef CREATE
+}
+
+__C infiniStatus_t infiniopGetWhereWorkspaceSize(infiniopWhereDescriptor_t desc, size_t *size) {
+
+#define GET(CASE, NAMESPACE)                                                                 \
+    case CASE:                                                                               \
+        *size = reinterpret_cast<op::where::NAMESPACE::Descriptor *>(desc)->workspaceSize(); \
+        return INFINI_STATUS_SUCCESS
+
+    switch (desc->device_type) {
+#ifdef ENABLE_CPU_API
+        GET(INFINI_DEVICE_CPU, cpu);
+#endif
+#ifdef ENABLE_NVIDIA_API
+        GET(INFINI_DEVICE_NVIDIA, nvidia);
+#endif
+#ifdef ENABLE_ILUVATAR_API
+        GET(INFINI_DEVICE_ILUVATAR, nvidia);
+#endif
+#ifdef ENABLE_METAX_API
+        GET(INFINI_DEVICE_METAX, metax);
+#endif
+    default:
+        return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
+    }
+#undef GET
+
+    return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
+}
+
+__C infiniStatus_t infiniopWhere(
+    infiniopWhereDescriptor_t desc,
+    void *workspace,
+    size_t workspace_size,
+    void *c,
+    const void *a,
+    const void *b,
+    const void *condition,
+    void *stream) {
+
+#define CALCULATE(CASE, NAMESPACE)                                              \
+    case CASE:                                                                  \
+        return reinterpret_cast<const op::where::NAMESPACE::Descriptor *>(desc) \
+            ->calculate(workspace, workspace_size, c, {a, b, condition}, stream)
+
+    switch (desc->device_type) {
+
+#ifdef ENABLE_CPU_API
+        CALCULATE(INFINI_DEVICE_CPU, cpu);
+#endif
+#ifdef ENABLE_NVIDIA_API
+        CALCULATE(INFINI_DEVICE_NVIDIA, nvidia);
+#endif
+#ifdef ENABLE_ILUVATAR_API
+        CALCULATE(INFINI_DEVICE_ILUVATAR, nvidia);
+#endif
+#ifdef ENABLE_METAX_API
+        CALCULATE(INFINI_DEVICE_METAX, metax);
+#endif
+
+    default:
+        return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
+    }
+
+#undef CALCULATE
+}
+
+__C infiniStatus_t
+infiniopDestroyWhereDescriptor(infiniopWhereDescriptor_t desc) {
+
+#define DELETE(CASE, NAMESPACE)                                                  \
+    case CASE:                                                                   \
+        delete reinterpret_cast<const op::where::NAMESPACE::Descriptor *>(desc); \
+        return INFINI_STATUS_SUCCESS
+
+    switch (desc->device_type) {
+
+#ifdef ENABLE_CPU_API
+        DELETE(INFINI_DEVICE_CPU, cpu);
+#endif
+#ifdef ENABLE_NVIDIA_API
+        DELETE(INFINI_DEVICE_NVIDIA, nvidia);
+#endif
+#ifdef ENABLE_ILUVATAR_API
+        DELETE(INFINI_DEVICE_ILUVATAR, nvidia);
+#endif
+#ifdef ENABLE_METAX_API
+        DELETE(INFINI_DEVICE_METAX, metax);
+#endif
+
+    default:
+        return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
+    }
+
+#undef DELETE
+}
diff --git a/test/infiniop/cast.py b/test/infiniop/cast.py
new file mode 100644
index 000000000..87b572741
--- /dev/null
+++ b/test/infiniop/cast.py
@@ -0,0 +1,244 @@
+import torch
+import ctypes
+from ctypes import c_uint64
+from libinfiniop import (
+    LIBINFINIOP,
+    TestTensor,
+    get_test_devices,
+    check_error,
+    test_operator,
+    get_args,
+    debug,
+    profile_operation,
+    TestWorkspace,
+    InfiniDtype,
+    InfiniDtypeNames,
+    InfiniDeviceNames,
+    infiniopOperatorDescriptor_t,
+    to_torch_dtype,
+    torch_device_map
+)
+import itertools
+
+# ==============================================================================
+#  Configuration (Internal Use Only)
+# ==============================================================================
+_TEST_CASES = [
+    ((13, 4), None, None),
+    ((13, 4), (10, 1), (10, 1)),
+    ((13, 4), (0, 1), None),
+    ((13, 4, 4), None, None),
+    ((13, 4, 4), (20, 4, 1), (20, 4, 1)),
+    ((13, 4, 4), (4, 0, 1), None),
+    ((16, 5632), None, None),
+    ((16, 5632), (10240, 1), (10240, 1)),
+    ((4, 4, 5632), None, None),
+    ((4, 4, 5632), (45056, 5632, 1), (45056, 5632, 1)),
+]
+
+_INTEGER_DTYPES = [
+    InfiniDtype.I32,
+    InfiniDtype.I64,
+    InfiniDtype.U32,
+    InfiniDtype.U64,
+]
+
+_FLOAT_DTYPES = [
+    InfiniDtype.F16,
+    InfiniDtype.F32,
+    InfiniDtype.F64,
+]
+
+def is_supported_dt(inf_dt):
+    try:
+        td = to_torch_dtype(inf_dt, compatability_mode=True)
+        _ = torch.empty((1,), dtype=td, device="cpu")
+        return True
+    except Exception:
+        return False
+
+_TOLERANCE_MAP = {
+    ("float", "float"): {"atol": 1e-3, "rtol": 1e-3},
+    ("int", "float"): {"atol": 1.0, "rtol": 1e-3},
+}
+
+DEBUG = False
+PROFILE = False
+NUM_PRERUN = 10
+NUM_ITERATIONS = 1000
+
+
+def _is_integer_dtype(inf_dt):
+    return inf_dt in _INTEGER_DTYPES
+
+
+def _is_float_dtype(inf_dt):
+    return inf_dt in _FLOAT_DTYPES
+
+
+def _is_unsigned_dtype(inf_dt):
+    return inf_dt in (InfiniDtype.U32, InfiniDtype.U64)
+
+
+def reference_cast_torch(output_tensor, input_tensor):
+    converted = input_tensor.to(dtype=output_tensor.dtype, device=output_tensor.device).clone()
+    output_tensor.copy_(converted)
+
+
+def make_integer_torch_tensor(shape, inf_dt, device):
+    use_compatibility = _is_unsigned_dtype(inf_dt)
+    
+    if inf_dt == InfiniDtype.I32:
+        low, high, dtype = -2000, 2000, torch.int32
+    elif inf_dt == InfiniDtype.I64:
+        low, high, dtype = -2048, 2048, torch.int64
+    elif inf_dt == InfiniDtype.U32:
+        low, high, dtype = 0, 2000, torch.int32
+    elif inf_dt == InfiniDtype.U64:
+        low, high, dtype = 0, 2048, torch.int64
+    else:
+        low, high, dtype = 0, 1, torch.int64
+
+    dev = torch_device_map[device]
+
+    t = torch.randint(low=low, high=high, size=shape, dtype=dtype, device=dev)
+
+    target_torch_dt = to_torch_dtype(inf_dt, compatability_mode=use_compatibility)
+    if t.dtype != target_torch_dt:
+        t = t.to(dtype=target_torch_dt)
+
+    return t
+
+
+def test(
+    handle,
+    device,
+    shape,
+    in_stride,
+    out_stride,
+    dtype_pair,
+    sync=None, 
+):
+    in_dt, out_dt = dtype_pair
+
+    if not is_supported_dt(in_dt) or not is_supported_dt(out_dt):
+        print(f"Skipping test for in={InfiniDtypeNames[in_dt]} out={InfiniDtypeNames[out_dt]} because dtype not supported on this platform")
+        return
+
+    try:
+        if _is_integer_dtype(in_dt):
+            in_torch = make_integer_torch_tensor(shape, in_dt, device)
+            input = TestTensor.from_torch(in_torch, in_dt, device)
+        else:
+            input = TestTensor(shape, in_stride, in_dt, device, mode="random")
+
+        output = TestTensor(shape, out_stride, out_dt, device, mode="zeros")
+
+        if output.is_broadcast():
+            return
+
+        print(f"Testing Cast on {InfiniDeviceNames[device]} shape={shape} in={InfiniDtypeNames[in_dt]} out={InfiniDtypeNames[out_dt]} in_stride={in_stride} out_stride={out_stride}")
+
+        reference_cast_torch(output.actual_tensor(), input.torch_tensor())
+
+        expected = output.actual_tensor().clone()
+
+        descriptor = infiniopOperatorDescriptor_t()
+        check_error(
+            LIBINFINIOP.infiniopCreateCastDescriptor(
+                handle,
+                ctypes.byref(descriptor),
+                output.descriptor,
+                input.descriptor,
+            )
+        )
+
+        input.destroy_desc()
+        output.destroy_desc()
+
+        workspace_size = c_uint64(0)
+        check_error(LIBINFINIOP.infiniopGetCastWorkspaceSize(descriptor, ctypes.byref(workspace_size)))
+        workspace = TestWorkspace(workspace_size.value, device)
+
+        def lib_cast():
+            check_error(
+                LIBINFINIOP.infiniopCast(
+                    descriptor,
+                    workspace.data(),
+                    workspace_size.value,
+                    output.data(),
+                    input.data(),
+                    None,
+                )
+            )
+
+        lib_cast()
+
+        actual = output.actual_tensor()
+
+        if _is_integer_dtype(in_dt) and _is_float_dtype(out_dt):
+            tol = _TOLERANCE_MAP[("int", "float")]
+            atol, rtol = tol["atol"], tol["rtol"]
+        elif _is_float_dtype(in_dt) and _is_float_dtype(out_dt):
+            tol = _TOLERANCE_MAP[("float", "float")]
+            atol, rtol = tol["atol"], tol["rtol"]
+        else:
+            atol, rtol = 0, 0
+
+        if DEBUG:
+            debug(actual, expected, atol=atol, rtol=rtol)
+
+        assert torch.allclose(actual, expected, atol=atol, rtol=rtol), \
+            f"Mismatch for in={InfiniDtypeNames[in_dt]} out={InfiniDtypeNames[out_dt]} shape={shape}"
+
+        if PROFILE:
+            profile_operation("PyTorch", lambda: reference_cast_torch(output.torch_tensor(), input.torch_tensor()), device, NUM_PRERUN, NUM_ITERATIONS)
+            profile_operation("    lib", lambda: lib_cast(), device, NUM_PRERUN, NUM_ITERATIONS)
+
+        check_error(LIBINFINIOP.infiniopDestroyCastDescriptor(descriptor))
+        
+    except RuntimeError as e:
+        if "not implemented for 'UInt32'" in str(e) or "not implemented for 'UInt64'" in str(e):
+            #print(f"Skipping unsupported operation: {e}")
+            return False
+        else:
+            raise
+
+
+def main():
+    args = get_args()
+    global DEBUG, PROFILE, NUM_PRERUN, NUM_ITERATIONS
+    DEBUG = args.debug
+    PROFILE = args.profile
+    NUM_PRERUN = args.num_prerun
+    NUM_ITERATIONS = args.num_iterations
+
+    integer_pairs = itertools.product(_INTEGER_DTYPES, _INTEGER_DTYPES)
+    float_pairs = itertools.product(_FLOAT_DTYPES, _FLOAT_DTYPES)
+    int_to_float_pairs = itertools.product(_INTEGER_DTYPES, _FLOAT_DTYPES)
+
+    all_pairs = list(set(itertools.chain(integer_pairs, float_pairs, int_to_float_pairs)))
+
+    supported_pairs = []
+    skipped_pairs = []
+    for pair in all_pairs:
+        in_dt, out_dt = pair
+        if is_supported_dt(in_dt) and is_supported_dt(out_dt):
+            supported_pairs.append(pair)
+        else:
+            skipped_pairs.append(pair)
+
+    print(f"Supported dtype pairs: {[(InfiniDtypeNames[in_d], InfiniDtypeNames[out_d]) for in_d, out_d in supported_pairs]}")
+    if skipped_pairs:
+        print(f"Warning: skipping unsupported dtype pairs: {[(InfiniDtypeNames[in_d], InfiniDtypeNames[out_d]) for in_d, out_d in skipped_pairs]}")
+
+    devices = get_test_devices(args)
+
+    for device in devices:
+        test_operator(device, test, _TEST_CASES, supported_pairs)
+
+    print("\033[92mAll cast tests passed!\033[0m")
+
+
+if __name__ == "__main__":
+    main()
diff --git a/test/infiniop/cos.py b/test/infiniop/cos.py
new file mode 100644
index 000000000..d1d94db3a
--- /dev/null
+++ b/test/infiniop/cos.py
@@ -0,0 +1,166 @@
+import torch
+import ctypes
+from ctypes import c_uint64
+from libinfiniop import (
+    LIBINFINIOP,
+    TestTensor,
+    get_test_devices,
+    check_error,
+    test_operator,
+    get_args,
+    debug,
+    get_tolerance,
+    get_sync_func,
+    profile_operation,
+    TestWorkspace,
+    InfiniDtype,
+    InfiniDtypeNames,
+    InfiniDeviceNames,
+    infiniopOperatorDescriptor_t,
+)
+from enum import Enum, auto
+
+# ========================================================================
+#  Configuration (Internal Use Only)
+# ========================================================================
+_TEST_CASES_ = [
+    # shape, input_stride, output_stride
+    ((13, 4), None, None),
+    ((13, 4), (10, 1), (10, 1)),
+    ((13, 4), (0, 1), None),
+    ((13, 4, 4), None, None),
+    ((13, 4, 4), (20, 4, 1), (20, 4, 1)),
+    ((13, 4, 4), (4, 0, 1), None),
+    ((16, 5632), None, None),
+    ((16, 5632), (10240, 1), (10240, 1)),
+    ((4, 4, 5632), None, None),
+    ((4, 4, 5632), (45056, 5632, 1), (45056, 5632, 1)),
+]
+
+class Inplace(Enum):
+    OUT_OF_PLACE = auto()
+    INPLACE_INPUT = auto()
+
+_INPLACE = [
+    Inplace.OUT_OF_PLACE, 
+    Inplace.INPLACE_INPUT,
+]
+
+_TEST_CASES = [
+    test_case + (inplace,)
+    for test_case in _TEST_CASES_
+    for inplace in _INPLACE
+]
+
+_TENSOR_DTYPES = [InfiniDtype.F16, InfiniDtype.F32, InfiniDtype.BF16]
+
+_TOLERANCE_MAP = {
+    InfiniDtype.F16: {"atol": 1e-3, "rtol": 1e-3},
+    InfiniDtype.F32: {"atol": 1e-6, "rtol": 1e-6},
+    InfiniDtype.BF16: {"atol": 1e-2, "rtol": 1e-2},
+}
+
+DEBUG       = False
+PROFILE     = False
+NUM_PRERUN  = 10
+NUM_ITERATIONS = 1000
+
+
+def cos(output, input):
+    output.copy_(torch.cos(input))
+
+def test(
+    handle,
+    device,
+    shape,
+    input_stride=None,
+    output_stride=None,
+    inplace=Inplace.OUT_OF_PLACE,
+    dtype=torch.float16,
+    sync=None,
+):
+    input = TestTensor(shape, input_stride, dtype, device)
+    if inplace == Inplace.INPLACE_INPUT:
+        if input_stride != output_stride:
+            return
+        output = input
+    else:
+        output = TestTensor(shape, output_stride, dtype, device, mode="ones")
+
+    if output.is_broadcast():
+        return
+
+    print(
+        f"Testing Cos on {InfiniDeviceNames[device]} with shape:{shape} input_stride:{input_stride} output_stride:{output_stride} "
+        f"dtype:{InfiniDtypeNames[dtype]} inplace:{inplace}"
+    )
+
+    cos(output.torch_tensor(), input.torch_tensor())
+
+    if sync is not None:
+        sync()
+
+    descriptor = infiniopOperatorDescriptor_t()
+    check_error(
+        LIBINFINIOP.infiniopCreateCosDescriptor(
+            handle,
+            ctypes.byref(descriptor),
+            output.descriptor,
+            input.descriptor,
+        )
+    )
+
+    # Invalidate the shape and strides in the descriptor to prevent them from being directly used by the kernel
+    for tensor in [input, output]:
+        tensor.destroy_desc()
+
+    workspace_size = c_uint64(0)
+    check_error(
+        LIBINFINIOP.infiniopGetCosWorkspaceSize(
+            descriptor, ctypes.byref(workspace_size)
+        )
+    )
+    workspace = TestWorkspace(workspace_size.value, output.device)
+
+    def lib_cos():
+        check_error(
+            LIBINFINIOP.infiniopCos(
+                descriptor,
+                workspace.data(),
+                workspace_size.value,
+                output.data(),
+                input.data(),
+                None,
+            )
+        )
+
+    lib_cos()
+
+    atol, rtol = get_tolerance(_TOLERANCE_MAP, dtype)
+    if DEBUG:
+        debug(output.actual_tensor(), output.torch_tensor(), atol=atol, rtol=rtol)
+
+    assert torch.allclose(output.actual_tensor(), output.torch_tensor(), atol=atol, rtol=rtol)
+
+    # Profiling workflow
+    if PROFILE:
+        # fmt: off
+        profile_operation("PyTorch", lambda: cos(output.torch_tensor(), input.torch_tensor()), device, NUM_PRERUN, NUM_ITERATIONS)
+        profile_operation("    lib", lambda: lib_cos(), device, NUM_PRERUN, NUM_ITERATIONS)
+        # fmt: on
+    check_error(LIBINFINIOP.infiniopDestroyCosDescriptor(descriptor))
+
+
+if __name__ == "__main__":
+    args = get_args()
+
+    # Configure testing options
+    DEBUG = args.debug
+    PROFILE = args.profile
+    NUM_PRERUN = args.num_prerun
+    NUM_ITERATIONS = args.num_iterations
+
+    for device in get_test_devices(args):
+        test_operator(device, test, _TEST_CASES, _TENSOR_DTYPES)
+
+    print("\033[92mTest passed!\033[0m")
diff --git a/test/infiniop/exp.py b/test/infiniop/exp.py
new file mode 100644
index 000000000..eb139af12
--- /dev/null
+++ b/test/infiniop/exp.py
@@ -0,0 +1,165 @@
+import torch
+import ctypes
+from ctypes import c_uint64
+from libinfiniop import (
+    LIBINFINIOP,
+    TestTensor,
+    get_test_devices,
+    check_error,
+    test_operator,
+    get_args,
+    debug,
+    get_tolerance,
+    get_sync_func,
+    profile_operation,
+    TestWorkspace,
+    InfiniDtype,
+    InfiniDtypeNames,
+    InfiniDeviceNames,
+    infiniopOperatorDescriptor_t,
+)
+from enum import Enum, auto
+
+# ========================================================================
+#  Configuration (Internal Use Only)
+# ========================================================================
+_TEST_CASES_ = [
+    # shape, input_stride, output_stride
+    ((13, 4), None, None),
+    ((13, 4), (10, 1), (10, 1)),
+    ((13, 4), (0, 1), None),
+    ((13, 4, 4), None, None),
+    ((13, 4, 4), (20, 4, 1), (20, 4, 1)),
+    ((13, 4, 4), (4, 0, 1), None),
+    ((16, 5632), None, None),
+    ((16, 5632), (10240, 1), (10240, 1)),
+    ((4, 4, 5632), None, None),
+    ((4, 4, 5632), (45056, 5632, 1), (45056, 5632, 1)),
+]
+
+class Inplace(Enum):
+    OUT_OF_PLACE = auto()
+    INPLACE_INPUT    = auto()
+
+_INPLACE = [
+    Inplace.OUT_OF_PLACE, 
+    Inplace.INPLACE_INPUT,
+]
+
+_TEST_CASES = [
+    test_case + (inplace,)
+    for test_case in _TEST_CASES_
+    for inplace in _INPLACE
+]
+
+_TENSOR_DTYPES = [InfiniDtype.F16, InfiniDtype.F32, InfiniDtype.BF16]
+
+_TOLERANCE_MAP = {
+    InfiniDtype.F16: {"atol": 1e-3, "rtol": 1e-3},
+    InfiniDtype.F32: {"atol": 1e-6, "rtol": 1e-6},
+    InfiniDtype.BF16: {"atol": 1e-2, "rtol": 1e-2},
+}
+
+DEBUG       = False
+PROFILE     = False
+NUM_PRERUN  = 10
+NUM_ITERATIONS = 1000
+
+
+def exp(output, input):
+    output.copy_(torch.exp(input))
+
+def test(
+    handle,
+    device,
+    shape,
+    input_stride=None,
+    output_stride=None,
+    inplace=Inplace.OUT_OF_PLACE,
+    dtype=torch.float16,
+    sync=None,
+):
+    input = TestTensor(shape, input_stride, dtype, device)
+    if inplace == Inplace.INPLACE_INPUT:
+        if input_stride != output_stride:
+            return
+        output = input
+    else:
+        output = TestTensor(shape, output_stride, dtype, device, mode="ones")
+
+    if output.is_broadcast():
+        return
+
+    print(
+        f"Testing Exp on {InfiniDeviceNames[device]} with shape:{shape} input_stride:{input_stride} output_stride:{output_stride} "
+        f"dtype:{InfiniDtypeNames[dtype]} inplace:{inplace}"
+    )
+
+    exp(output.torch_tensor(), input.torch_tensor())
+
+    if sync is not None:
+        sync()
+
+    descriptor = infiniopOperatorDescriptor_t()
+    check_error(
+        LIBINFINIOP.infiniopCreateExpDescriptor(
+            handle,
+            ctypes.byref(descriptor),
+            output.descriptor,
+            input.descriptor,
+        )
+    )
+
+    # Invalidate the shape and strides in the descriptor to prevent them from being directly used by the kernel
+    for tensor in [input, output]:
+        tensor.destroy_desc()
+
+    workspace_size = c_uint64(0)
+    check_error(
+        LIBINFINIOP.infiniopGetExpWorkspaceSize(
+            descriptor, ctypes.byref(workspace_size)
+        )
+    )
+    workspace = TestWorkspace(workspace_size.value, output.device)
+
+    def lib_exp():
+        check_error(
+            LIBINFINIOP.infiniopExp(
+                descriptor,
+                workspace.data(),
+                workspace_size.value,
+                output.data(),
+                input.data(),
+                None,
+            )
+        )
+
+    lib_exp()
+
+    atol, rtol = get_tolerance(_TOLERANCE_MAP, dtype)
+    if DEBUG:
+        debug(output.actual_tensor(), output.torch_tensor(), atol=atol, rtol=rtol)
+    assert torch.allclose(output.actual_tensor(), output.torch_tensor(), atol=atol, rtol=rtol)
+
+    # Profiling workflow
+    if PROFILE:
+        # fmt: off
+        profile_operation("PyTorch", lambda: exp(output.torch_tensor(), input.torch_tensor()), device, NUM_PRERUN, NUM_ITERATIONS)
+        profile_operation("    lib", lambda: lib_exp(), device, NUM_PRERUN, NUM_ITERATIONS)
+        # fmt: on
+    check_error(LIBINFINIOP.infiniopDestroyExpDescriptor(descriptor))
+
+
+if __name__ == "__main__":
+    args = get_args()
+
+    # Configure testing options
+    DEBUG = args.debug
+    PROFILE = args.profile
+    NUM_PRERUN = args.num_prerun
+    NUM_ITERATIONS = args.num_iterations
+
+    for device in get_test_devices(args):
+        test_operator(device, test, _TEST_CASES, _TENSOR_DTYPES)
+
+    print("\033[92mTest passed!\033[0m")
diff --git a/test/infiniop/hardswish.py b/test/infiniop/hardswish.py
new file mode 100644
index 000000000..424b30567
--- /dev/null
+++ b/test/infiniop/hardswish.py
@@ -0,0 +1,167 @@
+import torch
+import ctypes
+from ctypes import c_uint64
+from libinfiniop import (
+    LIBINFINIOP,
+    TestTensor,
+    get_test_devices,
+    check_error,
+    test_operator,
+    get_args,
+    debug,
+    get_tolerance,
+    get_sync_func,
+    profile_operation,
+    TestWorkspace,
+    InfiniDtype,
+    InfiniDtypeNames,
+    InfiniDeviceNames,
+    infiniopOperatorDescriptor_t,
+)
+from enum import Enum, auto
+
+# ========================================================================
+#  Configuration (Internal Use Only)
+# ========================================================================
+_TEST_CASES_ = [
+    # shape, input_stride, output_stride
+    ((13, 4), None, None),
+    ((13, 4), (10, 1), (10, 1)),
+    ((13, 4), (0, 1), None),
+    ((13, 4, 4), None, None),
+    ((13, 4, 4), (20, 4, 1), (20, 4, 1)),
+    ((13, 4, 4), (4, 0, 1), None),
+    ((16, 5632), None, None),
+    ((16, 5632), (10240, 1), (10240, 1)),
+    ((4, 4, 5632), None, None),
+    ((4, 4, 5632), (45056, 5632, 1), (45056, 5632, 1)),
+]
+
+class Inplace(Enum):
+    OUT_OF_PLACE = auto()
+    INPLACE_INPUT = auto()
+
+_INPLACE = [
+    Inplace.OUT_OF_PLACE, 
+    Inplace.INPLACE_INPUT,
+]
+
+_TEST_CASES = [
+    test_case + (inplace,)
+    for test_case in _TEST_CASES_
+    for inplace in _INPLACE
+]
+
+_TENSOR_DTYPES = [InfiniDtype.F16, InfiniDtype.F32, InfiniDtype.BF16]
+
+_TOLERANCE_MAP = {
+    InfiniDtype.F16: {"atol": 1e-3, "rtol": 1e-3},
+    InfiniDtype.F32: {"atol": 1e-6, "rtol": 1e-6},
+    InfiniDtype.BF16: {"atol": 1e-2, "rtol": 1e-2},
+}
+
+DEBUG       = False
+PROFILE     = False
+NUM_PRERUN  = 10
+NUM_ITERATIONS = 1000
+
+
+def hardswish(output, input):
+    output.copy_(input * torch.clamp(input + 3, min=0, max=6) / 6)
+
+
+def test(
+    handle,
+    device,
+    shape,
+    input_stride=None,
+    output_stride=None,
+    inplace=Inplace.OUT_OF_PLACE,
+    dtype=torch.float16,
+    sync=None,
+):
+    input = TestTensor(shape, input_stride, dtype, device)
+    if inplace == Inplace.INPLACE_INPUT:
+        if input_stride != output_stride:
+            return
+        output = input
+    else:
+        output = TestTensor(shape, output_stride, dtype, device, mode="ones")
+
+    if output.is_broadcast():
+        return
+
+    print(
+        f"Testing Hardswish on {InfiniDeviceNames[device]} with shape:{shape} input_stride:{input_stride} output_stride:{output_stride} "
+        f"dtype:{InfiniDtypeNames[dtype]} inplace:{inplace}"
+    )
+
+    hardswish(output.torch_tensor(), input.torch_tensor())
+
+    if sync is not None:
+        sync()
+
+    descriptor = infiniopOperatorDescriptor_t()
+    check_error(
+        LIBINFINIOP.infiniopCreateHardswishDescriptor(
+            handle,
+            ctypes.byref(descriptor),
+            output.descriptor,
+            input.descriptor,
+        )
+    )
+
+    # Invalidate the shape and strides in the descriptor to prevent them from being directly used by the kernel
+    for tensor in [input, output]:
+        tensor.destroy_desc()
+
+    workspace_size = c_uint64(0)
+    check_error(
+        LIBINFINIOP.infiniopGetHardswishWorkspaceSize(
+            descriptor, ctypes.byref(workspace_size)
+        )
+    )
+    workspace = TestWorkspace(workspace_size.value, output.device)
+
+    def lib_hardswish():
+        check_error(
+            LIBINFINIOP.infiniopHardswish(
+                descriptor,
+                workspace.data(),
+                workspace_size.value,
+                output.data(),
+                input.data(),
+                None,
+            )
+        )
+
+    lib_hardswish()
+
+    atol, rtol = get_tolerance(_TOLERANCE_MAP, dtype)
+    if DEBUG:
+        debug(output.actual_tensor(), output.torch_tensor(), atol=atol, rtol=rtol)
+
+    assert torch.allclose(output.actual_tensor(), output.torch_tensor(), atol=atol, rtol=rtol)
+
+    # Profiling workflow
+    if PROFILE:
+        # fmt: off
+        profile_operation("PyTorch", lambda: hardswish(output.torch_tensor(), input.torch_tensor()), device, NUM_PRERUN, NUM_ITERATIONS)
+        profile_operation("    lib", lambda: lib_hardswish(), device, NUM_PRERUN, NUM_ITERATIONS)
+        # fmt: on
+    check_error(LIBINFINIOP.infiniopDestroyHardswishDescriptor(descriptor))
+
+
+if __name__ == "__main__":
+    args = get_args()
+
+    # Configure testing options
+    DEBUG = args.debug
+    PROFILE = args.profile
+    NUM_PRERUN = args.num_prerun
+    NUM_ITERATIONS = args.num_iterations
+
+    for device in get_test_devices(args):
+        test_operator(device, test, _TEST_CASES, _TENSOR_DTYPES)
+
+    print("\033[92mTest passed!\033[0m")
diff --git a/test/infiniop/leakyrelu.py b/test/infiniop/leakyrelu.py
new file mode 100644
index 000000000..76562ddf0
--- /dev/null
+++ b/test/infiniop/leakyrelu.py
@@ -0,0 +1,168 @@
+import torch
+import ctypes
+from ctypes import c_uint64
+from libinfiniop import (
+    LIBINFINIOP,
+    TestTensor,
+    get_test_devices,
+    check_error,
+    test_operator,
+    get_args,
+    debug,
+    get_tolerance,
+    profile_operation,
+    TestWorkspace,
+    InfiniDtype,
+    InfiniDtypeNames,
+    InfiniDeviceNames,
+    infiniopOperatorDescriptor_t,
+)
+from enum import Enum, auto
+
+# ========================================================================
+#  Configuration (Internal Use Only)
+# ========================================================================
+_TEST_CASES_ = [
+    # shape, input_stride, output_stride
+    ((13, 4), None, None),
+    ((13, 4), (10, 1), (10, 1)),
+    ((13, 4), (0, 1), None),
+    ((13, 4, 4), None, None),
+    ((13, 4, 4), (20, 4, 1), (20, 4, 1)),
+    ((13, 4, 4), (4, 0, 1), None),
+    ((16, 5632), None, None),
+    ((16, 5632), (10240, 1), (10240, 1)),
+    ((4, 4, 5632), None, None),
+    ((4, 4, 5632), (45056, 5632, 1), (45056, 5632, 1)),
+]
+
+class Inplace(Enum):
+    OUT_OF_PLACE = auto()
+    INPLACE_INPUT = auto()
+
+_INPLACE = [
+    Inplace.OUT_OF_PLACE, 
+    Inplace.INPLACE_INPUT,
+]
+
+_TEST_CASES = [
+    test_case + (inplace,)
+    for test_case in _TEST_CASES_
+    for inplace in _INPLACE
+]
+
+_TENSOR_DTYPES = [InfiniDtype.F16, InfiniDtype.F32, InfiniDtype.BF16]
+
+_TOLERANCE_MAP = {
+    InfiniDtype.F16: {"atol": 1e-3, "rtol": 1e-3},
+    InfiniDtype.F32: {"atol": 1e-6, "rtol": 1e-6},
+    InfiniDtype.BF16: {"atol": 1e-2, "rtol": 1e-2},
+}
+
+DEBUG       = False
+PROFILE     = False
+NUM_PRERUN  = 10
+NUM_ITERATIONS = 1000
+
+
+def leakyrelu(output, input, negative_slope):
+    output.copy_(torch.where(input >= 0, input, input * negative_slope))
+
+
+def test(
+    handle,
+    device,
+    shape,
+    input_stride=None,
+    output_stride=None,
+    inplace=Inplace.OUT_OF_PLACE,
+    dtype=torch.float16,
+    sync=None,
+):
+    input = TestTensor(shape, input_stride, dtype, device)
+    if inplace == Inplace.INPLACE_INPUT:
+        if input_stride != output_stride:
+            return
+        output = input
+    else:
+        output = TestTensor(shape, output_stride, dtype, device, mode="ones")
+
+    if output.is_broadcast():
+        return
+    
+    negative_slope = 0.1
+    print(
+        f"Testing Leakyrelu on {InfiniDeviceNames[device]} with shape:{shape} input_stride:{input_stride} output_stride:{output_stride} negative_slope:{negative_slope} "
+        f"dtype:{InfiniDtypeNames[dtype]} inplace:{inplace}"
+    )
+
+    leakyrelu(output.torch_tensor(), input.torch_tensor(), negative_slope)
+
+    if sync is not None:
+        sync()
+
+    descriptor = infiniopOperatorDescriptor_t()
+    check_error(
+        LIBINFINIOP.infiniopCreateLeakyreluDescriptor(
+            handle,
+            ctypes.byref(descriptor),
+            output.descriptor,
+            input.descriptor,
+            negative_slope
+        )
+    )
+
+    # Invalidate the shape and strides in the descriptor to prevent them from being directly used by the kernel
+    for tensor in [input, output]:
+        tensor.destroy_desc()
+
+    workspace_size = c_uint64(0)
+    check_error(
+        LIBINFINIOP.infiniopGetLeakyreluWorkspaceSize(
+            descriptor, ctypes.byref(workspace_size)
+        )
+    )
+    workspace = TestWorkspace(workspace_size.value, output.device)
+
+    def lib_leakyrelu():
+        check_error(
+            LIBINFINIOP.infiniopLeakyrelu(
+                descriptor,
+                workspace.data(),
+                workspace_size.value,
+                output.data(),
+                input.data(),
+                None
+            )
+        )
+
+    lib_leakyrelu()
+
+    atol, rtol = get_tolerance(_TOLERANCE_MAP, dtype)
+    if DEBUG:
+        debug(output.actual_tensor(), output.torch_tensor(), atol=atol, rtol=rtol)
+
+    assert torch.allclose(output.actual_tensor(), output.torch_tensor(), atol=atol, rtol=rtol)
+
+    # Profiling workflow
+    if PROFILE:
+        # fmt: off
+        profile_operation("PyTorch", lambda: leakyrelu(output.torch_tensor(), input.torch_tensor(), negative_slope), device, NUM_PRERUN, NUM_ITERATIONS)
+        profile_operation("    lib", lambda: lib_leakyrelu(), device, NUM_PRERUN, NUM_ITERATIONS)
+        # fmt: on
+    check_error(LIBINFINIOP.infiniopDestroyLeakyreluDescriptor(descriptor))
+
+
+if __name__ == "__main__":
+    args = get_args()
+
+    # Configure testing options
+    DEBUG = args.debug
+    PROFILE = args.profile
+    NUM_PRERUN = args.num_prerun
+    NUM_ITERATIONS = args.num_iterations
+
+    for device in get_test_devices(args):
+        test_operator(device, test, _TEST_CASES, _TENSOR_DTYPES)
+
+    print("\033[92mTest passed!\033[0m")
diff --git a/test/infiniop/libinfiniop/op_register.py b/test/infiniop/libinfiniop/op_register.py
index e92e77105..86cee0424 100644
--- a/test/infiniop/libinfiniop/op_register.py
+++ b/test/infiniop/libinfiniop/op_register.py
@@ -489,3 +489,290 @@ def conv_(lib):
     lib.infiniopDestroyConvDescriptor.argtypes = [
         infiniopOperatorDescriptor_t,
     ]
+
+@OpRegister.operator
+def exp_(lib):
+    lib.infiniopCreateExpDescriptor.restype = c_int32
+    lib.infiniopCreateExpDescriptor.argtypes = [
+        infiniopHandle_t,
+        POINTER(infiniopOperatorDescriptor_t),
+        infiniopTensorDescriptor_t,
+        infiniopTensorDescriptor_t,
+    ]
+
+    lib.infiniopGetExpWorkspaceSize.restype = c_int32
+    lib.infiniopGetExpWorkspaceSize.argtypes = [
+        infiniopOperatorDescriptor_t,
+        POINTER(c_size_t),
+    ]
+
+    lib.infiniopExp.restype = c_int32
+    lib.infiniopExp.argtypes = [
+        infiniopOperatorDescriptor_t,
+        c_void_p,
+        c_size_t,
+        c_void_p,
+        c_void_p,
+        c_void_p,
+    ]
+
+    lib.infiniopDestroyExpDescriptor.restype = c_int32
+    lib.infiniopDestroyExpDescriptor.argtypes = [
+        infiniopOperatorDescriptor_t,
+    ]
+
+@OpRegister.operator
+def sin_(lib):
+    lib.infiniopCreateSinDescriptor.restype = c_int32
+    lib.infiniopCreateSinDescriptor.argtypes = [
+        infiniopHandle_t,
+        POINTER(infiniopOperatorDescriptor_t),
+        infiniopTensorDescriptor_t,
+        infiniopTensorDescriptor_t,
+    ]
+
+    lib.infiniopGetSinWorkspaceSize.restype = c_int32
+    lib.infiniopGetSinWorkspaceSize.argtypes = [
+        infiniopOperatorDescriptor_t,
+        POINTER(c_size_t),
+    ]
+
+    lib.infiniopSin.restype = c_int32
+    lib.infiniopSin.argtypes = [
+        infiniopOperatorDescriptor_t,
+        c_void_p,
+        c_size_t,
+        c_void_p,
+        c_void_p,
+        c_void_p,
+    ]
+
+    lib.infiniopDestroySinDescriptor.restype = c_int32
+    lib.infiniopDestroySinDescriptor.argtypes = [
+        infiniopOperatorDescriptor_t,
+    ]
+
+@OpRegister.operator
+def cos_(lib):
+    lib.infiniopCreateCosDescriptor.restype = c_int32
+    lib.infiniopCreateCosDescriptor.argtypes = [
+        infiniopHandle_t,
+        POINTER(infiniopOperatorDescriptor_t),
+        infiniopTensorDescriptor_t,
+        infiniopTensorDescriptor_t,
+    ]
+
+    lib.infiniopGetCosWorkspaceSize.restype = c_int32
+    lib.infiniopGetCosWorkspaceSize.argtypes = [
+        infiniopOperatorDescriptor_t,
+        POINTER(c_size_t),
+    ]
+
+    lib.infiniopCos.restype = c_int32
+    lib.infiniopCos.argtypes = [
+        infiniopOperatorDescriptor_t,
+        c_void_p,
+        c_size_t,
+        c_void_p,
+        c_void_p,
+        c_void_p,
+    ]
+
+    lib.infiniopDestroyCosDescriptor.restype = c_int32
+    lib.infiniopDestroyCosDescriptor.argtypes = [
+        infiniopOperatorDescriptor_t,
+    ]
+
+@OpRegister.operator
+def leakyrelu_(lib):
+    lib.infiniopCreateLeakyreluDescriptor.restype = c_int32
+    lib.infiniopCreateLeakyreluDescriptor.argtypes = [
+        infiniopHandle_t,
+        POINTER(infiniopOperatorDescriptor_t),
+        infiniopTensorDescriptor_t,
+        infiniopTensorDescriptor_t,
+        c_float,
+    ]
+
+    lib.infiniopGetLeakyreluWorkspaceSize.restype = c_int32
+    lib.infiniopGetLeakyreluWorkspaceSize.argtypes = [
+        infiniopOperatorDescriptor_t,
+        POINTER(c_size_t),
+    ]
+
+    lib.infiniopLeakyrelu.restype = c_int32
+    lib.infiniopLeakyrelu.argtypes = [
+        infiniopOperatorDescriptor_t,
+        c_void_p,
+        c_size_t,
+        c_void_p,
+        c_void_p,
+        c_void_p,
+    ]
+
+    lib.infiniopDestroyLeakyreluDescriptor.restype = c_int32
+    lib.infiniopDestroyLeakyreluDescriptor.argtypes = [
+        infiniopOperatorDescriptor_t,
+    ]
+
+@OpRegister.operator
+def tanh_(lib):
+    lib.infiniopCreateTanhDescriptor.restype = c_int32
+    lib.infiniopCreateTanhDescriptor.argtypes = [
+        infiniopHandle_t,
+        POINTER(infiniopOperatorDescriptor_t),
+        infiniopTensorDescriptor_t,
+        infiniopTensorDescriptor_t,
+    ]
+
+    lib.infiniopGetTanhWorkspaceSize.restype = c_int32
+    lib.infiniopGetTanhWorkspaceSize.argtypes = [
+        infiniopOperatorDescriptor_t,
+        POINTER(c_size_t),
+    ]
+
+    lib.infiniopTanh.restype = c_int32
+    lib.infiniopTanh.argtypes = [
+        infiniopOperatorDescriptor_t,
+        c_void_p,
+        c_size_t,
+        c_void_p,
+        c_void_p,
+        c_void_p,
+    ]
+
+    lib.infiniopDestroyTanhDescriptor.restype = c_int32
+    lib.infiniopDestroyTanhDescriptor.argtypes = [
+        infiniopOperatorDescriptor_t,
+    ]
+
+@OpRegister.operator
+def sigmoid_backward_(lib):
+    lib.infiniopCreateSigmoidBackwardDescriptor.restype = c_int32
+    lib.infiniopCreateSigmoidBackwardDescriptor.argtypes = [
+        infiniopHandle_t,
+        POINTER(infiniopOperatorDescriptor_t),
+        infiniopTensorDescriptor_t,
+        infiniopTensorDescriptor_t,
+        infiniopTensorDescriptor_t,
+    ]
+
+    lib.infiniopGetSigmoidBackwardWorkspaceSize.restype = c_int32
+    lib.infiniopGetSigmoidBackwardWorkspaceSize.argtypes = [
+        infiniopOperatorDescriptor_t,
+        POINTER(c_size_t),
+    ]
+
+    lib.infiniopSigmoidBackward.restype = c_int32
+    lib.infiniopSigmoidBackward.argtypes = [
+        infiniopOperatorDescriptor_t,
+        c_void_p,
+        c_size_t,
+        c_void_p,
+        c_void_p,
+        c_void_p,
+        c_void_p,
+    ]
+
+    lib.infiniopDestroySigmoidBackwardDescriptor.restype = c_int32
+    lib.infiniopDestroySigmoidBackwardDescriptor.argtypes = [
+        infiniopOperatorDescriptor_t,
+    ]
+
+@OpRegister.operator
+def hardswish_(lib):
+    lib.infiniopCreateHardswishDescriptor.restype = c_int32
+    lib.infiniopCreateHardswishDescriptor.argtypes = [
+        infiniopHandle_t,
+        POINTER(infiniopOperatorDescriptor_t),
+        infiniopTensorDescriptor_t,
+        infiniopTensorDescriptor_t,
+    ]
+
+    lib.infiniopGetHardswishWorkspaceSize.restype = c_int32
+    lib.infiniopGetHardswishWorkspaceSize.argtypes = [
+        infiniopOperatorDescriptor_t,
+        POINTER(c_size_t),
+    ]
+
+    lib.infiniopHardswish.restype = c_int32
+    lib.infiniopHardswish.argtypes = [
+        infiniopOperatorDescriptor_t,
+        c_void_p,
+        c_size_t,
+        c_void_p,
+        c_void_p,
+        c_void_p,
+    ]
+
+    lib.infiniopDestroyHardswishDescriptor.restype = c_int32
+    lib.infiniopDestroyHardswishDescriptor.argtypes = [
+        infiniopOperatorDescriptor_t,
+    ]
+
+@OpRegister.operator
+def cast_(lib):
+    lib.infiniopCreateCastDescriptor.restype = c_int32
+    lib.infiniopCreateCastDescriptor.argtypes = [
+        infiniopHandle_t,
+        POINTER(infiniopOperatorDescriptor_t),
+        infiniopTensorDescriptor_t,
+        infiniopTensorDescriptor_t,
+    ]
+
+    lib.infiniopGetCastWorkspaceSize.restype = c_int32
+    lib.infiniopGetCastWorkspaceSize.argtypes = [
+        infiniopOperatorDescriptor_t,
+        POINTER(c_size_t),
+    ]
+
+    lib.infiniopCast.restype = c_int32
+    lib.infiniopCast.argtypes = [
+        infiniopOperatorDescriptor_t,
+        c_void_p,
+        c_size_t,
+        c_void_p,
+        c_void_p,
+        c_void_p,
+    ]
+
+    lib.infiniopDestroyCastDescriptor.restype = c_int32
+    lib.infiniopDestroyCastDescriptor.argtypes = [
+        infiniopOperatorDescriptor_t,
+    ]
+
+
+@OpRegister.operator
+def where_(lib):
+    lib.infiniopCreateWhereDescriptor.restype = c_int32
+    lib.infiniopCreateWhereDescriptor.argtypes = [
+        infiniopHandle_t,
+        POINTER(infiniopOperatorDescriptor_t),
+        infiniopTensorDescriptor_t,
+        infiniopTensorDescriptor_t,
+        infiniopTensorDescriptor_t,
+        infiniopTensorDescriptor_t,
+    ]
+
+    lib.infiniopGetWhereWorkspaceSize.restype = c_int32
+    lib.infiniopGetWhereWorkspaceSize.argtypes = [
+        infiniopOperatorDescriptor_t,
+        POINTER(c_size_t),
+    ]
+
+    lib.infiniopWhere.restype = c_int32
+    lib.infiniopWhere.argtypes = [
+        infiniopOperatorDescriptor_t,
+        c_void_p,
+        c_size_t,
+        c_void_p,
+        c_void_p,
+        c_void_p,
+        c_void_p,
+        c_void_p,
+    ]
+
+    lib.infiniopDestroyWhereDescriptor.restype = c_int32
+    lib.infiniopDestroyWhereDescriptor.argtypes = [
+        infiniopOperatorDescriptor_t,
+    ]    
\ No newline at end of file
diff --git a/test/infiniop/sigmoid_backward.py b/test/infiniop/sigmoid_backward.py
new file mode 100644
index 000000000..813791aa8
--- /dev/null
+++ b/test/infiniop/sigmoid_backward.py
@@ -0,0 +1,184 @@
+import torch
+import ctypes
+from ctypes import c_uint64
+from libinfiniop import (
+    LIBINFINIOP,
+    TestTensor,
+    get_test_devices,
+    check_error,
+    test_operator,
+    get_args,
+    debug,
+    get_tolerance,
+    profile_operation,
+    TestWorkspace,
+    InfiniDtype,
+    InfiniDtypeNames,
+    InfiniDeviceNames,
+    infiniopOperatorDescriptor_t,
+)
+from enum import Enum, auto
+
+# ==============================================================================
+#  Configuration (Internal Use Only)
+# ==============================================================================
+# These are not meant to be imported from other modules
+_TEST_CASES_ = [
+    # shape, input_stride, grad_output_stride, grad_input_stride
+    ((13, 4), None, None, None),
+    ((13, 4), (10, 1), (10, 1), (10, 1)),
+    ((13, 4), (0, 1), None, None),
+    ((13, 4, 4), None, None, None),
+    ((13, 4, 4), (20, 4, 1), (20, 4, 1), (20, 4, 1)),
+    ((13, 4, 4), (4, 0, 1), (0, 4, 1), None),
+    ((16, 5632), None, None, None),
+    ((16, 5632), (13312, 1), (13312, 1), (13312, 1)),
+    ((4, 4, 5632), None, None, None),
+    ((4, 4, 5632), (45056, 5632, 1), (45056, 5632, 1), (45056, 5632, 1)),
+]
+
+
+class Inplace(Enum):
+    OUT_OF_PLACE = auto()
+    INPLACE_INPUT = auto()
+    INPLACE_GRAD_OUTPUT = auto()
+
+
+# Inplace options applied for each test case in _TEST_CASES_
+_INPLACE = [
+    Inplace.OUT_OF_PLACE,
+    Inplace.INPLACE_INPUT,
+    Inplace.INPLACE_GRAD_OUTPUT,
+]
+
+# Form the test cases by appending each element of _INPLACE to each tuple in _TEST_CASES_
+_TEST_CASES = [
+    test_case + (inplace_item,)
+    for test_case in _TEST_CASES_
+    for inplace_item in _INPLACE
+]
+
+# Data types used for testing
+_TENSOR_DTYPES = [InfiniDtype.F16, InfiniDtype.F32, InfiniDtype.BF16]
+
+# Tolerance map for different data types
+_TOLERANCE_MAP = {
+    InfiniDtype.F16: {"atol": 1e-3, "rtol": 1e-3},
+    InfiniDtype.F32: {"atol": 1e-6, "rtol": 1e-6},
+    InfiniDtype.BF16: {"atol": 1e-2, "rtol": 1e-2},
+}
+
+DEBUG = False
+PROFILE = False
+NUM_PRERUN = 10
+NUM_ITERATIONS = 1000
+
+
+def sigmoid_backward(grad_input, input_tensor, grad_output):
+    sigmoid_input = torch.sigmoid(input_tensor)
+    grad_input.copy_(grad_output * sigmoid_input * (1 - sigmoid_input))
+
+
+def test(
+    handle,
+    device,
+    shape,
+    input_stride=None,
+    grad_output_stride=None,
+    grad_input_stride=None,
+    inplace=Inplace.OUT_OF_PLACE,
+    dtype=torch.float16,
+    sync=None,
+):
+    input_tensor = TestTensor(shape, input_stride, dtype, device)
+    grad_output = TestTensor(shape, grad_output_stride, dtype, device)
+    
+    if inplace == Inplace.INPLACE_INPUT:
+        if input_stride != grad_input_stride:
+            return
+        grad_input = input_tensor
+    elif inplace == Inplace.INPLACE_GRAD_OUTPUT:
+        if grad_input_stride != grad_output_stride:
+            return
+        grad_input = grad_output
+    else:
+        grad_input = TestTensor(shape, grad_input_stride, dtype, device, mode="ones")
+
+    if grad_input.is_broadcast():
+        return
+
+    print(
+        f"Testing SigmoidBackward on {InfiniDeviceNames[device]} with shape:{shape} "
+        f"input_stride:{input_stride} grad_output_stride:{grad_output_stride} grad_input_stride:{grad_input_stride} "
+        f"dtype:{InfiniDtypeNames[dtype]} inplace:{inplace}"
+    )
+
+    sigmoid_backward(grad_input.torch_tensor(), input_tensor.torch_tensor(), grad_output.torch_tensor())
+
+    if sync is not None:
+        sync()
+
+    descriptor = infiniopOperatorDescriptor_t()
+    check_error(
+        LIBINFINIOP.infiniopCreateSigmoidBackwardDescriptor(
+            handle,
+            ctypes.byref(descriptor),
+            grad_input.descriptor,
+            input_tensor.descriptor,
+            grad_output.descriptor,
+        )
+    )
+
+    for tensor in [input_tensor, grad_output, grad_input]:
+        tensor.destroy_desc()
+
+    workspace_size = c_uint64(0)
+    check_error(
+        LIBINFINIOP.infiniopGetSigmoidBackwardWorkspaceSize(
+            descriptor, ctypes.byref(workspace_size)
+        )
+    )
+    workspace = TestWorkspace(workspace_size.value, grad_input.device)
+
+    def lib_sigmoid_backward():
+        check_error(
+            LIBINFINIOP.infiniopSigmoidBackward(
+                descriptor,
+                workspace.data(),
+                workspace.size(),
+                grad_input.data(),
+                input_tensor.data(),
+                grad_output.data(),
+                None,
+            )
+        )
+
+    lib_sigmoid_backward()
+
+    atol, rtol = get_tolerance(_TOLERANCE_MAP, dtype)
+    if DEBUG:
+        debug(grad_input.actual_tensor(), grad_input.torch_tensor(), atol=atol, rtol=rtol)
+    assert torch.allclose(grad_input.actual_tensor(), grad_input.torch_tensor(), atol=atol, rtol=rtol)
+
+    if PROFILE:
+        # fmt: off
+        profile_operation("PyTorch", lambda: sigmoid_backward(grad_input.torch_tensor(), input_tensor.torch_tensor(), grad_output.torch_tensor()), 
+                         device, NUM_PRERUN, NUM_ITERATIONS)
+        profile_operation("    lib", lambda: lib_sigmoid_backward(), device, NUM_PRERUN, NUM_ITERATIONS)
+        # fmt: on
+
+    check_error(LIBINFINIOP.infiniopDestroySigmoidBackwardDescriptor(descriptor))
+
+
+if __name__ == "__main__":
+    args = get_args()
+
+    DEBUG = args.debug
+    PROFILE = args.profile
+    NUM_PRERUN = args.num_prerun
+    NUM_ITERATIONS = args.num_iterations
+
+    for device in get_test_devices(args):
+        test_operator(device, test, _TEST_CASES, _TENSOR_DTYPES)
+
+    print("\033[92mTest passed!\033[0m")
diff --git a/test/infiniop/sin.py b/test/infiniop/sin.py
new file mode 100644
index 000000000..613257e9c
--- /dev/null
+++ b/test/infiniop/sin.py
@@ -0,0 +1,166 @@
+import torch
+import ctypes
+from ctypes import c_uint64
+from libinfiniop import (
+    LIBINFINIOP,
+    TestTensor,
+    get_test_devices,
+    check_error,
+    test_operator,
+    get_args,
+    debug,
+    get_tolerance,
+    get_sync_func,
+    profile_operation,
+    TestWorkspace,
+    InfiniDtype,
+    InfiniDtypeNames,
+    InfiniDeviceNames,
+    infiniopOperatorDescriptor_t,
+)
+from enum import Enum, auto
+
+# ========================================================================
+#  Configuration (Internal Use Only)
+# ========================================================================
+_TEST_CASES_ = [
+    # shape, input_stride, output_stride
+    ((13, 4), None, None),
+    ((13, 4), (10, 1), (10, 1)),
+    ((13, 4), (0, 1), None),
+    ((13, 4, 4), None, None),
+    ((13, 4, 4), (20, 4, 1), (20, 4, 1)),
+    ((13, 4, 4), (4, 0, 1), None),
+    ((16, 5632), None, None),
+    ((16, 5632), (10240, 1), (10240, 1)),
+    ((4, 4, 5632), None, None),
+    ((4, 4, 5632), (45056, 5632, 1), (45056, 5632, 1)),
+]
+
+class Inplace(Enum):
+    OUT_OF_PLACE = auto()
+    INPLACE_INPUT = auto()
+
+_INPLACE = [
+    Inplace.OUT_OF_PLACE, 
+    Inplace.INPLACE_INPUT,
+]
+
+_TEST_CASES = [
+    test_case + (inplace,)
+    for test_case in _TEST_CASES_
+    for inplace in _INPLACE
+]
+
+_TENSOR_DTYPES = [InfiniDtype.F16, InfiniDtype.F32, InfiniDtype.BF16]
+
+_TOLERANCE_MAP = {
+    InfiniDtype.F16: {"atol": 1e-3, "rtol": 1e-3},
+    InfiniDtype.F32: {"atol": 1e-6, "rtol": 1e-6},
+    InfiniDtype.BF16: {"atol": 1e-2, "rtol": 1e-2},
+}
+
+DEBUG       = False
+PROFILE     = False
+NUM_PRERUN  = 10
+NUM_ITERATIONS = 1000
+
+
+def sin(output, input):
+    output.copy_(torch.sin(input))
+
+def test(
+    handle,
+    device,
+    shape,
+    input_stride=None,
+    output_stride=None,
+    inplace=Inplace.OUT_OF_PLACE,
+    dtype=torch.float16,
+    sync=None,
+):
+    input = TestTensor(shape, input_stride, dtype, device)
+    if inplace == Inplace.INPLACE_INPUT:
+        if input_stride != output_stride:
+            return
+        output = input
+    else:
+        output = TestTensor(shape, output_stride, dtype, device, mode="ones")
+
+    if output.is_broadcast():
+        return
+
+    print(
+        f"Testing Sin on {InfiniDeviceNames[device]} with shape:{shape} input_stride:{input_stride} output_stride:{output_stride} "
+        f"dtype:{InfiniDtypeNames[dtype]} inplace:{inplace}"
+    )
+
+    sin(output.torch_tensor(), input.torch_tensor())
+
+    if sync is not None:
+        sync()
+
+    descriptor = infiniopOperatorDescriptor_t()
+    check_error(
+        LIBINFINIOP.infiniopCreateSinDescriptor(
+            handle,
+            ctypes.byref(descriptor),
+            output.descriptor,
+            input.descriptor,
+        )
+    )
+
+    # Invalidate the shape and strides in the descriptor to prevent them from being directly used by the kernel
+    for tensor in [input, output]:
+        tensor.destroy_desc()
+
+    workspace_size = c_uint64(0)
+    check_error(
+        LIBINFINIOP.infiniopGetSinWorkspaceSize(
+            descriptor, ctypes.byref(workspace_size)
+        )
+    )
+    workspace = TestWorkspace(workspace_size.value, output.device)
+
+    def lib_sin():
+        check_error(
+            LIBINFINIOP.infiniopSin(
+                descriptor,
+                workspace.data(),
+                workspace_size.value,
+                output.data(),
+                input.data(),
+                None,
+            )
+        )
+
+    lib_sin()
+
+    atol, rtol = get_tolerance(_TOLERANCE_MAP, dtype)
+    if DEBUG:
+        debug(output.actual_tensor(), output.torch_tensor(), atol=atol, rtol=rtol)
+
+    assert torch.allclose(output.actual_tensor(), output.torch_tensor(), atol=atol, rtol=rtol)
+
+    # Profiling workflow
+    if PROFILE:
+        # fmt: off
+        profile_operation("PyTorch", lambda: sin(output.torch_tensor(), input.torch_tensor()), device, NUM_PRERUN, NUM_ITERATIONS)
+        profile_operation("    lib", lambda: lib_sin(), device, NUM_PRERUN, NUM_ITERATIONS)
+        # fmt: on
+    check_error(LIBINFINIOP.infiniopDestroySinDescriptor(descriptor))
+
+
+if __name__ == "__main__":
+    args = get_args()
+
+    # Configure testing options
+    DEBUG = args.debug
+    PROFILE = args.profile
+    NUM_PRERUN = args.num_prerun
+    NUM_ITERATIONS = args.num_iterations
+
+    for device in get_test_devices(args):
+        test_operator(device, test, _TEST_CASES, _TENSOR_DTYPES)
+
+    print("\033[92mTest passed!\033[0m")
diff --git a/test/infiniop/tanh.py b/test/infiniop/tanh.py
new file mode 100644
index 000000000..dc6ec46e8
--- /dev/null
+++ b/test/infiniop/tanh.py
@@ -0,0 +1,166 @@
+import torch
+import ctypes
+from ctypes import c_uint64
+from libinfiniop import (
+    LIBINFINIOP,
+    TestTensor,
+    get_test_devices,
+    check_error,
+    test_operator,
+    get_args,
+    debug,
+    get_tolerance,
+    get_sync_func,
+    profile_operation,
+    TestWorkspace,
+    InfiniDtype,
+    InfiniDtypeNames,
+    InfiniDeviceNames,
+    infiniopOperatorDescriptor_t,
+)
+from enum import Enum, auto
+
+# ========================================================================
+#  Configuration (Internal Use Only)
+# ========================================================================
+_TEST_CASES_ = [
+    # shape, input_stride, output_stride
+    ((13, 4), None, None),
+    ((13, 4), (10, 1), (10, 1)),
+    ((13, 4), (0, 1), None),
+    ((13, 4, 4), None, None),
+    ((13, 4, 4), (20, 4, 1), (20, 4, 1)),
+    ((13, 4, 4), (4, 0, 1), None),
+    ((16, 5632), None, None),
+    ((16, 5632), (10240, 1), (10240, 1)),
+    ((4, 4, 5632), None, None),
+    ((4, 4, 5632), (45056, 5632, 1), (45056, 5632, 1)),
+]
+
+class Inplace(Enum):
+    OUT_OF_PLACE = auto()
+    INPLACE_INPUT = auto()
+
+_INPLACE = [
+    Inplace.OUT_OF_PLACE, 
+    Inplace.INPLACE_INPUT,
+]
+
+_TEST_CASES = [
+    test_case + (inplace,)
+    for test_case in _TEST_CASES_
+    for inplace in _INPLACE
+]
+
+_TENSOR_DTYPES = [InfiniDtype.F16, InfiniDtype.F32, InfiniDtype.BF16]
+
+_TOLERANCE_MAP = {
+    InfiniDtype.F16: {"atol": 1e-3, "rtol": 1e-3},
+    InfiniDtype.F32: {"atol": 1e-6, "rtol": 1e-6},
+    InfiniDtype.BF16: {"atol": 1e-2, "rtol": 1e-2},
+}
+
+DEBUG       = False
+PROFILE     = False
+NUM_PRERUN  = 10
+NUM_ITERATIONS = 1000
+
+
+def tanh(output, input):
+    output.copy_(torch.tanh(input))
+
+def test(
+    handle,
+    device,
+    shape,
+    input_stride=None,
+    output_stride=None,
+    inplace=Inplace.OUT_OF_PLACE,
+    dtype=torch.float16,
+    sync=None,
+):
+    input = TestTensor(shape, input_stride, dtype, device)
+    if inplace == Inplace.INPLACE_INPUT:
+        if input_stride != output_stride:
+            return
+        output = input
+    else:
+        output = TestTensor(shape, output_stride, dtype, device, mode="ones")
+
+    if output.is_broadcast():
+        return
+
+    print(
+        f"Testing Tanh on {InfiniDeviceNames[device]} with shape:{shape} input_stride:{input_stride} output_stride:{output_stride} "
+        f"dtype:{InfiniDtypeNames[dtype]} inplace:{inplace}"
+    )
+
+    tanh(output.torch_tensor(), input.torch_tensor())
+
+    if sync is not None:
+        sync()
+
+    descriptor = infiniopOperatorDescriptor_t()
+    check_error(
+        LIBINFINIOP.infiniopCreateTanhDescriptor(
+            handle,
+            ctypes.byref(descriptor),
+            output.descriptor,
+            input.descriptor,
+        )
+    )
+
+    # Invalidate the shape and strides in the descriptor to prevent them from being directly used by the kernel
+    for tensor in [input, output]:
+        tensor.destroy_desc()
+
+    workspace_size = c_uint64(0)
+    check_error(
+        LIBINFINIOP.infiniopGetTanhWorkspaceSize(
+            descriptor, ctypes.byref(workspace_size)
+        )
+    )
+    workspace = TestWorkspace(workspace_size.value, output.device)
+
+    def lib_tanh():
+        check_error(
+            LIBINFINIOP.infiniopTanh(
+                descriptor,
+                workspace.data(),
+                workspace_size.value,
+                output.data(),
+                input.data(),
+                None,
+            )
+        )
+
+    lib_tanh()
+
+    atol, rtol = get_tolerance(_TOLERANCE_MAP, dtype)
+    if DEBUG:
+        debug(output.actual_tensor(), output.torch_tensor(), atol=atol, rtol=rtol)
+        
+    assert torch.allclose(output.actual_tensor(), output.torch_tensor(), atol=atol, rtol=rtol)
+
+    # Profiling workflow
+    if PROFILE:
+        # fmt: off
+        profile_operation("PyTorch", lambda: tanh(output.torch_tensor(), input.torch_tensor()), device, NUM_PRERUN, NUM_ITERATIONS)
+        profile_operation("    lib", lambda: lib_tanh(), device, NUM_PRERUN, NUM_ITERATIONS)
+        # fmt: on
+    check_error(LIBINFINIOP.infiniopDestroyTanhDescriptor(descriptor))
+
+
+if __name__ == "__main__":
+    args = get_args()
+
+    # Configure testing options
+    DEBUG = args.debug
+    PROFILE = args.profile
+    NUM_PRERUN = args.num_prerun
+    NUM_ITERATIONS = args.num_iterations
+
+    for device in get_test_devices(args):
+        test_operator(device, test, _TEST_CASES, _TENSOR_DTYPES)
+
+    print("\033[92mTest passed!\033[0m")
diff --git a/test/infiniop/where.py b/test/infiniop/where.py
new file mode 100644
index 000000000..c940d4f05
--- /dev/null
+++ b/test/infiniop/where.py
@@ -0,0 +1,288 @@
+import torch
+import ctypes
+from ctypes import c_uint64
+from libinfiniop import (
+    LIBINFINIOP,
+    TestTensor,
+    get_test_devices,
+    check_error,
+    test_operator,
+    get_args,
+    debug,
+    get_tolerance,
+    profile_operation,
+    TestWorkspace,
+    InfiniDtype,
+    InfiniDtypeNames,
+    InfiniDeviceNames,
+    infiniopOperatorDescriptor_t,
+    to_torch_dtype,
+    torch_device_map,
+)
+from enum import Enum, auto
+
+# ======================================================================
+# Configuration (Internal Use Only)
+# Now each test case tuple is: (shape, a_stride, b_stride, cond_stride, c_stride)
+# ======================================================================
+_TEST_CASES_ = [
+    ((13, 4), None, None, None, None),
+    ((13, 4), None, None, None, None),
+    ((13, 4), (10, 1), (10, 1), (10, 1), (10, 1)),
+    ((13, 4), (0, 1), None, None, None),
+    ((13, 4, 4), None, None, None, None),
+    ((13, 4, 4), (20, 4, 1), (20, 4, 1), (20, 4, 1), (20, 4, 1)),
+    ((13, 4, 4), (4, 0, 1), (0, 4, 1), None, None),
+    ((16, 5632), None, None, None, None),
+    ((16, 5632), (13312, 1), (13312, 1), (13312, 1), (13312, 1)),
+    ((4, 4, 5632), None, None, None, None),
+    ((4, 4, 5632), (45056, 5632, 1), (45056, 5632, 1), (45056, 5632, 1), (45056, 5632, 1)),
+]
+
+class Inplace(Enum):
+    OUT_OF_PLACE = auto()
+    INPLACE_A = auto()
+    INPLACE_B = auto()
+    INPLACE_COND = auto()
+
+_INPLACE = [
+    Inplace.OUT_OF_PLACE,
+    Inplace.INPLACE_A,
+    Inplace.INPLACE_B,
+    Inplace.INPLACE_COND,
+]
+
+_TEST_CASES = [
+    test_case + (inplace_item,)
+    for test_case in _TEST_CASES_
+    for inplace_item in _INPLACE
+]
+
+_INTEGER_DTYPES = [
+    InfiniDtype.I32,
+    InfiniDtype.I64,
+    InfiniDtype.U32,
+    InfiniDtype.U64,
+]
+
+_FLOAT_DTYPES = [
+    InfiniDtype.F16,
+    InfiniDtype.F32,
+    InfiniDtype.F64,
+    InfiniDtype.BF16,
+]
+
+_TENSOR_DTYPES = _INTEGER_DTYPES + _FLOAT_DTYPES
+
+_TOLERANCE_MAP = {
+    InfiniDtype.I32: {"atol": 1e-2, "rtol": 1e-2},
+    InfiniDtype.I64: {"atol": 1e-2, "rtol": 1e-2},
+    InfiniDtype.U32: {"atol": 1e-2, "rtol": 1e-2},
+    InfiniDtype.U64: {"atol": 1e-2, "rtol": 1e-2},
+    InfiniDtype.F16: {"atol": 1e-3, "rtol": 1e-3},
+    InfiniDtype.F32: {"atol": 1e-6, "rtol": 1e-6},
+    InfiniDtype.F64: {"atol": 1e-7, "rtol": 1e-7},
+    InfiniDtype.BF16: {"atol": 1e-3, "rtol": 1e-3},
+}
+
+DEBUG = False
+PROFILE = False
+NUM_PRERUN = 10
+NUM_ITERATIONS = 1000
+
+def is_supported_dt(inf_dt):
+    try:
+        td = to_torch_dtype(inf_dt, compatability_mode=True)
+        _ = torch.empty((1,), dtype=td, device="cpu")
+        return True
+    except Exception:
+        return False
+
+def _is_integer_dtype(inf_dt):
+    return inf_dt in _INTEGER_DTYPES
+
+def _is_unsigned_dtype(inf_dt):
+    return inf_dt in (InfiniDtype.U32, InfiniDtype.U64)
+
+
+def make_integer_torch_tensor(shape, inf_dt, device):
+    use_compatibility = _is_unsigned_dtype(inf_dt)
+
+    if inf_dt == InfiniDtype.I32:
+        low, high, dtype = -2000, 2000, torch.int32
+    elif inf_dt == InfiniDtype.I64:
+        low, high, dtype = -2048, 2048, torch.int64
+    elif inf_dt == InfiniDtype.U32:
+        low, high, dtype = 0, 2000, torch.int32
+    elif inf_dt == InfiniDtype.U64:
+        low, high, dtype = 0, 2048, torch.int64
+    else:
+        low, high, dtype = 0, 1, torch.int64
+
+    dev = torch_device_map[device]
+
+    t = torch.randint(low=low, high=high, size=shape, dtype=dtype, device=dev)
+
+    target_torch_dt = to_torch_dtype(inf_dt, compatability_mode=use_compatibility)
+    if t.dtype != target_torch_dt:
+        t = t.to(dtype=target_torch_dt)
+
+    return t
+
+def where_ref(c, a, b, cond):
+    cond_bool = cond.torch_tensor().to(torch.bool)
+    c.torch_tensor().copy_(torch.where(cond_bool, a.torch_tensor(), b.torch_tensor()))
+
+def test(
+    handle,
+    device,
+    shape,
+    a_stride=None,
+    b_stride=None,
+    cond_stride=None,
+    c_stride=None,
+    inplace=Inplace.OUT_OF_PLACE,
+    dtype=InfiniDtype.F16,
+    sync=None,
+):
+    inf_dt = dtype
+
+    if not is_supported_dt(inf_dt):
+        # print(f"Skipping dtype {InfiniDtypeNames[inf_dt]} on this platform")
+        return
+
+    try:
+        if _is_integer_dtype(inf_dt):
+            a_torch = make_integer_torch_tensor(shape, inf_dt, device)
+            b_torch = make_integer_torch_tensor(shape, inf_dt, device)
+            a = TestTensor.from_torch(a_torch, inf_dt, device)
+            b = TestTensor.from_torch(b_torch, inf_dt, device)
+        else:
+            a = TestTensor(shape, a_stride, inf_dt, device, mode="random")
+            b = TestTensor(shape, b_stride, inf_dt, device, mode="random")
+    except RuntimeError as e:
+        msg = str(e)
+        if "not implemented for 'UInt32'" in msg or "not implemented for 'UInt64'" in msg or "check_uniform_bounds" in msg:
+            # print(f"Skipping dtype {InfiniDtypeNames[inf_dt]} because platform torch can't build random tensor: {e}")
+            return
+        else:
+            raise
+
+    dev = torch_device_map[device]
+    if _is_integer_dtype(inf_dt):
+        cond_torch = torch.randint(0, 2, size=shape, dtype=to_torch_dtype(inf_dt, compatability_mode=False), device=dev)
+    else:
+        cond_bool = (torch.rand(shape, device=dev) > 0.5)
+        cond_torch = cond_bool.to(dtype=to_torch_dtype(inf_dt, compatability_mode=False))
+
+    cond = TestTensor.from_torch(cond_torch, inf_dt, device)
+
+    if inplace == Inplace.INPLACE_A:
+        if a_stride != c_stride:
+            return
+        c = a
+    elif inplace == Inplace.INPLACE_B:
+        if c_stride != b_stride:
+            return
+        c = b
+    elif inplace == Inplace.INPLACE_COND:
+        if c_stride != cond_stride:
+            return
+        c = cond    
+    else:
+        if _is_integer_dtype(inf_dt):
+            dev = torch_device_map[device]
+            c_torch = torch.zeros(shape, dtype=to_torch_dtype(inf_dt, compatability_mode=False), device=dev)
+            c = TestTensor.from_torch(c_torch, inf_dt, device)
+        else:
+            c = TestTensor(shape, c_stride, inf_dt, device, mode="ones")
+
+    if c.is_broadcast():
+        return
+
+    print(
+        f"Testing Where on {InfiniDeviceNames[device]} "
+        f"shape:{shape} a_stride:{a_stride} b_stride:{b_stride} cond_stride:{cond_stride} c_stride:{c_stride} "
+        f"dtype:{InfiniDtypeNames[inf_dt]} inplace:{inplace}"
+    )
+
+    where_ref(c, a, b, cond)
+
+    if sync is not None:
+        sync()
+
+    descriptor = infiniopOperatorDescriptor_t()
+    try:
+        check_error(
+            LIBINFINIOP.infiniopCreateWhereDescriptor(
+                handle,
+                ctypes.byref(descriptor),
+                c.descriptor,
+                a.descriptor,
+                b.descriptor,
+                cond.descriptor,
+            )
+        )
+    except Exception as e:
+        # print(f"Skipping dtype {InfiniDtypeNames[inf_dt]} on {InfiniDeviceNames[device]}: CreateWhereDescriptor failed: {e}")
+        return
+
+    for tensor in [a, b, c, cond]:
+        tensor.destroy_desc()
+
+    workspace_size = c_uint64(0)
+    check_error(
+        LIBINFINIOP.infiniopGetWhereWorkspaceSize(
+            descriptor, ctypes.byref(workspace_size)
+        )
+    )
+    workspace = TestWorkspace(workspace_size.value, c.device)
+
+    def lib_where():
+        check_error(
+            LIBINFINIOP.infiniopWhere(
+                descriptor,
+                workspace.data(),
+                workspace.size(),
+                c.data(),
+                a.data(),
+                b.data(),
+                cond.data(),
+                None,
+            )
+        )
+
+    lib_where()
+
+    atol, rtol = get_tolerance(_TOLERANCE_MAP, inf_dt)
+    if DEBUG:
+        debug(c.actual_tensor(), c.torch_tensor(), atol=atol, rtol=rtol)
+
+    assert torch.allclose(c.actual_tensor(), c.torch_tensor(), atol=atol, rtol=rtol)
+
+    if PROFILE:
+        profile_operation("PyTorch", lambda: where_ref(c, a, b, cond), device, NUM_PRERUN, NUM_ITERATIONS)
+        profile_operation("    lib", lambda: lib_where(), device, NUM_PRERUN, NUM_ITERATIONS)
+
+    check_error(LIBINFINIOP.infiniopDestroyWhereDescriptor(descriptor))
+
+
+def main():
+    args = get_args()
+    global DEBUG, PROFILE, NUM_PRERUN, NUM_ITERATIONS
+    DEBUG = args.debug
+    PROFILE = args.profile
+    NUM_PRERUN = args.num_prerun
+    NUM_ITERATIONS = args.num_iterations
+
+    supported = [dt for dt in _TENSOR_DTYPES if is_supported_dt(dt)]
+    devices = get_test_devices(args)
+
+    for device in devices:
+        test_operator(device, test, _TEST_CASES, supported)
+
+    print("\033[92mTest passed!\033[0m")
+
+
+if __name__ == "__main__":
+    main()