diff --git a/include/infiniop.h b/include/infiniop.h
index d51b8d92e..868f29d48 100644
--- a/include/infiniop.h
+++ b/include/infiniop.h
@@ -17,5 +17,12 @@
 #include "infiniop/ops/sub.h"
 #include "infiniop/ops/swiglu.h"
 #include "infiniop/tensor_descriptor.h"
+#include "infiniop/ops/index_copy_inplace.h"
+#include "infiniop/ops/gather.h"
+#include "infiniop/ops/scatter.h"
+#include "infiniop/ops/tril.h"
+#include "infiniop/ops/triu.h"
+#include "infiniop/ops/linear.h"
+#include "infiniop/ops/linear_backward.h"
 
 #endif // __INFINIOP_API_H__
diff --git a/include/infiniop/ops/gather.h b/include/infiniop/ops/gather.h
new file mode 100644
index 000000000..9ffe310c9
--- /dev/null
+++ b/include/infiniop/ops/gather.h
@@ -0,0 +1,31 @@
+#ifndef __INFINIOP_GATHER_API_H__
+#define __INFINIOP_GATHER_API_H__
+
+#include "../operator_descriptor.h"
+
+typedef struct InfiniopDescriptor *infiniopGatherDescriptor_t;
+
+__C __export infiniStatus_t infiniopCreateGatherDescriptor(
+    infiniopHandle_t handle,
+    infiniopGatherDescriptor_t *desc_ptr,
+    infiniopTensorDescriptor_t output_desc,
+    infiniopTensorDescriptor_t input_desc,
+    infiniopTensorDescriptor_t index_desc,
+    size_t dim
+);
+
+__C __export infiniStatus_t infiniopGetGatherWorkspaceSize(infiniopGatherDescriptor_t desc, size_t *size);
+
+__C __export infiniStatus_t infiniopGather(
+    infiniopGatherDescriptor_t desc,
+    void *workspace,
+    size_t workspace_size,
+    void * output,
+    const void * input,
+    const void * index,
+    void *stream
+);
+
+__C __export infiniStatus_t infiniopDestroyGatherDescriptor(infiniopGatherDescriptor_t desc);
+
+#endif
diff --git a/include/infiniop/ops/index_copy_inplace.h b/include/infiniop/ops/index_copy_inplace.h
new file mode 100644
index 000000000..e2266299a
--- /dev/null
+++ b/include/infiniop/ops/index_copy_inplace.h
@@ -0,0 +1,30 @@
+#ifndef __INFINIOP_INDEX_COPY_INPLACE_API_H__
+#define __INFINIOP_INDEX_COPY_INPLACE_API_H__
+
+#include "../operator_descriptor.h"
+
+typedef struct InfiniopDescriptor *infiniopIndexCopyInplaceDescriptor_t;
+
+__C __export infiniStatus_t infiniopCreateIndexCopyInplaceDescriptor(
+    infiniopHandle_t handle,
+    infiniopIndexCopyInplaceDescriptor_t *desc_ptr,
+	infiniopTensorDescriptor_t output_desc,
+	infiniopTensorDescriptor_t input_desc,
+	infiniopTensorDescriptor_t index_desc,
+	size_t dim
+);
+
+__C __export infiniStatus_t infiniopGetIndexCopyInplaceWorkspaceSize(infiniopIndexCopyInplaceDescriptor_t desc, size_t *size);
+
+__C __export infiniStatus_t infiniopIndexCopyInplace(infiniopIndexCopyInplaceDescriptor_t desc,
+    void *workspace,
+    size_t workspace_size,
+	void * output,
+	const void * input,
+	const void * index,
+    void *stream
+);
+
+__C __export infiniStatus_t infiniopDestroyIndexCopyInplaceDescriptor(infiniopIndexCopyInplaceDescriptor_t desc);
+
+#endif
diff --git a/include/infiniop/ops/linear.h b/include/infiniop/ops/linear.h
new file mode 100644
index 000000000..4a81d4988
--- /dev/null
+++ b/include/infiniop/ops/linear.h
@@ -0,0 +1,32 @@
+#ifndef __INFINIOP_LINEAR_API_H__
+#define __INFINIOP_LINEAR_API_H__
+
+#include "../operator_descriptor.h"
+
+typedef struct InfiniopDescriptor *infiniopLinearDescriptor_t;
+
+__C __export infiniStatus_t infiniopCreateLinearDescriptor(
+    infiniopHandle_t handle,
+    infiniopLinearDescriptor_t *desc_ptr,
+    infiniopTensorDescriptor_t y_desc,
+    infiniopTensorDescriptor_t x_desc,
+    infiniopTensorDescriptor_t w_desc,
+    infiniopTensorDescriptor_t b_desc
+);
+
+__C __export infiniStatus_t infiniopGetLinearWorkspaceSize(infiniopLinearDescriptor_t desc, size_t *size);
+
+__C __export infiniStatus_t infiniopLinear(
+    infiniopLinearDescriptor_t desc,
+    void *workspace,
+    size_t workspace_size,
+    void * y,
+    const void * x,
+    const void * w,
+    const void * b,
+    void *stream
+);
+
+__C __export infiniStatus_t infiniopDestroyLinearDescriptor(infiniopLinearDescriptor_t desc);
+
+#endif
\ No newline at end of file
diff --git a/include/infiniop/ops/linear_backward.h b/include/infiniop/ops/linear_backward.h
new file mode 100644
index 000000000..de74668c7
--- /dev/null
+++ b/include/infiniop/ops/linear_backward.h
@@ -0,0 +1,35 @@
+#ifndef __INFINIOP_LINEAR_BACKWARD_API_H__
+#define __INFINIOP_LINEAR_BACKWARD_API_H__
+
+#include "../operator_descriptor.h"
+
+typedef struct InfiniopDescriptor *infiniopLinearBackwardDescriptor_t;
+
+__C __export infiniStatus_t infiniopCreateLinearBackwardDescriptor(
+    infiniopHandle_t handle,
+    infiniopLinearBackwardDescriptor_t *desc_ptr,
+	infiniopTensorDescriptor_t grad_x_desc,
+	infiniopTensorDescriptor_t grad_w_desc,
+	infiniopTensorDescriptor_t grad_b_desc,
+	infiniopTensorDescriptor_t grad_y_desc,
+	infiniopTensorDescriptor_t x_desc,
+	infiniopTensorDescriptor_t w_desc
+);
+
+__C __export infiniStatus_t infiniopGetLinearBackwardWorkspaceSize(infiniopLinearBackwardDescriptor_t desc, size_t *size);
+
+__C __export infiniStatus_t infiniopLinearBackward(infiniopLinearBackwardDescriptor_t desc,
+    void *workspace,
+    size_t workspace_size,
+	void * grad_x,
+	void * grad_w,
+	void * grad_b,
+	const void * grad_y,
+	const void * x,
+	const void * w,
+    void *stream
+);
+
+__C __export infiniStatus_t infiniopDestroyLinearBackwardDescriptor(infiniopLinearBackwardDescriptor_t desc);
+
+#endif
diff --git a/include/infiniop/ops/scatter.h b/include/infiniop/ops/scatter.h
new file mode 100644
index 000000000..22e0eff83
--- /dev/null
+++ b/include/infiniop/ops/scatter.h
@@ -0,0 +1,30 @@
+#ifndef __INFINIOP_SCATTER_API_H__
+#define __INFINIOP_SCATTER_API_H__
+
+#include "../operator_descriptor.h"
+
+typedef struct InfiniopDescriptor *infiniopScatterDescriptor_t;
+
+__C __export infiniStatus_t infiniopCreateScatterDescriptor(
+    infiniopHandle_t handle,
+    infiniopScatterDescriptor_t *desc_ptr,
+	infiniopTensorDescriptor_t output_desc,
+	infiniopTensorDescriptor_t input_desc,
+	infiniopTensorDescriptor_t index_desc,
+	size_t dim
+);
+
+__C __export infiniStatus_t infiniopGetScatterWorkspaceSize(infiniopScatterDescriptor_t desc, size_t *size);
+
+__C __export infiniStatus_t infiniopScatter(infiniopScatterDescriptor_t desc,
+    void *workspace,
+    size_t workspace_size,
+	void * output,
+	const void * input,
+	const void * index,
+    void *stream
+);
+
+__C __export infiniStatus_t infiniopDestroyScatterDescriptor(infiniopScatterDescriptor_t desc);
+
+#endif
diff --git a/include/infiniop/ops/tril.h b/include/infiniop/ops/tril.h
new file mode 100644
index 000000000..8ce215308
--- /dev/null
+++ b/include/infiniop/ops/tril.h
@@ -0,0 +1,28 @@
+#ifndef __INFINIOP_TRIL_API_H__
+#define __INFINIOP_TRIL_API_H__
+
+#include "../operator_descriptor.h"
+
+typedef struct InfiniopDescriptor *infiniopTrilDescriptor_t;
+
+__C __export infiniStatus_t infiniopCreateTrilDescriptor(
+    infiniopHandle_t handle,
+    infiniopTrilDescriptor_t *desc_ptr,
+	infiniopTensorDescriptor_t output_desc,
+	infiniopTensorDescriptor_t input_desc,
+	int diagonal
+);
+
+__C __export infiniStatus_t infiniopGetTrilWorkspaceSize(infiniopTrilDescriptor_t desc, size_t *size);
+
+__C __export infiniStatus_t infiniopTril(infiniopTrilDescriptor_t desc,
+    void *workspace,
+    size_t workspace_size,
+	void * output,
+	const void * input,
+    void *stream
+);
+
+__C __export infiniStatus_t infiniopDestroyTrilDescriptor(infiniopTrilDescriptor_t desc);
+
+#endif
diff --git a/include/infiniop/ops/triu.h b/include/infiniop/ops/triu.h
new file mode 100644
index 000000000..bb7854767
--- /dev/null
+++ b/include/infiniop/ops/triu.h
@@ -0,0 +1,28 @@
+#ifndef __INFINIOP_TRIU_API_H__
+#define __INFINIOP_TRIU_API_H__
+
+#include "../operator_descriptor.h"
+
+typedef struct InfiniopDescriptor *infiniopTriuDescriptor_t;
+
+__C __export infiniStatus_t infiniopCreateTriuDescriptor(
+    infiniopHandle_t handle,
+    infiniopTriuDescriptor_t *desc_ptr,
+	infiniopTensorDescriptor_t output_desc,
+	infiniopTensorDescriptor_t input_desc,
+	int diagonal
+);
+
+__C __export infiniStatus_t infiniopGetTriuWorkspaceSize(infiniopTriuDescriptor_t desc, size_t *size);
+
+__C __export infiniStatus_t infiniopTriu(infiniopTriuDescriptor_t desc,
+    void *workspace,
+    size_t workspace_size,
+	void * output,
+	const void * input,
+    void *stream
+);
+
+__C __export infiniStatus_t infiniopDestroyTriuDescriptor(infiniopTriuDescriptor_t desc);
+
+#endif
diff --git a/scripts/python_test.py b/scripts/python_test.py
index eb2d4319e..548a68025 100644
--- a/scripts/python_test.py
+++ b/scripts/python_test.py
@@ -12,18 +12,25 @@
 def run_tests(args):
     failed = []
     for test in [
-        "add.py",
-        "attention.py",
-        "causal_softmax.py",
-        "clip.py",
-        "gemm.py",
-        "mul.py",
-        "random_sample.py",
-        "rearrange.py",
-        "rms_norm.py",
-        "rope.py",
-        "sub.py",
-        "swiglu.py",
+        # "add.py",
+        # "attention.py",
+        # "causal_softmax.py",
+        # "clip.py",
+        # "gemm.py",
+        # "mul.py",
+        # "random_sample.py",
+        # "rearrange.py",
+        # "rms_norm.py",
+        # "rope.py",
+        # "sub.py",
+        # "swiglu.py",
+        "index_copy_inplace.py",
+        "gather.py",
+        "scatter.py",
+        "tril.py",
+        "triu.py",
+        "linear.py",
+        "linear_backward.py",
     ]:
         result = subprocess.run(
             f"python {test} {args} --debug", text=True, encoding="utf-8", shell=True
diff --git a/src/infiniop-test/include/ops.hpp b/src/infiniop-test/include/ops.hpp
index 3820f7cfd..f59552709 100644
--- a/src/infiniop-test/include/ops.hpp
+++ b/src/infiniop-test/include/ops.hpp
@@ -16,6 +16,13 @@ DECLARE_INFINIOP_TEST(add)
 DECLARE_INFINIOP_TEST(causal_softmax)
 DECLARE_INFINIOP_TEST(rearrange)
 DECLARE_INFINIOP_TEST(sub)
+DECLARE_INFINIOP_TEST(index_copy_inplace)
+DECLARE_INFINIOP_TEST(gather)
+DECLARE_INFINIOP_TEST(scatter)
+DECLARE_INFINIOP_TEST(tril)
+DECLARE_INFINIOP_TEST(triu)
+DECLARE_INFINIOP_TEST(linear)
+DECLARE_INFINIOP_TEST(linear_backward)
 
 #define REGISTER_INFINIOP_TEST(name)                      \
     {                                                     \
@@ -43,6 +50,13 @@ DECLARE_INFINIOP_TEST(sub)
         REGISTER_INFINIOP_TEST(causal_softmax) \
         REGISTER_INFINIOP_TEST(rearrange)      \
         REGISTER_INFINIOP_TEST(sub)            \
+        REGISTER_INFINIOP_TEST(index_copy_inplace)          \
+        REGISTER_INFINIOP_TEST(gather)                      \
+        REGISTER_INFINIOP_TEST(scatter)                     \
+        REGISTER_INFINIOP_TEST(tril)                        \
+        REGISTER_INFINIOP_TEST(triu)                        \
+        REGISTER_INFINIOP_TEST(linear)                      \
+        REGISTER_INFINIOP_TEST(linear_backward)             \
     }
 
 namespace infiniop_test {
diff --git a/src/infiniop-test/src/ops/gather.cpp b/src/infiniop-test/src/ops/gather.cpp
new file mode 100644
index 000000000..e1b998fe9
--- /dev/null
+++ b/src/infiniop-test/src/ops/gather.cpp
@@ -0,0 +1,114 @@
+#include "ops.hpp"
+#include "utils.hpp"
+#include <infinirt.h>
+#include <iomanip>
+#include <iostream>
+
+namespace infiniop_test::gather {
+struct Test::Attributes {
+    std::shared_ptr<Tensor> output;
+    std::shared_ptr<Tensor> input;
+    std::shared_ptr<Tensor> index;
+    size_t dim;
+    std::shared_ptr<Tensor> ans;
+};
+
+std::shared_ptr<Test> Test::build(
+    std::unordered_map<std::string, std::vector<uint8_t>> attributes,
+    std::unordered_map<std::string, std::shared_ptr<Tensor>> tensors,
+    double rtol, double atol) {
+    auto test = std::shared_ptr<Test>(new Test(rtol, atol));
+    test->_attributes = new Attributes();
+    if (tensors.find("output") == tensors.end()
+        || tensors.find("input") == tensors.end()
+        || tensors.find("index") == tensors.end()
+        || tensors.find("ans") == tensors.end()) {
+        throw std::runtime_error("Invalid Test");
+    }
+    test->_attributes->output = tensors["output"];
+    test->_attributes->input = tensors["input"];
+    test->_attributes->index = tensors["index"];
+    test->_attributes->ans = tensors["ans"];
+    test->_attributes->dim = *reinterpret_cast<size_t *>(attributes["dim"].data());
+
+    return test;
+}
+std::shared_ptr<infiniop_test::Result> Test::run(
+    infiniopHandle_t handle, infiniDevice_t device, int device_id, size_t warm_ups, size_t iterations) {
+    infiniopGatherDescriptor_t op_desc;
+    auto output = _attributes->output->to(device, device_id);
+    auto input = _attributes->input->to(device, device_id);
+    auto index = _attributes->index->to(device, device_id);
+    auto dim = _attributes->dim;
+    CHECK_OR(infiniopCreateGatherDescriptor(handle, &op_desc,
+            output->desc(),
+            input->desc(),
+            index->desc(),
+            dim
+        ),
+             return TEST_FAILED(OP_CREATION_FAILED, "Failed to create op descriptor."));
+    size_t workspace_size;
+    CHECK_OR(infiniopGetGatherWorkspaceSize(op_desc, &workspace_size),
+             return TEST_FAILED(OP_CREATION_FAILED, "Failed to get workspace size."));
+    void *workspace;
+    CHECK_OR(infinirtMalloc(&workspace, workspace_size),
+             return TEST_FAILED(OP_CREATION_FAILED, "Failed to allocate workspace."));
+    CHECK_OR(infiniopGather(op_desc, workspace, workspace_size,
+                    output->data(),
+                    input->data(),
+                    index->data(),
+                    nullptr),
+             return TEST_FAILED(OP_EXECUTION_FAILED, "Failed during execution."));
+
+    try {
+        allClose(output, _attributes->ans, _rtol, _atol);
+    } catch (const std::exception &e) {
+        return TEST_FAILED(RESULT_INCORRECT, e.what());
+    }
+
+    double elapsed_time = 0.;
+
+    elapsed_time = benchmark(
+        [=]() {
+            infiniopGather(
+                op_desc, workspace, workspace_size,
+                output->data(),
+                input->data(),
+                index->data(),
+                nullptr);
+        },
+        warm_ups, iterations);
+
+    return TEST_PASSED(elapsed_time);
+}
+
+std::vector<std::string> Test::attribute_names() {
+    return {"dim"};
+}
+
+std::vector<std::string> Test::tensor_names() {
+    return {"output", "input", "index", "ans"};
+}
+
+std::vector<std::string> Test::output_names() {
+    return {"output"};
+}
+
+std::string Test::toString() const {
+    std::ostringstream oss;
+    oss << op_name() << std::endl;
+    oss << "- output: " << _attributes->output->info() << std::endl;
+    oss << "- input: " << _attributes->input->info() << std::endl;
+    oss << "- index: " << _attributes->index->info() << std::endl;
+    oss << "- dim: " << _attributes->dim << std::endl;
+
+    oss << std::scientific << std::setprecision(2);
+    oss << "- rtol=" << _rtol << ", atol=" << _atol << std::endl;
+    return oss.str();
+}
+
+Test::~Test() {
+    delete _attributes;
+}
+
+} // namespace infiniop_test::gather
diff --git a/src/infiniop-test/src/ops/index_copy_inplace.cpp b/src/infiniop-test/src/ops/index_copy_inplace.cpp
new file mode 100644
index 000000000..9f5fb5be9
--- /dev/null
+++ b/src/infiniop-test/src/ops/index_copy_inplace.cpp
@@ -0,0 +1,114 @@
+#include "ops.hpp"
+#include "utils.hpp"
+#include <infinirt.h>
+#include <iomanip>
+#include <iostream>
+
+namespace infiniop_test::index_copy_inplace {
+struct Test::Attributes {
+    std::shared_ptr<Tensor> output;
+    std::shared_ptr<Tensor> input;
+    std::shared_ptr<Tensor> index;
+    size_t dim;
+    std::shared_ptr<Tensor> ans;
+};
+
+std::shared_ptr<Test> Test::build(
+    std::unordered_map<std::string, std::vector<uint8_t>> attributes,
+    std::unordered_map<std::string, std::shared_ptr<Tensor>> tensors,
+    double rtol, double atol) {
+    auto test = std::shared_ptr<Test>(new Test(rtol, atol));
+    test->_attributes = new Attributes();
+    if (tensors.find("output") == tensors.end()
+        || tensors.find("input") == tensors.end()
+        || tensors.find("index") == tensors.end()
+        || tensors.find("ans") == tensors.end()) {
+        throw std::runtime_error("Invalid Test");
+    }
+    test->_attributes->output = tensors["output"];
+    test->_attributes->input = tensors["input"];
+    test->_attributes->index = tensors["index"];
+    test->_attributes->ans = tensors["ans"];
+    test->_attributes->dim = *reinterpret_cast<size_t *>(attributes["dim"].data());
+
+    return test;
+}
+std::shared_ptr<infiniop_test::Result> Test::run(
+    infiniopHandle_t handle, infiniDevice_t device, int device_id, size_t warm_ups, size_t iterations) {
+    infiniopIndexCopyInplaceDescriptor_t op_desc;
+    auto output = _attributes->output->to(device, device_id);
+    auto input = _attributes->input->to(device, device_id);
+    auto index = _attributes->index->to(device, device_id);
+    auto dim = _attributes->dim;
+    CHECK_OR(infiniopCreateIndexCopyInplaceDescriptor(handle, &op_desc,
+            output->desc(),
+            input->desc(),
+            index->desc(),
+            dim
+        ),
+             return TEST_FAILED(OP_CREATION_FAILED, "Failed to create op descriptor."));
+    size_t workspace_size;
+    CHECK_OR(infiniopGetIndexCopyInplaceWorkspaceSize(op_desc, &workspace_size),
+             return TEST_FAILED(OP_CREATION_FAILED, "Failed to get workspace size."));
+    void *workspace;
+    CHECK_OR(infinirtMalloc(&workspace, workspace_size),
+             return TEST_FAILED(OP_CREATION_FAILED, "Failed to allocate workspace."));
+    CHECK_OR(infiniopIndexCopyInplace(op_desc, workspace, workspace_size,
+                    output->data(),
+                    input->data(),
+                    index->data(),
+                    nullptr),
+             return TEST_FAILED(OP_EXECUTION_FAILED, "Failed during execution."));
+
+    try {
+        allClose(output, _attributes->ans, _rtol, _atol);
+    } catch (const std::exception &e) {
+        return TEST_FAILED(RESULT_INCORRECT, e.what());
+    }
+
+    double elapsed_time = 0.;
+
+    elapsed_time = benchmark(
+        [=]() {
+            infiniopIndexCopyInplace(
+                op_desc, workspace, workspace_size,
+                output->data(),
+                input->data(),
+                index->data(),
+                nullptr);
+        },
+        warm_ups, iterations);
+
+    return TEST_PASSED(elapsed_time);
+}
+
+std::vector<std::string> Test::attribute_names() {
+    return {"dim"};
+}
+
+std::vector<std::string> Test::tensor_names() {
+    return {"output", "input", "index", "ans"};
+}
+
+std::vector<std::string> Test::output_names() {
+    return {"output"};
+}
+
+std::string Test::toString() const {
+    std::ostringstream oss;
+    oss << op_name() << std::endl;
+    oss << "- output: " << _attributes->output->info() << std::endl;
+    oss << "- input: " << _attributes->input->info() << std::endl;
+    oss << "- index: " << _attributes->index->info() << std::endl;
+    oss << "- dim: " << _attributes->dim << std::endl;
+
+    oss << std::scientific << std::setprecision(2);
+    oss << "- rtol=" << _rtol << ", atol=" << _atol << std::endl;
+    return oss.str();
+}
+
+Test::~Test() {
+    delete _attributes;
+}
+
+} // namespace infiniop_test::index_copy_inplace
diff --git a/src/infiniop-test/src/ops/linear.cpp b/src/infiniop-test/src/ops/linear.cpp
new file mode 100644
index 000000000..94e146b9e
--- /dev/null
+++ b/src/infiniop-test/src/ops/linear.cpp
@@ -0,0 +1,119 @@
+#include "ops.hpp"
+#include "utils.hpp"
+#include <infinirt.h>
+#include <iomanip>
+#include <iostream>
+
+namespace infiniop_test::linear {
+struct Test::Attributes {
+    bool bias_exist;
+    std::shared_ptr<Tensor> y;
+    std::shared_ptr<Tensor> x;
+    std::shared_ptr<Tensor> w;
+    std::shared_ptr<Tensor> b;
+    std::shared_ptr<Tensor> ans;
+};
+
+std::shared_ptr<Test> Test::build(
+    std::unordered_map<std::string, std::vector<uint8_t>> attributes,
+    std::unordered_map<std::string, std::shared_ptr<Tensor>> tensors,
+    double rtol, double atol) {
+    auto test = std::shared_ptr<Test>(new Test(rtol, atol));
+    test->_attributes = new Attributes();
+    if (tensors.find("y") == tensors.end()
+        || tensors.find("x") == tensors.end()
+        || tensors.find("w") == tensors.end()
+        || tensors.find("b") == tensors.end()
+        || tensors.find("ans") == tensors.end()) {
+        throw std::runtime_error("Invalid Test");
+    }
+    test->_attributes->y = tensors["y"];
+    test->_attributes->x = tensors["x"];
+    test->_attributes->w = tensors["w"];
+    test->_attributes->b = tensors["b"];
+    test->_attributes->ans = tensors["ans"];
+    test->_attributes->bias_exist = *reinterpret_cast<bool *>(attributes["bias_exist"].data());
+
+    return test;
+}
+std::shared_ptr<infiniop_test::Result> Test::run(
+    infiniopHandle_t handle, infiniDevice_t device, int device_id, size_t warm_ups, size_t iterations) {
+    infiniopLinearDescriptor_t op_desc;
+    auto y = _attributes->y->to(device, device_id);
+    auto x = _attributes->x->to(device, device_id);
+    auto w = _attributes->w->to(device, device_id);
+    auto b = _attributes->b->to(device, device_id);
+    CHECK_OR(infiniopCreateLinearDescriptor(handle, &op_desc,
+            y->desc(),
+            x->desc(),
+            w->desc(),
+            (_attributes->bias_exist) ? (b->desc() ) : nullptr
+        ),
+             return TEST_FAILED(OP_CREATION_FAILED, "Failed to create op descriptor."));
+    size_t workspace_size;
+    CHECK_OR(infiniopGetLinearWorkspaceSize(op_desc, &workspace_size),
+             return TEST_FAILED(OP_CREATION_FAILED, "Failed to get workspace size."));
+    void *workspace;
+    CHECK_OR(infinirtMalloc(&workspace, workspace_size),
+             return TEST_FAILED(OP_CREATION_FAILED, "Failed to allocate workspace."));
+    CHECK_OR(infiniopLinear(op_desc, workspace, workspace_size,
+                    y->data(),
+                    x->data(),
+                    w->data(),
+                    (_attributes->bias_exist) ? b->data() : nullptr,
+                    nullptr),
+             return TEST_FAILED(OP_EXECUTION_FAILED, "Failed during execution."));
+
+    try {
+        allClose(y, _attributes->ans, _rtol, _atol);
+    } catch (const std::exception &e) {
+        return TEST_FAILED(RESULT_INCORRECT, e.what());
+    }
+
+    double elapsed_time = 0.;
+
+    elapsed_time = benchmark(
+        [=]() {
+            infiniopLinear(
+                op_desc, workspace, workspace_size,
+                y->data(),
+                x->data(),
+                w->data(),
+                (_attributes->bias_exist) ? b->data() : nullptr,
+                nullptr);
+        },
+        warm_ups, iterations);
+
+    return TEST_PASSED(elapsed_time);
+}
+
+std::vector<std::string> Test::attribute_names() {
+    return {"bias_exist"};
+}
+
+std::vector<std::string> Test::tensor_names() {
+    return {"y", "x", "w", "b", "ans"};
+}
+
+std::vector<std::string> Test::output_names() {
+    return {"y"};
+}
+
+std::string Test::toString() const {
+    std::ostringstream oss;
+    oss << op_name() << std::endl;
+    oss << "- y: " << _attributes->y->info() << std::endl;
+    oss << "- x: " << _attributes->x->info() << std::endl;
+    oss << "- w: " << _attributes->w->info() << std::endl;
+    oss << "- b: " << (_attributes->bias_exist ? _attributes->b->info() : "null") << std::endl;
+
+    oss << std::scientific << std::setprecision(2);
+    oss << "- rtol=" << _rtol << ", atol=" << _atol << std::endl;
+    return oss.str();
+}
+
+Test::~Test() {
+    delete _attributes;
+}
+
+} // namespace infiniop_test::linear
diff --git a/src/infiniop-test/src/ops/linear_backward.cpp b/src/infiniop-test/src/ops/linear_backward.cpp
new file mode 100644
index 000000000..5352b62a9
--- /dev/null
+++ b/src/infiniop-test/src/ops/linear_backward.cpp
@@ -0,0 +1,144 @@
+#include "ops.hpp"
+#include "utils.hpp"
+#include <infinirt.h>
+#include <iomanip>
+#include <iostream>
+
+namespace infiniop_test::linear_backward {
+struct Test::Attributes {
+    bool bias_exist;
+    std::shared_ptr<Tensor> grad_x;
+    std::shared_ptr<Tensor> grad_w;
+    std::shared_ptr<Tensor> grad_b;
+    std::shared_ptr<Tensor> grad_y;
+    std::shared_ptr<Tensor> x;
+    std::shared_ptr<Tensor> w;
+    std::shared_ptr<Tensor> ans_grad_x;
+    std::shared_ptr<Tensor> ans_grad_w;
+    std::shared_ptr<Tensor> ans_grad_b;
+};
+
+std::shared_ptr<Test> Test::build(
+    std::unordered_map<std::string, std::vector<uint8_t>> attributes,
+    std::unordered_map<std::string, std::shared_ptr<Tensor>> tensors,
+    double rtol, double atol) {
+    auto test = std::shared_ptr<Test>(new Test(rtol, atol));
+    test->_attributes = new Attributes();
+    if (tensors.find("grad_x") == tensors.end()
+        || tensors.find("grad_w") == tensors.end()
+        || tensors.find("grad_b") == tensors.end()
+        || tensors.find("grad_y") == tensors.end()
+        || tensors.find("x") == tensors.end()
+        || tensors.find("w") == tensors.end()
+        || tensors.find("ans_grad_x") == tensors.end()
+        || tensors.find("ans_grad_w") == tensors.end()
+        || tensors.find("ans_grad_b") == tensors.end()) {
+        throw std::runtime_error("Invalid Test");
+    }
+    test->_attributes->grad_x = tensors["grad_x"];
+    test->_attributes->grad_w = tensors["grad_w"];
+    test->_attributes->grad_b = tensors["grad_b"];
+    test->_attributes->grad_y = tensors["grad_y"];
+    test->_attributes->x = tensors["x"];
+    test->_attributes->w = tensors["w"];
+    test->_attributes->ans_grad_x = tensors["ans_grad_x"];
+    test->_attributes->ans_grad_w = tensors["ans_grad_w"];
+    test->_attributes->ans_grad_b = tensors["ans_grad_b"];
+    test->_attributes->bias_exist = *reinterpret_cast<bool *>(attributes["bias_exist"].data());
+
+    return test;
+}
+std::shared_ptr<infiniop_test::Result> Test::run(
+    infiniopHandle_t handle, infiniDevice_t device, int device_id, size_t warm_ups, size_t iterations) {
+    infiniopLinearBackwardDescriptor_t op_desc;
+    auto grad_x = _attributes->grad_x->to(device, device_id);
+    auto grad_w = _attributes->grad_w->to(device, device_id);
+    auto grad_b = _attributes->grad_b->to(device, device_id);
+    auto grad_y = _attributes->grad_y->to(device, device_id);
+    auto x = _attributes->x->to(device, device_id);
+    auto w = _attributes->w->to(device, device_id);
+    CHECK_OR(infiniopCreateLinearBackwardDescriptor(handle, &op_desc,
+            grad_x->desc(),
+            grad_w->desc(),
+            (_attributes->bias_exist) ? grad_b->desc() : nullptr,
+            grad_y->desc(),
+            x->desc(),
+            w->desc()
+        ),
+             return TEST_FAILED(OP_CREATION_FAILED, "Failed to create op descriptor."));
+    size_t workspace_size;
+    CHECK_OR(infiniopGetLinearBackwardWorkspaceSize(op_desc, &workspace_size),
+             return TEST_FAILED(OP_CREATION_FAILED, "Failed to get workspace size."));
+    void *workspace;
+    CHECK_OR(infinirtMalloc(&workspace, workspace_size),
+             return TEST_FAILED(OP_CREATION_FAILED, "Failed to allocate workspace."));
+    CHECK_OR(infiniopLinearBackward(op_desc, workspace, workspace_size,
+                    grad_x->data(),
+                    grad_w->data(),
+                    (_attributes->bias_exist) ? grad_b->data() : nullptr,
+                    grad_y->data(),
+                    x->data(),
+                    w->data(),
+                    nullptr),
+             return TEST_FAILED(OP_EXECUTION_FAILED, "Failed during execution."));
+
+    try {
+        allClose(grad_x, _attributes->ans_grad_x, _rtol, _atol);
+        allClose(grad_w, _attributes->ans_grad_w, _rtol, _atol);
+        if (_attributes->bias_exist)
+            allClose(grad_b, _attributes->ans_grad_b, _rtol, _atol);
+    } catch (const std::exception &e) {
+        return TEST_FAILED(RESULT_INCORRECT, e.what());
+    }
+
+    double elapsed_time = 0.;
+
+    elapsed_time = benchmark(
+        [=]() {
+            infiniopLinearBackward(
+                op_desc, workspace, workspace_size,
+                grad_x->data(),
+                grad_w->data(),
+                (_attributes->bias_exist) ? grad_b->data() : nullptr,
+                grad_y->data(),
+                x->data(),
+                w->data(),
+                nullptr);
+        },
+        warm_ups, iterations);
+
+    return TEST_PASSED(elapsed_time);
+}
+
+std::vector<std::string> Test::attribute_names() {
+    return {"bias_exist"};
+}
+
+std::vector<std::string> Test::tensor_names() {
+    return {"grad_x", "grad_w", "grad_b", "grad_y", "x", "w", "ans_grad_x", "ans_grad_w", "ans_grad_b"};
+}
+
+std::vector<std::string> Test::output_names() {
+    return {"grad_x", "grad_w", "grad_b"};
+}
+
+std::string Test::toString() const {
+    std::ostringstream oss;
+    oss << op_name() << std::endl;
+    oss << "- grad_x: " << _attributes->grad_x->info() << std::endl;
+    oss << "- grad_w: " << _attributes->grad_w->info() << std::endl;
+    oss << "- grad_b: " << (_attributes->bias_exist ? _attributes->grad_b->info() : "null") << std::endl;
+    oss << "- grad_y: " << _attributes->grad_y->info() << std::endl;
+    oss << "- x: " << _attributes->x->info() << std::endl;
+    oss << "- w: " << _attributes->w->info() << std::endl;
+
+    oss << std::scientific << std::setprecision(2);
+    oss << "- rtol=" << _rtol << ", atol=" << _atol << std::endl;
+    return oss.str();
+}
+
+Test::~Test() {
+    delete _attributes;
+}
+
+} // namespace infiniop_test::linear_backward
diff --git a/src/infiniop-test/src/ops/scatter.cpp b/src/infiniop-test/src/ops/scatter.cpp
new file mode 100644
index 000000000..691e338da
--- /dev/null
+++ b/src/infiniop-test/src/ops/scatter.cpp
@@ -0,0 +1,114 @@
+#include "ops.hpp"
+#include "utils.hpp"
+#include <infinirt.h>
+#include <iomanip>
+#include <iostream>
+
+namespace infiniop_test::scatter {
+struct Test::Attributes {
+    std::shared_ptr<Tensor> output;
+    std::shared_ptr<Tensor> input;
+    std::shared_ptr<Tensor> index;
+    size_t dim;
+    std::shared_ptr<Tensor> ans;
+};
+
+std::shared_ptr<Test> Test::build(
+    std::unordered_map<std::string, std::vector<uint8_t>> attributes,
+    std::unordered_map<std::string, std::shared_ptr<Tensor>> tensors,
+    double rtol, double atol) {
+    auto test = std::shared_ptr<Test>(new Test(rtol, atol));
+    test->_attributes = new Attributes();
+    if (tensors.find("output") == tensors.end()
+        || tensors.find("input") == tensors.end()
+        || tensors.find("index") == tensors.end()
+        || tensors.find("ans") == tensors.end()) {
+        throw std::runtime_error("Invalid Test");
+    }
+    test->_attributes->output = tensors["output"];
+    test->_attributes->input = tensors["input"];
+    test->_attributes->index = tensors["index"];
+    test->_attributes->ans = tensors["ans"];
+    test->_attributes->dim = *reinterpret_cast<size_t *>(attributes["dim"].data());
+
+    return test;
+}
+std::shared_ptr<infiniop_test::Result> Test::run(
+    infiniopHandle_t handle, infiniDevice_t device, int device_id, size_t warm_ups, size_t iterations) {
+    infiniopScatterDescriptor_t op_desc;
+    auto output = _attributes->output->to(device, device_id);
+    auto input = _attributes->input->to(device, device_id);
+    auto index = _attributes->index->to(device, device_id);
+    auto dim = _attributes->dim;
+    CHECK_OR(infiniopCreateScatterDescriptor(handle, &op_desc,
+            output->desc(),
+            input->desc(),
+            index->desc(),
+            dim
+        ),
+             return TEST_FAILED(OP_CREATION_FAILED, "Failed to create op descriptor."));
+    size_t workspace_size;
+    CHECK_OR(infiniopGetScatterWorkspaceSize(op_desc, &workspace_size),
+             return TEST_FAILED(OP_CREATION_FAILED, "Failed to get workspace size."));
+    void *workspace;
+    CHECK_OR(infinirtMalloc(&workspace, workspace_size),
+             return TEST_FAILED(OP_CREATION_FAILED, "Failed to allocate workspace."));
+    CHECK_OR(infiniopScatter(op_desc, workspace, workspace_size,
+                    output->data(),
+                    input->data(),
+                    index->data(),
+                    nullptr),
+             return TEST_FAILED(OP_EXECUTION_FAILED, "Failed during execution."));
+
+    try {
+        allClose(output, _attributes->ans, _rtol, _atol);
+    } catch (const std::exception &e) {
+        return TEST_FAILED(RESULT_INCORRECT, e.what());
+    }
+
+    double elapsed_time = 0.;
+
+    elapsed_time = benchmark(
+        [=]() {
+            infiniopScatter(
+                op_desc, workspace, workspace_size,
+                output->data(),
+                input->data(),
+                index->data(),
+                nullptr);
+        },
+        warm_ups, iterations);
+
+    return TEST_PASSED(elapsed_time);
+}
+
+std::vector<std::string> Test::attribute_names() {
+    return {"dim"};
+}
+
+std::vector<std::string> Test::tensor_names() {
+    return {"output", "input", "index", "ans"};
+}
+
+std::vector<std::string> Test::output_names() {
+    return {"output"};
+}
+
+std::string Test::toString() const {
+    std::ostringstream oss;
+    oss << op_name() << std::endl;
+    oss << "- output: " << _attributes->output->info() << std::endl;
+    oss << "- input: " << _attributes->input->info() << std::endl;
+    oss << "- index: " << _attributes->index->info() << std::endl;
+    oss << "- dim: " << _attributes->dim << std::endl;
+
+    oss << std::scientific << std::setprecision(2);
+    oss << "- rtol=" << _rtol << ", atol=" << _atol << std::endl;
+    return oss.str();
+}
+
+Test::~Test() {
+    delete _attributes;
+}
+
+} // namespace infiniop_test::scatter
diff --git a/src/infiniop-test/src/ops/tril.cpp b/src/infiniop-test/src/ops/tril.cpp
new file mode 100644
index 000000000..4b025e6f7
--- /dev/null
+++ b/src/infiniop-test/src/ops/tril.cpp
@@ -0,0 +1,106 @@
+#include "ops.hpp"
+#include "utils.hpp"
+#include <infinirt.h>
+#include <iomanip>
+#include <iostream>
+
+namespace infiniop_test::tril {
+struct Test::Attributes {
+    std::shared_ptr<Tensor> output;
+    std::shared_ptr<Tensor> input;
+    int diagonal;
+    std::shared_ptr<Tensor> ans;
+};
+
+std::shared_ptr<Test> Test::build(
+    std::unordered_map<std::string, std::vector<uint8_t>> attributes,
+    std::unordered_map<std::string, std::shared_ptr<Tensor>> tensors,
+    double rtol, double atol) {
+    auto test = std::shared_ptr<Test>(new Test(rtol, atol));
+    test->_attributes = new Attributes();
+    if (tensors.find("output") == tensors.end()
+        || tensors.find("input") == tensors.end()
+        || tensors.find("ans") == tensors.end()) {
+        throw std::runtime_error("Invalid Test");
+    }
+    test->_attributes->output = tensors["output"];
+    test->_attributes->input = tensors["input"];
+    test->_attributes->ans = tensors["ans"];
+    test->_attributes->diagonal = *reinterpret_cast<int *>(attributes["diagonal"].data());
+
+    return test;
+}
+std::shared_ptr<infiniop_test::Result> Test::run(
+    infiniopHandle_t handle, infiniDevice_t device, int device_id, size_t warm_ups, size_t iterations) {
+    infiniopTrilDescriptor_t op_desc;
+    auto output = _attributes->output->to(device, device_id);
+    auto input = _attributes->input->to(device, device_id);
+    auto diagonal = _attributes->diagonal;
+    CHECK_OR(infiniopCreateTrilDescriptor(handle, &op_desc,
+            output->desc(),
+            input->desc(),
+            diagonal
+        ),
+             return TEST_FAILED(OP_CREATION_FAILED, "Failed to create op descriptor."));
+    size_t workspace_size;
+    CHECK_OR(infiniopGetTrilWorkspaceSize(op_desc, &workspace_size),
+             return TEST_FAILED(OP_CREATION_FAILED, "Failed to get workspace size."));
+    void *workspace;
+    CHECK_OR(infinirtMalloc(&workspace, workspace_size),
+             return TEST_FAILED(OP_CREATION_FAILED, "Failed to allocate workspace."));
+    CHECK_OR(infiniopTril(op_desc, workspace, workspace_size,
+                    output->data(),
+                    input->data(),
+                    nullptr),
+             return TEST_FAILED(OP_EXECUTION_FAILED, "Failed during execution."));
+
+    try {
+        allClose(output, _attributes->ans, _rtol, _atol);
+    } catch (const std::exception &e) {
+        return TEST_FAILED(RESULT_INCORRECT, e.what());
+    }
+
+    double elapsed_time = 0.;
+
+    elapsed_time = benchmark(
+        [=]() {
+            infiniopTril(
+                op_desc, workspace, workspace_size,
+                output->data(),
+                input->data(),
+                nullptr);
+        },
+        warm_ups, iterations);
+
+    return TEST_PASSED(elapsed_time);
+}
+
+std::vector<std::string> Test::attribute_names() {
+    return {"diagonal"};
+}
+
+std::vector<std::string> Test::tensor_names() {
+    return {"output", "input", "ans"};
+}
+
+std::vector<std::string> Test::output_names() {
+    return {"output"};
+}
+
+std::string Test::toString() const {
+    std::ostringstream oss;
+    oss << op_name() << std::endl;
+    oss << "- output: " << _attributes->output->info() << std::endl;
+    oss << "- input: " << _attributes->input->info() << std::endl;
+    oss << "- diagonal: " << _attributes->diagonal << std::endl;
+
+    oss << std::scientific << std::setprecision(2);
+    oss << "- rtol=" << _rtol << ", atol=" << _atol << std::endl;
+    return oss.str();
+}
+
+Test::~Test() {
+    delete _attributes;
+}
+
+} // namespace infiniop_test::tril
diff --git a/src/infiniop-test/src/ops/triu.cpp b/src/infiniop-test/src/ops/triu.cpp
new file mode 100644
index 000000000..f49240283
--- /dev/null
+++ b/src/infiniop-test/src/ops/triu.cpp
@@ -0,0 +1,106 @@
+#include "ops.hpp"
+#include "utils.hpp"
+#include <infinirt.h>
+#include <iomanip>
+#include <iostream>
+
+namespace infiniop_test::triu {
+struct Test::Attributes {
+    std::shared_ptr<Tensor> output;
+    std::shared_ptr<Tensor> input;
+    int diagonal;
+    std::shared_ptr<Tensor> ans;
+};
+
+std::shared_ptr<Test> Test::build(
+    std::unordered_map<std::string, std::vector<uint8_t>> attributes,
+    std::unordered_map<std::string, std::shared_ptr<Tensor>> tensors,
+    double rtol, double atol) {
+    auto test = std::shared_ptr<Test>(new Test(rtol, atol));
+    test->_attributes = new Attributes();
+    if (tensors.find("output") == tensors.end()
+        || tensors.find("input") == tensors.end()
+        || tensors.find("ans") == tensors.end()) {
+        throw std::runtime_error("Invalid Test");
+    }
+    test->_attributes->output = tensors["output"];
+    test->_attributes->input = tensors["input"];
+    test->_attributes->ans = tensors["ans"];
+    test->_attributes->diagonal = *reinterpret_cast<int *>(attributes["diagonal"].data());
+
+    return test;
+}
+std::shared_ptr<infiniop_test::Result> Test::run(
+    infiniopHandle_t handle, infiniDevice_t device, int device_id, size_t warm_ups, size_t iterations) {
+    infiniopTriuDescriptor_t op_desc;
+    auto output = _attributes->output->to(device, device_id);
+    auto input = _attributes->input->to(device, device_id);
+    auto diagonal = _attributes->diagonal;
+    CHECK_OR(infiniopCreateTriuDescriptor(handle, &op_desc,
+            output->desc(),
+            input->desc(),
+            diagonal
+        ),
+             return TEST_FAILED(OP_CREATION_FAILED, "Failed to create op descriptor."));
+    size_t workspace_size;
+    CHECK_OR(infiniopGetTriuWorkspaceSize(op_desc, &workspace_size),
+             return TEST_FAILED(OP_CREATION_FAILED, "Failed to get workspace size."));
+    void *workspace;
+    CHECK_OR(infinirtMalloc(&workspace, workspace_size),
+             return TEST_FAILED(OP_CREATION_FAILED, "Failed to allocate workspace."));
+    CHECK_OR(infiniopTriu(op_desc, workspace, workspace_size,
+                    output->data(),
+                    input->data(),
+                    nullptr),
+             return TEST_FAILED(OP_EXECUTION_FAILED, "Failed during execution."));
+
+    try {
+        allClose(output, _attributes->ans, _rtol, _atol);
+    } catch (const std::exception &e) {
+        return TEST_FAILED(RESULT_INCORRECT, e.what());
+    }
+
+    double elapsed_time = 0.;
+
+    elapsed_time = benchmark(
+        [=]() {
+            infiniopTriu(
+                op_desc, workspace, workspace_size,
+                output->data(),
+                input->data(),
+                nullptr);
+        },
+        warm_ups, iterations);
+
+    return TEST_PASSED(elapsed_time);
+}
+
+std::vector<std::string> Test::attribute_names() {
+    return {"diagonal"};
+}
+
+std::vector<std::string> Test::tensor_names() {
+    return {"output", "input", "ans"};
+}
+
+std::vector<std::string> Test::output_names() {
+    return {"output"};
+}
+
+std::string Test::toString() const {
+    std::ostringstream oss;
+    oss << op_name() << std::endl;
+    oss << "- output: " << _attributes->output->info() << std::endl;
+    oss << "- input: " << _attributes->input->info() << std::endl;
+    oss << "- diagonal: " << _attributes->diagonal << std::endl;
+
+    oss << std::scientific << std::setprecision(2);
+    oss << "- rtol=" << _rtol << ", atol=" << _atol << std::endl;
+    return oss.str();
+}
+
+Test::~Test() {
+    delete _attributes;
+}
+
+} // namespace infiniop_test::triu
diff --git a/src/infiniop/ops/gather/cpu/gather_cpu.cc b/src/infiniop/ops/gather/cpu/gather_cpu.cc
new file mode 100644
index 000000000..cc1d5b740
--- /dev/null
+++ b/src/infiniop/ops/gather/cpu/gather_cpu.cc
@@ -0,0 +1,106 @@
+#include "gather_cpu.h"
+#include "../../../devices/cpu/common_cpu.h"
+#include "../../../reduce/cpu/reduce.h"
+#include "../info.h"
+
+namespace op::gather::cpu {
+
+infiniStatus_t calculate_gather(
+    const GatherInfo &info,
+    char * output,
+    const char * input,
+    const int64_t * index
+) {
+//  -------------------------------- start: perform operator on CPU --------------------------------
+    std::vector<ptrdiff_t> contiguous_strides(info.ndim);
+	ptrdiff_t last_dim = 1;
+    ptrdiff_t last_stride = 1;
+    for(size_t d = 0; d < info.ndim; d ++)
+    {
+        contiguous_strides[d] = last_dim * last_stride;  
+        last_dim = info.output_shape[d];
+        last_stride = contiguous_strides[d];
+    }
+    size_t total_size = last_dim * last_stride;
+    
+    int gather_dim = info.dim;
+    size_t element_size = infiniSizeOf(info.dtype);
+
+    #pragma omp parallel for
+    for(size_t i = 0; i < total_size; i++)
+    {
+        auto output_ptr = output;
+        auto input_ptr = input;
+        auto index_ptr = index;
+        size_t rem = i;
+        for (int d = info.ndim - 1; d >= 0; d--) {
+            size_t dim_index = rem / contiguous_strides[d];
+            rem = rem % contiguous_strides[d];
+            output_ptr += dim_index * element_size * info.output_strides[d];
+            index_ptr += dim_index * info.index_strides[d];
+            if(d != gather_dim)
+                input_ptr += dim_index * element_size * info.input_strides[d];
+        }
+        int64_t gather_number = *index_ptr;
+        input_ptr += gather_number * element_size * info.input_strides[gather_dim];
+        // *output_ptr = *input_ptr;
+        memcpy(
+            output_ptr,
+            input_ptr,
+            element_size
+        );
+    }
+//  --------------------------------- end: perform operator on CPU ---------------------------------
+    return INFINI_STATUS_SUCCESS;
+}
+
+Descriptor::~Descriptor() = default;
+
+infiniStatus_t Descriptor::create(
+    infiniopHandle_t handle_,
+    Descriptor **desc_ptr,
+    infiniopTensorDescriptor_t output_desc,
+    infiniopTensorDescriptor_t input_desc,
+    infiniopTensorDescriptor_t index_desc,
+    size_t dim
+) {
+    auto handle = reinterpret_cast<device::cpu::Handle *>(handle_);
+
+//  --------------------- start: check data type and calculate workspace size ----------------------
+    auto dtype = output_desc->dtype();
+    size_t WorkSpaceSize = 0;
+//  ---------------------- end: check data type and calculate workspace size -----------------------
+
+    auto result = GatherInfo::createGatherInfo(
+        output_desc,
+        input_desc,
+        index_desc,
+        dim
+    );
+    CHECK_RESULT(result);
+    const GatherInfo &info = result.take();
+    
+    *desc_ptr = new Descriptor(
+        dtype, std::move(info), WorkSpaceSize,
+        nullptr,
+        handle->device, handle->device_id
+    );    
+
+    return INFINI_STATUS_SUCCESS;
+}
+
+
+
+infiniStatus_t Descriptor::calculate(
+    void *workspace,
+    size_t workspace_size,
+    void * output,
+    const void * input,
+    const void * index,
+    void *stream
+) const {
+
+    return calculate_gather(_info, (char *)output, (const char *)input, (const int64_t *)index);
+}
+}
+
diff --git a/src/infiniop/ops/gather/cpu/gather_cpu.h b/src/infiniop/ops/gather/cpu/gather_cpu.h
new file mode 100644
index 000000000..bc74fd669
--- /dev/null
+++ b/src/infiniop/ops/gather/cpu/gather_cpu.h
@@ -0,0 +1,9 @@
+#ifndef __GATHER_CPU_H__
+#define __GATHER_CPU_H__
+
+#include "../gather.h"
+
+DESCRIPTOR(cpu)
+
+
+#endif // __GATHER_CPU_H__
diff --git a/src/infiniop/ops/gather/cuda/kernel.cuh b/src/infiniop/ops/gather/cuda/kernel.cuh
new file mode 100644
index 000000000..e49ca0fd2
--- /dev/null
+++ b/src/infiniop/ops/gather/cuda/kernel.cuh
@@ -0,0 +1,38 @@
+#ifndef __GATHER_KERNEL_CUH__
+#define __GATHER_KERNEL_CUH__
+//  ------------------------------- start: perform operator on CUDA --------------------------------
+template <unsigned int BLOCK_SIZE, typename Tdata>
+__device__ void gatherKernel(
+    Tdata * output,
+    const Tdata * input,
+    const int64_t * index,
+    size_t ndim,
+    size_t index_gather_size,
+    ptrdiff_t * output_strides,
+    ptrdiff_t * input_strides,
+    ptrdiff_t * index_strides,
+    ptrdiff_t * contiguous_strides,
+    int gather_dim
+) {
+    auto output_ptr = output;
+    auto input_ptr = input;
+    auto index_ptr = index;
+    size_t rem = blockIdx.x;
+    for (int d = ndim - 1; d >= 0; d--) {
+        if (d == gather_dim)
+            continue;        
+        size_t dim_index = rem / contiguous_strides[d];
+        rem = rem % contiguous_strides[d];
+        output_ptr += dim_index * output_strides[d];
+        input_ptr += dim_index * input_strides[d];
+        index_ptr += dim_index * index_strides[d];
+    }
+    for (size_t c = threadIdx.x; c < index_gather_size; c ++) {
+        int64_t gather_number = *(index_ptr + c * index_strides[gather_dim]);
+        *(output_ptr + c * output_strides[gather_dim]) = \
+            *(input_ptr + gather_number * input_strides[gather_dim]);        
+    }
+}
+//  -------------------------------- end: perform operator on CUDA ---------------------------------
+
+#endif // __GATHER_KERNEL_CUH__
diff --git a/src/infiniop/ops/gather/gather.h b/src/infiniop/ops/gather/gather.h
new file mode 100644
index 000000000..70991f5db
--- /dev/null
+++ b/src/infiniop/ops/gather/gather.h
@@ -0,0 +1,49 @@
+#ifndef __GATHER_H__
+#define __GATHER_H__
+
+#include "../../../utils.h"
+#include "../../operator.h"
+#include "../../tensor.h"
+#include "info.h"
+
+#define DESCRIPTOR(NAMESPACE)                                         \
+    namespace op::gather::NAMESPACE {                                 \
+    class Descriptor final : public InfiniopDescriptor {              \
+        struct Opaque;                                                \
+        Opaque *_opaque;                                              \
+        GatherInfo _info;                                             \
+        size_t _workspace_size;                                       \
+        Descriptor(                                                   \
+            infiniDtype_t dtype,                                      \
+            GatherInfo info,                                          \
+            size_t workspace_size_,                                   \
+            Opaque *opaque,                                           \
+            infiniDevice_t device_type,                               \
+            int device_id                                             \
+        ) : InfiniopDescriptor{device_type, device_id},               \
+              _opaque(opaque),                                        \
+              _info(info),                                            \
+              _workspace_size(workspace_size_) {}                     \
+    public:                                                           \
+        ~Descriptor();                                                \
+        size_t workspaceSize() const { return _workspace_size; }      \
+        static infiniStatus_t create(                                 \
+            infiniopHandle_t handle,                                  \
+            Descriptor **desc_ptr,                                    \
+            infiniopTensorDescriptor_t output_desc,                   \
+            infiniopTensorDescriptor_t input_desc,                    \
+            infiniopTensorDescriptor_t index_desc,                    \
+            size_t dim                                                \
+        );                                                            \
+        infiniStatus_t calculate(                                     \
+            void *workspace,                                          \
+            size_t workspace_size,                                    \
+            void * output,                                            \
+            const void * input,                                       \
+            const void * index,                                       \
+            void *stream                                              \
+        ) const;                                                      \
+    };                                                                \
+    }
+
+#endif
\ No newline at end of file
diff --git a/src/infiniop/ops/gather/info.h b/src/infiniop/ops/gather/info.h
new file mode 100644
index 000000000..aa7a54a16
--- /dev/null
+++ b/src/infiniop/ops/gather/info.h
@@ -0,0 +1,58 @@
+#ifndef __GATHER_INFO_H__
+#define __GATHER_INFO_H__
+
+#include "../../../utils.h"
+#include "../../operator.h"
+#include "../../tensor.h"
+
+namespace op::gather {
+
+class GatherInfo {
+private:
+    GatherInfo() = default;
+
+public:
+//  ---------------------------- start: define member variables of Info ----------------------------
+    infiniDtype_t dtype;
+    size_t ndim;
+    std::vector<size_t> output_shape;
+    size_t input_dim_size;
+    std::vector<ptrdiff_t> output_strides;
+    std::vector<ptrdiff_t> input_strides;
+    std::vector<ptrdiff_t> index_strides;
+    size_t dim;
+
+//  ----------------------------- end: define member variables of Info -----------------------------
+
+    static utils::Result<GatherInfo> createGatherInfo(
+        infiniopTensorDescriptor_t output_desc,
+        infiniopTensorDescriptor_t input_desc,
+        infiniopTensorDescriptor_t index_desc,
+        size_t dim
+    ) {
+//  ------------------------- start: check tensor shape and input validity -------------------------
+        CHECK_SAME_SHAPE(output_desc->shape(), index_desc->shape());
+        size_t ndim = output_desc->ndim();
+        for (size_t d = 0; d < ndim; d ++) {
+            if (d != dim)
+                CHECK_OR_RETURN(input_desc->dim(d) == output_desc->dim(d), INFINI_STATUS_BAD_TENSOR_SHAPE);
+        }        
+        CHECK_OR_RETURN(ndim > dim, INFINI_STATUS_BAD_PARAM);
+//  -------------------------- end: check tensor shape and input validity --------------------------
+        return utils::Result<GatherInfo>(GatherInfo{
+//  ------------------------------ start: create an instance of Info -------------------------------
+            output_desc->dtype(),
+            ndim,
+            output_desc->shape(),
+            input_desc->dim(dim),
+            output_desc->strides(),
+            input_desc->strides(),
+            index_desc->strides(),
+            dim
+//  ------------------------------- end: create an instance of Info --------------------------------
+        });
+    }
+};
+}
+
+#endif //  __GATHER_INFO_H__
diff --git a/src/infiniop/ops/gather/metax/gather_metax.h b/src/infiniop/ops/gather/metax/gather_metax.h
new file mode 100644
index 000000000..bf0a25a36
--- /dev/null
+++ b/src/infiniop/ops/gather/metax/gather_metax.h
@@ -0,0 +1,8 @@
+#ifndef __GATHER_METAX_H__
+#define __GATHER_METAX_H__
+
+#include "../gather.h"
+
+DESCRIPTOR(metax)
+
+#endif // __GATHER_METAX_H__
diff --git a/src/infiniop/ops/gather/metax/gather_metax.maca b/src/infiniop/ops/gather/metax/gather_metax.maca
new file mode 100644
index 000000000..7254078c3
--- /dev/null
+++ b/src/infiniop/ops/gather/metax/gather_metax.maca
@@ -0,0 +1,190 @@
+#include "../../../devices/metax/metax_common.h"
+#include "gather_metax.h"
+#include <hccub/block/block_reduce.cuh>
+#include "../../../devices/metax/metax_kernel_common.h"
+#include "../../../reduce/cuda/reduce.cuh"
+#include "../cuda/kernel.cuh"
+#include "../info.h"
+
+namespace op::gather::metax {
+
+template <unsigned int BLOCK_SIZE, typename Tdata>
+INFINIOP_METAX_KERNEL launchKernel(
+    Tdata * output,
+    const Tdata * input,
+    const int64_t * index,
+    size_t ndim,
+    size_t index_gather_size,
+    ptrdiff_t * output_strides,
+    ptrdiff_t * input_strides,
+    ptrdiff_t * index_strides,
+    ptrdiff_t * contiguous_strides,
+    int gather_dim
+) {
+    gatherKernel<BLOCK_SIZE, Tdata>(
+        output,
+        input,
+        index,
+        ndim,
+        index_gather_size,
+        output_strides,
+        input_strides,
+        index_strides,
+        contiguous_strides,
+        gather_dim
+    );
+}
+
+//  ----------------------------------- start: call launchKernel -----------------------------------
+template<unsigned int BLOCK_SIZE, typename Tdata>
+infiniStatus_t calculate_gather(
+    const GatherInfo &info,
+    Tdata * output,
+    const Tdata * input,
+    const int64_t *  index,
+    hcStream_t stream,
+    void * workspace
+) {
+    size_t ndim = info.ndim;
+    ptrdiff_t * contiguous_strides = new ptrdiff_t[ndim];
+    size_t last_dim = 1, last_stride = 1;
+    size_t gather_dim = info.dim;
+    for(size_t d = 0; d < ndim; d ++)
+    {
+        if (d == gather_dim) 
+            continue;         
+        contiguous_strides[d] = last_dim * last_stride;
+        last_dim = info.output_shape[d];
+        last_stride = contiguous_strides[d];
+    }
+    size_t batch_size = last_dim * last_stride;
+
+
+    ptrdiff_t * contiguous_strides_cuda = reinterpret_cast<ptrdiff_t*>(workspace);
+    ptrdiff_t * input_strides_cuda = contiguous_strides_cuda + ndim;
+    ptrdiff_t * output_strides_cuda = input_strides_cuda + ndim;
+    ptrdiff_t * index_strides_cuda = output_strides_cuda + ndim;
+
+    CHECK_METAX(hcMemcpyAsync(contiguous_strides_cuda, contiguous_strides, sizeof(ptrdiff_t) * ndim, hcMemcpyHostToDevice, stream));
+    CHECK_METAX(hcMemcpyAsync(input_strides_cuda, info.input_strides.data(), sizeof(ptrdiff_t) * ndim, hcMemcpyHostToDevice, stream));
+    CHECK_METAX(hcMemcpyAsync(output_strides_cuda, info.output_strides.data(), sizeof(ptrdiff_t) * ndim, hcMemcpyHostToDevice, stream));
+    CHECK_METAX(hcMemcpyAsync(index_strides_cuda, info.index_strides.data(), sizeof(ptrdiff_t) * ndim, hcMemcpyHostToDevice, stream));    
+
+    
+    launchKernel<BLOCK_SIZE, Tdata><<<batch_size, BLOCK_SIZE, 0, stream>>>(
+        output,
+        input,
+        index,
+        ndim,
+        info.output_shape[gather_dim],
+        output_strides_cuda,
+        input_strides_cuda,
+        index_strides_cuda,
+        contiguous_strides_cuda,
+        info.dim
+    );
+    delete[] contiguous_strides;
+    return INFINI_STATUS_SUCCESS;
+}
+//  ------------------------------------ end: call launchKernel ------------------------------------
+
+
+struct Descriptor::Opaque {
+    std::shared_ptr<device::metax::Handle::Internal> internal;
+};
+
+Descriptor::~Descriptor() {
+    delete _opaque;
+}
+
+infiniStatus_t Descriptor::create(
+    infiniopHandle_t handle_,
+    Descriptor **desc_ptr,
+    infiniopTensorDescriptor_t output_desc,
+    infiniopTensorDescriptor_t input_desc,
+    infiniopTensorDescriptor_t index_desc,
+    size_t dim
+) {
+    auto handle = reinterpret_cast<device::metax::Handle *>(handle_);
+//  --------------------- start: check data type and calculate workspace size ----------------------    
+    auto dtype = output_desc->dtype();
+    auto result = GatherInfo::createGatherInfo(
+        output_desc,
+        input_desc,
+        index_desc,
+        dim
+    );
+    CHECK_RESULT(result);
+    const GatherInfo &info = result.take();
+    size_t WorkSpaceSize = sizeof(ptrdiff_t) * input_desc->ndim() * 4;
+//  ---------------------- end: check data type and calculate workspace size -----------------------
+
+    *desc_ptr = new Descriptor(
+        dtype, std::move(info), WorkSpaceSize, 
+        new Opaque{handle->internal()},
+        handle->device, handle->device_id
+    );
+    return INFINI_STATUS_SUCCESS;
+}
+
+
+
+infiniStatus_t Descriptor::calculate(
+    void * workspace,
+    size_t workspace_size,
+    void * output,
+    const void * input,
+    const void * index,
+    void *stream_
+) const {
+    if (workspace_size < _workspace_size)
+        return INFINI_STATUS_INSUFFICIENT_WORKSPACE;
+
+    hcStream_t stream = (hcStream_t)stream_;
+
+
+    #define CALCULATE_GATHER(BLOCK_SIZE, TDATA) \
+        calculate_gather<BLOCK_SIZE, TDATA>(_info, (TDATA *)output, (const TDATA *)input, (const int64_t *)index, stream, workspace)
+
+    #define CALCULATE_GATHER_WITH_METAX_BLOCK(BLOCK_SIZE)             \
+    switch (_info.dtype) {                                            \
+        case INFINI_DTYPE_BOOL:                                       \
+            return CALCULATE_GATHER(BLOCK_SIZE, bool);                \
+        case INFINI_DTYPE_U8:                                         \
+            return CALCULATE_GATHER(BLOCK_SIZE, uint8_t);             \
+        case INFINI_DTYPE_U16:                                        \
+            return CALCULATE_GATHER(BLOCK_SIZE, uint16_t);            \
+        case INFINI_DTYPE_U32:                                        \
+            return CALCULATE_GATHER(BLOCK_SIZE, uint32_t);            \
+        case INFINI_DTYPE_U64:                                        \
+            return CALCULATE_GATHER(BLOCK_SIZE, uint64_t);            \
+        case INFINI_DTYPE_I8:                                         \
+            return CALCULATE_GATHER(BLOCK_SIZE, int8_t);              \
+        case INFINI_DTYPE_I16:                                        \
+            return CALCULATE_GATHER(BLOCK_SIZE, int16_t);             \
+        case INFINI_DTYPE_I32:                                        \
+            return CALCULATE_GATHER(BLOCK_SIZE, int32_t);             \
+        case INFINI_DTYPE_I64:                                        \
+            return CALCULATE_GATHER(BLOCK_SIZE, int64_t);             \
+        case INFINI_DTYPE_F16:                                        \
+            return CALCULATE_GATHER(BLOCK_SIZE, half);                \
+        case INFINI_DTYPE_F32:                                        \
+            return CALCULATE_GATHER(BLOCK_SIZE, float);               \
+        case INFINI_DTYPE_BF16:                                       \
+            return CALCULATE_GATHER(BLOCK_SIZE, cuda_bfloat16);       \
+        default:                                                      \
+            return INFINI_STATUS_BAD_TENSOR_DTYPE;                    \
+    }    
+
+    if (_opaque->internal->maxThreadsPerBlock() == METAX_BLOCK_SIZE_1024)
+        CALCULATE_GATHER_WITH_METAX_BLOCK(METAX_BLOCK_SIZE_1024)
+    else if (_opaque->internal->maxThreadsPerBlock() == METAX_BLOCK_SIZE_512)
+        CALCULATE_GATHER_WITH_METAX_BLOCK(METAX_BLOCK_SIZE_512)
+    else
+        return INFINI_STATUS_DEVICE_ARCHITECTURE_NOT_SUPPORTED;
+    return INFINI_STATUS_SUCCESS;
+
+    #undef CALCULATE_GATHER_WITH_METAX_BLOCK
+    #undef CALCULATE_GATHER
+}
+} // namespace op::gather::metax
diff --git a/src/infiniop/ops/gather/nvidia/gather_nvidia.cu b/src/infiniop/ops/gather/nvidia/gather_nvidia.cu
new file mode 100644
index 000000000..94741a7d5
--- /dev/null
+++ b/src/infiniop/ops/gather/nvidia/gather_nvidia.cu
@@ -0,0 +1,189 @@
+#include "../../../devices/nvidia/nvidia_handle.cuh"
+#include "../../../devices/nvidia/nvidia_common.cuh"
+#include "../../../devices/nvidia/nvidia_kernel_common.cuh"
+#include "gather_nvidia.cuh"
+#include "../cuda/kernel.cuh"
+#include "../info.h"
+
+namespace op::gather::nvidia {
+
+//  ---------------------- start: launchKernel: call kernel function of CUDA -----------------------
+template <unsigned int BLOCK_SIZE, typename Tdata>
+INFINIOP_CUDA_KERNEL launchKernel(
+    Tdata * output,
+    const Tdata * input,
+    const int64_t * index,
+    size_t ndim,
+    size_t index_gather_size,
+    ptrdiff_t * output_strides,
+    ptrdiff_t * input_strides,
+    ptrdiff_t * index_strides,
+    ptrdiff_t * contiguous_strides,
+    int gather_dim
+) {
+    gatherKernel<BLOCK_SIZE, Tdata>(
+        output,
+        input,
+        index,
+        ndim,
+        index_gather_size,
+        output_strides,
+        input_strides,
+        index_strides,
+        contiguous_strides,
+        gather_dim
+    );
+}
+//  ----------------------- end: launchKernel: call kernel function of CUDA ------------------------
+
+//  ----------------------------------- start: call launchKernel -----------------------------------
+template<unsigned int BLOCK_SIZE, typename Tdata>
+infiniStatus_t calculate_gather(
+    const GatherInfo &info,
+    Tdata * output,
+    const Tdata * input,
+    const int64_t *  index,
+    cudaStream_t stream,
+    void * workspace
+) {
+    size_t ndim = info.ndim;
+    ptrdiff_t * contiguous_strides = new ptrdiff_t[ndim];
+    size_t last_dim = 1, last_stride = 1;
+    size_t gather_dim = info.dim;
+    for(size_t d = 0; d < ndim; d ++)
+    {
+        if (d == gather_dim) 
+            continue;        
+        contiguous_strides[d] = last_dim * last_stride;
+        last_dim = info.output_shape[d];
+        last_stride = contiguous_strides[d];
+    }
+    size_t batch_size = last_dim * last_stride;
+
+
+    ptrdiff_t * contiguous_strides_cuda = reinterpret_cast<ptrdiff_t*>(workspace);
+    ptrdiff_t * input_strides_cuda = contiguous_strides_cuda + ndim;
+    ptrdiff_t * output_strides_cuda = input_strides_cuda + ndim;
+    ptrdiff_t * index_strides_cuda = output_strides_cuda + ndim;
+
+    CHECK_CUDA(cudaMemcpyAsync(contiguous_strides_cuda, contiguous_strides, sizeof(ptrdiff_t) * ndim, cudaMemcpyHostToDevice, stream));
+    CHECK_CUDA(cudaMemcpyAsync(input_strides_cuda, info.input_strides.data(), sizeof(ptrdiff_t) * ndim, cudaMemcpyHostToDevice, stream));
+    CHECK_CUDA(cudaMemcpyAsync(output_strides_cuda, info.output_strides.data(), sizeof(ptrdiff_t) * ndim, cudaMemcpyHostToDevice, stream));
+    CHECK_CUDA(cudaMemcpyAsync(index_strides_cuda, info.index_strides.data(), sizeof(ptrdiff_t) * ndim, cudaMemcpyHostToDevice, stream));    
+
+    launchKernel<1, Tdata><<<batch_size, 1, 0, stream>>>(
+        output,
+        input,
+        index,
+        ndim,
+        info.output_shape[gather_dim],
+        output_strides_cuda,
+        input_strides_cuda,
+        index_strides_cuda,
+        contiguous_strides_cuda,
+        info.dim
+    );
+    delete[] contiguous_strides;
+    return INFINI_STATUS_SUCCESS;
+}
+//  ------------------------------------ end: call launchKernel ------------------------------------
+
+
+struct Descriptor::Opaque {
+    std::shared_ptr<device::nvidia::Handle::Internal> internal;
+};
+
+Descriptor::~Descriptor() {
+    delete _opaque;
+}
+
+infiniStatus_t Descriptor::create(
+    infiniopHandle_t handle_,
+    Descriptor **desc_ptr,
+    infiniopTensorDescriptor_t output_desc,
+    infiniopTensorDescriptor_t input_desc,
+    infiniopTensorDescriptor_t index_desc,
+    size_t dim
+) {
+    auto handle = reinterpret_cast<device::nvidia::Handle *>(handle_);
+//  --------------------- start: check data type and calculate workspace size ----------------------
+    auto dtype = output_desc->dtype();
+    size_t WorkSpaceSize = sizeof(ptrdiff_t) * input_desc->ndim() * 4;
+//  ---------------------- end: check data type and calculate workspace size -----------------------
+    auto result = GatherInfo::createGatherInfo(
+        output_desc,
+        input_desc,
+        index_desc,
+        dim
+    );
+    CHECK_RESULT(result);
+    const GatherInfo &info = result.take();
+    *desc_ptr = new Descriptor(
+        dtype, std::move(info), WorkSpaceSize,
+        new Opaque{handle->internal()},
+        handle->device, handle->device_id
+    );    
+    return INFINI_STATUS_SUCCESS;
+}
+
+
+infiniStatus_t Descriptor::calculate(
+    void * workspace,
+    size_t workspace_size,
+    void * output,
+    const void * input,
+    const void * index,
+    void *stream_
+) const {
+    if (workspace_size < _workspace_size) {
+        return INFINI_STATUS_INSUFFICIENT_WORKSPACE;
+    }
+    cudaStream_t stream = (cudaStream_t)stream_;
+
+    #define CALCULATE_GATHER(BLOCK_SIZE, TDATA) \
+        calculate_gather<BLOCK_SIZE, TDATA>(_info, (TDATA *)output, (const TDATA *)input, (const int64_t *)index, stream, workspace)
+    #define CALCULATE_GATHER_WITH_BLOCK_SIZE(BLOCK_SIZE)              \
+    switch (_info.dtype) {                                            \
+        case INFINI_DTYPE_BOOL:                                       \
+            return CALCULATE_GATHER(BLOCK_SIZE, bool);                \
+        case INFINI_DTYPE_U8:                                         \
+            return CALCULATE_GATHER(BLOCK_SIZE, uint8_t);             \
+        case INFINI_DTYPE_U16:                                        \
+            return CALCULATE_GATHER(BLOCK_SIZE, uint16_t);            \
+        case INFINI_DTYPE_U32:                                        \
+            return CALCULATE_GATHER(BLOCK_SIZE, uint32_t);            \
+        case INFINI_DTYPE_U64:                                        \
+            return CALCULATE_GATHER(BLOCK_SIZE, uint64_t);            \
+        case INFINI_DTYPE_I8:                                         \
+            return CALCULATE_GATHER(BLOCK_SIZE, int8_t);              \
+        case INFINI_DTYPE_I16:                                        \
+            return CALCULATE_GATHER(BLOCK_SIZE, int16_t);             \
+        case INFINI_DTYPE_I32:                                        \
+            return CALCULATE_GATHER(BLOCK_SIZE, int32_t);             \
+        case INFINI_DTYPE_I64:                                        \
+            return CALCULATE_GATHER(BLOCK_SIZE, int64_t);             \
+        case INFINI_DTYPE_F16:                                        \
+            return CALCULATE_GATHER(BLOCK_SIZE, half);                \
+        case INFINI_DTYPE_F32:                                        \
+            return CALCULATE_GATHER(BLOCK_SIZE, float);               \
+        case INFINI_DTYPE_BF16:                                       \
+            return CALCULATE_GATHER(BLOCK_SIZE, cuda_bfloat16);       \
+        default:                                                      \
+            return INFINI_STATUS_BAD_TENSOR_DTYPE;                    \
+    }    
+    
+    if (_opaque->internal->maxThreadsPerBlock() == CUDA_BLOCK_SIZE_1024)
+        CALCULATE_GATHER_WITH_BLOCK_SIZE(CUDA_BLOCK_SIZE_1024)
+    else if (_opaque->internal->maxThreadsPerBlock() == CUDA_BLOCK_SIZE_512)
+        CALCULATE_GATHER_WITH_BLOCK_SIZE(CUDA_BLOCK_SIZE_512)
+    else if (_opaque->internal->maxThreadsPerBlock() == CUDA_BLOCK_SIZE_4096)
+        CALCULATE_GATHER_WITH_BLOCK_SIZE(CUDA_BLOCK_SIZE_4096)
+    else
+        return INFINI_STATUS_DEVICE_ARCHITECTURE_NOT_SUPPORTED;
+
+    #undef CALCULATE_GATHER_WITH_BLOCK_SIZE
+    #undef CALCULATE_GATHER
+
+    return INFINI_STATUS_SUCCESS;
+}
+} // namespace op::gather::nvidia
diff --git a/src/infiniop/ops/gather/nvidia/gather_nvidia.cuh b/src/infiniop/ops/gather/nvidia/gather_nvidia.cuh
new file mode 100644
index 000000000..46d42fa0c
--- /dev/null
+++ b/src/infiniop/ops/gather/nvidia/gather_nvidia.cuh
@@ -0,0 +1,7 @@
+#ifndef __GATHER_NVIDIA_API_H__
+#define __GATHER_NVIDIA_API_H__
+#include "../gather.h"
+
+DESCRIPTOR(nvidia)
+
+#endif // __GATHER_NVIDIA_API_H__
diff --git a/src/infiniop/ops/gather/operator.cc b/src/infiniop/ops/gather/operator.cc
new file mode 100644
index 000000000..c748f811c
--- /dev/null
+++ b/src/infiniop/ops/gather/operator.cc
@@ -0,0 +1,154 @@
+#include "../../operator.h"
+#include "../../handle.h"
+#include "infiniop/ops/gather.h"
+
+#ifdef ENABLE_CPU_API
+#include "cpu/gather_cpu.h"
+#endif
+#if defined(ENABLE_NVIDIA_API) || defined(ENABLE_ILUVATAR_API)
+#include "nvidia/gather_nvidia.cuh"
+#endif
+#ifdef ENABLE_METAX_API
+#include "metax/gather_metax.h"
+#endif
+
+__C infiniStatus_t infiniopCreateGatherDescriptor(
+    infiniopHandle_t handle,
+    infiniopGatherDescriptor_t *desc_ptr,
+    infiniopTensorDescriptor_t output_desc,
+    infiniopTensorDescriptor_t input_desc,
+    infiniopTensorDescriptor_t index_desc,
+    size_t dim
+) {
+#define CREATE(CASE, NAMESPACE)                                                 \
+    case CASE:                                                                  \
+        return op::gather::NAMESPACE::Descriptor::create(                       \
+            handle,                                                             \
+            reinterpret_cast<op::gather::NAMESPACE::Descriptor **>(desc_ptr),   \
+            output_desc,                                                        \
+            input_desc,                                                         \
+            index_desc,                                                         \
+            dim                                                                 \
+        )
+
+    switch (handle->device) {
+
+#ifdef ENABLE_CPU_API
+        CREATE(INFINI_DEVICE_CPU, cpu);
+#endif
+#ifdef ENABLE_NVIDIA_API
+        CREATE(INFINI_DEVICE_NVIDIA, nvidia);
+#endif
+#ifdef ENABLE_ILUVATAR_API
+        CREATE(INFINI_DEVICE_ILUVATAR, nvidia);
+#endif
+#ifdef ENABLE_METAX_API
+       CREATE(INFINI_DEVICE_METAX, metax);
+#endif
+
+    default:
+        return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
+    }
+
+#undef CREATE
+}
+
+__C infiniStatus_t infiniopGetGatherWorkspaceSize(infiniopGatherDescriptor_t desc, size_t *size) {
+#define GET(CASE, NAMESPACE)                                                                        \
+    case CASE:                                                                                      \
+        *size = reinterpret_cast<op::gather::NAMESPACE::Descriptor *>(desc)->workspaceSize();       \
+        return INFINI_STATUS_SUCCESS
+
+    switch (desc->device_type) {
+#ifdef ENABLE_CPU_API
+        GET(INFINI_DEVICE_CPU, cpu);
+#endif
+#ifdef ENABLE_NVIDIA_API
+        GET(INFINI_DEVICE_NVIDIA, nvidia);
+#endif
+#ifdef ENABLE_ILUVATAR_API
+        GET(INFINI_DEVICE_ILUVATAR, nvidia);
+#endif
+#ifdef ENABLE_METAX_API
+       GET(INFINI_DEVICE_METAX, metax);
+#endif
+    default:
+        return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
+    }
+#undef GET
+
+    return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
+}
+
+__C infiniStatus_t infiniopGather(
+    infiniopGatherDescriptor_t desc,
+    void *workspace,
+    size_t workspace_size,
+    void * output,
+    const void * input,
+    const void * index,                
+    void *stream
+) {
+
+#define CALCULATE(CASE, NAMESPACE)                                                                  \
+    case CASE:                                                                                      \
+        return reinterpret_cast<const op::gather::NAMESPACE::Descriptor *>(desc)->calculate(        \
+            workspace,                                                                              \
+            workspace_size,                                                                         \
+            output,                                                                                 \
+            input,                                                                                  \
+            index,                                                                                  \
+            stream                                                                                  \
+        )
+
+    switch (desc->device_type) {
+
+#ifdef ENABLE_CPU_API
+        CALCULATE(INFINI_DEVICE_CPU, cpu);
+#endif
+#ifdef ENABLE_NVIDIA_API
+        CALCULATE(INFINI_DEVICE_NVIDIA, nvidia);
+#endif
+#ifdef ENABLE_ILUVATAR_API
+        CALCULATE(INFINI_DEVICE_ILUVATAR, nvidia);
+#endif
+#ifdef ENABLE_METAX_API
+       CALCULATE(INFINI_DEVICE_METAX, metax);
+#endif
+
+    default:
+        return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
+    }
+
+#undef CALCULATE
+}
+
+__C infiniStatus_t
+infiniopDestroyGatherDescriptor(infiniopGatherDescriptor_t desc) {
+
+#define DELETE(CASE, NAMESPACE)                                                           \
+    case CASE:                                                                            \
+        delete reinterpret_cast<const op::gather::NAMESPACE::Descriptor *>(desc);         \
+        return INFINI_STATUS_SUCCESS
+
+    switch (desc->device_type) {
+
+#ifdef ENABLE_CPU_API
+        DELETE(INFINI_DEVICE_CPU, cpu);
+#endif
+#ifdef ENABLE_NVIDIA_API
+        DELETE(INFINI_DEVICE_NVIDIA, nvidia);
+#endif
+#ifdef ENABLE_ILUVATAR_API
+        DELETE(INFINI_DEVICE_ILUVATAR, nvidia);
+#endif
+#ifdef ENABLE_METAX_API
+       DELETE(INFINI_DEVICE_METAX, metax);
+#endif
+
+    default:
+        return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
+    }
+
+#undef DELETE
+}
diff --git a/src/infiniop/ops/index_copy_inplace/cpu/index_copy_inplace_cpu.cc b/src/infiniop/ops/index_copy_inplace/cpu/index_copy_inplace_cpu.cc
new file mode 100644
index 000000000..f45ddef7a
--- /dev/null
+++ b/src/infiniop/ops/index_copy_inplace/cpu/index_copy_inplace_cpu.cc
@@ -0,0 +1,108 @@
+#include "index_copy_inplace_cpu.h"
+#include "../../../devices/cpu/common_cpu.h"
+#include "../../../reduce/cpu/reduce.h"
+#include "../../rearrange/cpu/rearrange_cpu.h"
+#include "../info.h"
+
+namespace op::index_copy_inplace::cpu {
+
+Descriptor::~Descriptor() = default;
+
+infiniStatus_t Descriptor::create(
+    infiniopHandle_t handle_,
+    Descriptor **desc_ptr,
+    infiniopTensorDescriptor_t output_desc,
+    infiniopTensorDescriptor_t input_desc,
+    infiniopTensorDescriptor_t index_desc,
+    size_t dim
+) {
+    auto handle = reinterpret_cast<device::cpu::Handle *>(handle_);
+
+//  --------------------- start: check data type and calculate workspace size ----------------------
+    auto dtype = output_desc->dtype();
+
+
+    auto result = IndexCopyInplaceInfo::createIndexCopyInplaceInfo(
+        output_desc,
+        input_desc,
+        index_desc,
+        dim
+    );
+    CHECK_RESULT(result);
+    const IndexCopyInplaceInfo &info = result.take();
+    size_t WorkSpaceSize = (info.total_input_size + info.total_output_size) * infiniSizeOf(dtype);
+//  ---------------------- end: check data type and calculate workspace size -----------------------    
+    InfiniopTensorDescriptor * rearrange_in_desc = new InfiniopTensorDescriptor(
+        dtype, input_desc->ndim(), input_desc->shape().data(), info.meta_strides.data()
+    );
+    InfiniopTensorDescriptor * rearrange_out_desc = new InfiniopTensorDescriptor(
+        dtype, input_desc->ndim(), output_desc->shape().data(), info.meta_strides.data()
+    );        
+    
+    void * in_rearrange_descriptor = nullptr;
+    void * out_rearrange_descriptor = nullptr;
+
+    op::rearrange::cpu::Descriptor::create(
+        handle_, reinterpret_cast<op::rearrange::cpu::Descriptor **>(&in_rearrange_descriptor),
+        rearrange_in_desc, input_desc
+    );
+    op::rearrange::cpu::Descriptor::create(
+        handle_, reinterpret_cast<op::rearrange::cpu::Descriptor **>(&out_rearrange_descriptor),
+        output_desc, rearrange_out_desc
+    );
+
+    *desc_ptr = new Descriptor(
+        dtype, std::move(info), WorkSpaceSize,
+        nullptr,
+        handle->device, handle->device_id,
+        in_rearrange_descriptor,
+        out_rearrange_descriptor
+    );    
+
+    return INFINI_STATUS_SUCCESS;
+}
+
+
+
+infiniStatus_t Descriptor::calculate(
+    void *workspace,
+    size_t workspace_size,
+    void * output,
+    const void * input,
+    const void * index,
+    void *stream
+) const {
+    size_t size_of_dtype = infiniSizeOf(_info.dtype);
+    auto index_ptr = reinterpret_cast<const int64_t *>(index);
+
+
+    char* workspace_in = reinterpret_cast<char*>(workspace);
+    char* workspace_out = workspace_in + size_of_dtype * _info.total_input_size;
+    
+    
+    reinterpret_cast<op::rearrange::cpu::Descriptor *>(_rearrange_desc_in)->calculate(workspace_in, input, stream);
+    memset(workspace_out, 0, _info.total_output_size * size_of_dtype);
+    size_t copy_unit_size = _info.meta_strides[_info.dim] * size_of_dtype;
+    #pragma omp parallel for
+    for (size_t dst_index = 0; dst_index < _info.output_shape[_info.dim]; dst_index++) {
+        size_t src_index = _info.index_shape[0] - 1;
+        while (true)
+        {
+            if (*(index_ptr + src_index * _info.index_strides[0]) == int64_t(dst_index)) {
+                std::memcpy(
+                    workspace_out + size_of_dtype * dst_index * _info.meta_strides[_info.dim],
+                    workspace_in + size_of_dtype * src_index * _info.meta_strides[_info.dim],
+                    copy_unit_size
+                );
+                break;
+            }
+            else if (src_index == 0)
+                break;
+            src_index --;
+        }
+    }
+    reinterpret_cast<op::rearrange::cpu::Descriptor *>(_rearrange_desc_out)->calculate(output, workspace_out, stream);
+
+    return INFINI_STATUS_SUCCESS;
+}
+}
diff --git a/src/infiniop/ops/index_copy_inplace/cpu/index_copy_inplace_cpu.h b/src/infiniop/ops/index_copy_inplace/cpu/index_copy_inplace_cpu.h
new file mode 100644
index 000000000..384197013
--- /dev/null
+++ b/src/infiniop/ops/index_copy_inplace/cpu/index_copy_inplace_cpu.h
@@ -0,0 +1,9 @@
+#ifndef __INDEX_COPY_INPLACE_CPU_H__
+#define __INDEX_COPY_INPLACE_CPU_H__
+
+#include "../index_copy_inplace.h"
+
+INDEX_COPY_INPLACE_DESCRIPTOR(cpu)
+
+
+#endif // __INDEX_COPY_INPLACE_CPU_H__
diff --git a/src/infiniop/ops/index_copy_inplace/index_copy_inplace.h b/src/infiniop/ops/index_copy_inplace/index_copy_inplace.h
new file mode 100644
index 000000000..f99fd3116
--- /dev/null
+++ b/src/infiniop/ops/index_copy_inplace/index_copy_inplace.h
@@ -0,0 +1,55 @@
+#ifndef __INDEX_COPY_INPLACE_H__
+#define __INDEX_COPY_INPLACE_H__
+
+#include "../../../utils.h"
+#include "../../operator.h"
+#include "../../tensor.h"
+#include "info.h"
+
+#define INDEX_COPY_INPLACE_DESCRIPTOR(NAMESPACE)                                         \
+    namespace op::index_copy_inplace::NAMESPACE {                     \
+    class Descriptor final : public InfiniopDescriptor {              \
+        struct Opaque;                                                \
+        Opaque *_opaque;                                              \
+        IndexCopyInplaceInfo _info;                                   \
+        size_t _workspace_size;                                       \
+        void *_rearrange_desc_in;                                     \
+        void *_rearrange_desc_out;                                    \
+        Descriptor(                                                   \
+            infiniDtype_t dtype,                                      \
+            IndexCopyInplaceInfo info,                                \
+            size_t workspace_size_,                                   \
+            Opaque *opaque,                                           \
+            infiniDevice_t device_type,                               \
+            int device_id,                                            \
+            void *rearrange_desc_in,                                  \
+            void *rearrange_desc_out                                  \
+        ) : InfiniopDescriptor{device_type, device_id},               \
+              _opaque(opaque),                                        \
+              _info(info),                                            \
+              _workspace_size(workspace_size_),                       \
+              _rearrange_desc_in(rearrange_desc_in),                  \
+              _rearrange_desc_out(rearrange_desc_out) {}              \
+    public:                                                           \
+        ~Descriptor();                                                \
+        size_t workspaceSize() const { return _workspace_size; }      \
+        static infiniStatus_t create(                                 \
+            infiniopHandle_t handle,                                  \
+            Descriptor **desc_ptr,                                    \
+            infiniopTensorDescriptor_t output_desc,                   \
+            infiniopTensorDescriptor_t input_desc,                    \
+            infiniopTensorDescriptor_t index_desc,                    \
+            size_t dim                                                \
+        );                                                            \
+        infiniStatus_t calculate(                                     \
+            void *workspace,                                          \
+            size_t workspace_size,                                    \
+            void * output,                                            \
+            const void * input,                                       \
+            const void * index,                                       \
+            void *stream                                              \
+        ) const;                                                      \
+    };                                                                \
+    }
+
+#endif
\ No newline at end of file
diff --git a/src/infiniop/ops/index_copy_inplace/info.h b/src/infiniop/ops/index_copy_inplace/info.h
new file mode 100644
index 000000000..99d6a3a1e
--- /dev/null
+++ b/src/infiniop/ops/index_copy_inplace/info.h
@@ -0,0 +1,76 @@
+#ifndef __INDEX_COPY_INPLACE_INFO_H__
+#define __INDEX_COPY_INPLACE_INFO_H__
+
+#include "../../../utils.h"
+#include "../../operator.h"
+#include "../../tensor.h"
+
+namespace op::index_copy_inplace {
+
+class IndexCopyInplaceInfo {
+private:
+    IndexCopyInplaceInfo() = default;
+
+public:
+//  ---------------------------- start: define member variables of Info ----------------------------
+    infiniDtype_t dtype;
+    size_t total_input_size;
+	size_t total_output_size;
+    std::vector<size_t> output_shape;
+    std::vector<size_t> input_shape;
+    std::vector<size_t> index_shape;
+    std::vector<ptrdiff_t> output_strides;
+    std::vector<ptrdiff_t> input_strides;
+    std::vector<ptrdiff_t> index_strides;
+    std::vector<ptrdiff_t> meta_strides;
+    size_t dim;
+
+//  ----------------------------- end: define member variables of Info -----------------------------
+
+    static utils::Result<IndexCopyInplaceInfo> createIndexCopyInplaceInfo(
+        infiniopTensorDescriptor_t output_desc,
+        infiniopTensorDescriptor_t input_desc,
+        infiniopTensorDescriptor_t index_desc,
+        size_t dim
+    ) {
+//  ------------------------- start: check tensor shape and input validity -------------------------
+        CHECK_OR_RETURN(output_desc->ndim() == input_desc->ndim(), INFINI_STATUS_BAD_TENSOR_STRIDES);
+		std::vector<ptrdiff_t> meta_strides(input_desc->ndim());
+		ptrdiff_t last_dim = 1;
+		ptrdiff_t last_stride = 1;
+		size_t total_input_size = 1;
+		size_t total_output_size = 1;
+		for (size_t d = 0; d < input_desc->ndim(); d++){
+			total_input_size *= input_desc->dim(d);
+			total_output_size *= output_desc->dim(d);
+			if (d == dim) {
+            	continue;
+			}
+			else {
+				meta_strides[d] = last_dim * last_stride;
+				last_dim = input_desc->dim(d);
+				last_stride = meta_strides[d];
+			}	
+		}
+		meta_strides[dim] = last_dim * last_stride;
+//  -------------------------- end: check tensor shape and input validity --------------------------
+        return utils::Result<IndexCopyInplaceInfo>(IndexCopyInplaceInfo{
+//  ------------------------------ start: create an instance of Info -------------------------------
+            output_desc->dtype(),
+			total_input_size,
+			total_output_size,
+			output_desc->shape(),
+			input_desc->shape(),
+			index_desc->shape(),
+			output_desc->strides(),
+			input_desc->strides(),
+			index_desc->strides(),
+			meta_strides,
+            dim
+//  ------------------------------- end: create an instance of Info --------------------------------
+        });
+    }
+};
+}
+
+#endif //  __INDEX_COPY_INPLACE_INFO_H__
diff --git a/src/infiniop/ops/index_copy_inplace/metax/index_copy_inplace_metax.h b/src/infiniop/ops/index_copy_inplace/metax/index_copy_inplace_metax.h
new file mode 100644
index 000000000..65673c88c
--- /dev/null
+++ b/src/infiniop/ops/index_copy_inplace/metax/index_copy_inplace_metax.h
@@ -0,0 +1,8 @@
+#ifndef __INDEX_COPY_INPLACE_METAX_H__
+#define __INDEX_COPY_INPLACE_METAX_H__
+
+#include "../index_copy_inplace.h"
+
+INDEX_COPY_INPLACE_DESCRIPTOR(metax)
+
+#endif // __INDEX_COPY_INPLACE_METAX_H__
diff --git a/src/infiniop/ops/index_copy_inplace/metax/index_copy_inplace_metax.maca b/src/infiniop/ops/index_copy_inplace/metax/index_copy_inplace_metax.maca
new file mode 100644
index 000000000..e540b6201
--- /dev/null
+++ b/src/infiniop/ops/index_copy_inplace/metax/index_copy_inplace_metax.maca
@@ -0,0 +1,147 @@
+#include "../../../devices/metax/metax_common.h"
+#include "index_copy_inplace_metax.h"
+#include <hccub/block/block_reduce.cuh>
+#include "../../../devices/metax/metax_kernel_common.h"
+#include "../../../reduce/cuda/reduce.cuh"
+#include "../../rearrange/metax/rearrange_metax.h"
+#include "../info.h"
+
+namespace op::index_copy_inplace::metax {
+
+infiniStatus_t calculate_index_copy_inplace(
+    char * output,
+    const char * input,
+    const int64_t * index,
+    size_t copy_unit_size,
+    size_t output_len,
+    size_t index_len,
+    ptrdiff_t index_stride,
+    hcStream_t stream
+) {
+    int64_t* dst_index = new int64_t;
+    size_t sizeof_int64_t = sizeof(int64_t);
+    for (size_t src_index = 0; src_index < index_len; src_index ++) {
+        CHECK_METAX(hcMemcpyAsync(
+            dst_index,
+            index + src_index * index_stride,
+            sizeof_int64_t,
+            hcMemcpyDeviceToHost,
+            stream
+        ));
+        hcStreamSynchronize(stream);
+        CHECK_METAX(hcMemcpyAsync(
+            output + (size_t)(*dst_index) * copy_unit_size,
+            input + src_index * copy_unit_size,
+            copy_unit_size,
+            hcMemcpyDeviceToDevice,
+            stream
+        ));
+        hcStreamSynchronize(stream);
+    }
+    delete dst_index;
+    return INFINI_STATUS_SUCCESS;
+}
+
+struct Descriptor::Opaque {
+    std::shared_ptr<device::metax::Handle::Internal> internal;
+};
+
+Descriptor::~Descriptor() {
+    delete reinterpret_cast<op::rearrange::metax::Descriptor *>(_rearrange_desc_in);
+    delete reinterpret_cast<op::rearrange::metax::Descriptor *>(_rearrange_desc_out);
+    delete _opaque;
+}
+
+infiniStatus_t Descriptor::create(
+    infiniopHandle_t handle_,
+    Descriptor **desc_ptr,
+    infiniopTensorDescriptor_t output_desc,
+    infiniopTensorDescriptor_t input_desc,
+    infiniopTensorDescriptor_t index_desc,
+    size_t dim
+) {
+    auto handle = reinterpret_cast<device::metax::Handle *>(handle_);
+//  --------------------- start: check data type and calculate workspace size ----------------------    
+    auto dtype = output_desc->dtype();
+
+    auto result = IndexCopyInplaceInfo::createIndexCopyInplaceInfo(
+        output_desc,
+        input_desc,
+        index_desc,
+        dim
+    );
+    CHECK_RESULT(result);
+    const IndexCopyInplaceInfo &info = result.take();
+    size_t WorkSpaceSize = (info.total_input_size + info.total_output_size) * infiniSizeOf(dtype);
+
+
+    InfiniopTensorDescriptor * rearrange_in_desc = new InfiniopTensorDescriptor(
+        dtype, input_desc->ndim(), input_desc->shape().data(), info.meta_strides.data()
+    );
+    InfiniopTensorDescriptor * rearrange_out_desc = new InfiniopTensorDescriptor(
+        dtype, input_desc->ndim(), output_desc->shape().data(), info.meta_strides.data()
+    );        
+    
+    void * in_rearrange_descriptor = nullptr;
+    void * out_rearrange_descriptor = nullptr;
+
+    op::rearrange::metax::Descriptor::create(
+        handle_, reinterpret_cast<op::rearrange::metax::Descriptor **>(&in_rearrange_descriptor),
+        rearrange_in_desc, input_desc
+    );
+    op::rearrange::metax::Descriptor::create(
+        handle_, reinterpret_cast<op::rearrange::metax::Descriptor **>(&out_rearrange_descriptor),
+        output_desc, rearrange_out_desc
+    );   
+
+//  ---------------------- end: check data type and calculate workspace size -----------------------
+
+    *desc_ptr = new Descriptor(
+        dtype, std::move(info), WorkSpaceSize, 
+        new Opaque{handle->internal()},
+        handle->device, handle->device_id,
+        in_rearrange_descriptor,
+        out_rearrange_descriptor
+    );
+    return INFINI_STATUS_SUCCESS;
+}
+
+
+
+infiniStatus_t Descriptor::calculate(
+    void * workspace,
+    size_t workspace_size,
+    void * output,
+    const void * input,
+    const void * index,
+    void *stream_
+) const {
+    if (workspace_size < _workspace_size)
+        return INFINI_STATUS_INSUFFICIENT_WORKSPACE;
+
+    hcStream_t stream = (hcStream_t)stream_;
+
+    size_t elem_size = infiniSizeOf(_info.dtype);
+    char* workspace_in = reinterpret_cast<char*>(workspace);
+    char* workspace_out = workspace_in + elem_size * _info.total_input_size;
+    CHECK_STATUS(reinterpret_cast<op::rearrange::metax::Descriptor *>(_rearrange_desc_in)->calculate(workspace_in, input, stream));
+    hcMemsetAsync(workspace_out, 0, _info.total_output_size * elem_size, stream);
+
+    hcDeviceSynchronize();
+    CHECK_STATUS(calculate_index_copy_inplace(
+        reinterpret_cast<char*>(workspace_out),
+        reinterpret_cast<char*>(workspace_in),
+        reinterpret_cast<const int64_t*>(index),
+        elem_size * _info.meta_strides[_info.dim],
+        _info.output_shape[_info.dim],
+        _info.index_shape[0],
+        _info.index_strides[0],
+        stream
+    ));
+    hcDeviceSynchronize();
+
+    CHECK_STATUS(reinterpret_cast<op::rearrange::metax::Descriptor *>(_rearrange_desc_out)->calculate(output, workspace_out, stream));
+    return INFINI_STATUS_SUCCESS;
+
+}
+} // namespace op::index_copy_inplace::metax
diff --git a/src/infiniop/ops/index_copy_inplace/nvidia/index_copy_inplace_nvidia.cu b/src/infiniop/ops/index_copy_inplace/nvidia/index_copy_inplace_nvidia.cu
new file mode 100644
index 000000000..ebf5907e0
--- /dev/null
+++ b/src/infiniop/ops/index_copy_inplace/nvidia/index_copy_inplace_nvidia.cu
@@ -0,0 +1,140 @@
+#include "../../../devices/nvidia/nvidia_handle.cuh"
+#include "../../../devices/nvidia/nvidia_common.cuh"
+#include "../../../devices/nvidia/nvidia_kernel_common.cuh"
+#include "index_copy_inplace_nvidia.cuh"
+#include "../../rearrange/nvidia/rearrange_nvidia.cuh"
+#include "../info.h"
+
+namespace op::index_copy_inplace::nvidia {
+
+infiniStatus_t calculate_index_copy_inplace(
+    char * output,
+    const char * input,
+    const int64_t * index,
+    size_t copy_unit_size,
+    size_t output_len,
+    size_t index_len,
+    ptrdiff_t index_stride,
+    cudaStream_t stream
+) {
+    int64_t* dst_index = new int64_t;
+    size_t sizeof_int64_t = sizeof(int64_t);
+    for (size_t src_index = 0; src_index < index_len; src_index ++) {
+        CHECK_CUDA(cudaMemcpyAsync(
+            dst_index,
+            index + src_index * index_stride,
+            sizeof_int64_t,
+            cudaMemcpyDeviceToHost,
+            stream
+        ));
+        cudaStreamSynchronize(stream);
+        CHECK_CUDA(cudaMemcpyAsync(
+            output + (size_t)(*dst_index) * copy_unit_size,
+            input + src_index * copy_unit_size,
+            copy_unit_size,
+            cudaMemcpyDeviceToDevice,
+            stream
+        ));
+        cudaStreamSynchronize(stream);
+    }
+    delete dst_index;
+    return INFINI_STATUS_SUCCESS;
+}
+
+struct Descriptor::Opaque {
+    std::shared_ptr<device::nvidia::Handle::Internal> internal;
+};
+
+Descriptor::~Descriptor() {
+    delete reinterpret_cast<op::rearrange::nvidia::Descriptor *>(_rearrange_desc_in);
+    delete reinterpret_cast<op::rearrange::nvidia::Descriptor *>(_rearrange_desc_out);    
+    delete _opaque;
+}
+
+infiniStatus_t Descriptor::create(
+    infiniopHandle_t handle_,
+    Descriptor **desc_ptr,
+    infiniopTensorDescriptor_t output_desc,
+    infiniopTensorDescriptor_t input_desc,
+    infiniopTensorDescriptor_t index_desc,
+    size_t dim
+) {
+    auto handle = reinterpret_cast<device::nvidia::Handle *>(handle_);
+//  --------------------- start: check data type and calculate workspace size ----------------------
+    auto dtype = output_desc->dtype();
+//  ---------------------- end: check data type and calculate workspace size -----------------------
+    auto result = IndexCopyInplaceInfo::createIndexCopyInplaceInfo(
+        output_desc,
+        input_desc,
+        index_desc,
+        dim
+    );
+    CHECK_RESULT(result);
+    const IndexCopyInplaceInfo &info = result.take();
+    size_t WorkSpaceSize = (info.total_input_size + info.total_output_size) * infiniSizeOf(dtype);
+
+    InfiniopTensorDescriptor * rearrange_in_desc = new InfiniopTensorDescriptor(
+        dtype, input_desc->ndim(), input_desc->shape().data(), info.meta_strides.data()
+    );
+    InfiniopTensorDescriptor * rearrange_out_desc = new InfiniopTensorDescriptor(
+        dtype, input_desc->ndim(), output_desc->shape().data(), info.meta_strides.data()
+    );        
+    
+    void * in_rearrange_descriptor = nullptr;
+    void * out_rearrange_descriptor = nullptr;
+
+    op::rearrange::nvidia::Descriptor::create(
+        handle_, reinterpret_cast<op::rearrange::nvidia::Descriptor **>(&in_rearrange_descriptor),
+        rearrange_in_desc, input_desc
+    );
+    op::rearrange::nvidia::Descriptor::create(
+        handle_, reinterpret_cast<op::rearrange::nvidia::Descriptor **>(&out_rearrange_descriptor),
+        output_desc, rearrange_out_desc
+    );    
+
+    *desc_ptr = new Descriptor(
+        dtype, std::move(info), WorkSpaceSize,
+        new Opaque{handle->internal()},
+        handle->device, handle->device_id,
+        in_rearrange_descriptor,
+        out_rearrange_descriptor        
+    );    
+    return INFINI_STATUS_SUCCESS;
+}
+
+
+infiniStatus_t Descriptor::calculate(
+    void * workspace,
+    size_t workspace_size,
+    void * output,
+    const void * input,
+    const void * index,
+    void *stream_
+) const {
+    if (workspace_size < _workspace_size) {
+        return INFINI_STATUS_INSUFFICIENT_WORKSPACE;
+    }
+    cudaStream_t stream = (cudaStream_t)stream_;
+
+    size_t elem_size = infiniSizeOf(_info.dtype);
+    char* workspace_in = reinterpret_cast<char*>(workspace);
+    char* workspace_out = workspace_in + elem_size * _info.total_input_size;
+    CHECK_STATUS(reinterpret_cast<op::rearrange::nvidia::Descriptor *>(_rearrange_desc_in)->calculate(workspace_in, input, stream));
+    cudaMemsetAsync(workspace_out, 0, _info.total_output_size * elem_size, stream);
+    cudaDeviceSynchronize();
+    CHECK_STATUS(calculate_index_copy_inplace(
+        reinterpret_cast<char*>(workspace_out),
+        reinterpret_cast<char*>(workspace_in),
+        reinterpret_cast<const int64_t*>(index),
+        elem_size * _info.meta_strides[_info.dim],
+        _info.output_shape[_info.dim],
+        _info.index_shape[0],
+        _info.index_strides[0],
+        stream
+    ));
+    cudaDeviceSynchronize();
+
+    CHECK_STATUS(reinterpret_cast<op::rearrange::nvidia::Descriptor *>(_rearrange_desc_out)->calculate(output, workspace_out, stream));
+    return INFINI_STATUS_SUCCESS;
+}
+} // namespace op::index_copy_inplace::nvidia
diff --git a/src/infiniop/ops/index_copy_inplace/nvidia/index_copy_inplace_nvidia.cuh b/src/infiniop/ops/index_copy_inplace/nvidia/index_copy_inplace_nvidia.cuh
new file mode 100644
index 000000000..04c3c86f7
--- /dev/null
+++ b/src/infiniop/ops/index_copy_inplace/nvidia/index_copy_inplace_nvidia.cuh
@@ -0,0 +1,7 @@
+#ifndef __INDEX_COPY_INPLACE_NVIDIA_API_H__
+#define __INDEX_COPY_INPLACE_NVIDIA_API_H__
+#include "../index_copy_inplace.h"
+
+INDEX_COPY_INPLACE_DESCRIPTOR(nvidia)
+
+#endif // __INDEX_COPY_INPLACE_NVIDIA_API_H__
diff --git a/src/infiniop/ops/index_copy_inplace/operator.cc b/src/infiniop/ops/index_copy_inplace/operator.cc
new file mode 100644
index 000000000..e8886b65f
--- /dev/null
+++ b/src/infiniop/ops/index_copy_inplace/operator.cc
@@ -0,0 +1,154 @@
+#include "../../operator.h"
+#include "../../handle.h"
+#include "infiniop/ops/index_copy_inplace.h"
+
+#ifdef ENABLE_CPU_API
+#include "cpu/index_copy_inplace_cpu.h"
+#endif
+#if defined(ENABLE_NVIDIA_API) || defined(ENABLE_ILUVATAR_API)
+#include "nvidia/index_copy_inplace_nvidia.cuh"
+#endif
+#ifdef ENABLE_METAX_API
+#include "metax/index_copy_inplace_metax.h"
+#endif
+
+__C infiniStatus_t infiniopCreateIndexCopyInplaceDescriptor(
+    infiniopHandle_t handle,
+    infiniopIndexCopyInplaceDescriptor_t *desc_ptr,
+    infiniopTensorDescriptor_t output_desc,
+    infiniopTensorDescriptor_t input_desc,
+    infiniopTensorDescriptor_t index_desc,
+    size_t dim
+) {
+#define CREATE(CASE, NAMESPACE)                                                           \
+    case CASE:                                                                            \
+        return op::index_copy_inplace::NAMESPACE::Descriptor::create(                     \
+            handle,                                                                       \
+            reinterpret_cast<op::index_copy_inplace::NAMESPACE::Descriptor **>(desc_ptr), \
+            output_desc,                                                                  \
+            input_desc,                                                                   \
+            index_desc,                                                                   \
+            dim                                                                           \
+        )
+
+    switch (handle->device) {
+
+#ifdef ENABLE_CPU_API
+        CREATE(INFINI_DEVICE_CPU, cpu);
+#endif
+#ifdef ENABLE_NVIDIA_API
+        CREATE(INFINI_DEVICE_NVIDIA, nvidia);
+#endif
+#ifdef ENABLE_ILUVATAR_API
+        CREATE(INFINI_DEVICE_ILUVATAR, nvidia);
+#endif
+#ifdef ENABLE_METAX_API
+       CREATE(INFINI_DEVICE_METAX, metax);
+#endif
+
+    default:
+        return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
+    }
+
+#undef CREATE
+}
+
+__C infiniStatus_t infiniopGetIndexCopyInplaceWorkspaceSize(infiniopIndexCopyInplaceDescriptor_t desc, size_t *size) {
+#define GET(CASE, NAMESPACE)                                                                                  \
+    case CASE:                                                                                                \
+        *size = reinterpret_cast<op::index_copy_inplace::NAMESPACE::Descriptor *>(desc)->workspaceSize();     \
+        return INFINI_STATUS_SUCCESS
+
+    switch (desc->device_type) {
+#ifdef ENABLE_CPU_API
+        GET(INFINI_DEVICE_CPU, cpu);
+#endif
+#ifdef ENABLE_NVIDIA_API
+        GET(INFINI_DEVICE_NVIDIA, nvidia);
+#endif
+#ifdef ENABLE_ILUVATAR_API
+        GET(INFINI_DEVICE_ILUVATAR, nvidia);
+#endif
+#ifdef ENABLE_METAX_API
+       GET(INFINI_DEVICE_METAX, metax);
+#endif
+    default:
+        return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
+    }
+#undef GET
+
+    return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
+}
+
+__C infiniStatus_t infiniopIndexCopyInplace(
+    infiniopIndexCopyInplaceDescriptor_t desc,
+    void *workspace,
+    size_t workspace_size,
+    void * output,
+    const void * input,
+    const void * index,                
+    void *stream
+) {
+
+#define CALCULATE(CASE, NAMESPACE)                                                                            \
+    case CASE:                                                                                                \
+        return reinterpret_cast<const op::index_copy_inplace::NAMESPACE::Descriptor *>(desc)->calculate(      \
+            workspace,                                                                                        \
+            workspace_size,                                                                                   \
+            output,                                                                                           \
+            input,                                                                                            \
+            index,                                                                                            \
+            stream                                                                                            \
+        )
+
+    switch (desc->device_type) {
+
+#ifdef ENABLE_CPU_API
+        CALCULATE(INFINI_DEVICE_CPU, cpu);
+#endif
+#ifdef ENABLE_NVIDIA_API
+        CALCULATE(INFINI_DEVICE_NVIDIA, nvidia);
+#endif
+#ifdef ENABLE_ILUVATAR_API
+        CALCULATE(INFINI_DEVICE_ILUVATAR, nvidia);
+#endif
+#ifdef ENABLE_METAX_API
+       CALCULATE(INFINI_DEVICE_METAX, metax);
+#endif
+
+    default:
+        return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
+    }
+
+#undef CALCULATE
+}
+
+__C infiniStatus_t
+infiniopDestroyIndexCopyInplaceDescriptor(infiniopIndexCopyInplaceDescriptor_t desc) {
+
+#define DELETE(CASE, NAMESPACE)                                                                     \
+    case CASE:                                                                                      \
+        delete reinterpret_cast<const op::index_copy_inplace::NAMESPACE::Descriptor *>(desc);       \
+        return INFINI_STATUS_SUCCESS
+
+    switch (desc->device_type) {
+
+#ifdef ENABLE_CPU_API
+        DELETE(INFINI_DEVICE_CPU, cpu);
+#endif
+#ifdef ENABLE_NVIDIA_API
+        DELETE(INFINI_DEVICE_NVIDIA, nvidia);
+#endif
+#ifdef ENABLE_ILUVATAR_API
+        DELETE(INFINI_DEVICE_ILUVATAR, nvidia);
+#endif
+#ifdef ENABLE_METAX_API
+       DELETE(INFINI_DEVICE_METAX, metax);
+#endif
+
+    default:
+        return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
+    }
+
+#undef DELETE
+}
diff --git a/src/infiniop/ops/linear/cpu/linear_cpu.cc b/src/infiniop/ops/linear/cpu/linear_cpu.cc
new file mode 100644
index 000000000..78dd86997
--- /dev/null
+++ b/src/infiniop/ops/linear/cpu/linear_cpu.cc
@@ -0,0 +1,95 @@
+#include "linear_cpu.h"
+#include "../../../devices/cpu/common_cpu.h"
+#include "../../../reduce/cpu/reduce.h"
+#include "../info.h"
+
+namespace op::linear::cpu {
+
+template <typename Tdata>
+infiniStatus_t calculate_linear(
+    const LinearInfo &info,
+    Tdata * y,
+    const Tdata * x,
+    const Tdata * w,
+    const Tdata * b
+) {
+//  -------------------------------- start: perform operator on CPU --------------------------------
+    #pragma omp parallel for
+    for(size_t j = 0; j < info.out_features; j ++)
+    {
+        auto w_ptr = w + j * info.w_stride_out;
+        float y_sum = info.bias ? (utils::cast<float>(*(b + j * info.b_stride))) : 0.;
+        for(size_t i = 0; i < info.in_features; i ++)
+        {
+            y_sum += utils::cast<float>(*(x + i * info.x_stride)) * utils::cast<float>(*(w_ptr + i * info.w_stride_in));
+        }
+        *(y + j * info.y_stride) = utils::cast<Tdata>(y_sum);
+    }
+//  --------------------------------- end: perform operator on CPU ---------------------------------
+    return INFINI_STATUS_SUCCESS;
+}
+
+Descriptor::~Descriptor() = default;
+
+infiniStatus_t Descriptor::create(
+    infiniopHandle_t handle_,
+    Descriptor **desc_ptr,
+    infiniopTensorDescriptor_t y_desc,
+    infiniopTensorDescriptor_t x_desc,
+    infiniopTensorDescriptor_t w_desc,
+    infiniopTensorDescriptor_t b_desc
+) {
+    auto handle = reinterpret_cast<device::cpu::Handle *>(handle_);
+
+//  --------------------- start: check data type and calculate workspace size ----------------------
+    auto dtype = y_desc->dtype();
+    CHECK_DTYPE(dtype, INFINI_DTYPE_F16, INFINI_DTYPE_F32, INFINI_DTYPE_BF16);
+    size_t WorkSpaceSize = 0;
+//  ---------------------- end: check data type and calculate workspace size -----------------------
+
+    auto result = LinearInfo::createLinearInfo(
+        y_desc,
+        x_desc,
+        w_desc,
+        b_desc
+    );
+    CHECK_RESULT(result);
+    const LinearInfo &info = result.take();
+    
+    *desc_ptr = new Descriptor(
+        dtype, std::move(info), WorkSpaceSize,
+        nullptr,
+        handle->device, handle->device_id
+    );    
+
+    return INFINI_STATUS_SUCCESS;
+}
+
+
+
+infiniStatus_t Descriptor::calculate(
+    void *workspace,
+    size_t workspace_size,
+    void * y,
+    const void * x,
+    const void * w,
+    const void * b,
+    void *stream
+) const {
+    #define CALCULATE_LINEAR(TDATA) \
+        CHECK_STATUS(calculate_linear<TDATA>(_info, \
+    (TDATA *)y, (const TDATA *)x, (const TDATA *)w, (const TDATA *)b))    
+    if (_info.dtype == INFINI_DTYPE_F16) {
+        CALCULATE_LINEAR(fp16_t);
+    } else if (_info.dtype == INFINI_DTYPE_BF16) {
+        CALCULATE_LINEAR(bf16_t);
+    } else if (_info.dtype == INFINI_DTYPE_F32) {
+        CALCULATE_LINEAR(float);
+    } else {
+        return INFINI_STATUS_BAD_TENSOR_DTYPE;
+    }
+    #undef CALCULATE_LINEAR
+
+    return INFINI_STATUS_SUCCESS;
+}
+}
diff --git a/src/infiniop/ops/linear/cpu/linear_cpu.h b/src/infiniop/ops/linear/cpu/linear_cpu.h
new file mode 100644
index 000000000..5caf2a8a8
--- /dev/null
+++ b/src/infiniop/ops/linear/cpu/linear_cpu.h
@@ -0,0 +1,9 @@
+#ifndef __LINEAR_CPU_H__
+#define __LINEAR_CPU_H__
+
+#include "../linear.h"
+
+DESCRIPTOR(cpu)
+
+
+#endif // __LINEAR_CPU_H__
diff --git a/src/infiniop/ops/linear/cuda/kernel.cuh b/src/infiniop/ops/linear/cuda/kernel.cuh
new file mode 100644
index 000000000..dbc2ec898
--- /dev/null
+++ b/src/infiniop/ops/linear/cuda/kernel.cuh
@@ -0,0 +1,32 @@
+#ifndef __LINEAR_KERNEL_CUH__
+#define __LINEAR_KERNEL_CUH__
+//  ------------------------------- start: perform operator on CUDA --------------------------------
+template <unsigned int BLOCK_SIZE, typename Tdata, typename Tcompute>
+__device__ void linearKernel(
+	Tdata * y,
+	const Tdata * x,
+	const Tdata * w,
+	const Tdata * b,
+	size_t in_features,
+	size_t out_features,
+	ptrdiff_t y_stride,
+	ptrdiff_t x_stride,
+	ptrdiff_t w_stride_out,
+    ptrdiff_t w_stride_in,
+	ptrdiff_t b_stride,
+    bool bias  
+) {
+    size_t y_index = blockIdx.x;
+    auto y_ptr = y + y_index * y_stride;
+    auto w_ptr = w + y_index * w_stride_out;
+
+    Tcompute y_value = bias ? (Tcompute(*(b + y_index * b_stride))) : Tcompute(0);
+    for(size_t i = 0; i < in_features; i ++)
+    {
+        y_value += Tcompute(*(x + i * x_stride)) * Tcompute(*(w_ptr + i * w_stride_in));
+    }
+    *y_ptr = y_value;
+}
+//  -------------------------------- end: perform operator on CUDA ---------------------------------
+
+#endif // __LINEAR_KERNEL_CUH__
diff --git a/src/infiniop/ops/linear/info.h b/src/infiniop/ops/linear/info.h
new file mode 100644
index 000000000..eef5dc6ce
--- /dev/null
+++ b/src/infiniop/ops/linear/info.h
@@ -0,0 +1,67 @@
+#ifndef __LINEAR_INFO_H__
+#define __LINEAR_INFO_H__
+
+#include "../../../utils.h"
+#include "../../operator.h"
+#include "../../tensor.h"
+
+namespace op::linear {
+
+class LinearInfo {
+private:
+    LinearInfo() = default;
+
+public:
+//  ---------------------------- start: define member variables of Info ----------------------------
+    infiniDtype_t dtype;
+	size_t in_features;
+	size_t out_features;
+	ptrdiff_t y_stride;
+	ptrdiff_t x_stride;
+	ptrdiff_t w_stride_in;
+    ptrdiff_t w_stride_out;
+	ptrdiff_t b_stride;
+    bool bias;
+
+//  ----------------------------- end: define member variables of Info -----------------------------
+
+    static utils::Result<LinearInfo> createLinearInfo(
+        infiniopTensorDescriptor_t y_desc,
+        infiniopTensorDescriptor_t x_desc,
+        infiniopTensorDescriptor_t w_desc,
+        infiniopTensorDescriptor_t b_desc
+    ) {
+//  ------------------------- start: check tensor shape and input validity -------------------------
+        size_t in_features = x_desc->dim(0);
+        size_t out_features = y_desc->dim(0);
+        CHECK_OR_RETURN(x_desc->ndim() == 1 && y_desc->ndim() == 1 && w_desc->ndim() == 2 && \
+            w_desc->dim(0) == out_features && w_desc->dim(1) == in_features,
+            INFINI_STATUS_BAD_TENSOR_SHAPE    
+        );
+        bool bias = (b_desc != nullptr);
+        if (bias)
+            CHECK_OR_RETURN(
+                b_desc->ndim() == 1 && b_desc->dim(0) == out_features,
+                INFINI_STATUS_BAD_TENSOR_SHAPE   
+            );
+        
+
+//  -------------------------- end: check tensor shape and input validity --------------------------
+        return utils::Result<LinearInfo>(LinearInfo{
+//  ------------------------------ start: create an instance of Info -------------------------------
+            y_desc->dtype(),
+			x_desc->dim(0),
+            y_desc->dim(0),
+			y_desc->stride(0),
+            x_desc->stride(0),
+            w_desc->stride(1),
+            w_desc->stride(0),
+            bias ? b_desc->stride(0) : 0,
+            bias
+//  ------------------------------- end: create an instance of Info --------------------------------
+        });
+    }
+};
+}
+
+#endif //  __LINEAR_INFO_H__
diff --git a/src/infiniop/ops/linear/linear.h b/src/infiniop/ops/linear/linear.h
new file mode 100644
index 000000000..5e7e8e022
--- /dev/null
+++ b/src/infiniop/ops/linear/linear.h
@@ -0,0 +1,50 @@
+#ifndef __LINEAR_H__
+#define __LINEAR_H__
+
+#include "../../../utils.h"
+#include "../../operator.h"
+#include "../../tensor.h"
+#include "info.h"
+
+#define DESCRIPTOR(NAMESPACE)                                         \
+    namespace op::linear::NAMESPACE {                                 \
+    class Descriptor final : public InfiniopDescriptor {              \
+        struct Opaque;                                                \
+        Opaque *_opaque;                                              \
+        LinearInfo _info;                                             \
+        size_t _workspace_size;                                       \
+        Descriptor(                                                   \
+            infiniDtype_t dtype,                                      \
+            LinearInfo info,                                          \
+            size_t workspace_size_,                                   \
+            Opaque *opaque,                                           \
+            infiniDevice_t device_type,                               \
+            int device_id                                             \
+        ) : InfiniopDescriptor{device_type, device_id},               \
+              _opaque(opaque),                                        \
+              _info(info),                                            \
+              _workspace_size(workspace_size_) {}                     \
+    public:                                                           \
+        ~Descriptor();                                                \
+        size_t workspaceSize() const { return _workspace_size; }      \
+        static infiniStatus_t create(                                 \
+            infiniopHandle_t handle,                                  \
+            Descriptor **desc_ptr,                                    \
+            infiniopTensorDescriptor_t y_desc,                        \
+            infiniopTensorDescriptor_t x_desc,                        \
+            infiniopTensorDescriptor_t w_desc,                        \
+            infiniopTensorDescriptor_t b_desc                         \
+        );                                                            \
+        infiniStatus_t calculate(                                     \
+            void *workspace,                                          \
+            size_t workspace_size,                                    \
+            void * y,                                                 \
+            const void * x,                                           \
+            const void * w,                                           \
+            const void * b,                                           \
+            void *stream                                              \
+        ) const;                                                      \
+    };                                                                \
+    }
+
+#endif
\ No newline at end of file
diff --git a/src/infiniop/ops/linear/metax/linear_metax.h b/src/infiniop/ops/linear/metax/linear_metax.h
new file mode 100644
index 000000000..736ef96b8
--- /dev/null
+++ b/src/infiniop/ops/linear/metax/linear_metax.h
@@ -0,0 +1,8 @@
+#ifndef __LINEAR_METAX_H__
+#define __LINEAR_METAX_H__
+
+#include "../linear.h"
+
+DESCRIPTOR(metax)
+
+#endif // __LINEAR_METAX_H__
diff --git a/src/infiniop/ops/linear/metax/linear_metax.maca b/src/infiniop/ops/linear/metax/linear_metax.maca
new file mode 100644
index 000000000..7b9765c79
--- /dev/null
+++ b/src/infiniop/ops/linear/metax/linear_metax.maca
@@ -0,0 +1,151 @@
+#include "../../../devices/metax/metax_common.h"
+#include "linear_metax.h"
+#include <hccub/block/block_reduce.cuh>
+#include "../../../devices/metax/metax_kernel_common.h"
+#include "../../../reduce/cuda/reduce.cuh"
+#include "../cuda/kernel.cuh"
+#include "../info.h"
+
+namespace op::linear::metax {
+
+template <unsigned int BLOCK_SIZE, typename Tdata, typename Tcompute>
+INFINIOP_METAX_KERNEL launchKernel(
+    Tdata * y,
+    const Tdata * x,
+    const Tdata * w,
+    const Tdata * b,
+	size_t in_features,
+	size_t out_features,
+	ptrdiff_t y_stride,
+	ptrdiff_t x_stride,
+	ptrdiff_t w_stride_out,
+    ptrdiff_t w_stride_in,
+	ptrdiff_t b_stride,
+    bool bias    
+) {
+    linearKernel<BLOCK_SIZE, Tdata, Tcompute>(
+		y,
+		x,
+		w,
+		b,
+        in_features,
+        out_features,
+        y_stride,
+        x_stride,
+        w_stride_out,
+        w_stride_in,
+        b_stride,
+        bias
+    );
+}
+
+//  ----------------------------------- start: call launchKernel -----------------------------------
+template <unsigned int BLOCK_SIZE, typename Tdata>
+infiniStatus_t calculate_linear(
+    const LinearInfo &info,
+    Tdata * y,
+    const Tdata * x,
+    const Tdata * w,
+    const Tdata * b,
+    hcStream_t stream
+) {
+    launchKernel<1, Tdata, float><<<info.out_features, 1, 0, stream>>>(
+        y,
+        x,
+        w,
+        b,
+        info.in_features,
+        info.out_features,
+        info.y_stride,
+        info.x_stride,
+        info.w_stride_out,
+        info.w_stride_in,
+        info.b_stride,
+        info.bias          
+    );
+    return INFINI_STATUS_SUCCESS;
+}
+//  ------------------------------------ end: call launchKernel ------------------------------------
+
+
+struct Descriptor::Opaque {
+    std::shared_ptr<device::metax::Handle::Internal> internal;
+};
+
+Descriptor::~Descriptor() {
+    delete _opaque;
+}
+
+infiniStatus_t Descriptor::create(
+    infiniopHandle_t handle_,
+    Descriptor **desc_ptr,
+    infiniopTensorDescriptor_t y_desc,
+    infiniopTensorDescriptor_t x_desc,
+    infiniopTensorDescriptor_t w_desc,
+    infiniopTensorDescriptor_t b_desc
+) {
+    auto handle = reinterpret_cast<device::metax::Handle *>(handle_);
+//  --------------------- start: check data type and calculate workspace size ----------------------    
+    auto dtype = y_desc->dtype();
+    CHECK_DTYPE(dtype, INFINI_DTYPE_F16, INFINI_DTYPE_F32, INFINI_DTYPE_BF16);
+    auto result = LinearInfo::createLinearInfo(
+        y_desc,
+        x_desc,
+        w_desc,
+        b_desc
+    );
+    CHECK_RESULT(result);
+    const LinearInfo &info = result.take();
+    size_t WorkSpaceSize = 0;
+//  ---------------------- end: check data type and calculate workspace size -----------------------
+
+    *desc_ptr = new Descriptor(
+        dtype, std::move(info), WorkSpaceSize, 
+        new Opaque{handle->internal()},
+        handle->device, handle->device_id
+    );
+    return INFINI_STATUS_SUCCESS;
+}
+
+
+
+infiniStatus_t Descriptor::calculate(
+    void * workspace,
+    size_t workspace_size,
+    void * y,
+    const void * x,
+    const void * w,
+    const void * b,
+    void *stream_
+) const {
+    if (workspace_size < _workspace_size)
+        return INFINI_STATUS_INSUFFICIENT_WORKSPACE;
+
+    hcStream_t stream = (hcStream_t)stream_;
+
+    #define CALCULATE_LINEAR(BLOCK_SIZE, TDATA) \
+        calculate_linear<BLOCK_SIZE, TDATA>(_info, (TDATA *)y, (const TDATA *)x, (const TDATA *)w, (const TDATA *)b, stream)
+    #define CALCULATE_LINEAR_WITH_METAX_BLOCK(BLOCK_SIZE)             \
+    {                                                                 \
+        if (_info.dtype == INFINI_DTYPE_F16)                          \
+            return CALCULATE_LINEAR(BLOCK_SIZE, half);                \
+        else if (_info.dtype == INFINI_DTYPE_F32)                     \
+            return CALCULATE_LINEAR(BLOCK_SIZE, float);               \
+        else if (_info.dtype == INFINI_DTYPE_BF16)                    \
+            return CALCULATE_LINEAR(BLOCK_SIZE, cuda_bfloat16);       \
+        else                                                          \
+            return INFINI_STATUS_BAD_TENSOR_DTYPE;                    \
+    }
+
+    if (_opaque->internal->maxThreadsPerBlock() == METAX_BLOCK_SIZE_1024)
+        CALCULATE_LINEAR_WITH_METAX_BLOCK(METAX_BLOCK_SIZE_1024)
+    else if (_opaque->internal->maxThreadsPerBlock() == METAX_BLOCK_SIZE_512)
+        CALCULATE_LINEAR_WITH_METAX_BLOCK(METAX_BLOCK_SIZE_512)
+    else
+        return INFINI_STATUS_DEVICE_ARCHITECTURE_NOT_SUPPORTED;
+    return INFINI_STATUS_SUCCESS;
+
+    #undef CALCULATE_LINEAR_WITH_METAX_BLOCK
+    #undef CALCULATE_LINEAR
+}
+} // namespace op::linear::metax
diff --git a/src/infiniop/ops/linear/nvidia/linear_nvidia.cu b/src/infiniop/ops/linear/nvidia/linear_nvidia.cu
new file mode 100644
index 000000000..98d897d12
--- /dev/null
+++ b/src/infiniop/ops/linear/nvidia/linear_nvidia.cu
@@ -0,0 +1,155 @@
+#include "../../../devices/nvidia/nvidia_handle.cuh"
+#include "../../../devices/nvidia/nvidia_common.cuh"
+#include "../../../devices/nvidia/nvidia_kernel_common.cuh"
+#include "linear_nvidia.cuh"
+#include "../cuda/kernel.cuh"
+#include "../info.h"
+
+namespace op::linear::nvidia {
+
+//  ---------------------- start: launchKernel: call kernel function of CUDA -----------------------
+template <unsigned int BLOCK_SIZE, typename Tdata, typename Tcompute>
+INFINIOP_CUDA_KERNEL launchKernel(
+    Tdata * y,
+    const Tdata * x,
+    const Tdata * w,
+    const Tdata * b,
+	size_t in_features,
+	size_t out_features,
+	ptrdiff_t y_stride,
+	ptrdiff_t x_stride,
+	ptrdiff_t w_stride_out,
+    ptrdiff_t w_stride_in,
+	ptrdiff_t b_stride,
+    bool bias
+) {
+    linearKernel<BLOCK_SIZE, Tdata, Tcompute>(
+		y,
+		x,
+		w,
+		b,
+        in_features,
+        out_features,
+        y_stride,
+        x_stride,
+        w_stride_out,
+        w_stride_in,
+        b_stride,
+        bias
+    );
+}
+//  ----------------------- end: launchKernel: call kernel function of CUDA ------------------------
+
+//  ----------------------------------- start: call launchKernel -----------------------------------
+template<unsigned int BLOCK_SIZE, typename Tdata>
+infiniStatus_t calculate_linear(
+    const LinearInfo &info,
+    Tdata * y,
+    const Tdata * x,
+    const Tdata * w,
+    const Tdata * b,
+    cudaStream_t stream
+) {
+
+    launchKernel<1, Tdata, float><<<info.out_features, 1, 0, stream>>>(
+        y,
+        x,
+        w,
+        b,
+        info.in_features,
+        info.out_features,
+        info.y_stride,
+        info.x_stride,
+        info.w_stride_out,
+        info.w_stride_in,
+        info.b_stride,
+        info.bias        
+    );
+    return INFINI_STATUS_SUCCESS;
+}
+//  ------------------------------------ end: call launchKernel ------------------------------------
+
+
+struct Descriptor::Opaque {
+    std::shared_ptr<device::nvidia::Handle::Internal> internal;
+};
+
+Descriptor::~Descriptor() {
+    delete _opaque;
+}
+
+infiniStatus_t Descriptor::create(
+    infiniopHandle_t handle_,
+    Descriptor **desc_ptr,
+    infiniopTensorDescriptor_t y_desc,
+    infiniopTensorDescriptor_t x_desc,
+    infiniopTensorDescriptor_t w_desc,
+    infiniopTensorDescriptor_t b_desc
+) {
+    auto handle = reinterpret_cast<device::nvidia::Handle *>(handle_);
+//  --------------------- start: check data type and calculate workspace size ----------------------
+    auto dtype = y_desc->dtype();
+    CHECK_DTYPE(dtype, INFINI_DTYPE_F16, INFINI_DTYPE_F32, INFINI_DTYPE_BF16);
+    size_t WorkSpaceSize = 0;
+    //size_t workspace_size = reinterpret_cast<op::linear::nvidia::Descriptor *>(y_desc)->workspaceSize();
+//  ---------------------- end: check data type and calculate workspace size -----------------------
+    auto result = LinearInfo::createLinearInfo(
+        y_desc,
+        x_desc,
+        w_desc,
+        b_desc
+    );
+    CHECK_RESULT(result);
+    const LinearInfo &info = result.take();
+    *desc_ptr = new Descriptor(
+        dtype, std::move(info), WorkSpaceSize,
+        new Opaque{handle->internal()},
+        handle->device, handle->device_id
+    );    
+    return INFINI_STATUS_SUCCESS;
+}
+
+
+infiniStatus_t Descriptor::calculate(
+    void * workspace,
+    size_t workspace_size,
+    void * y,
+    const void * x,
+    const void * w,
+    const void * b,
+    void *stream_
+) const {
+    if (workspace_size < _workspace_size) {
+        return INFINI_STATUS_INSUFFICIENT_WORKSPACE;
+    }
+    cudaStream_t stream = (cudaStream_t)stream_;
+
+    #define CALCULATE_LINEAR(BLOCK_SIZE, TDATA) \
+        calculate_linear<BLOCK_SIZE, TDATA>(_info, (TDATA *)y, (const TDATA *)x, (const TDATA *)w, (const TDATA *)b, stream)
+    #define CALCULATE_LINEAR_WITH_BLOCK_SIZE(BLOCK_SIZE)              \
+    {                                                                 \
+        if (_info.dtype == INFINI_DTYPE_F16)                          \
+            return CALCULATE_LINEAR(BLOCK_SIZE, half);                \
+        else if (_info.dtype == INFINI_DTYPE_F32)                     \
+            return CALCULATE_LINEAR(BLOCK_SIZE, float);               \
+        else if (_info.dtype == INFINI_DTYPE_BF16)                    \
+            return CALCULATE_LINEAR(BLOCK_SIZE, __nv_bfloat16);       \
+        else                                                          \
+            return INFINI_STATUS_BAD_TENSOR_DTYPE;                    \
+    }
+    
+    if (_opaque->internal->maxThreadsPerBlock() == CUDA_BLOCK_SIZE_1024)
+        CALCULATE_LINEAR_WITH_BLOCK_SIZE(CUDA_BLOCK_SIZE_1024)
+    else if (_opaque->internal->maxThreadsPerBlock() == CUDA_BLOCK_SIZE_512)
+        CALCULATE_LINEAR_WITH_BLOCK_SIZE(CUDA_BLOCK_SIZE_512)
+    else if (_opaque->internal->maxThreadsPerBlock() == CUDA_BLOCK_SIZE_4096)
+        CALCULATE_LINEAR_WITH_BLOCK_SIZE(CUDA_BLOCK_SIZE_4096)
+    else
+        return INFINI_STATUS_DEVICE_ARCHITECTURE_NOT_SUPPORTED;
+
+    return INFINI_STATUS_SUCCESS;
+
+    #undef CALCULATE_LINEAR_WITH_BLOCK_SIZE
+    #undef CALCULATE_LINEAR
+}
+} // namespace op::linear::nvidia
diff --git a/src/infiniop/ops/linear/nvidia/linear_nvidia.cuh b/src/infiniop/ops/linear/nvidia/linear_nvidia.cuh
new file mode 100644
index 000000000..fdc3ddf64
--- /dev/null
+++ b/src/infiniop/ops/linear/nvidia/linear_nvidia.cuh
@@ -0,0 +1,7 @@
+#ifndef __LINEAR_NVIDIA_API_H__
+#define __LINEAR_NVIDIA_API_H__
+#include "../linear.h"
+
+DESCRIPTOR(nvidia)
+
+#endif // __LINEAR_NVIDIA_API_H__
diff --git a/src/infiniop/ops/linear/operator.cc b/src/infiniop/ops/linear/operator.cc
new file mode 100644
index 000000000..7f741ffad
--- /dev/null
+++ b/src/infiniop/ops/linear/operator.cc
@@ -0,0 +1,156 @@
+#include "../../operator.h"
+#include "../../handle.h"
+#include "infiniop/ops/linear.h"
+
+#ifdef ENABLE_CPU_API
+#include "cpu/linear_cpu.h"
+#endif
+#if defined(ENABLE_NVIDIA_API) || defined(ENABLE_ILUVATAR_API)
+#include "nvidia/linear_nvidia.cuh"
+#endif
+#ifdef ENABLE_METAX_API
+#include "metax/linear_metax.h"
+#endif
+
+__C infiniStatus_t infiniopCreateLinearDescriptor(
+    infiniopHandle_t handle,
+    infiniopLinearDescriptor_t *desc_ptr,
+    infiniopTensorDescriptor_t y_desc,
+    infiniopTensorDescriptor_t x_desc,
+    infiniopTensorDescriptor_t w_desc,
+    infiniopTensorDescriptor_t b_desc
+) {
+#define CREATE(CASE, NAMESPACE)                                                 \
+    case CASE:                                                                  \
+        return op::linear::NAMESPACE::Descriptor::create(                       \
+            handle,                                                             \
+            reinterpret_cast<op::linear::NAMESPACE::Descriptor **>(desc_ptr),   \
+            y_desc,                                                             \
+            x_desc,                                                             \
+            w_desc,                                                             \
+            b_desc                                                              \
+        )
+
+    switch (handle->device) {
+
+#ifdef ENABLE_CPU_API
+        CREATE(INFINI_DEVICE_CPU, cpu);
+#endif
+#ifdef ENABLE_NVIDIA_API
+        CREATE(INFINI_DEVICE_NVIDIA, nvidia);
+#endif
+#ifdef ENABLE_ILUVATAR_API
+        CREATE(INFINI_DEVICE_ILUVATAR, nvidia);
+#endif
+#ifdef ENABLE_METAX_API
+       CREATE(INFINI_DEVICE_METAX, metax);
+#endif
+
+    default:
+        return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
+    }
+
+#undef CREATE
+}
+
+__C infiniStatus_t infiniopGetLinearWorkspaceSize(infiniopLinearDescriptor_t desc, size_t *size) {
+#define GET(CASE, NAMESPACE)                                                                        \
+    case CASE:                                                                                      \
+        *size = reinterpret_cast<op::linear::NAMESPACE::Descriptor *>(desc)->workspaceSize();       \
+        return INFINI_STATUS_SUCCESS
+
+    switch (desc->device_type) {
+#ifdef ENABLE_CPU_API
+        GET(INFINI_DEVICE_CPU, cpu);
+#endif
+#ifdef ENABLE_NVIDIA_API
+        GET(INFINI_DEVICE_NVIDIA, nvidia);
+#endif
+#ifdef ENABLE_ILUVATAR_API
+        GET(INFINI_DEVICE_ILUVATAR, nvidia);
+#endif
+#ifdef ENABLE_METAX_API
+       GET(INFINI_DEVICE_METAX, metax);
+#endif
+    default:
+        return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
+    }
+#undef GET
+
+    return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
+}
+
+__C infiniStatus_t infiniopLinear(
+    infiniopLinearDescriptor_t desc,
+    void *workspace,
+    size_t workspace_size,
+    void * y,
+    const void * x,
+    const void * w,
+    const void * b,                
+    void *stream
+) {
+
+#define CALCULATE(CASE, NAMESPACE)                                                                  \
+    case CASE:                                                                                      \
+        return reinterpret_cast<const op::linear::NAMESPACE::Descriptor *>(desc)->calculate(        \
+            workspace,                                                                              \
+            workspace_size,                                                                         \
+            y,                                                                                      \
+            x,                                                                                      \
+            w,                                                                                      \
+            b,                                                                                      \
+            stream                                                                                  \
+        )
+
+    switch (desc->device_type) {
+
+#ifdef ENABLE_CPU_API
+        CALCULATE(INFINI_DEVICE_CPU, cpu);
+#endif
+#ifdef ENABLE_NVIDIA_API
+        CALCULATE(INFINI_DEVICE_NVIDIA, nvidia);
+#endif
+#ifdef ENABLE_ILUVATAR_API
+        CALCULATE(INFINI_DEVICE_ILUVATAR, nvidia);
+#endif
+#ifdef ENABLE_METAX_API
+       CALCULATE(INFINI_DEVICE_METAX, metax);
+#endif
+
+    default:
+        return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
+    }
+
+#undef CALCULATE
+}
+
+__C infiniStatus_t
+infiniopDestroyLinearDescriptor(infiniopLinearDescriptor_t desc) {
+
+#define DELETE(CASE, NAMESPACE)                                                           \
+    case CASE:                                                                            \
+        delete reinterpret_cast<const op::linear::NAMESPACE::Descriptor *>(desc);         \
+        return INFINI_STATUS_SUCCESS
+
+    switch (desc->device_type) {
+
+#ifdef ENABLE_CPU_API
+        DELETE(INFINI_DEVICE_CPU, cpu);
+#endif
+#ifdef ENABLE_NVIDIA_API
+        DELETE(INFINI_DEVICE_NVIDIA, nvidia);
+#endif
+#ifdef ENABLE_ILUVATAR_API
+        DELETE(INFINI_DEVICE_ILUVATAR, nvidia);
+#endif
+#ifdef ENABLE_METAX_API
+       DELETE(INFINI_DEVICE_METAX, metax);
+#endif
+
+    default:
+        return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
+    }
+
+#undef DELETE
+}
diff --git a/src/infiniop/ops/linear_backward/cpu/linear_backward_cpu.cc b/src/infiniop/ops/linear_backward/cpu/linear_backward_cpu.cc
new file mode 100644
index 000000000..006626a4a
--- /dev/null
+++ b/src/infiniop/ops/linear_backward/cpu/linear_backward_cpu.cc
@@ -0,0 +1,110 @@
+#include "linear_backward_cpu.h"
+#include "../../../devices/cpu/common_cpu.h"
+#include "../../../reduce/cpu/reduce.h"
+#include "../info.h"
+
+namespace op::linear_backward::cpu {
+
+template <typename Tdata>
+infiniStatus_t calculate_linear_backward(
+    const LinearBackwardInfo &info,
+	Tdata * grad_x,
+	Tdata * grad_w,
+	Tdata * grad_b,
+	const Tdata * grad_y,
+	const Tdata * x,
+	const Tdata * w
+) {
+//  -------------------------------- start: perform operator on CPU --------------------------------
+    // #pragma omp parallel for
+    for (size_t i = 0; i < info.in_features; i ++)
+    {
+        auto w_ptr = w + i * info.w_stride_in;
+        auto grad_w_ptr = grad_w + i * info.grad_w_stride_in;
+        float grad_x_sum = 0.;
+        float x_value = utils::cast<float>(*(x + i * info.x_stride));
+        for (size_t j = 0; j < info.out_features; j ++)
+        {
+            float grad_y_value = utils::cast<float>(*(grad_y + j * info.grad_y_stride));
+            grad_x_sum += grad_y_value * utils::cast<float>(*(w_ptr + j * info.w_stride_out));
+            (*(grad_w_ptr + j * info.grad_w_stride_out)) = utils::cast<Tdata>(x_value * grad_y_value);
+            if (info.bias && i == 0)
+                (*(grad_b + j * info.grad_b_stride)) = utils::cast<Tdata>(grad_y_value);
+        }
+
+        *(grad_x + i * info.grad_x_stride) = utils::cast<Tdata>(grad_x_sum);
+    }
+//  --------------------------------- end: perform operator on CPU ---------------------------------
+    return INFINI_STATUS_SUCCESS;
+}
+
+Descriptor::~Descriptor() = default;
+
+infiniStatus_t Descriptor::create(
+    infiniopHandle_t handle_,
+    Descriptor **desc_ptr,
+	infiniopTensorDescriptor_t grad_x_desc,
+	infiniopTensorDescriptor_t grad_w_desc,
+	infiniopTensorDescriptor_t grad_b_desc,
+	infiniopTensorDescriptor_t grad_y_desc,
+	infiniopTensorDescriptor_t x_desc,
+	infiniopTensorDescriptor_t w_desc
+) {
+    auto handle = reinterpret_cast<device::cpu::Handle *>(handle_);
+
+//  --------------------- start: check data type and calculate workspace size ----------------------
+    auto dtype = grad_y_desc->dtype();
+    CHECK_DTYPE(dtype, INFINI_DTYPE_F16, INFINI_DTYPE_F32, INFINI_DTYPE_BF16);
+    size_t WorkSpaceSize = 0;
+//  ---------------------- end: check data type and calculate workspace size -----------------------
+
+    auto result = LinearBackwardInfo::createLinearBackwardInfo(
+		grad_x_desc,
+		grad_w_desc,
+		grad_b_desc,
+		grad_y_desc,
+		x_desc,
+		w_desc
+    );
+    CHECK_RESULT(result);
+    const LinearBackwardInfo &info = result.take();
+    
+    *desc_ptr = new Descriptor(
+        dtype, std::move(info), WorkSpaceSize,
+        nullptr,
+        handle->device, handle->device_id
+    );    
+
+    return INFINI_STATUS_SUCCESS;
+}
+
+infiniStatus_t Descriptor::calculate(
+    void *workspace,
+    size_t workspace_size,
+	void * grad_x,
+	void * grad_w,
+	void * grad_b,
+	const void * grad_y,
+	const void * x,
+	const void * w,
+    void *stream
+) const {
+
+    #define CALCULATE_LINEAR_BACKWARD(TDATA) \
+        CHECK_STATUS(calculate_linear_backward<TDATA>(_info, \
+    (TDATA *)grad_x, (TDATA *)grad_w, (TDATA *)grad_b, (const TDATA *)grad_y, (const TDATA *)x, (const TDATA *)w))
+
+    if (_info.dtype == INFINI_DTYPE_F16) {
+        CALCULATE_LINEAR_BACKWARD(fp16_t);
+    } else if (_info.dtype == INFINI_DTYPE_BF16) {
+        CALCULATE_LINEAR_BACKWARD(bf16_t);
+    } else if (_info.dtype == INFINI_DTYPE_F32) {
+        CALCULATE_LINEAR_BACKWARD(float);
+    } else {
+        return INFINI_STATUS_BAD_TENSOR_DTYPE;
+    }
+    #undef CALCULATE_LINEAR_BACKWARD
+
+    return INFINI_STATUS_SUCCESS;
+}
+}
diff --git a/src/infiniop/ops/linear_backward/cpu/linear_backward_cpu.h b/src/infiniop/ops/linear_backward/cpu/linear_backward_cpu.h
new file mode 100644
index 000000000..849638877
--- /dev/null
+++ b/src/infiniop/ops/linear_backward/cpu/linear_backward_cpu.h
@@ -0,0 +1,9 @@
+#ifndef __LINEAR_BACKWARD_CPU_H__
+#define __LINEAR_BACKWARD_CPU_H__
+
+#include "../linear_backward.h"
+
+DESCRIPTOR(cpu)
+
+
+#endif // __LINEAR_BACKWARD_CPU_H__
diff --git a/src/infiniop/ops/linear_backward/cuda/kernel.cuh b/src/infiniop/ops/linear_backward/cuda/kernel.cuh
new file mode 100644
index 000000000..8ba6635ff
--- /dev/null
+++ b/src/infiniop/ops/linear_backward/cuda/kernel.cuh
@@ -0,0 +1,41 @@
+#ifndef __LINEAR_BACKWARD_KERNEL_CUH__
+#define __LINEAR_BACKWARD_KERNEL_CUH__
+//  ------------------------------- start: perform operator on CUDA --------------------------------
+template <unsigned int BLOCK_SIZE, typename Tdata, typename Tcompute>
+__device__ void linearBackwardKernel(
+    Tdata * grad_x,
+    Tdata * grad_w,
+    Tdata * grad_b,
+    const Tdata * grad_y,
+    const Tdata * x,
+    const Tdata * w,
+    size_t out_features,
+    ptrdiff_t grad_x_stride,
+    ptrdiff_t grad_w_stride_out,
+    ptrdiff_t grad_w_stride_in,
+    ptrdiff_t grad_b_stride,
+    ptrdiff_t grad_y_stride,
+    ptrdiff_t x_stride,
+    ptrdiff_t w_stride_out,
+    ptrdiff_t w_stride_in,
+    bool bias
+) {
+    size_t in_index = blockIdx.x;
+
+    auto w_ptr = w + in_index * w_stride_in;
+    auto grad_w_ptr = grad_w + in_index * grad_w_stride_in;
+    Tcompute grad_x_sum = 0.;
+    Tcompute x_value = *(x + in_index * x_stride);
+    for (size_t j = 0; j < out_features; j ++)
+    {
+        Tcompute grad_y_value = *(grad_y + j * grad_y_stride);
+        grad_x_sum += grad_y_value * Tcompute(*(w_ptr + j * w_stride_out));
+        (*(grad_w_ptr + j * grad_w_stride_out)) = x_value * grad_y_value;
+        if (bias && blockIdx.x == 0)
+            (*(grad_b + j * grad_b_stride)) = grad_y_value;
+    }
+    (*(grad_x + in_index * grad_x_stride)) = grad_x_sum;
+}
+//  -------------------------------- end: perform operator on CUDA ---------------------------------
+
+#endif // __LINEAR_BACKWARD_KERNEL_CUH__
diff --git a/src/infiniop/ops/linear_backward/info.h b/src/infiniop/ops/linear_backward/info.h
new file mode 100644
index 000000000..0c474e6dc
--- /dev/null
+++ b/src/infiniop/ops/linear_backward/info.h
@@ -0,0 +1,82 @@
+#ifndef __LINEAR_BACKWARD_INFO_H__
+#define __LINEAR_BACKWARD_INFO_H__
+
+#include "../../../utils.h"
+#include "../../operator.h"
+#include "../../tensor.h"
+
+namespace op::linear_backward {
+
+class LinearBackwardInfo {
+private:
+    LinearBackwardInfo() = default;
+
+public:
+//  ---------------------------- start: define member variables of Info ----------------------------
+    infiniDtype_t dtype;
+	size_t in_features;
+	size_t out_features;
+	ptrdiff_t grad_x_stride;
+	ptrdiff_t grad_w_stride_out;
+	ptrdiff_t grad_w_stride_in;
+	ptrdiff_t grad_b_stride;
+	ptrdiff_t grad_y_stride;
+	ptrdiff_t x_stride;
+	ptrdiff_t w_stride_out;
+	ptrdiff_t w_stride_in;
+	bool bias;
+
+//  ----------------------------- end: define member variables of Info -----------------------------
+
+    static utils::Result<LinearBackwardInfo> createLinearBackwardInfo(
+		infiniopTensorDescriptor_t grad_x_desc,
+		infiniopTensorDescriptor_t grad_w_desc,
+		infiniopTensorDescriptor_t grad_b_desc,
+		infiniopTensorDescriptor_t grad_y_desc,
+		infiniopTensorDescriptor_t x_desc,
+		infiniopTensorDescriptor_t w_desc
+    ) {
+//  ------------------------- start: check tensor shape and input validity -------------------------
+
+		CHECK_SAME_SHAPE(x_desc->shape(), grad_x_desc->shape());
+		CHECK_SAME_SHAPE(w_desc->shape(), grad_w_desc->shape());
+		CHECK_OR_RETURN(
+			w_desc->ndim() == 2 && x_desc->ndim() == 1 && grad_y_desc->ndim() == 1,
+			INFINI_STATUS_BAD_TENSOR_SHAPE
+		);
+		size_t out = grad_y_desc->dim(0);
+		size_t in = x_desc->dim(0);
+		CHECK_OR_RETURN(
+			w_desc->dim(0) == out && w_desc->dim(1) == in,
+			INFINI_STATUS_BAD_TENSOR_SHAPE
+		);		
+        bool bias = (grad_b_desc != nullptr);
+		if (bias)
+			CHECK_OR_RETURN(
+				grad_b_desc->ndim() == 1 && grad_b_desc->dim(0) == out,
+				INFINI_STATUS_BAD_TENSOR_SHAPE   
+			);	
+		
+
+//  -------------------------- end: check tensor shape and input validity --------------------------
+        return utils::Result<LinearBackwardInfo>(LinearBackwardInfo{
+//  ------------------------------ start: create an instance of Info -------------------------------
+            grad_y_desc->dtype(),
+			in,
+			out,
+			grad_x_desc->stride(0),
+			grad_w_desc->stride(0),
+			grad_w_desc->stride(1),
+			bias ? (grad_b_desc->stride(0)) : 0,
+			grad_y_desc->stride(0),
+			x_desc->stride(0),
+			w_desc->stride(0),
+			w_desc->stride(1),
+			bias
+//  ------------------------------- end: create an instance of Info --------------------------------
+        });
+    }
+};
+}
+
+#endif //  __LINEAR_BACKWARD_INFO_H__
diff --git a/src/infiniop/ops/linear_backward/linear_backward.h b/src/infiniop/ops/linear_backward/linear_backward.h
new file mode 100644
index 000000000..1c082fa3c
--- /dev/null
+++ b/src/infiniop/ops/linear_backward/linear_backward.h
@@ -0,0 +1,54 @@
+#ifndef __LINEAR_BACKWARD_H__
+#define __LINEAR_BACKWARD_H__
+
+#include "../../../utils.h"
+#include "../../operator.h"
+#include "../../tensor.h"
+#include "info.h"
+
+#define DESCRIPTOR(NAMESPACE)                                         \
+    namespace op::linear_backward::NAMESPACE {                        \
+    class Descriptor final : public InfiniopDescriptor {              \
+        struct Opaque;                                                \
+        Opaque *_opaque;                                              \
+        LinearBackwardInfo _info;                                     \
+        size_t _workspace_size;                                       \
+        Descriptor(                                                   \
+            infiniDtype_t dtype,                                      \
+            LinearBackwardInfo info,                                  \
+            size_t workspace_size_,                                   \
+            Opaque *opaque,                                           \
+            infiniDevice_t device_type,                               \
+            int device_id                                             \
+        ) : InfiniopDescriptor{device_type, device_id},               \
+              _opaque(opaque),                                        \
+              _info(info),                                            \
+              _workspace_size(workspace_size_) {}                     \
+    public:                                                           \
+        ~Descriptor();                                                \
+        size_t workspaceSize() const { return _workspace_size; }      \
+        static infiniStatus_t create(                                 \
+            infiniopHandle_t handle,                                  \
+            Descriptor **desc_ptr,                                    \
+            infiniopTensorDescriptor_t grad_x_desc,                   \
+            infiniopTensorDescriptor_t grad_w_desc,                   \
+            infiniopTensorDescriptor_t grad_b_desc,                   \
+            infiniopTensorDescriptor_t grad_y_desc,                   \
+            infiniopTensorDescriptor_t x_desc,                        \
+            infiniopTensorDescriptor_t w_desc                         \
+        );                                                            \
+        infiniStatus_t calculate(                                     \
+            void *workspace,                                          \
+            size_t workspace_size,                                    \
+            void * grad_x,                                            \
+            void * grad_w,                                            \
+            void * grad_b,                                            \
+            const void * grad_y,                                      \
+            const void * x,                                           \
+            const void * w,                                           \
+            void *stream                                              \
+        ) const;                                                      \
+    };                                                                \
+    }
+
+#endif
\ No newline at end of file
diff --git a/src/infiniop/ops/linear_backward/metax/linear_backward_metax.h b/src/infiniop/ops/linear_backward/metax/linear_backward_metax.h
new file mode 100644
index 000000000..b926cee89
--- /dev/null
+++ b/src/infiniop/ops/linear_backward/metax/linear_backward_metax.h
@@ -0,0 +1,8 @@
+#ifndef __LINEAR_BACKWARD_METAX_H__
+#define __LINEAR_BACKWARD_METAX_H__
+
+#include "../linear_backward.h"
+
+DESCRIPTOR(metax)
+
+#endif // __LINEAR_BACKWARD_METAX_H__
diff --git a/src/infiniop/ops/linear_backward/metax/linear_backward_metax.maca b/src/infiniop/ops/linear_backward/metax/linear_backward_metax.maca
new file mode 100644
index 000000000..b3a6e596f
--- /dev/null
+++ b/src/infiniop/ops/linear_backward/metax/linear_backward_metax.maca
@@ -0,0 +1,171 @@
+#include "../../../devices/metax/metax_common.h"
+#include "linear_backward_metax.h"
+#include <hccub/block/block_reduce.cuh>
+#include "../../../devices/metax/metax_kernel_common.h"
+#include "../../../reduce/cuda/reduce.cuh"
+#include "../cuda/kernel.cuh"
+#include "../info.h"
+
+namespace op::linear_backward::metax {
+
+template <unsigned int BLOCK_SIZE, typename Tdata, typename Tcompute>
+INFINIOP_METAX_KERNEL launchKernel(
+    Tdata * grad_x,
+    Tdata * grad_w,
+    Tdata * grad_b,
+    const Tdata * grad_y,
+    const Tdata * x,
+    const Tdata * w,
+    size_t out_features,
+    ptrdiff_t grad_x_stride,
+    ptrdiff_t grad_w_stride_out,
+    ptrdiff_t grad_w_stride_in,
+    ptrdiff_t grad_b_stride,
+    ptrdiff_t grad_y_stride,
+    ptrdiff_t x_stride,
+    ptrdiff_t w_stride_out,
+    ptrdiff_t w_stride_in,
+    bool bias 
+) {
+    linearBackwardKernel<BLOCK_SIZE, Tdata, Tcompute>(
+        grad_x,
+        grad_w,
+        grad_b,
+        grad_y,
+        x,
+        w,
+        out_features,
+        grad_x_stride,
+        grad_w_stride_out,
+        grad_w_stride_in,
+        grad_b_stride,
+        grad_y_stride,
+        x_stride,
+        w_stride_out,
+        w_stride_in,
+        bias
+    );
+}
+
+//  ----------------------------------- start: call launchKernel -----------------------------------
+template <unsigned int BLOCK_SIZE, typename Tdata>
+infiniStatus_t calculate_linear_backward(
+    const LinearBackwardInfo &info,
+    Tdata * grad_x,
+    Tdata * grad_w,
+    Tdata * grad_b,
+    const Tdata * grad_y,
+    const Tdata * x,
+    const Tdata * w,
+    hcStream_t stream
+) {
+    launchKernel<1, Tdata, float><<<info.in_features, 1, 0, stream>>>(
+        grad_x,
+        grad_w,
+        grad_b,
+        grad_y,
+        x,
+        w,
+        info.out_features,
+        info.grad_x_stride,
+        info.grad_w_stride_out,
+        info.grad_w_stride_in,
+        info.grad_b_stride,
+        info.grad_y_stride,
+        info.x_stride,
+        info.w_stride_out,
+        info.w_stride_in,
+        info.bias
+    );
+    return INFINI_STATUS_SUCCESS;
+}
+//  ------------------------------------ end: call launchKernel ------------------------------------
+
+
+struct Descriptor::Opaque {
+    std::shared_ptr<device::metax::Handle::Internal> internal;
+};
+
+Descriptor::~Descriptor() {
+    delete _opaque;
+}
+
+infiniStatus_t Descriptor::create(
+    infiniopHandle_t handle_,
+    Descriptor **desc_ptr,
+    infiniopTensorDescriptor_t grad_x_desc,
+    infiniopTensorDescriptor_t grad_w_desc,
+    infiniopTensorDescriptor_t grad_b_desc,
+    infiniopTensorDescriptor_t grad_y_desc,
+    infiniopTensorDescriptor_t x_desc,
+    infiniopTensorDescriptor_t w_desc
+) {
+    auto handle = reinterpret_cast<device::metax::Handle *>(handle_);
+//  --------------------- start: check data type and calculate workspace size ----------------------    
+    auto dtype = grad_x_desc->dtype();
+    CHECK_DTYPE(dtype, INFINI_DTYPE_F16, INFINI_DTYPE_F32, INFINI_DTYPE_BF16);
+    auto result = LinearBackwardInfo::createLinearBackwardInfo(
+        grad_x_desc,
+        grad_w_desc,
+        grad_b_desc,
+        grad_y_desc,
+        x_desc,
+        w_desc
+    );
+    CHECK_RESULT(result);
+    const LinearBackwardInfo &info = result.take();
+    size_t WorkSpaceSize = 0;
+//  ---------------------- end: check data type and calculate workspace size -----------------------
+
+    *desc_ptr = new Descriptor(
+        dtype, std::move(info), WorkSpaceSize, 
+        new Opaque{handle->internal()},
+        handle->device, handle->device_id
+    );
+    return INFINI_STATUS_SUCCESS;
+}
+
+
+
+infiniStatus_t Descriptor::calculate(
+    void * workspace,
+    size_t workspace_size,
+    void * grad_x,
+    void * grad_w,
+    void * grad_b,
+    const void * grad_y,
+    const void * x,
+    const void * w,
+    void *stream_
+) const {
+    if (workspace_size < _workspace_size)
+        return INFINI_STATUS_INSUFFICIENT_WORKSPACE;
+
+    hcStream_t stream = (hcStream_t)stream_;
+
+    #define CALCULATE_LINEAR_BACKWARD(BLOCK_SIZE, TDATA) \
+        calculate_linear_backward<BLOCK_SIZE, TDATA>(_info, (TDATA *)grad_x, (TDATA *)grad_w, (TDATA *)grad_b, (const TDATA *)grad_y, (const TDATA *)x, (const TDATA *)w, stream)
+    #define CALCULATE_LINEAR_BACKWARD_WITH_METAX_BLOCK(BLOCK_SIZE)              \
+    {                                                                           \
+        if (_info.dtype == INFINI_DTYPE_F16)                                    \
+            return CALCULATE_LINEAR_BACKWARD(BLOCK_SIZE, half);                 \
+        else if (_info.dtype == INFINI_DTYPE_F32)                               \
+            return CALCULATE_LINEAR_BACKWARD(BLOCK_SIZE, float);                \
+        else if (_info.dtype == INFINI_DTYPE_BF16)                              \
+            return CALCULATE_LINEAR_BACKWARD(BLOCK_SIZE, cuda_bfloat16);        \
+        else                                                                    \
+            return INFINI_STATUS_BAD_TENSOR_DTYPE;                              \
+    }
+
+    if (_opaque->internal->maxThreadsPerBlock() == METAX_BLOCK_SIZE_1024)
+        CALCULATE_LINEAR_BACKWARD_WITH_METAX_BLOCK(METAX_BLOCK_SIZE_1024)
+    else if (_opaque->internal->maxThreadsPerBlock() == METAX_BLOCK_SIZE_512)
+        CALCULATE_LINEAR_BACKWARD_WITH_METAX_BLOCK(METAX_BLOCK_SIZE_512)
+    else
+        return INFINI_STATUS_DEVICE_ARCHITECTURE_NOT_SUPPORTED;
+    return INFINI_STATUS_SUCCESS;
+
+    #undef CALCULATE_LINEAR_BACKWARD_WITH_METAX_BLOCK
+    #undef CALCULATE_LINEAR_BACKWARD
+}
+} // namespace op::linear_backward::metax
diff --git a/src/infiniop/ops/linear_backward/nvidia/linear_backward_nvidia.cu b/src/infiniop/ops/linear_backward/nvidia/linear_backward_nvidia.cu
new file mode 100644
index 000000000..5102d1bb3
--- /dev/null
+++ b/src/infiniop/ops/linear_backward/nvidia/linear_backward_nvidia.cu
@@ -0,0 +1,172 @@
+#include "../../../devices/nvidia/nvidia_handle.cuh"
+#include "../../../devices/nvidia/nvidia_common.cuh"
+#include "../../../devices/nvidia/nvidia_kernel_common.cuh"
+#include "linear_backward_nvidia.cuh"
+#include "../cuda/kernel.cuh"
+#include "../info.h"
+
+namespace op::linear_backward::nvidia {
+
+//  ---------------------- start: launchKernel: call kernel function of CUDA -----------------------
+template <unsigned int BLOCK_SIZE, typename Tdata, typename Tcompute>
+INFINIOP_CUDA_KERNEL launchKernel(
+    Tdata * grad_x,
+    Tdata * grad_w,
+    Tdata * grad_b,
+    const Tdata * grad_y,
+    const Tdata * x,
+    const Tdata * w,
+    size_t out_features,
+    ptrdiff_t grad_x_stride,
+    ptrdiff_t grad_w_stride_out,
+    ptrdiff_t grad_w_stride_in,
+    ptrdiff_t grad_b_stride,
+    ptrdiff_t grad_y_stride,
+    ptrdiff_t x_stride,
+    ptrdiff_t w_stride_out,
+    ptrdiff_t w_stride_in,
+    bool bias 
+) {
+    linearBackwardKernel<BLOCK_SIZE, Tdata, Tcompute>(
+        grad_x,
+        grad_w,
+        grad_b,
+        grad_y,
+        x,
+        w,
+        out_features,
+        grad_x_stride,
+        grad_w_stride_out,
+        grad_w_stride_in,
+        grad_b_stride,
+        grad_y_stride,
+        x_stride,
+        w_stride_out,
+        w_stride_in,
+        bias
+    );
+}
+//  ----------------------- end: launchKernel: call kernel function of CUDA ------------------------
+
+//  ----------------------------------- start: call launchKernel -----------------------------------
+template<unsigned int BLOCK_SIZE, typename Tdata>
+infiniStatus_t calculate_linear_backward(
+    const LinearBackwardInfo &info,
+    Tdata * grad_x,
+    Tdata * grad_w,
+    Tdata * grad_b,
+    const Tdata * grad_y,
+    const Tdata * x,
+    const Tdata * w,
+    cudaStream_t stream
+) {
+    launchKernel<1, Tdata, float><<<info.in_features, 1, 0, stream>>>(
+        grad_x,
+        grad_w,
+        grad_b,
+        grad_y,
+        x,
+        w,
+        info.out_features,
+        info.grad_x_stride,
+        info.grad_w_stride_out,
+        info.grad_w_stride_in,
+        info.grad_b_stride,
+        info.grad_y_stride,
+        info.x_stride,
+        info.w_stride_out,
+        info.w_stride_in,
+        info.bias
+    );
+    return INFINI_STATUS_SUCCESS;
+}
+//  ------------------------------------ end: call launchKernel ------------------------------------
+
+
+struct Descriptor::Opaque {
+    std::shared_ptr<device::nvidia::Handle::Internal> internal;
+};
+
+Descriptor::~Descriptor() {
+    delete _opaque;
+}
+
+infiniStatus_t Descriptor::create(
+    infiniopHandle_t handle_,
+    Descriptor **desc_ptr,
+    infiniopTensorDescriptor_t grad_x_desc,
+    infiniopTensorDescriptor_t grad_w_desc,
+    infiniopTensorDescriptor_t grad_b_desc,
+    infiniopTensorDescriptor_t grad_y_desc,
+    infiniopTensorDescriptor_t x_desc,
+    infiniopTensorDescriptor_t w_desc
+) {
+    auto handle = reinterpret_cast<device::nvidia::Handle *>(handle_);
+//  --------------------- start: check data type and calculate workspace size ----------------------
+    auto dtype = grad_x_desc->dtype();
+    CHECK_DTYPE(dtype, INFINI_DTYPE_F16, INFINI_DTYPE_F32, INFINI_DTYPE_BF16);
+    size_t WorkSpaceSize = 0;
+//  ---------------------- end: check data type and calculate workspace size -----------------------
+    auto result = LinearBackwardInfo::createLinearBackwardInfo(
+        grad_x_desc,
+        grad_w_desc,
+        grad_b_desc,
+        grad_y_desc,
+        x_desc,
+        w_desc
+    );
+    CHECK_RESULT(result);
+    const LinearBackwardInfo &info = result.take();
+    *desc_ptr = new Descriptor(
+        dtype, std::move(info), WorkSpaceSize,
+        new Opaque{handle->internal()},
+        handle->device, handle->device_id
+    );    
+    return INFINI_STATUS_SUCCESS;
+}
+
+
+infiniStatus_t Descriptor::calculate(
+    void * workspace,
+    size_t workspace_size,
+    void * grad_x,
+    void * grad_w,
+    void * grad_b,
+    const void * grad_y,
+    const void * x,
+    const void * w,
+    void *stream_
+) const {
+    if (workspace_size < _workspace_size) {
+        return INFINI_STATUS_INSUFFICIENT_WORKSPACE;
+    }
+    cudaStream_t stream = (cudaStream_t)stream_;
+
+    #define CALCULATE_LINEAR_BACKWARD(BLOCK_SIZE, TDATA) \
+        calculate_linear_backward<BLOCK_SIZE, TDATA>(_info, (TDATA *)grad_x, (TDATA *)grad_w, (TDATA *)grad_b, (const TDATA *)grad_y, (const TDATA *)x, (const TDATA *)w, stream)
+    #define CALCULATE_LINEAR_BACKWARD_WITH_BLOCK_SIZE(BLOCK_SIZE)               \
+    {                                                                           \
+        if (_info.dtype == INFINI_DTYPE_F16)                                    \
+            return CALCULATE_LINEAR_BACKWARD(BLOCK_SIZE, half);                 \
+        else if (_info.dtype == INFINI_DTYPE_F32)                               \
+            return CALCULATE_LINEAR_BACKWARD(BLOCK_SIZE, float);                \
+        else if (_info.dtype == INFINI_DTYPE_BF16)                              \
+            return CALCULATE_LINEAR_BACKWARD(BLOCK_SIZE, __nv_bfloat16);        \
+        else                                                                    \
+            return INFINI_STATUS_BAD_TENSOR_DTYPE;                              \
+    }
+    
+    if (_opaque->internal->maxThreadsPerBlock() == CUDA_BLOCK_SIZE_1024)
+        CALCULATE_LINEAR_BACKWARD_WITH_BLOCK_SIZE(CUDA_BLOCK_SIZE_1024)
+    else if (_opaque->internal->maxThreadsPerBlock() == CUDA_BLOCK_SIZE_512)
+        CALCULATE_LINEAR_BACKWARD_WITH_BLOCK_SIZE(CUDA_BLOCK_SIZE_512)
+    else if (_opaque->internal->maxThreadsPerBlock() == CUDA_BLOCK_SIZE_4096)
+        CALCULATE_LINEAR_BACKWARD_WITH_BLOCK_SIZE(CUDA_BLOCK_SIZE_4096)
+    else
+        return INFINI_STATUS_DEVICE_ARCHITECTURE_NOT_SUPPORTED;
+    return INFINI_STATUS_SUCCESS;
+
+    #undef CALCULATE_LINEAR_BACKWARD_WITH_BLOCK_SIZE
+    #undef CALCULATE_LINEAR_BACKWARD    
+}
+} // namespace op::linear_backward::nvidia
diff --git a/src/infiniop/ops/linear_backward/nvidia/linear_backward_nvidia.cuh b/src/infiniop/ops/linear_backward/nvidia/linear_backward_nvidia.cuh
new file mode 100644
index 000000000..e80ba5342
--- /dev/null
+++ b/src/infiniop/ops/linear_backward/nvidia/linear_backward_nvidia.cuh
@@ -0,0 +1,7 @@
+#ifndef __LINEAR_BACKWARD_NVIDIA_API_H__
+#define __LINEAR_BACKWARD_NVIDIA_API_H__
+#include "../linear_backward.h"
+
+DESCRIPTOR(nvidia)
+
+#endif // __LINEAR_BACKWARD_NVIDIA_API_H__
diff --git a/src/infiniop/ops/linear_backward/operator.cc b/src/infiniop/ops/linear_backward/operator.cc
new file mode 100644
index 000000000..677442d31
--- /dev/null
+++ b/src/infiniop/ops/linear_backward/operator.cc
@@ -0,0 +1,164 @@
+#include "../../operator.h"
+#include "../../handle.h"
+#include "infiniop/ops/linear_backward.h"
+
+#ifdef ENABLE_CPU_API
+#include "cpu/linear_backward_cpu.h"
+#endif
+#if defined(ENABLE_NVIDIA_API) || defined(ENABLE_ILUVATAR_API)
+#include "nvidia/linear_backward_nvidia.cuh"
+#endif
+#ifdef ENABLE_METAX_API
+#include "metax/linear_backward_metax.h"
+#endif
+
+__C infiniStatus_t infiniopCreateLinearBackwardDescriptor(
+    infiniopHandle_t handle,
+    infiniopLinearBackwardDescriptor_t *desc_ptr,
+	infiniopTensorDescriptor_t grad_x_desc,
+	infiniopTensorDescriptor_t grad_w_desc,
+	infiniopTensorDescriptor_t grad_b_desc,
+	infiniopTensorDescriptor_t grad_y_desc,
+	infiniopTensorDescriptor_t x_desc,
+	infiniopTensorDescriptor_t w_desc
+) {
+#define CREATE(CASE, NAMESPACE)                                                           \
+    case CASE:                                                                            \
+        return op::linear_backward::NAMESPACE::Descriptor::create(                        \
+            handle,                                                                       \
+            reinterpret_cast<op::linear_backward::NAMESPACE::Descriptor **>(desc_ptr),    \
+            grad_x_desc,                                                                  \
+            grad_w_desc,                                                                  \
+            grad_b_desc,                                                                  \
+            grad_y_desc,                                                                  \
+            x_desc,                                                                       \
+            w_desc                                                                        \
+        )
+
+    switch (handle->device) {
+
+#ifdef ENABLE_CPU_API
+        CREATE(INFINI_DEVICE_CPU, cpu);
+#endif
+#ifdef ENABLE_NVIDIA_API
+        CREATE(INFINI_DEVICE_NVIDIA, nvidia);
+#endif
+#ifdef ENABLE_ILUVATAR_API
+        CREATE(INFINI_DEVICE_ILUVATAR, nvidia);
+#endif
+#ifdef ENABLE_METAX_API
+       CREATE(INFINI_DEVICE_METAX, metax);
+#endif
+
+    default:
+        return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
+    }
+
+#undef CREATE
+}
+
+__C infiniStatus_t infiniopGetLinearBackwardWorkspaceSize(infiniopLinearBackwardDescriptor_t desc, size_t *size) {
+#define GET(CASE, NAMESPACE)                                                                                  \
+    case CASE:                                                                                                \
+        *size = reinterpret_cast<op::linear_backward::NAMESPACE::Descriptor *>(desc)->workspaceSize();        \
+        return INFINI_STATUS_SUCCESS
+
+    switch (desc->device_type) {
+#ifdef ENABLE_CPU_API
+        GET(INFINI_DEVICE_CPU, cpu);
+#endif
+#ifdef ENABLE_NVIDIA_API
+        GET(INFINI_DEVICE_NVIDIA, nvidia);
+#endif
+#ifdef ENABLE_ILUVATAR_API
+        GET(INFINI_DEVICE_ILUVATAR, nvidia);
+#endif
+#ifdef ENABLE_METAX_API
+       GET(INFINI_DEVICE_METAX, metax);
+#endif
+    default:
+        return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
+    }
+#undef GET
+
+    return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
+}
+
+__C infiniStatus_t infiniopLinearBackward(
+    infiniopLinearBackwardDescriptor_t desc,
+    void *workspace,
+    size_t workspace_size,
+	void * grad_x,
+	void * grad_w,
+	void * grad_b,
+	const void * grad_y,
+	const void * x,
+	const void * w,                
+    void *stream
+) {
+
+#define CALCULATE(CASE, NAMESPACE)                                                                            \
+    case CASE:                                                                                                \
+        return reinterpret_cast<const op::linear_backward::NAMESPACE::Descriptor *>(desc)->calculate(         \
+            workspace,                                                                                        \
+            workspace_size,                                                                                   \
+            grad_x,                                                                                           \
+            grad_w,                                                                                           \
+            grad_b,                                                                                           \
+            grad_y,                                                                                           \
+            x,                                                                                                \
+            w,                                                                                                \
+            stream                                                                                            \
+        )
+
+    switch (desc->device_type) {
+
+#ifdef ENABLE_CPU_API
+        CALCULATE(INFINI_DEVICE_CPU, cpu);
+#endif
+#ifdef ENABLE_NVIDIA_API
+        CALCULATE(INFINI_DEVICE_NVIDIA, nvidia);
+#endif
+#ifdef ENABLE_ILUVATAR_API
+        CALCULATE(INFINI_DEVICE_ILUVATAR, nvidia);
+#endif
+#ifdef ENABLE_METAX_API
+       CALCULATE(INFINI_DEVICE_METAX, metax);
+#endif
+
+    default:
+        return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
+    }
+
+#undef CALCULATE
+}
+
+__C infiniStatus_t
+infiniopDestroyLinearBackwardDescriptor(infiniopLinearBackwardDescriptor_t desc) {
+
+#define DELETE(CASE, NAMESPACE)                                                                     \
+    case CASE:                                                                                      \
+        delete reinterpret_cast<const op::linear_backward::NAMESPACE::Descriptor *>(desc);          \
+        return INFINI_STATUS_SUCCESS
+
+    switch (desc->device_type) {
+
+#ifdef ENABLE_CPU_API
+        DELETE(INFINI_DEVICE_CPU, cpu);
+#endif
+#ifdef ENABLE_NVIDIA_API
+        DELETE(INFINI_DEVICE_NVIDIA, nvidia);
+#endif
+#ifdef ENABLE_ILUVATAR_API
+        DELETE(INFINI_DEVICE_ILUVATAR, nvidia);
+#endif
+#ifdef ENABLE_METAX_API
+       DELETE(INFINI_DEVICE_METAX, metax);
+#endif
+
+    default:
+        return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
+    }
+
+#undef DELETE
+}
diff --git a/src/infiniop/ops/scatter/cpu/scatter_cpu.cc b/src/infiniop/ops/scatter/cpu/scatter_cpu.cc
new file mode 100644
index 000000000..c47a38ef9
--- /dev/null
+++ b/src/infiniop/ops/scatter/cpu/scatter_cpu.cc
@@ -0,0 +1,108 @@
+#include "scatter_cpu.h"
+#include "../../../devices/cpu/common_cpu.h"
+#include "../../../reduce/cpu/reduce.h"
+#include "../info.h"
+
+namespace op::scatter::cpu {
+
+infiniStatus_t calculate_scatter(
+    const ScatterInfo &info,
+	char * output,
+	const char * input,
+	const int64_t * index
+) {
+//  -------------------------------- start: perform operator on CPU --------------------------------
+    std::vector<ptrdiff_t> contiguous_strides(info.ndim);
+	ptrdiff_t last_dim = 1;
+    ptrdiff_t last_stride = 1;
+    for(size_t d = 0; d < info.ndim; d ++)
+    {
+        if (d == info.dim)
+            continue;
+        contiguous_strides[d] = last_dim * last_stride;  
+        last_dim = info.index_shape[d];
+        last_stride = contiguous_strides[d];
+    }
+    size_t batch_size = last_dim * last_stride;
+    int scatter_dim = int(info.dim);
+    size_t element_size = infiniSizeOf(info.dtype);
+
+    #pragma omp parallel for
+    for (size_t n = 0; n < batch_size; n ++) {
+        auto output_ptr = output;
+        auto input_ptr = input;
+        auto index_ptr = index;
+        size_t rem = n;
+        for(int d = info.ndim - 1; d >= 0; d --) {
+            if (d == scatter_dim)
+                continue;
+            size_t dim_index = rem / contiguous_strides[d];
+            rem = rem % contiguous_strides[d];
+            output_ptr += dim_index * element_size * info.output_strides[d];
+            input_ptr += dim_index * element_size * info.input_strides[d];
+            index_ptr += dim_index * info.index_strides[d];
+        }
+        for (size_t c = 0; c < info.index_shape[scatter_dim]; c ++) {
+            int64_t scatter_number = *(index_ptr + c * info.index_strides[scatter_dim]);
+            memcpy(
+                output_ptr + scatter_number * element_size * info.output_strides[scatter_dim],
+                input_ptr + c * element_size * info.input_strides[scatter_dim],
+                element_size
+            );
+        }
+    }
+
+//  --------------------------------- end: perform operator on CPU ---------------------------------
+    return INFINI_STATUS_SUCCESS;
+}
+
+
+Descriptor::~Descriptor() = default;
+
+infiniStatus_t Descriptor::create(
+    infiniopHandle_t handle_,
+    Descriptor **desc_ptr,
+	infiniopTensorDescriptor_t output_desc,
+	infiniopTensorDescriptor_t input_desc,
+	infiniopTensorDescriptor_t index_desc,
+	size_t dim
+) {
+    auto handle = reinterpret_cast<device::cpu::Handle *>(handle_);
+
+//  --------------------- start: check data type and calculate workspace size ----------------------
+    auto dtype = input_desc->dtype();
+    size_t WorkSpaceSize = 0;
+//  ---------------------- end: check data type and calculate workspace size -----------------------
+
+    auto result = ScatterInfo::createScatterInfo(
+		output_desc,
+		input_desc,
+		index_desc,
+		dim
+    );
+    CHECK_RESULT(result);
+    const ScatterInfo &info = result.take();
+    
+    *desc_ptr = new Descriptor(
+        dtype, std::move(info), WorkSpaceSize,
+        nullptr,
+        handle->device, handle->device_id
+    );    
+
+    return INFINI_STATUS_SUCCESS;
+}
+
+
+
+infiniStatus_t Descriptor::calculate(
+    void *workspace,
+    size_t workspace_size,
+	void * output,
+	const void * input,
+	const void * index,
+    void *stream
+) const {
+
+    return calculate_scatter(_info, (char *)output, (const char *)input, (const int64_t *)index);
+}
+}
diff --git a/src/infiniop/ops/scatter/cpu/scatter_cpu.h b/src/infiniop/ops/scatter/cpu/scatter_cpu.h
new file mode 100644
index 000000000..bf2fcb7a1
--- /dev/null
+++ b/src/infiniop/ops/scatter/cpu/scatter_cpu.h
@@ -0,0 +1,9 @@
+#ifndef __SCATTER_CPU_H__
+#define __SCATTER_CPU_H__
+
+#include "../scatter.h"
+
+DESCRIPTOR(cpu)
+
+
+#endif // __SCATTER_CPU_H__
diff --git a/src/infiniop/ops/scatter/cuda/kernel.cuh b/src/infiniop/ops/scatter/cuda/kernel.cuh
new file mode 100644
index 000000000..6c4de6ce5
--- /dev/null
+++ b/src/infiniop/ops/scatter/cuda/kernel.cuh
@@ -0,0 +1,38 @@
+#ifndef __SCATTER_KERNEL_CUH__
+#define __SCATTER_KERNEL_CUH__
+//  ------------------------------- start: perform operator on CUDA --------------------------------
+template <unsigned int BLOCK_SIZE, typename Tdata>
+__device__ void scatterKernel(
+    Tdata * output,
+    const Tdata * input,
+    const int64_t * index,
+    size_t ndim,
+    size_t index_scatter_size,
+    ptrdiff_t * output_strides,
+    ptrdiff_t * input_strides,
+    ptrdiff_t * index_strides,
+    ptrdiff_t * contiguous_strides,
+    int scatter_dim
+) {
+        auto output_ptr = output;
+        auto input_ptr = input;
+        auto index_ptr = index;        
+        size_t rem = blockIdx.x;
+        for(int d = ndim - 1; d >= 0; d --) {
+            if (d == scatter_dim)
+                continue;
+            size_t dim_index = rem / contiguous_strides[d];
+            rem = rem % contiguous_strides[d];
+            output_ptr += dim_index * output_strides[d];
+            input_ptr += dim_index * input_strides[d];
+            index_ptr += dim_index * index_strides[d];
+        }
+        for (size_t c = threadIdx.x; c < index_scatter_size; c += BLOCK_SIZE) {
+            int64_t scatter_number = *(index_ptr + c * index_strides[scatter_dim]);
+            *(output_ptr + scatter_number * output_strides[scatter_dim]) = \
+                *(input_ptr + c * input_strides[scatter_dim]);
+        }
+}
+//  -------------------------------- end: perform operator on CUDA ---------------------------------
+
+#endif // __SCATTER_KERNEL_CUH__
diff --git a/src/infiniop/ops/scatter/info.h b/src/infiniop/ops/scatter/info.h
new file mode 100644
index 000000000..9f21e435c
--- /dev/null
+++ b/src/infiniop/ops/scatter/info.h
@@ -0,0 +1,67 @@
+#ifndef __SCATTER_INFO_H__
+#define __SCATTER_INFO_H__
+
+#include "../../../utils.h"
+#include "../../operator.h"
+#include "../../tensor.h"
+
+namespace op::scatter {
+
+class ScatterInfo {
+private:
+    ScatterInfo() = default;
+
+public:
+//  ---------------------------- start: define member variables of Info ----------------------------
+    infiniDtype_t dtype;
+	size_t ndim;
+	std::vector<size_t> output_shape;
+	std::vector<size_t> input_shape;
+	std::vector<size_t> index_shape;
+	std::vector<ptrdiff_t> output_strides;
+	std::vector<ptrdiff_t> input_strides;
+	std::vector<ptrdiff_t> index_strides;
+	size_t dim;
+
+//  ----------------------------- end: define member variables of Info -----------------------------
+
+    static utils::Result<ScatterInfo> createScatterInfo(
+		infiniopTensorDescriptor_t output_desc,
+		infiniopTensorDescriptor_t input_desc,
+		infiniopTensorDescriptor_t index_desc,
+		size_t dim
+    ) {
+//  ------------------------- start: check tensor shape and input validity -------------------------
+		CHECK_OR_RETURN(
+			input_desc->ndim() == output_desc->ndim() && output_desc->ndim() == index_desc->ndim(),
+			INFINI_STATUS_BAD_TENSOR_SHAPE
+		);
+		size_t ndim = output_desc->ndim();
+		for (size_t d = 0; d < ndim; d ++){
+			if(d != dim) {
+				CHECK_OR_RETURN(
+					index_desc->dim(d) <= input_desc->dim(d) && index_desc->dim(d) <= output_desc->dim(d),
+					INFINI_STATUS_BAD_TENSOR_SHAPE;
+				);
+			}
+		}
+		CHECK_OR_RETURN(index_desc->dim(dim) <= input_desc->dim(dim), INFINI_STATUS_BAD_TENSOR_SHAPE);		
+//  -------------------------- end: check tensor shape and input validity --------------------------
+        return utils::Result<ScatterInfo>(ScatterInfo{
+//  ------------------------------ start: create an instance of Info -------------------------------
+            output_desc->dtype(),
+			ndim,
+			output_desc->shape(),
+			input_desc->shape(),
+			index_desc->shape(),
+			output_desc->strides(),
+			input_desc->strides(),
+			index_desc->strides(),
+			dim
+//  ------------------------------- end: create an instance of Info --------------------------------
+        });
+    }
+};
+}
+
+#endif //  __SCATTER_INFO_H__
diff --git a/src/infiniop/ops/scatter/metax/scatter_metax.h b/src/infiniop/ops/scatter/metax/scatter_metax.h
new file mode 100644
index 000000000..d5ce0ef16
--- /dev/null
+++ b/src/infiniop/ops/scatter/metax/scatter_metax.h
@@ -0,0 +1,8 @@
+#ifndef __SCATTER_METAX_H__
+#define __SCATTER_METAX_H__
+
+#include "../scatter.h"
+
+DESCRIPTOR(metax)
+
+#endif // __SCATTER_METAX_H__
diff --git a/src/infiniop/ops/scatter/metax/scatter_metax.maca b/src/infiniop/ops/scatter/metax/scatter_metax.maca
new file mode 100644
index 000000000..1c742f60d
--- /dev/null
+++ b/src/infiniop/ops/scatter/metax/scatter_metax.maca
@@ -0,0 +1,190 @@
+#include "../../../devices/metax/metax_common.h"
+#include "scatter_metax.h"
+#include <hccub/block/block_reduce.cuh>
+#include "../../../devices/metax/metax_kernel_common.h"
+#include "../../../reduce/cuda/reduce.cuh"
+#include "../cuda/kernel.cuh"
+#include "../info.h"
+
+namespace op::scatter::metax {
+
+template <unsigned int BLOCK_SIZE, typename Tdata>
+INFINIOP_METAX_KERNEL launchKernel(
+    Tdata * output,
+    const Tdata * input,
+    const int64_t * index,
+    size_t ndim,
+    size_t index_scatter_size,
+    ptrdiff_t * output_strides,
+    ptrdiff_t * input_strides,
+    ptrdiff_t * index_strides,
+    ptrdiff_t * contiguous_strides,
+    int scatter_dim
+) {
+    scatterKernel<BLOCK_SIZE, Tdata>(
+        output,
+        input,
+        index,
+        ndim,
+        index_scatter_size,
+        output_strides,
+        input_strides,
+        index_strides,
+        contiguous_strides,
+        scatter_dim
+    );
+}
+//  ----------------------- end: launchKernel: call kernel function of CUDA ------------------------
+
+//  ----------------------------------- start: call launchKernel -----------------------------------
+template<unsigned int BLOCK_SIZE, typename Tdata>
+infiniStatus_t calculate_scatter(
+    const ScatterInfo &info,
+    Tdata * output,
+    const Tdata * input,
+    const int64_t *  index,
+    hcStream_t stream,
+    void * workspace
+) {
+    size_t ndim = info.ndim;
+    ptrdiff_t * contiguous_strides = new ptrdiff_t[ndim];
+    size_t last_dim = 1, last_stride = 1;
+    size_t scatter_dim = info.dim;
+    for(size_t d = 0; d < ndim; d ++)
+    {
+        if (d == scatter_dim) 
+            continue;
+        contiguous_strides[d] = last_dim * last_stride;
+        last_dim = info.index_shape[d];
+        last_stride = contiguous_strides[d];
+    }
+
+    size_t batch_size = last_dim * last_stride;
+
+    ptrdiff_t * contiguous_strides_cuda = reinterpret_cast<ptrdiff_t*>(workspace);
+    ptrdiff_t * input_strides_cuda = contiguous_strides_cuda + ndim;
+    ptrdiff_t * output_strides_cuda = input_strides_cuda + ndim;
+    ptrdiff_t * index_strides_cuda = output_strides_cuda + ndim;
+
+    CHECK_METAX(hcMemcpyAsync(contiguous_strides_cuda, contiguous_strides, sizeof(ptrdiff_t) * ndim, hcMemcpyHostToDevice, stream));
+    CHECK_METAX(hcMemcpyAsync(input_strides_cuda, info.input_strides.data(), sizeof(ptrdiff_t) * ndim, hcMemcpyHostToDevice, stream));
+    CHECK_METAX(hcMemcpyAsync(output_strides_cuda, info.output_strides.data(), sizeof(ptrdiff_t) * ndim, hcMemcpyHostToDevice, stream));
+    CHECK_METAX(hcMemcpyAsync(index_strides_cuda, info.index_strides.data(), sizeof(ptrdiff_t) * ndim, hcMemcpyHostToDevice, stream));    
+
+
+    launchKernel<BLOCK_SIZE, Tdata><<<batch_size, BLOCK_SIZE, 0, stream>>>(
+        output,
+        input,
+        index,
+        ndim,
+        info.index_shape[scatter_dim],
+        output_strides_cuda,
+        input_strides_cuda,
+        index_strides_cuda,
+        contiguous_strides_cuda,
+        scatter_dim
+    );
+    delete[] contiguous_strides;
+    return INFINI_STATUS_SUCCESS;
+}
+//  ------------------------------------ end: call launchKernel ------------------------------------
+
+
+struct Descriptor::Opaque {
+    std::shared_ptr<device::metax::Handle::Internal> internal;
+};
+
+Descriptor::~Descriptor() {
+    delete _opaque;
+}
+
+infiniStatus_t Descriptor::create(
+    infiniopHandle_t handle_,
+    Descriptor **desc_ptr,
+    infiniopTensorDescriptor_t output_desc,
+    infiniopTensorDescriptor_t input_desc,
+    infiniopTensorDescriptor_t index_desc,
+    size_t dim
+) {
+    auto handle = reinterpret_cast<device::metax::Handle *>(handle_);
+//  --------------------- start: check data type and calculate workspace size ----------------------    
+    auto dtype = output_desc->dtype();
+    auto result = ScatterInfo::createScatterInfo(
+        output_desc,
+        input_desc,
+        index_desc,
+        dim
+    );
+    CHECK_RESULT(result);
+    const ScatterInfo &info = result.take();
+    size_t WorkSpaceSize = sizeof(ptrdiff_t) * input_desc->ndim() * 4;
+//  ---------------------- end: check data type and calculate workspace size -----------------------
+
+    *desc_ptr = new Descriptor(
+        dtype, std::move(info), WorkSpaceSize, 
+        new Opaque{handle->internal()},
+        handle->device, handle->device_id
+    );
+    return INFINI_STATUS_SUCCESS;
+}
+
+
+
+infiniStatus_t Descriptor::calculate(
+    void * workspace,
+    size_t workspace_size,
+    void * output,
+    const void * input,
+    const void * index,
+    void *stream_
+) const {
+    if (workspace_size < _workspace_size)
+        return INFINI_STATUS_INSUFFICIENT_WORKSPACE;
+
+    hcStream_t stream = (hcStream_t)stream_;
+
+    #define CALCULATE_SCATTER(BLOCK_SIZE, TDATA) \
+        calculate_scatter<BLOCK_SIZE, TDATA>(_info, (TDATA *)output, (const TDATA *)input, (const int64_t *)index, stream, workspace)
+
+    #define CALCULATE_SCATTER_WITH_METAX_BLOCK(BLOCK_SIZE)            \
+    switch (_info.dtype) {                                            \
+        case INFINI_DTYPE_BOOL:                                       \
+            return CALCULATE_SCATTER(BLOCK_SIZE, bool);               \
+        case INFINI_DTYPE_U8:                                         \
+            return CALCULATE_SCATTER(BLOCK_SIZE, uint8_t);            \
+        case INFINI_DTYPE_U16:                                        \
+            return CALCULATE_SCATTER(BLOCK_SIZE, uint16_t);           \
+        case INFINI_DTYPE_U32:                                        \
+            return CALCULATE_SCATTER(BLOCK_SIZE, uint32_t);           \
+        case INFINI_DTYPE_U64:                                        \
+            return CALCULATE_SCATTER(BLOCK_SIZE, uint64_t);           \
+        case INFINI_DTYPE_I8:                                         \
+            return CALCULATE_SCATTER(BLOCK_SIZE, int8_t);             \
+        case INFINI_DTYPE_I16:                                        \
+            return CALCULATE_SCATTER(BLOCK_SIZE, int16_t);            \
+        case INFINI_DTYPE_I32:                                        \
+            return CALCULATE_SCATTER(BLOCK_SIZE, int32_t);            \
+        case INFINI_DTYPE_I64:                                        \
+            return CALCULATE_SCATTER(BLOCK_SIZE, int64_t);            \
+        case INFINI_DTYPE_F16:                                        \
+            return CALCULATE_SCATTER(BLOCK_SIZE, half);               \
+        case INFINI_DTYPE_F32:                                        \
+            return CALCULATE_SCATTER(BLOCK_SIZE, float);              \
+        case INFINI_DTYPE_BF16:                                       \
+            return CALCULATE_SCATTER(BLOCK_SIZE, cuda_bfloat16);      \
+        default:                                                      \
+            return INFINI_STATUS_BAD_TENSOR_DTYPE;                    \
+    }   
+
+    if (_opaque->internal->maxThreadsPerBlock() == METAX_BLOCK_SIZE_1024)
+        CALCULATE_SCATTER_WITH_METAX_BLOCK(METAX_BLOCK_SIZE_1024)
+    else if (_opaque->internal->maxThreadsPerBlock() == METAX_BLOCK_SIZE_512)
+        CALCULATE_SCATTER_WITH_METAX_BLOCK(METAX_BLOCK_SIZE_512)
+    else
+        return INFINI_STATUS_DEVICE_ARCHITECTURE_NOT_SUPPORTED;
+    return INFINI_STATUS_SUCCESS;
+
+    #undef CALCULATE_SCATTER_WITH_METAX_BLOCK
+    #undef CALCULATE_SCATTER
+}
+} // namespace op::scatter::metax
diff --git a/src/infiniop/ops/scatter/nvidia/scatter_nvidia.cu b/src/infiniop/ops/scatter/nvidia/scatter_nvidia.cu
new file mode 100644
index 000000000..7d6e1a1a1
--- /dev/null
+++ b/src/infiniop/ops/scatter/nvidia/scatter_nvidia.cu
@@ -0,0 +1,190 @@
+#include "../../../devices/nvidia/nvidia_handle.cuh"
+#include "../../../devices/nvidia/nvidia_common.cuh"
+#include "../../../devices/nvidia/nvidia_kernel_common.cuh"
+
+#include "scatter_nvidia.cuh"
+#include "../cuda/kernel.cuh"
+#include "../info.h"
+
+namespace op::scatter::nvidia {
+
+//  ---------------------- start: launchKernel: call kernel function of CUDA -----------------------
+template <unsigned int BLOCK_SIZE, typename Tdata>
+INFINIOP_CUDA_KERNEL launchKernel(
+    Tdata * output,
+    const Tdata * input,
+    const int64_t * index,
+    size_t ndim,
+    size_t index_scatter_size,
+    ptrdiff_t * output_strides,
+    ptrdiff_t * input_strides,
+    ptrdiff_t * index_strides,
+    ptrdiff_t * contiguous_strides,
+    int scatter_dim
+) {
+    scatterKernel<BLOCK_SIZE, Tdata>(
+        output,
+        input,
+        index,
+        ndim,
+        index_scatter_size,
+        output_strides,
+        input_strides,
+        index_strides,
+        contiguous_strides,
+        scatter_dim
+    );
+}
+//  ----------------------- end: launchKernel: call kernel function of CUDA ------------------------
+
+//  ----------------------------------- start: call launchKernel -----------------------------------
+template<unsigned int BLOCK_SIZE, typename Tdata>
+infiniStatus_t calculate_scatter(
+    const ScatterInfo &info,
+    Tdata * output,
+    const Tdata * input,
+    const int64_t *  index,
+    cudaStream_t stream,
+    void * workspace
+) {
+    size_t ndim = info.ndim;
+    ptrdiff_t * contiguous_strides = new ptrdiff_t[ndim];
+    size_t last_dim = 1, last_stride = 1;
+    size_t scatter_dim = info.dim;
+    for(size_t d = 0; d < ndim; d ++)
+    {
+        if (d == scatter_dim) 
+            continue;
+        contiguous_strides[d] = last_dim * last_stride;
+        last_dim = info.index_shape[d];
+        last_stride = contiguous_strides[d];
+    }
+
+    size_t batch_size = last_dim * last_stride;
+
+    ptrdiff_t * contiguous_strides_cuda = reinterpret_cast<ptrdiff_t*>(workspace);
+    ptrdiff_t * input_strides_cuda = contiguous_strides_cuda + ndim;
+    ptrdiff_t * output_strides_cuda = input_strides_cuda + ndim;
+    ptrdiff_t * index_strides_cuda = output_strides_cuda + ndim;
+
+    CHECK_CUDA(cudaMemcpyAsync(contiguous_strides_cuda, contiguous_strides, sizeof(ptrdiff_t) * ndim, cudaMemcpyHostToDevice, stream));
+    CHECK_CUDA(cudaMemcpyAsync(input_strides_cuda, info.input_strides.data(), sizeof(ptrdiff_t) * ndim, cudaMemcpyHostToDevice, stream));
+    CHECK_CUDA(cudaMemcpyAsync(output_strides_cuda, info.output_strides.data(), sizeof(ptrdiff_t) * ndim, cudaMemcpyHostToDevice, stream));
+    CHECK_CUDA(cudaMemcpyAsync(index_strides_cuda, info.index_strides.data(), sizeof(ptrdiff_t) * ndim, cudaMemcpyHostToDevice, stream));    
+
+    launchKernel<BLOCK_SIZE, Tdata><<<batch_size, BLOCK_SIZE, 0, stream>>>(
+        output,
+        input,
+        index,
+        ndim,
+        info.index_shape[scatter_dim],
+        output_strides_cuda,
+        input_strides_cuda,
+        index_strides_cuda,
+        contiguous_strides_cuda,
+        scatter_dim
+    );
+    delete[] contiguous_strides;
+    return INFINI_STATUS_SUCCESS;
+}
+//  ------------------------------------ end: call launchKernel ------------------------------------
+
+
+struct Descriptor::Opaque {
+    std::shared_ptr<device::nvidia::Handle::Internal> internal;
+};
+
+Descriptor::~Descriptor() {
+    delete _opaque;
+}
+
+infiniStatus_t Descriptor::create(
+    infiniopHandle_t handle_,
+    Descriptor **desc_ptr,
+    infiniopTensorDescriptor_t output_desc,
+    infiniopTensorDescriptor_t input_desc,
+    infiniopTensorDescriptor_t index_desc,
+    size_t dim
+) {
+    auto handle = reinterpret_cast<device::nvidia::Handle *>(handle_);
+//  --------------------- start: check data type and calculate workspace size ----------------------
+    auto dtype = output_desc->dtype();
+    size_t WorkSpaceSize = sizeof(ptrdiff_t) * input_desc->ndim() * 4;
+//  ---------------------- end: check data type and calculate workspace size -----------------------
+    auto result = ScatterInfo::createScatterInfo(
+        output_desc,
+        input_desc,
+        index_desc,
+        dim
+    );
+    CHECK_RESULT(result);
+    const ScatterInfo &info = result.take();
+    *desc_ptr = new Descriptor(
+        dtype, std::move(info), WorkSpaceSize,
+        new Opaque{handle->internal()},
+        handle->device, handle->device_id
+    );    
+    return INFINI_STATUS_SUCCESS;
+}
+
+
+infiniStatus_t Descriptor::calculate(
+    void * workspace,
+    size_t workspace_size,
+    void * output,
+    const void * input,
+    const void * index,
+    void *stream_
+) const {
+    if (workspace_size < _workspace_size) {
+        return INFINI_STATUS_INSUFFICIENT_WORKSPACE;
+    }
+    cudaStream_t stream = (cudaStream_t)stream_;
+    #define CALCULATE_SCATTER(BLOCK_SIZE, TDATA) \
+        calculate_scatter<BLOCK_SIZE, TDATA>(_info, (TDATA *)output, (const TDATA *)input, (const int64_t *)index, stream, workspace)
+    #define CALCULATE_SCATTER_WITH_BLOCK_SIZE(BLOCK_SIZE)             \
+    switch (_info.dtype) {                                            \
+        case INFINI_DTYPE_BOOL:                                       \
+            return CALCULATE_SCATTER(BLOCK_SIZE, bool);               \
+        case INFINI_DTYPE_U8:                                         \
+            return CALCULATE_SCATTER(BLOCK_SIZE, uint8_t);            \
+        case INFINI_DTYPE_U16:                                        \
+            return CALCULATE_SCATTER(BLOCK_SIZE, uint16_t);           \
+        case INFINI_DTYPE_U32:                                        \
+            return CALCULATE_SCATTER(BLOCK_SIZE, uint32_t);           \
+        case INFINI_DTYPE_U64:                                        \
+            return CALCULATE_SCATTER(BLOCK_SIZE, uint64_t);           \
+        case INFINI_DTYPE_I8:                                         \
+            return CALCULATE_SCATTER(BLOCK_SIZE, int8_t);             \
+        case INFINI_DTYPE_I16:                                        \
+            return CALCULATE_SCATTER(BLOCK_SIZE, int16_t);            \
+        case INFINI_DTYPE_I32:                                        \
+            return CALCULATE_SCATTER(BLOCK_SIZE, int32_t);            \
+        case INFINI_DTYPE_I64:                                        \
+            return CALCULATE_SCATTER(BLOCK_SIZE, int64_t);            \
+        case INFINI_DTYPE_F16:                                        \
+            return CALCULATE_SCATTER(BLOCK_SIZE, half);               \
+        case INFINI_DTYPE_F32:                                        \
+            return CALCULATE_SCATTER(BLOCK_SIZE, float);              \
+        case INFINI_DTYPE_BF16:                                       \
+            return CALCULATE_SCATTER(BLOCK_SIZE, cuda_bfloat16);      \
+        default:                                                      \
+            return INFINI_STATUS_BAD_TENSOR_DTYPE;                    \
+    }    
+        
+    
+    if (_opaque->internal->maxThreadsPerBlock() == CUDA_BLOCK_SIZE_1024)
+        CALCULATE_SCATTER_WITH_BLOCK_SIZE(CUDA_BLOCK_SIZE_1024)
+    else if (_opaque->internal->maxThreadsPerBlock() == CUDA_BLOCK_SIZE_512)
+        CALCULATE_SCATTER_WITH_BLOCK_SIZE(CUDA_BLOCK_SIZE_512)
+    else if (_opaque->internal->maxThreadsPerBlock() == CUDA_BLOCK_SIZE_4096)
+        CALCULATE_SCATTER_WITH_BLOCK_SIZE(CUDA_BLOCK_SIZE_4096)
+    else
+        return INFINI_STATUS_DEVICE_ARCHITECTURE_NOT_SUPPORTED;
+    
+    #undef CALCULATE_SCATTER_WITH_BLOCK_SIZE
+    #undef CALCULATE_SCATTER
+
+    return INFINI_STATUS_SUCCESS;
+}
+} // namespace op::scatter::nvidia
diff --git a/src/infiniop/ops/scatter/nvidia/scatter_nvidia.cuh b/src/infiniop/ops/scatter/nvidia/scatter_nvidia.cuh
new file mode 100644
index 000000000..a199edb6e
--- /dev/null
+++ b/src/infiniop/ops/scatter/nvidia/scatter_nvidia.cuh
@@ -0,0 +1,7 @@
+#ifndef __SCATTER_NVIDIA_API_H__
+#define __SCATTER_NVIDIA_API_H__
+#include "../scatter.h"
+
+DESCRIPTOR(nvidia)
+
+#endif // __SCATTER_NVIDIA_API_H__
diff --git a/src/infiniop/ops/scatter/operator.cc b/src/infiniop/ops/scatter/operator.cc
new file mode 100644
index 000000000..7c7de71b5
--- /dev/null
+++ b/src/infiniop/ops/scatter/operator.cc
@@ -0,0 +1,154 @@
+#include "../../operator.h"
+#include "../../handle.h"
+#include "infiniop/ops/scatter.h"
+
+#ifdef ENABLE_CPU_API
+#include "cpu/scatter_cpu.h"
+#endif
+#if defined(ENABLE_NVIDIA_API) || defined(ENABLE_ILUVATAR_API)
+#include "nvidia/scatter_nvidia.cuh"
+#endif
+#ifdef ENABLE_METAX_API
+#include "metax/scatter_metax.h"
+#endif
+
+__C infiniStatus_t infiniopCreateScatterDescriptor(
+    infiniopHandle_t handle,
+    infiniopScatterDescriptor_t *desc_ptr,
+	infiniopTensorDescriptor_t output_desc,
+	infiniopTensorDescriptor_t input_desc,
+	infiniopTensorDescriptor_t index_desc,
+	size_t dim
+) {
+#define CREATE(CASE, NAMESPACE)                                                 \
+    case CASE:                                                                  \
+        return op::scatter::NAMESPACE::Descriptor::create(                      \
+            handle,                                                             \
+            reinterpret_cast<op::scatter::NAMESPACE::Descriptor **>(desc_ptr),  \
+            output_desc,                                                        \
+            input_desc,                                                         \
+            index_desc,                                                         \
+            dim                                                                 \
+        )
+
+    switch (handle->device) {
+
+#ifdef ENABLE_CPU_API
+        CREATE(INFINI_DEVICE_CPU, cpu);
+#endif
+#ifdef ENABLE_NVIDIA_API
+        CREATE(INFINI_DEVICE_NVIDIA, nvidia);
+#endif
+#ifdef ENABLE_ILUVATAR_API
+        CREATE(INFINI_DEVICE_ILUVATAR, nvidia);
+#endif
+#ifdef ENABLE_METAX_API
+       CREATE(INFINI_DEVICE_METAX, metax);
+#endif
+
+    default:
+        return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
+    }
+
+#undef CREATE
+}
+
+__C infiniStatus_t infiniopGetScatterWorkspaceSize(infiniopScatterDescriptor_t desc, size_t *size) {
+#define GET(CASE, NAMESPACE)                                                                        \
+    case CASE:                                                                                      \
+        *size = reinterpret_cast<op::scatter::NAMESPACE::Descriptor *>(desc)->workspaceSize();      \
+        return INFINI_STATUS_SUCCESS
+
+    switch (desc->device_type) {
+#ifdef ENABLE_CPU_API
+        GET(INFINI_DEVICE_CPU, cpu);
+#endif
+#ifdef ENABLE_NVIDIA_API
+        GET(INFINI_DEVICE_NVIDIA, nvidia);
+#endif
+#ifdef ENABLE_ILUVATAR_API
+        GET(INFINI_DEVICE_ILUVATAR, nvidia);
+#endif
+#ifdef ENABLE_METAX_API
+       GET(INFINI_DEVICE_METAX, metax);
+#endif
+    default:
+        return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
+    }
+#undef GET
+
+    return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
+}
+
+__C infiniStatus_t infiniopScatter(
+    infiniopScatterDescriptor_t desc,
+    void *workspace,
+    size_t workspace_size,
+	void * output,
+	const void * input,
+	const void * index,                
+    void *stream
+) {
+
+#define CALCULATE(CASE, NAMESPACE)                                                                  \
+    case CASE:                                                                                      \
+        return reinterpret_cast<const op::scatter::NAMESPACE::Descriptor *>(desc)->calculate(       \
+            workspace,                                                                              \
+            workspace_size,                                                                         \
+            output,                                                                                 \
+            input,                                                                                  \
+            index,                                                                                  \
+            stream                                                                                  \
+        )
+
+    switch (desc->device_type) {
+
+#ifdef ENABLE_CPU_API
+        CALCULATE(INFINI_DEVICE_CPU, cpu);
+#endif
+#ifdef ENABLE_NVIDIA_API
+        CALCULATE(INFINI_DEVICE_NVIDIA, nvidia);
+#endif
+#ifdef ENABLE_ILUVATAR_API
+        CALCULATE(INFINI_DEVICE_ILUVATAR, nvidia);
+#endif
+#ifdef ENABLE_METAX_API
+       CALCULATE(INFINI_DEVICE_METAX, metax);
+#endif
+
+    default:
+        return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
+    }
+
+#undef CALCULATE
+}
+
+__C infiniStatus_t
+infiniopDestroyScatterDescriptor(infiniopScatterDescriptor_t desc) {
+
+#define DELETE(CASE, NAMESPACE)                                                           \
+    case CASE:                                                                            \
+        delete reinterpret_cast<const op::scatter::NAMESPACE::Descriptor *>(desc);        \
+        return INFINI_STATUS_SUCCESS
+
+    switch (desc->device_type) {
+
+#ifdef ENABLE_CPU_API
+        DELETE(INFINI_DEVICE_CPU, cpu);
+#endif
+#ifdef ENABLE_NVIDIA_API
+        DELETE(INFINI_DEVICE_NVIDIA, nvidia);
+#endif
+#ifdef ENABLE_ILUVATAR_API
+        DELETE(INFINI_DEVICE_ILUVATAR, nvidia);
+#endif
+#ifdef ENABLE_METAX_API
+       DELETE(INFINI_DEVICE_METAX, metax);
+#endif
+
+    default:
+        return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
+    }
+
+#undef DELETE
+}
diff --git a/src/infiniop/ops/scatter/scatter.h b/src/infiniop/ops/scatter/scatter.h
new file mode 100644
index 000000000..7d2deefc1
--- /dev/null
+++ b/src/infiniop/ops/scatter/scatter.h
@@ -0,0 +1,49 @@
+#ifndef __SCATTER_H__
+#define __SCATTER_H__
+
+#include "../../../utils.h"
+#include "../../operator.h"
+#include "../../tensor.h"
+#include "info.h"
+
+#define DESCRIPTOR(NAMESPACE)                                         \
+    namespace op::scatter::NAMESPACE {                                \
+    class Descriptor final : public InfiniopDescriptor {              \
+        struct Opaque;                                                \
+        Opaque *_opaque;                                              \
+        ScatterInfo _info;                                            \
+        size_t _workspace_size;                                       \
+        Descriptor(                                                   \
+            infiniDtype_t dtype,                                      \
+            ScatterInfo info,                                         \
+            size_t workspace_size_,                                   \
+            Opaque *opaque,                                           \
+            infiniDevice_t device_type,                               \
+            int device_id                                             \
+        ) : InfiniopDescriptor{device_type, device_id},               \
+              _opaque(opaque),                                        \
+              _info(info),                                            \
+              _workspace_size(workspace_size_) {}                     \
+    public:                                                           \
+        ~Descriptor();                                                \
+        size_t workspaceSize() const { return _workspace_size; }      \
+        static infiniStatus_t create(                                 \
+            infiniopHandle_t handle,                                  \
+            Descriptor **desc_ptr,                                    \
+            infiniopTensorDescriptor_t output_desc,                   \
+            infiniopTensorDescriptor_t input_desc,                    \
+            infiniopTensorDescriptor_t index_desc,                    \
+            size_t dim                                                \
+        );                                                            \
+        infiniStatus_t calculate(                                     \
+            void *workspace,                                          \
+            size_t workspace_size,                                    \
+            void * output,                                            \
+            const void * input,                                       \
+            const void * index,                                       \
+            void *stream                                              \
+        ) const;                                                      \
+    };                                                                \
+    }
+
+#endif
\ No newline at end of file
diff --git a/src/infiniop/ops/tril/cpu/tril_cpu.cc b/src/infiniop/ops/tril/cpu/tril_cpu.cc
new file mode 100644
index 000000000..3b2d2f312
--- /dev/null
+++ b/src/infiniop/ops/tril/cpu/tril_cpu.cc
@@ -0,0 +1,81 @@
+#include "tril_cpu.h"
+#include "../../../devices/cpu/common_cpu.h"
+#include "../../../reduce/cpu/reduce.h"
+#include "../info.h"
+
+namespace op::tril::cpu {
+
+Descriptor::~Descriptor() = default;
+
+infiniStatus_t Descriptor::create(
+    infiniopHandle_t handle_,
+    Descriptor **desc_ptr,
+	infiniopTensorDescriptor_t output_desc,
+	infiniopTensorDescriptor_t input_desc,
+	int diagonal
+) {
+    auto handle = reinterpret_cast<device::cpu::Handle *>(handle_);
+
+//  --------------------- start: check data type and calculate workspace size ----------------------
+    auto dtype = input_desc->dtype();
+    size_t WorkSpaceSize = 0;
+//  ---------------------- end: check data type and calculate workspace size -----------------------
+
+    auto result = TrilInfo::createTrilInfo(
+		output_desc,
+		input_desc,
+		diagonal
+    );
+    CHECK_RESULT(result);
+    const TrilInfo &info = result.take();
+    
+    *desc_ptr = new Descriptor(
+        dtype, std::move(info), WorkSpaceSize,
+        nullptr,
+        handle->device, handle->device_id
+    );    
+
+    return INFINI_STATUS_SUCCESS;
+}
+
+infiniStatus_t Descriptor::calculate(
+    void *workspace,
+    size_t workspace_size,
+	void * output,
+	const void * input,
+    void *stream
+) const {
+
+    #pragma omp parallel for
+    for (int row = 0; row < _info.row_size; row ++)
+    {
+        auto output_ptr = reinterpret_cast<char *>(output) + row * _info.column_size * _info.elem_size;
+        auto input_ptr = reinterpret_cast<const char *>(input) + row * _info.column_size * _info.elem_size;
+        if (0 > row + _info.diagonal)
+            memset(
+                output_ptr,
+                0,
+                _info.elem_size * _info.column_size
+            );
+        else if (_info.column_size - 1 <= row + _info.diagonal)
+            memcpy(
+                output_ptr,
+                input_ptr, 
+                _info.elem_size * _info.column_size
+            );
+        else {
+            memset(
+                output_ptr + _info.elem_size * (row + _info.diagonal + 1),
+                0,
+                _info.elem_size * (_info.column_size - row - _info.diagonal - 1)
+            );
+            memcpy(
+                output_ptr,
+                input_ptr,
+                _info.elem_size * (row + _info.diagonal + 1)
+            );   
+        }
+    }    
+    return INFINI_STATUS_SUCCESS;
+}
+}
diff --git a/src/infiniop/ops/tril/cpu/tril_cpu.h b/src/infiniop/ops/tril/cpu/tril_cpu.h
new file mode 100644
index 000000000..1410af08b
--- /dev/null
+++ b/src/infiniop/ops/tril/cpu/tril_cpu.h
@@ -0,0 +1,9 @@
+#ifndef __TRIL_CPU_H__
+#define __TRIL_CPU_H__
+
+#include "../tril.h"
+
+DESCRIPTOR(cpu)
+
+
+#endif // __TRIL_CPU_H__
diff --git a/src/infiniop/ops/tril/cuda/kernel.cuh b/src/infiniop/ops/tril/cuda/kernel.cuh
new file mode 100644
index 000000000..07b163524
--- /dev/null
+++ b/src/infiniop/ops/tril/cuda/kernel.cuh
@@ -0,0 +1,19 @@
+#ifndef __TRIL_KERNEL_CUH__
+#define __TRIL_KERNEL_CUH__
+//  ------------------------------- start: perform operator on CUDA --------------------------------
+template<unsigned int BLOCK_SIZE, typename Tdata>
+__device__ void trilKernel(
+    Tdata * output,
+    const Tdata * input,
+    int column_size,
+    int diagonal
+) {
+    int row = blockIdx.x;
+    auto output_ptr = output + row * column_size;
+    auto input_ptr = input + row * column_size;
+    for (int i = threadIdx.x; i < column_size; i += BLOCK_SIZE)
+        *(output_ptr + i) = (i >= row + diagonal + 1) ? (Tdata)0 : *(input_ptr + i);
+}
+//  -------------------------------- end: perform operator on CUDA ---------------------------------
+
+#endif // __TRIL_KERNEL_CUH__
\ No newline at end of file
diff --git a/src/infiniop/ops/tril/info.h b/src/infiniop/ops/tril/info.h
new file mode 100644
index 000000000..6e0c1d87b
--- /dev/null
+++ b/src/infiniop/ops/tril/info.h
@@ -0,0 +1,52 @@
+#ifndef __TRIL_INFO_H__
+#define __TRIL_INFO_H__
+
+#include "../../../utils.h"
+#include "../../operator.h"
+#include "../../tensor.h"
+
+namespace op::tril {
+
+class TrilInfo {
+private:
+    TrilInfo() = default;
+
+public:
+//  ---------------------------- start: define member variables of Info ----------------------------
+    infiniDtype_t dtype;
+	size_t elem_size;
+	int row_size;
+	int column_size;
+	int diagonal;
+
+//  ----------------------------- end: define member variables of Info -----------------------------
+
+    static utils::Result<TrilInfo> createTrilInfo(
+		infiniopTensorDescriptor_t output_desc,
+		infiniopTensorDescriptor_t input_desc,
+		int diagonal
+    ) {
+//  ------------------------- start: check tensor shape and input validity -------------------------
+		CHECK_SAME_SHAPE(
+			input_desc->shape(), output_desc->shape()
+		);
+		CHECK_OR_RETURN(
+			input_desc->ndim() == 2,
+			INFINI_STATUS_BAD_TENSOR_SHAPE
+		);
+		size_t elem_size = infiniSizeOf(input_desc->dtype());
+//  -------------------------- end: check tensor shape and input validity --------------------------
+        return utils::Result<TrilInfo>(TrilInfo{
+//  ------------------------------ start: create an instance of Info -------------------------------
+            input_desc->dtype(),
+			elem_size,
+			int(input_desc->dim(0)),
+			int(input_desc->dim(1)),
+			diagonal
+//  ------------------------------- end: create an instance of Info --------------------------------
+        });
+    }
+};
+}
+
+#endif //  __TRIL_INFO_H__
diff --git a/src/infiniop/ops/tril/metax/tril_metax.h b/src/infiniop/ops/tril/metax/tril_metax.h
new file mode 100644
index 000000000..35c4a0f72
--- /dev/null
+++ b/src/infiniop/ops/tril/metax/tril_metax.h
@@ -0,0 +1,8 @@
+#ifndef __TRIL_METAX_H__
+#define __TRIL_METAX_H__
+
+#include "../tril.h"
+
+DESCRIPTOR(metax)
+
+#endif // __TRIL_METAX_H__
diff --git a/src/infiniop/ops/tril/metax/tril_metax.maca b/src/infiniop/ops/tril/metax/tril_metax.maca
new file mode 100644
index 000000000..e2c59b634
--- /dev/null
+++ b/src/infiniop/ops/tril/metax/tril_metax.maca
@@ -0,0 +1,128 @@
+#include "../../../devices/metax/metax_common.h"
+#include "tril_metax.h"
+#include <hccub/block/block_reduce.cuh>
+#include "../../../devices/metax/metax_kernel_common.h"
+#include "../../../reduce/cuda/reduce.cuh"
+#include "../cuda/kernel.cuh"
+#include "../info.h"
+
+namespace op::tril::metax {
+template <unsigned int BLOCK_SIZE, typename Tdata>
+INFINIOP_METAX_KERNEL launchKernel(
+    Tdata * output,
+    const Tdata * input,
+    int column_size,
+    int diagonal
+) {
+    trilKernel<BLOCK_SIZE, Tdata>(
+        output,
+        input,
+        column_size,
+        diagonal
+    );
+}
+
+
+struct Descriptor::Opaque {
+    std::shared_ptr<device::metax::Handle::Internal> internal;
+};
+
+Descriptor::~Descriptor() {
+    delete _opaque;
+}
+
+infiniStatus_t Descriptor::create(
+    infiniopHandle_t handle_,
+    Descriptor **desc_ptr,
+    infiniopTensorDescriptor_t output_desc,
+    infiniopTensorDescriptor_t input_desc,
+    int diagonal
+) {
+    auto handle = reinterpret_cast<device::metax::Handle *>(handle_);
+//  --------------------- start: check data type and calculate workspace size ----------------------    
+    auto dtype = output_desc->dtype();
+    auto result = TrilInfo::createTrilInfo(
+        output_desc,
+        input_desc,
+        diagonal
+    );
+    CHECK_RESULT(result);
+    const TrilInfo &info = result.take();
+    size_t WorkSpaceSize = 0;
+//  ---------------------- end: check data type and calculate workspace size -----------------------
+
+    *desc_ptr = new Descriptor(
+        dtype, std::move(info), WorkSpaceSize, 
+        new Opaque{handle->internal()},
+        handle->device, handle->device_id
+    );
+    return INFINI_STATUS_SUCCESS;
+}
+
+
+
+infiniStatus_t Descriptor::calculate(
+    void * workspace,
+    size_t workspace_size,
+    void * output,
+    const void * input,
+    void *stream_
+) const {
+    if (workspace_size < _workspace_size)
+        return INFINI_STATUS_INSUFFICIENT_WORKSPACE;
+
+    hcStream_t stream = (hcStream_t)stream_;
+
+    #define CALCULATE_TRIL(TDATA)                                     \
+    launchKernel<256, TDATA><<<_info.row_size, 256, 0, stream>>>(     \
+        (TDATA *)output,                                              \
+        (const TDATA *)input,                                         \
+        _info.column_size,                                            \
+        _info.diagonal                                                \
+    )
+
+    switch (_info.dtype) {
+        case INFINI_DTYPE_BOOL:
+            CALCULATE_TRIL(bool);
+            break;
+        case INFINI_DTYPE_U8:
+            CALCULATE_TRIL(uint8_t);
+            break;
+        case INFINI_DTYPE_U16:
+            CALCULATE_TRIL(uint16_t);
+            break;
+        case INFINI_DTYPE_U32:
+            CALCULATE_TRIL(uint32_t);
+            break;
+        case INFINI_DTYPE_U64:
+            CALCULATE_TRIL(uint64_t);
+            break;
+        case INFINI_DTYPE_I8:
+            CALCULATE_TRIL(int8_t);
+            break;
+        case INFINI_DTYPE_I16:
+            CALCULATE_TRIL(int16_t);
+            break;
+        case INFINI_DTYPE_I32:
+            CALCULATE_TRIL(int32_t);
+            break;
+        case INFINI_DTYPE_I64:
+            CALCULATE_TRIL(int64_t);
+            break;
+        case INFINI_DTYPE_F16:
+            CALCULATE_TRIL(half);
+            break;
+        case INFINI_DTYPE_F32:
+            CALCULATE_TRIL(float);
+            break;
+        case INFINI_DTYPE_BF16:
+            CALCULATE_TRIL(cuda_bfloat16);
+            break;
+        default:
+            return INFINI_STATUS_BAD_TENSOR_DTYPE;
+    }
+    return INFINI_STATUS_SUCCESS;
+
+    #undef CALCULATE_TRIL
+}
+} // namespace op::tril::metax
diff --git a/src/infiniop/ops/tril/nvidia/tril_nvidia.cu b/src/infiniop/ops/tril/nvidia/tril_nvidia.cu
new file mode 100644
index 000000000..2038a1e4a
--- /dev/null
+++ b/src/infiniop/ops/tril/nvidia/tril_nvidia.cu
@@ -0,0 +1,122 @@
+#include "../../../devices/nvidia/nvidia_handle.cuh"
+#include "../../../devices/nvidia/nvidia_common.cuh"
+#include "../../../devices/nvidia/nvidia_kernel_common.cuh"
+#include "tril_nvidia.cuh"
+#include "../cuda/kernel.cuh"
+#include "../info.h"
+
+namespace op::tril::nvidia {
+template <unsigned int BLOCK_SIZE, typename Tdata>
+INFINIOP_CUDA_KERNEL launchKernel(
+    Tdata * output,
+    const Tdata * input,
+    int column_size,
+    int diagonal
+) {
+    trilKernel<BLOCK_SIZE, Tdata>(
+        output,
+        input,
+        column_size,
+        diagonal
+    );
+}
+
+
+struct Descriptor::Opaque {
+    std::shared_ptr<device::nvidia::Handle::Internal> internal;
+};
+
+Descriptor::~Descriptor() {
+    delete _opaque;
+}
+
+infiniStatus_t Descriptor::create(
+    infiniopHandle_t handle_,
+    Descriptor **desc_ptr,
+    infiniopTensorDescriptor_t output_desc,
+    infiniopTensorDescriptor_t input_desc,
+    int diagonal
+) {
+    auto handle = reinterpret_cast<device::nvidia::Handle *>(handle_);
+//  --------------------- start: check data type and calculate workspace size ----------------------
+    auto dtype = output_desc->dtype();
+    size_t WorkSpaceSize = 0;
+//  ---------------------- end: check data type and calculate workspace size -----------------------
+    auto result = TrilInfo::createTrilInfo(
+        output_desc,
+        input_desc,
+        diagonal
+    );
+    CHECK_RESULT(result);
+    const TrilInfo &info = result.take();
+    *desc_ptr = new Descriptor(
+        dtype, std::move(info), WorkSpaceSize,
+        new Opaque{handle->internal()},
+        handle->device, handle->device_id
+    );    
+    return INFINI_STATUS_SUCCESS;
+}
+
+
+infiniStatus_t Descriptor::calculate(
+    void * workspace,
+    size_t workspace_size,
+    void * output,
+    const void * input,
+    void *stream_
+) const {
+    if (workspace_size < _workspace_size) {
+        return INFINI_STATUS_INSUFFICIENT_WORKSPACE;
+    }
+    cudaStream_t stream = (cudaStream_t)stream_;
+
+    #define CALCULATE_TRIL(TDATA)                                     \
+    launchKernel<256, TDATA><<<_info.row_size, 256, 0, stream>>>(     \
+        (TDATA *)output,                                              \
+        (const TDATA *)input,                                         \
+        _info.column_size,                                            \
+        _info.diagonal                                                \
+    )
+    switch (_info.dtype) {
+        case INFINI_DTYPE_BOOL:
+            CALCULATE_TRIL(uint8_t);
+            break;
+        case INFINI_DTYPE_U8:
+            CALCULATE_TRIL(uint8_t);
+            break;
+        case INFINI_DTYPE_U16:
+            CALCULATE_TRIL(uint16_t);
+            break;
+        case INFINI_DTYPE_U32:
+            CALCULATE_TRIL(uint32_t);
+            break;
+        case INFINI_DTYPE_U64:
+            CALCULATE_TRIL(uint64_t);
+            break;
+        case INFINI_DTYPE_I8:
+            CALCULATE_TRIL(int8_t);
+            break;
+        case INFINI_DTYPE_I16:
+            CALCULATE_TRIL(int16_t);
+            break;
+        case INFINI_DTYPE_I32:
+            CALCULATE_TRIL(int32_t);
+            break;
+        case INFINI_DTYPE_I64:
+            CALCULATE_TRIL(int64_t);
+            break;
+        case INFINI_DTYPE_F16:
+            CALCULATE_TRIL(half);
+            break;
+        case INFINI_DTYPE_F32:
+            CALCULATE_TRIL(float);
+            break;
+        case INFINI_DTYPE_BF16:
+            CALCULATE_TRIL(cuda_bfloat16);
+            break;
+        default:
+            return INFINI_STATUS_BAD_TENSOR_DTYPE;
+    }
+    return INFINI_STATUS_SUCCESS;
+}
+} // namespace op::tril::nvidia
diff --git a/src/infiniop/ops/tril/nvidia/tril_nvidia.cuh b/src/infiniop/ops/tril/nvidia/tril_nvidia.cuh
new file mode 100644
index 000000000..4db2fbb45
--- /dev/null
+++ b/src/infiniop/ops/tril/nvidia/tril_nvidia.cuh
@@ -0,0 +1,7 @@
+#ifndef __TRIL_NVIDIA_API_H__
+#define __TRIL_NVIDIA_API_H__
+#include "../tril.h"
+
+DESCRIPTOR(nvidia)
+
+#endif // __TRIL_NVIDIA_API_H__
diff --git a/src/infiniop/ops/tril/operator.cc b/src/infiniop/ops/tril/operator.cc
new file mode 100644
index 000000000..f7f519bde
--- /dev/null
+++ b/src/infiniop/ops/tril/operator.cc
@@ -0,0 +1,150 @@
+#include "../../operator.h"
+#include "../../handle.h"
+#include "infiniop/ops/tril.h"
+
+#ifdef ENABLE_CPU_API
+#include "cpu/tril_cpu.h"
+#endif
+#if defined(ENABLE_NVIDIA_API) || defined(ENABLE_ILUVATAR_API)
+#include "nvidia/tril_nvidia.cuh"
+#endif
+#ifdef ENABLE_METAX_API
+#include "metax/tril_metax.h"
+#endif
+
+__C infiniStatus_t infiniopCreateTrilDescriptor(
+    infiniopHandle_t handle,
+    infiniopTrilDescriptor_t *desc_ptr,
+	infiniopTensorDescriptor_t output_desc,
+	infiniopTensorDescriptor_t input_desc,
+	int diagonal
+) {
+#define CREATE(CASE, NAMESPACE)                                                 \
+    case CASE:                                                                  \
+        return op::tril::NAMESPACE::Descriptor::create(                         \
+            handle,                                                             \
+            reinterpret_cast<op::tril::NAMESPACE::Descriptor **>(desc_ptr),     \
+            output_desc,                                                        \
+            input_desc,                                                         \
+            diagonal                                                            \
+        )
+
+    switch (handle->device) {
+
+#ifdef ENABLE_CPU_API
+        CREATE(INFINI_DEVICE_CPU, cpu);
+#endif
+#ifdef ENABLE_NVIDIA_API
+        CREATE(INFINI_DEVICE_NVIDIA, nvidia);
+#endif
+#ifdef ENABLE_ILUVATAR_API
+        CREATE(INFINI_DEVICE_ILUVATAR, nvidia);
+#endif
+#ifdef ENABLE_METAX_API
+       CREATE(INFINI_DEVICE_METAX, metax);
+#endif
+
+    default:
+        return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
+    }
+
+#undef CREATE
+}
+
+__C infiniStatus_t infiniopGetTrilWorkspaceSize(infiniopTrilDescriptor_t desc, size_t *size) {
+#define GET(CASE, NAMESPACE)                                                                        \
+    case CASE:                                                                                      \
+        *size = reinterpret_cast<op::tril::NAMESPACE::Descriptor *>(desc)->workspaceSize();         \
+        return INFINI_STATUS_SUCCESS
+
+    switch (desc->device_type) {
+#ifdef ENABLE_CPU_API
+        GET(INFINI_DEVICE_CPU, cpu);
+#endif
+#ifdef ENABLE_NVIDIA_API
+        GET(INFINI_DEVICE_NVIDIA, nvidia);
+#endif
+#ifdef ENABLE_ILUVATAR_API
+        GET(INFINI_DEVICE_ILUVATAR, nvidia);
+#endif
+#ifdef ENABLE_METAX_API
+       GET(INFINI_DEVICE_METAX, metax);
+#endif
+    default:
+        return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
+    }
+#undef GET
+
+    return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
+}
+
+__C infiniStatus_t infiniopTril(
+    infiniopTrilDescriptor_t desc,
+    void *workspace,
+    size_t workspace_size,
+	void * output,
+	const void * input,                
+    void *stream
+) {
+
+#define CALCULATE(CASE, NAMESPACE)                                                                  \
+    case CASE:                                                                                      \
+        return reinterpret_cast<const op::tril::NAMESPACE::Descriptor *>(desc)->calculate(          \
+            workspace,                                                                              \
+            workspace_size,                                                                         \
+            output,                                                                                 \
+            input,                                                                                  \
+            stream                                                                                  \
+        )
+
+    switch (desc->device_type) {
+
+#ifdef ENABLE_CPU_API
+        CALCULATE(INFINI_DEVICE_CPU, cpu);
+#endif
+#ifdef ENABLE_NVIDIA_API
+        CALCULATE(INFINI_DEVICE_NVIDIA, nvidia);
+#endif
+#ifdef ENABLE_ILUVATAR_API
+        CALCULATE(INFINI_DEVICE_ILUVATAR, nvidia);
+#endif
+#ifdef ENABLE_METAX_API
+       CALCULATE(INFINI_DEVICE_METAX, metax);
+#endif
+
+    default:
+        return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
+    }
+
+#undef CALCULATE
+}
+
+__C infiniStatus_t
+infiniopDestroyTrilDescriptor(infiniopTrilDescriptor_t desc) {
+
+#define DELETE(CASE, NAMESPACE)                                                 \
+    case CASE:                                                                  \
+        delete reinterpret_cast<const op::tril::NAMESPACE::Descriptor *>(desc); \
+        return INFINI_STATUS_SUCCESS
+
+    switch (desc->device_type) {
+
+#ifdef ENABLE_CPU_API
+        DELETE(INFINI_DEVICE_CPU, cpu);
+#endif
+#ifdef ENABLE_NVIDIA_API
+        DELETE(INFINI_DEVICE_NVIDIA, nvidia);
+#endif
+#ifdef ENABLE_ILUVATAR_API
+        DELETE(INFINI_DEVICE_ILUVATAR, nvidia);
+#endif
+#ifdef ENABLE_METAX_API
+       DELETE(INFINI_DEVICE_METAX, metax);
+#endif
+
+    default:
+        return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
+    }
+
+#undef DELETE
+}
diff --git a/src/infiniop/ops/tril/tril.h b/src/infiniop/ops/tril/tril.h
new file mode 100644
index 000000000..7d36709a6
--- /dev/null
+++ b/src/infiniop/ops/tril/tril.h
@@ -0,0 +1,47 @@
+#ifndef __TRIL_H__
+#define __TRIL_H__
+
+#include "../../../utils.h"
+#include "../../operator.h"
+#include "../../tensor.h"
+#include "info.h"
+
+#define DESCRIPTOR(NAMESPACE)                                         \
+    namespace op::tril::NAMESPACE {                                   \
+    class Descriptor final : public InfiniopDescriptor {              \
+        struct Opaque;                                                \
+        Opaque *_opaque;                                              \
+        TrilInfo _info;                                               \
+        size_t _workspace_size;                                       \
+        Descriptor(                                                   \
+            infiniDtype_t dtype,                                      \
+            TrilInfo info,                                            \
+            size_t workspace_size_,                                   \
+            Opaque *opaque,                                           \
+            infiniDevice_t device_type,                               \
+            int device_id                                             \
+        ) : InfiniopDescriptor{device_type, device_id},               \
+              _opaque(opaque),                                        \
+              _info(info),                                            \
+              _workspace_size(workspace_size_) {}                     \
+    public:                                                           \
+        ~Descriptor();                                                \
+        size_t workspaceSize() const { return _workspace_size; }      \
+        static infiniStatus_t create(                                 \
+            infiniopHandle_t handle,                                  \
+            Descriptor **desc_ptr,                                    \
+            infiniopTensorDescriptor_t output_desc,                   \
+            infiniopTensorDescriptor_t input_desc,                    \
+            int diagonal                                              \
+        );                                                            \
+        infiniStatus_t calculate(                                     \
+            void *workspace,                                          \
+            size_t workspace_size,                                    \
+            void * output,                                            \
+            const void * input,                                       \
+            void *stream                                              \
+        ) const;                                                      \
+    };                                                                \
+    }
+
+#endif
\ No newline at end of file
diff --git a/src/infiniop/ops/triu/cpu/triu_cpu.cc b/src/infiniop/ops/triu/cpu/triu_cpu.cc
new file mode 100644
index 000000000..0650c5147
--- /dev/null
+++ b/src/infiniop/ops/triu/cpu/triu_cpu.cc
@@ -0,0 +1,82 @@
+#include "triu_cpu.h"
+#include "../../../devices/cpu/common_cpu.h"
+#include "../../../reduce/cpu/reduce.h"
+#include "../info.h"
+
+namespace op::triu::cpu {
+
+Descriptor::~Descriptor() = default;
+
+infiniStatus_t Descriptor::create(
+    infiniopHandle_t handle_,
+    Descriptor **desc_ptr,
+	infiniopTensorDescriptor_t output_desc,
+	infiniopTensorDescriptor_t input_desc,
+	int diagonal
+) {
+    auto handle = reinterpret_cast<device::cpu::Handle *>(handle_);
+
+//  --------------------- start: check data type and calculate workspace size ----------------------
+    auto dtype = input_desc->dtype();
+    size_t WorkSpaceSize = 0;
+//  ---------------------- end: check data type and calculate workspace size -----------------------
+
+    auto result = TriuInfo::createTriuInfo(
+		output_desc,
+		input_desc,
+		diagonal
+    );
+    CHECK_RESULT(result);
+    const TriuInfo &info = result.take();
+    
+    *desc_ptr = new Descriptor(
+        dtype, std::move(info), WorkSpaceSize,
+        nullptr,
+        handle->device, handle->device_id
+    );    
+
+    return INFINI_STATUS_SUCCESS;
+}
+
+infiniStatus_t Descriptor::calculate(
+    void *workspace,
+    size_t workspace_size,
+	void * output,
+	const void * input,
+    void *stream
+) const {
+
+    #pragma omp parallel for
+    for (int row = 0; row < _info.row_size; row ++)
+    {
+        auto output_ptr = reinterpret_cast<char *>(output) + row * _info.column_size * _info.elem_size;
+        auto input_ptr = reinterpret_cast<const char *>(input) + row * _info.column_size * _info.elem_size;
+        if (_info.column_size - 1 < row + _info.diagonal)
+            memset(
+                output_ptr,
+                0,
+                _info.elem_size * _info.column_size
+            );
+        else if (0 >= row + _info.diagonal)
+            memcpy(
+                output_ptr,
+                input_ptr, 
+                _info.elem_size * _info.column_size
+            );
+        else {
+            memset(
+                output_ptr,
+                0,
+                _info.elem_size * (row + _info.diagonal)
+            );
+            memcpy(
+                output_ptr + _info.elem_size * (row + _info.diagonal),
+                input_ptr + _info.elem_size * (row + _info.diagonal),
+                _info.elem_size * (_info.column_size - row - _info.diagonal)
+            );   
+        }
+
+    }    
+    return INFINI_STATUS_SUCCESS;
+}
+}
diff --git a/src/infiniop/ops/triu/cpu/triu_cpu.h b/src/infiniop/ops/triu/cpu/triu_cpu.h
new file mode 100644
index 000000000..f58acfe7e
--- /dev/null
+++ b/src/infiniop/ops/triu/cpu/triu_cpu.h
@@ -0,0 +1,9 @@
+#ifndef __TRIU_CPU_H__
+#define __TRIU_CPU_H__
+
+#include "../triu.h"
+
+DESCRIPTOR(cpu)
+
+
+#endif // __TRIU_CPU_H__
diff --git a/src/infiniop/ops/triu/cuda/kernel.cuh b/src/infiniop/ops/triu/cuda/kernel.cuh
new file mode 100644
index 000000000..934d2d213
--- /dev/null
+++ b/src/infiniop/ops/triu/cuda/kernel.cuh
@@ -0,0 +1,19 @@
+#ifndef __TRIU_KERNEL_CUH__
+#define __TRIU_KERNEL_CUH__
+//  ------------------------------- start: perform operator on CUDA --------------------------------
+template<unsigned int BLOCK_SIZE, typename Tdata>
+__device__ void triuKernel(
+    Tdata * output,
+    const Tdata * input,
+    int column_size,
+    int diagonal
+) {
+    int row = blockIdx.x;
+    auto output_ptr = output + row * column_size;
+    auto input_ptr = input + row * column_size;
+    for (int i = threadIdx.x; i < column_size; i += BLOCK_SIZE)
+        *(output_ptr + i) = (i < row + diagonal) ? (Tdata)0 : *(input_ptr + i);
+}
+//  -------------------------------- end: perform operator on CUDA ---------------------------------
+
+#endif // __TRIU_KERNEL_CUH__
diff --git a/src/infiniop/ops/triu/info.h b/src/infiniop/ops/triu/info.h
new file mode 100644
index 000000000..135e05c97
--- /dev/null
+++ b/src/infiniop/ops/triu/info.h
@@ -0,0 +1,52 @@
+#ifndef __TRIU_INFO_H__
+#define __TRIU_INFO_H__
+
+#include "../../../utils.h"
+#include "../../operator.h"
+#include "../../tensor.h"
+
+namespace op::triu {
+
+class TriuInfo {
+private:
+    TriuInfo() = default;
+
+public:
+//  ---------------------------- start: define member variables of Info ----------------------------
+    infiniDtype_t dtype;
+	size_t elem_size;
+	int row_size;
+	int column_size;
+	int diagonal;
+
+//  ----------------------------- end: define member variables of Info -----------------------------
+
+    static utils::Result<TriuInfo> createTriuInfo(
+		infiniopTensorDescriptor_t output_desc,
+		infiniopTensorDescriptor_t input_desc,
+		int diagonal
+    ) {
+//  ------------------------- start: check tensor shape and input validity -------------------------
+		CHECK_SAME_SHAPE(
+			input_desc->shape(), output_desc->shape()
+		);
+		CHECK_OR_RETURN(
+			input_desc->ndim() == 2,
+			INFINI_STATUS_BAD_TENSOR_SHAPE
+		);
+		size_t elem_size = infiniSizeOf(input_desc->dtype());
+//  -------------------------- end: check tensor shape and input validity --------------------------
+        return utils::Result<TriuInfo>(TriuInfo{
+//  ------------------------------ start: create an instance of Info -------------------------------
+            input_desc->dtype(),
+			elem_size,
+			int(input_desc->dim(0)),
+			int(input_desc->dim(1)),
+			diagonal
+//  ------------------------------- end: create an instance of Info --------------------------------
+        });
+    }
+};
+}
+
+#endif //  __TRIU_INFO_H__
diff --git a/src/infiniop/ops/triu/metax/triu_metax.h b/src/infiniop/ops/triu/metax/triu_metax.h
new file mode 100644
index 000000000..631d9188c
--- /dev/null
+++ b/src/infiniop/ops/triu/metax/triu_metax.h
@@ -0,0 +1,8 @@
+#ifndef __TRIU_METAX_H__
+#define __TRIU_METAX_H__
+
+#include "../triu.h"
+
+DESCRIPTOR(metax)
+
+#endif // __TRIU_METAX_H__
diff --git a/src/infiniop/ops/triu/metax/triu_metax.maca b/src/infiniop/ops/triu/metax/triu_metax.maca
new file mode 100644
index 000000000..59e3275e1
--- /dev/null
+++ b/src/infiniop/ops/triu/metax/triu_metax.maca
@@ -0,0 +1,128 @@
+#include "../../../devices/metax/metax_common.h"
+#include "triu_metax.h"
+#include <hccub/block/block_reduce.cuh>
+#include "../../../devices/metax/metax_kernel_common.h"
+#include "../../../reduce/cuda/reduce.cuh"
+#include "../cuda/kernel.cuh"
+#include "../info.h"
+
+namespace op::triu::metax {
+template <unsigned int BLOCK_SIZE, typename Tdata>
+INFINIOP_METAX_KERNEL launchKernel(
+    Tdata * output,
+    const Tdata * input,
+    int column_size,
+    int diagonal
+) {
+    triuKernel<BLOCK_SIZE, Tdata>(
+        output,
+        input,
+        column_size,
+        diagonal
+    );
+}
+
+
+struct Descriptor::Opaque {
+    std::shared_ptr<device::metax::Handle::Internal> internal;
+};
+
+Descriptor::~Descriptor() {
+    delete _opaque;
+}
+
+infiniStatus_t Descriptor::create(
+    infiniopHandle_t handle_,
+    Descriptor **desc_ptr,
+    infiniopTensorDescriptor_t output_desc,
+    infiniopTensorDescriptor_t input_desc,
+    int diagonal
+) {
+    auto handle = reinterpret_cast<device::metax::Handle *>(handle_);
+//  --------------------- start: check data type and calculate workspace size ----------------------    
+    auto dtype = output_desc->dtype();
+    auto result = TriuInfo::createTriuInfo(
+        output_desc,
+        input_desc,
+        diagonal
+    );
+    CHECK_RESULT(result);
+    const TriuInfo &info = result.take();
+    size_t WorkSpaceSize = 0;
+//  ---------------------- end: check data type and calculate workspace size -----------------------
+
+    *desc_ptr = new Descriptor(
+        dtype, std::move(info), WorkSpaceSize, 
+        new Opaque{handle->internal()},
+        handle->device, handle->device_id
+    );
+    return INFINI_STATUS_SUCCESS;
+}
+
+
+
+infiniStatus_t Descriptor::calculate(
+    void * workspace,
+    size_t workspace_size,
+    void * output,
+    const void * input,
+    void *stream_
+) const {
+    if (workspace_size < _workspace_size)
+        return INFINI_STATUS_INSUFFICIENT_WORKSPACE;
+
+    hcStream_t stream = (hcStream_t)stream_;
+
+    #define CALCULATE_TRIU(TDATA)                                     \
+    launchKernel<256, TDATA><<<_info.row_size, 256, 0, stream>>>(     \
+        (TDATA *)output,                                              \
+        (const TDATA *)input,                                         \
+        _info.column_size,                                            \
+        _info.diagonal                                                \
+    )
+
+    switch (_info.dtype) {
+        case INFINI_DTYPE_BOOL:
+            CALCULATE_TRIU(bool);
+            break;
+        case INFINI_DTYPE_U8:
+            CALCULATE_TRIU(uint8_t);
+            break;
+        case INFINI_DTYPE_U16:
+            CALCULATE_TRIU(uint16_t);
+            break;
+        case INFINI_DTYPE_U32:
+            CALCULATE_TRIU(uint32_t);
+            break;
+        case INFINI_DTYPE_U64:
+            CALCULATE_TRIU(uint64_t);
+            break;
+        case INFINI_DTYPE_I8:
+            CALCULATE_TRIU(int8_t);
+            break;
+        case INFINI_DTYPE_I16:
+            CALCULATE_TRIU(int16_t);
+            break;
+        case INFINI_DTYPE_I32:
+            CALCULATE_TRIU(int32_t);
+            break;
+        case INFINI_DTYPE_I64:
+            CALCULATE_TRIU(int64_t);
+            break;
+        case INFINI_DTYPE_F16:
+            CALCULATE_TRIU(half);
+            break;
+        case INFINI_DTYPE_F32:
+            CALCULATE_TRIU(float);
+            break;
+        case INFINI_DTYPE_BF16:
+            CALCULATE_TRIU(cuda_bfloat16);
+            break;
+        default:
+            return INFINI_STATUS_BAD_TENSOR_DTYPE;
+    }
+    return INFINI_STATUS_SUCCESS;
+    
+    #undef CALCULATE_TRIU
+}
+} // namespace op::triu::metax
diff --git a/src/infiniop/ops/triu/nvidia/triu_nvidia.cu b/src/infiniop/ops/triu/nvidia/triu_nvidia.cu
new file mode 100644
index 000000000..51761cf95
--- /dev/null
+++ b/src/infiniop/ops/triu/nvidia/triu_nvidia.cu
@@ -0,0 +1,123 @@
+#include "../../../devices/nvidia/nvidia_handle.cuh"
+#include "../../../devices/nvidia/nvidia_common.cuh"
+#include "../../../devices/nvidia/nvidia_kernel_common.cuh"
+#include "triu_nvidia.cuh"
+#include "../cuda/kernel.cuh"
+#include "../info.h"
+
+namespace op::triu::nvidia {
+
+template<unsigned int BLOCK_SIZE, typename Tdata>
+INFINIOP_CUDA_KERNEL launchKernel(
+    Tdata * output,
+    const Tdata * input,
+    int column_size,
+    int diagonal
+) {
+    triuKernel<BLOCK_SIZE, Tdata>(
+        output,
+        input,
+        column_size,
+        diagonal
+    );
+}
+
+
+struct Descriptor::Opaque {
+    std::shared_ptr<device::nvidia::Handle::Internal> internal;
+};
+
+Descriptor::~Descriptor() {
+    delete _opaque;
+}
+
+infiniStatus_t Descriptor::create(
+    infiniopHandle_t handle_,
+    Descriptor **desc_ptr,
+    infiniopTensorDescriptor_t output_desc,
+    infiniopTensorDescriptor_t input_desc,
+    int diagonal
+) {
+    auto handle = reinterpret_cast<device::nvidia::Handle *>(handle_);
+//  --------------------- start: check data type and calculate workspace size ----------------------
+    auto dtype = output_desc->dtype();
+    size_t WorkSpaceSize = 0;
+//  ---------------------- end: check data type and calculate workspace size -----------------------
+    auto result = TriuInfo::createTriuInfo(
+        output_desc,
+        input_desc,
+        diagonal
+    );
+    CHECK_RESULT(result);
+    const TriuInfo &info = result.take();
+    *desc_ptr = new Descriptor(
+        dtype, std::move(info), WorkSpaceSize,
+        new Opaque{handle->internal()},
+        handle->device, handle->device_id
+    );    
+    return INFINI_STATUS_SUCCESS;
+}
+
+
+infiniStatus_t Descriptor::calculate(
+    void * workspace,
+    size_t workspace_size,
+    void * output,
+    const void * input,
+    void *stream_
+) const {
+    if (workspace_size < _workspace_size) {
+        return INFINI_STATUS_INSUFFICIENT_WORKSPACE;
+    }
+    cudaStream_t stream = (cudaStream_t)stream_;
+
+    #define CALCULATE_TRIU(TDATA)                                     \
+    launchKernel<256, TDATA><<<_info.row_size, 256, 0, stream>>>(     \
+        (TDATA *)output,                                              \
+        (const TDATA *)input,                                         \
+        _info.column_size,                                            \
+        _info.diagonal                                                \
+    )
+switch (_info.dtype) {
+        case INFINI_DTYPE_BOOL:
+            CALCULATE_TRIU(bool);
+            break;
+        case INFINI_DTYPE_U8:
+            CALCULATE_TRIU(uint8_t);
+            break;
+        case INFINI_DTYPE_U16:
+            CALCULATE_TRIU(uint16_t);
+            break;
+        case INFINI_DTYPE_U32:
+            CALCULATE_TRIU(uint32_t);
+            break;
+        case INFINI_DTYPE_U64:
+            CALCULATE_TRIU(uint64_t);
+            break;
+        case INFINI_DTYPE_I8:
+            CALCULATE_TRIU(int8_t);
+            break;
+        case INFINI_DTYPE_I16:
+            CALCULATE_TRIU(int16_t);
+            break;
+        case INFINI_DTYPE_I32:
+            CALCULATE_TRIU(int32_t);
+            break;
+        case INFINI_DTYPE_I64:
+            CALCULATE_TRIU(int64_t);
+            break;
+        case INFINI_DTYPE_F16:
+            CALCULATE_TRIU(half);
+            break;
+        case INFINI_DTYPE_F32:
+            CALCULATE_TRIU(float);
+            break;
+        case INFINI_DTYPE_BF16:
+            CALCULATE_TRIU(cuda_bfloat16);
+            break;
+        default:
+            return INFINI_STATUS_BAD_TENSOR_DTYPE;
+    }
+    return INFINI_STATUS_SUCCESS;
+}
+} // namespace op::triu::nvidia
diff --git a/src/infiniop/ops/triu/nvidia/triu_nvidia.cuh b/src/infiniop/ops/triu/nvidia/triu_nvidia.cuh
new file mode 100644
index 000000000..1f86ebbd3
--- /dev/null
+++ b/src/infiniop/ops/triu/nvidia/triu_nvidia.cuh
@@ -0,0 +1,7 @@
+#ifndef __TRIU_NVIDIA_API_H__
+#define __TRIU_NVIDIA_API_H__
+#include "../triu.h"
+
+DESCRIPTOR(nvidia)
+
+#endif // __TRIU_NVIDIA_API_H__
diff --git a/src/infiniop/ops/triu/operator.cc b/src/infiniop/ops/triu/operator.cc
new file mode 100644
index 000000000..c7a27a4ee
--- /dev/null
+++ b/src/infiniop/ops/triu/operator.cc
@@ -0,0 +1,150 @@
+#include "../../operator.h"
+#include "../../handle.h"
+#include "infiniop/ops/triu.h"
+
+#ifdef ENABLE_CPU_API
+#include "cpu/triu_cpu.h"
+#endif
+#if defined(ENABLE_NVIDIA_API) || defined(ENABLE_ILUVATAR_API)
+#include "nvidia/triu_nvidia.cuh"
+#endif
+#ifdef ENABLE_METAX_API
+#include "metax/triu_metax.h"
+#endif
+
+__C infiniStatus_t infiniopCreateTriuDescriptor(
+    infiniopHandle_t handle,
+    infiniopTriuDescriptor_t *desc_ptr,
+	infiniopTensorDescriptor_t output_desc,
+	infiniopTensorDescriptor_t input_desc,
+	int diagonal
+) {
+#define CREATE(CASE, NAMESPACE)                                                 \
+    case CASE:                                                                  \
+        return op::triu::NAMESPACE::Descriptor::create(                         \
+            handle,                                                             \
+            reinterpret_cast<op::triu::NAMESPACE::Descriptor **>(desc_ptr),     \
+            output_desc,                                                        \
+            input_desc,                                                         \
+            diagonal                                                            \
+        )
+
+    switch (handle->device) {
+
+#ifdef ENABLE_CPU_API
+        CREATE(INFINI_DEVICE_CPU, cpu);
+#endif
+#ifdef ENABLE_NVIDIA_API
+        CREATE(INFINI_DEVICE_NVIDIA, nvidia);
+#endif
+#ifdef ENABLE_ILUVATAR_API
+        CREATE(INFINI_DEVICE_ILUVATAR, nvidia);
+#endif
+#ifdef ENABLE_METAX_API
+       CREATE(INFINI_DEVICE_METAX, metax);
+#endif
+
+    default:
+        return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
+    }
+
+#undef CREATE
+}
+
+__C infiniStatus_t infiniopGetTriuWorkspaceSize(infiniopTriuDescriptor_t desc, size_t *size) {
+#define GET(CASE, NAMESPACE)                                                                        \
+    case CASE:                                                                                      \
+        *size = reinterpret_cast<op::triu::NAMESPACE::Descriptor *>(desc)->workspaceSize();         \
+        return INFINI_STATUS_SUCCESS
+
+    switch (desc->device_type) {
+#ifdef ENABLE_CPU_API
+        GET(INFINI_DEVICE_CPU, cpu);
+#endif
+#ifdef ENABLE_NVIDIA_API
+        GET(INFINI_DEVICE_NVIDIA, nvidia);
+#endif
+#ifdef ENABLE_ILUVATAR_API
+        GET(INFINI_DEVICE_ILUVATAR, nvidia);
+#endif
+#ifdef ENABLE_METAX_API
+       GET(INFINI_DEVICE_METAX, metax);
+#endif
+    default:
+        return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
+    }
+#undef GET
+
+    return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
+}
+
+__C infiniStatus_t infiniopTriu(
+    infiniopTriuDescriptor_t desc,
+    void *workspace,
+    size_t workspace_size,
+	void * output,
+	const void * input,                
+    void *stream
+) {
+
+#define CALCULATE(CASE, NAMESPACE)                                                                  \
+    case CASE:                                                                                      \
+        return reinterpret_cast<const op::triu::NAMESPACE::Descriptor *>(desc)->calculate(          \
+            workspace,                                                                              \
+            workspace_size,                                                                         \
+            output,                                                                                 \
+            input,                                                                                  \
+            stream                                                                                  \
+        )
+
+    switch (desc->device_type) {
+
+#ifdef ENABLE_CPU_API
+        CALCULATE(INFINI_DEVICE_CPU, cpu);
+#endif
+#ifdef ENABLE_NVIDIA_API
+        CALCULATE(INFINI_DEVICE_NVIDIA, nvidia);
+#endif
+#ifdef ENABLE_ILUVATAR_API
+        CALCULATE(INFINI_DEVICE_ILUVATAR, nvidia);
+#endif
+#ifdef ENABLE_METAX_API
+       CALCULATE(INFINI_DEVICE_METAX, metax);
+#endif
+
+    default:
+        return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
+    }
+
+#undef CALCULATE
+}
+
+__C infiniStatus_t
+infiniopDestroyTriuDescriptor(infiniopTriuDescriptor_t desc) {
+
+#define DELETE(CASE, NAMESPACE)                                                 \
+    case CASE:                                                                  \
+        delete reinterpret_cast<const op::triu::NAMESPACE::Descriptor *>(desc); \
+        return INFINI_STATUS_SUCCESS
+
+    switch (desc->device_type) {
+
+#ifdef ENABLE_CPU_API
+        DELETE(INFINI_DEVICE_CPU, cpu);
+#endif
+#ifdef ENABLE_NVIDIA_API
+        DELETE(INFINI_DEVICE_NVIDIA, nvidia);
+#endif
+#ifdef ENABLE_ILUVATAR_API
+        DELETE(INFINI_DEVICE_ILUVATAR, nvidia);
+#endif
+#ifdef ENABLE_METAX_API
+       DELETE(INFINI_DEVICE_METAX, metax);
+#endif
+
+    default:
+        return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
+    }
+
+#undef DELETE
+}
diff --git a/src/infiniop/ops/triu/triu.h b/src/infiniop/ops/triu/triu.h
new file mode 100644
index 000000000..c519e14f6
--- /dev/null
+++ b/src/infiniop/ops/triu/triu.h
@@ -0,0 +1,47 @@
+#ifndef __TRIU_H__
+#define __TRIU_H__
+
+#include "../../../utils.h"
+#include "../../operator.h"
+#include "../../tensor.h"
+#include "info.h"
+
+#define DESCRIPTOR(NAMESPACE)                                         \
+    namespace op::triu::NAMESPACE {                                   \
+    class Descriptor final : public InfiniopDescriptor {              \
+        struct Opaque;                                                \
+        Opaque *_opaque;                                              \
+        TriuInfo _info;                                               \
+        size_t _workspace_size;                                       \
+        Descriptor(                                                   \
+            infiniDtype_t dtype,                                      \
+            TriuInfo info,                                            \
+            size_t workspace_size_,                                   \
+            Opaque *opaque,                                           \
+            infiniDevice_t device_type,                               \
+            int device_id                                             \
+        ) : InfiniopDescriptor{device_type, device_id},               \
+              _opaque(opaque),                                        \
+              _info(info),                                            \
+              _workspace_size(workspace_size_) {}                     \
+    public:                                                           \
+        ~Descriptor();                                                \
+        size_t workspaceSize() const { return _workspace_size; }      \
+        static infiniStatus_t create(                                 \
+            infiniopHandle_t handle,                                  \
+            Descriptor **desc_ptr,                                    \
+            infiniopTensorDescriptor_t output_desc,                   \
+            infiniopTensorDescriptor_t input_desc,                    \
+            int diagonal                                              \
+        );                                                            \
+        infiniStatus_t calculate(                                     \
+            void *workspace,                                          \
+            size_t workspace_size,                                    \
+            void * output,                                            \
+            const void * input,                                       \
+            void *stream                                              \
+        ) const;                                                      \
+    };                                                                \
+    }
+
+#endif
\ No newline at end of file
diff --git a/test/infiniop-test/test_generate/testcases/gather.py b/test/infiniop-test/test_generate/testcases/gather.py
new file mode 100644
index 000000000..88e2abec6
--- /dev/null
+++ b/test/infiniop-test/test_generate/testcases/gather.py
@@ -0,0 +1,118 @@
+from ast import List
+import numpy as np
+import gguf
+from typing import List
+from numpy.lib.stride_tricks import as_strided
+
+from .. import InfiniopTestWriter, InfiniopTestCase, np_dtype_to_ggml, gguf_strides, contiguous_gguf_strides, process_zero_stride_tensor
+import torch
+
+def gather(
+    output: np.ndarray,
+    input: np.ndarray,
+    index: np.ndarray,
+    dim: int,
+):
+    torch_output = torch.from_numpy(output)
+    torch_input = torch.from_numpy(input)
+    torch_index = torch.from_numpy(index)
+    
+    torch.gather(torch_input, dim, torch_index, out=torch_output)
+    return torch_output.detach().numpy()
+
+
+class GatherTestCase(InfiniopTestCase):
+    def __init__(
+        self,
+        output: np.ndarray,
+        output_shape: List[int],
+        output_strides: List[int],
+        input: np.ndarray,
+        input_shape: List[int],
+        input_strides: List[int],
+        index: np.ndarray,
+        index_shape: List[int],
+        index_strides: List[int],
+        dim: int,
+    ):
+        super().__init__("gather")
+        self.output = output
+        self.output_shape = output_shape
+        self.output_strides = output_strides
+        self.input = input
+        self.input_shape = input_shape
+        self.input_strides = input_strides
+        self.index = index
+        self.index_shape = index_shape
+        self.index_strides = index_strides
+        self.dim = dim
+
+    def write_test(self, test_writer: "InfiniopTestWriter"):
+        super().write_test(test_writer)
+        test_writer.add_array(test_writer.gguf_key("output.shape"), self.output_shape)
+        test_writer.add_array(test_writer.gguf_key("input.shape"), self.input_shape)
+        test_writer.add_array(test_writer.gguf_key("index.shape"), self.index_shape)
+        test_writer.add_uint64(test_writer.gguf_key("dim"), self.dim)
+        if self.input_strides is not None:
+            test_writer.add_array(test_writer.gguf_key("input.strides"), gguf_strides(*self.input_strides))
+        if self.index_strides is not None:
+            test_writer.add_array(test_writer.gguf_key("index.strides"), gguf_strides(*self.index_strides))
+        test_writer.add_array(
+            test_writer.gguf_key("output.strides"),
+            gguf_strides(*(self.output_strides if self.output_strides is not None else contiguous_gguf_strides(self.output_shape)))
+        )
+        test_writer.add_tensor(
+            test_writer.gguf_key("output"), self.output, raw_dtype=np_dtype_to_ggml(self.output.dtype)
+        )
+        test_writer.add_tensor(
+            test_writer.gguf_key("input"), self.input, raw_dtype=np_dtype_to_ggml(self.input.dtype)
+        )
+        test_writer.add_tensor(
+            test_writer.gguf_key("index"), self.index, raw_dtype=np_dtype_to_ggml(self.index.dtype)
+        )
+
+        ans = gather(
+            self.output,
+            self.input,
+            self.index,
+            self.dim,
+        )
+        test_writer.add_tensor(
+            test_writer.gguf_key("ans"), ans, raw_dtype=np_dtype_to_ggml(self.input.dtype)
+        )
+if __name__ == "__main__":
+    test_writer = InfiniopTestWriter("gather.gguf")
+    test_cases = []
+    # ==============================================================================
+    #  Configuration (Internal Use Only)
+    # ==============================================================================
+    # These are not meant to be imported from other modules
+    _TEST_CASES_ = [
+        # input_shape,output_shape, dim, input_strides, output_strides, index_strides
+        ((2, 3, 7), (2, 3, 5), 2, (177, 17, 1), None, None),
+        ((10, 5, 4), (10, 4, 4), 1, (30, 5, 1), None, [107, 10, 1]),
+        ((11, 2, 2, 4), (11, 2, 2, 4), 0, None, (1007, 107, 10, 1), None),
+        ((11, 20, 20, 13, 37), (11, 20, 20, 13, 37), 1, None, None, None),
+    ]
+    _TENSOR_DTYPES_ = [np.float32, np.float16, np.int32, np.int64, np.bool_]
+    for dtype in _TENSOR_DTYPES_:
+        for input_shape, output_shape, dim, input_strides, output_strides, index_strides in _TEST_CASES_:
+            input = (np.random.rand(*input_shape) * 100).astype(dtype)
+            index = np.random.randint(low=0, high=input_shape[dim], size=output_shape, dtype=np.int64)
+            output = np.empty(output_shape, dtype=dtype)
+
+            test_case = GatherTestCase(
+                output=output,
+                output_shape=output_shape,
+                output_strides=output_strides,
+                input=input,
+                input_shape=input_shape,
+                input_strides=input_strides,
+                index=index,
+                index_shape=output_shape,
+                index_strides=index_strides,
+                dim=dim,
+            )
+            test_cases.append(test_case)
+    test_writer.add_tests(test_cases)
+    test_writer.save()
diff --git a/test/infiniop-test/test_generate/testcases/index_copy_inplace.py b/test/infiniop-test/test_generate/testcases/index_copy_inplace.py
new file mode 100644
index 000000000..7bb314cbc
--- /dev/null
+++ b/test/infiniop-test/test_generate/testcases/index_copy_inplace.py
@@ -0,0 +1,121 @@
+from ast import List
+import numpy as np
+import gguf
+from typing import List
+from numpy.lib.stride_tricks import as_strided
+
+from .. import InfiniopTestWriter, InfiniopTestCase, np_dtype_to_ggml, gguf_strides, contiguous_gguf_strides, process_zero_stride_tensor
+import torch
+import random
+
+def index_copy_inplace(
+    output: np.ndarray,
+    input: np.ndarray,
+    index: np.ndarray,
+    dim: int,
+):
+    torch_output = torch.from_numpy(output)
+    torch_input = torch.from_numpy(input)
+    torch_index = torch.from_numpy(index)
+    torch_output.index_copy_(dim, torch_index, torch_input)
+    return torch_output.detach().numpy()
+
+
+class IndexCopyInplaceTestCase(InfiniopTestCase):
+    def __init__(
+        self,
+        output: np.ndarray,
+        output_shape: List[int],
+        output_strides: List[int],
+        input: np.ndarray,
+        input_shape: List[int],
+        input_strides: List[int],
+        index: np.ndarray,
+        index_shape: List[int],
+        index_strides: List[int],
+        dim: int,
+    ):
+        super().__init__("index_copy_inplace")
+        self.output = output
+        self.output_shape = output_shape
+        self.output_strides = output_strides
+        self.input = input
+        self.input_shape = input_shape
+        self.input_strides = input_strides
+        self.index = index
+        self.index_shape = index_shape
+        self.index_strides = index_strides
+        self.dim = dim
+
+    def write_test(self, test_writer: "InfiniopTestWriter"):
+        super().write_test(test_writer)
+        test_writer.add_array(test_writer.gguf_key("output.shape"), self.output_shape)
+        test_writer.add_array(test_writer.gguf_key("input.shape"), self.input_shape)
+        test_writer.add_array(test_writer.gguf_key("index.shape"), self.index_shape)
+        test_writer.add_uint64(test_writer.gguf_key("dim"), self.dim)
+        if self.input_strides is not None:
+            test_writer.add_array(test_writer.gguf_key("input.strides"), gguf_strides(*self.input_strides))
+        if self.index_strides is not None:
+            test_writer.add_array(test_writer.gguf_key("index.strides"), gguf_strides(*self.index_strides))
+        test_writer.add_array(
+            test_writer.gguf_key("output.strides"),
+            gguf_strides(*(self.output_strides if self.output_strides is not None else contiguous_gguf_strides(self.output_shape)))
+        )
+        test_writer.add_tensor(
+            test_writer.gguf_key("output"), self.output, raw_dtype=np_dtype_to_ggml(self.output.dtype)
+        )
+        test_writer.add_tensor(
+            test_writer.gguf_key("input"), self.input, raw_dtype=np_dtype_to_ggml(self.input.dtype)
+        )
+        test_writer.add_tensor(
+            test_writer.gguf_key("index"), self.index, raw_dtype=np_dtype_to_ggml(self.index.dtype)
+        )
+
+        ans = index_copy_inplace(
+            self.output,
+            self.input,
+            self.index,
+            self.dim,
+        )
+        test_writer.add_tensor(
+            test_writer.gguf_key("ans"), ans, raw_dtype=np_dtype_to_ggml(self.input.dtype)
+        )
+if __name__ == "__main__":
+    test_writer = InfiniopTestWriter("index_copy_inplace.gguf")
+    test_cases = []
+    # ==============================================================================
+    #  Configuration (Internal Use Only)
+    # ==============================================================================
+    # These are not meant to be imported from other modules
+    _TEST_CASES_ = [
+        # input_shape, output_shape, dim, output_strides, input_strides, index_strides,
+        ([13, 1], [13, 4], 1, [37, 1], [37, 1], None),
+        ([1333, 4], [1333, 4], 0, [1, 1333], [1, 2333], [2]),
+        ([133, 23, 53], [133, 23, 53], 1, None, None, None),
+        ([133, 23, 13, 53], [133, 23, 13, 53], 2, None, None, None),
+    ]
+    _TENSOR_DTYPES_ = [np.float32, np.float16, np.int16, np.int32, np.bool_]
+    for dtype in _TENSOR_DTYPES_:
+        for input_shape, output_shape, dim, output_strides, input_strides, index_strides in _TEST_CASES_:
+            input = np.random.rand(*input_shape).astype(dtype)
+            
+            index_list = list(range(output_shape[dim]))
+            random.shuffle(index_list)
+            index = np.array(index_list[:input_shape[dim]], dtype=np.int64)
+            output = np.zeros(output_shape, dtype=dtype)
+
+            test_case = IndexCopyInplaceTestCase(
+                output=output,
+                output_shape=output_shape,
+                output_strides=output_strides,
+                input=input,
+                input_shape=input_shape,
+                input_strides=input_strides,
+                index=index,
+                index_shape=[input_shape[dim]],
+                index_strides=index_strides,
+                dim=dim
+            )
+            test_cases.append(test_case)
+    test_writer.add_tests(test_cases)
+    test_writer.save()
diff --git a/test/infiniop-test/test_generate/testcases/linear.py b/test/infiniop-test/test_generate/testcases/linear.py
new file mode 100644
index 000000000..8bf842d9d
--- /dev/null
+++ b/test/infiniop-test/test_generate/testcases/linear.py
@@ -0,0 +1,138 @@
+from ast import List
+import numpy as np
+import gguf
+from typing import List
+from numpy.lib.stride_tricks import as_strided
+
+from .. import InfiniopTestWriter, InfiniopTestCase, np_dtype_to_ggml, gguf_strides, contiguous_gguf_strides, process_zero_stride_tensor
+import torch
+
+def linear(
+    x: np.ndarray,
+    w: np.ndarray,
+    b: np.ndarray,
+    bias_exist: bool
+):
+    torch_x = torch.from_numpy(x)
+    torch_w = torch.from_numpy(w)
+    if bias_exist:
+        torch_b = torch.from_numpy(b)
+    return torch.nn.functional.linear(torch_x, torch_w, bias=(torch_b if bias_exist else None)).detach().numpy()
+
+
+class LinearTestCase(InfiniopTestCase):
+    def __init__(
+        self,
+        y: np.ndarray,
+        y_shape: List[int],
+        y_strides: List[int],
+        x: np.ndarray,
+        x_shape: List[int],
+        x_strides: List[int],
+        w: np.ndarray,
+        w_shape: List[int],
+        w_strides: List[int],
+        b: np.ndarray,
+        b_shape: List[int],
+        b_strides: List[int],
+        bias_exist: bool
+    ):
+        super().__init__("linear")
+        self.y = y
+        self.y_shape = y_shape
+        self.y_strides = y_strides
+        self.x = x
+        self.x_shape = x_shape
+        self.x_strides = x_strides
+        self.w = w
+        self.w_shape = w_shape
+        self.w_strides = w_strides
+        self.b = b
+        self.b_shape = b_shape
+        self.b_strides = b_strides
+        self.bias_exist = bias_exist
+
+    def write_test(self, test_writer: "InfiniopTestWriter"):
+        super().write_test(test_writer)
+        test_writer.add_array(test_writer.gguf_key("y.shape"), self.y_shape)
+        test_writer.add_array(test_writer.gguf_key("x.shape"), self.x_shape)
+        test_writer.add_array(test_writer.gguf_key("w.shape"), self.w_shape)
+        test_writer.add_array(test_writer.gguf_key("b.shape"), self.b_shape)
+        test_writer.add_bool(test_writer.gguf_key("bias_exist"), self.bias_exist)
+        if self.x_strides is not None:
+            test_writer.add_array(test_writer.gguf_key("x.strides"), gguf_strides(*self.x_strides))
+        if self.w_strides is not None:
+            test_writer.add_array(test_writer.gguf_key("w.strides"), gguf_strides(*self.w_strides))
+        if self.b_strides is not None:
+            test_writer.add_array(test_writer.gguf_key("b.strides"), gguf_strides(*self.b_strides))
+        test_writer.add_array(
+            test_writer.gguf_key("y.strides"),
+            gguf_strides(*(self.y_strides if self.y_strides is not None else contiguous_gguf_strides(self.y_shape)))
+        )
+        test_writer.add_tensor(
+            test_writer.gguf_key("y"), self.y, raw_dtype=np_dtype_to_ggml(self.y.dtype)
+        )
+        test_writer.add_tensor(
+            test_writer.gguf_key("x"), self.x, raw_dtype=np_dtype_to_ggml(self.x.dtype)
+        )
+        test_writer.add_tensor(
+            test_writer.gguf_key("w"), self.w, raw_dtype=np_dtype_to_ggml(self.w.dtype)
+        )
+        test_writer.add_tensor(
+            test_writer.gguf_key("b"), self.b, raw_dtype=np_dtype_to_ggml(self.y.dtype)
+        )
+
+        ans = linear(
+            self.x.astype(np.float64),
+            self.w.astype(np.float64),
+            self.b.astype(np.float64),
+            self.bias_exist
+        )
+        test_writer.add_tensor(
+            test_writer.gguf_key("ans"), ans, raw_dtype=gguf.GGMLQuantizationType.F64
+        )
+if __name__ == "__main__":
+    test_writer = InfiniopTestWriter("linear.gguf")
+    test_cases = []
+    # ==============================================================================
+    #  Configuration (Internal Use Only)
+    # ==============================================================================
+    # These are not meant to be imported from other modules
+    _TEST_CASES_ = [
+        # in_features, out_features, bias_exist, x_strides, y_strides, w_strides
+        (50, 40, True, None, None, [1, 377]),
+        (50, 40, False, [10], [1], None),
+        (50, 40, True, [10], [1], None),
+        (333, 999, True, [1], [10], None),
+        (333, 999, False, [1], [10], None),        
+        (1001, 505, True, None, None, [3001, 3]),
+    ]
+    _TENSOR_DTYPES_ = [np.float32, np.float16]
+    for dtype in _TENSOR_DTYPES_:
+        for in_features, out_features, bias_exist, x_strides, y_strides, w_strides in _TEST_CASES_:
+            x = np.random.rand(in_features).astype(dtype)
+            w = np.random.rand(out_features, in_features).astype(dtype)
+            if bias_exist:
+                b = np.random.rand(out_features).astype(dtype)
+            else:
+                b = np.empty(shape=[], dtype=dtype)
+            y = np.empty(out_features, dtype=dtype)
+
+            test_case = LinearTestCase(
+                y=y,
+                y_shape=[out_features],
+                y_strides=y_strides,
+                x=x,
+                x_shape=[in_features],
+                x_strides=x_strides,
+                w=w,
+                w_shape=[out_features, in_features],
+                w_strides=w_strides,
+                b=b,
+                b_shape=b.shape,
+                b_strides=None,
+                bias_exist=bias_exist
+            )
+            test_cases.append(test_case)
+    test_writer.add_tests(test_cases)
+    test_writer.save()
diff --git a/test/infiniop-test/test_generate/testcases/linear_backward.py b/test/infiniop-test/test_generate/testcases/linear_backward.py
new file mode 100644
index 000000000..551f077da
--- /dev/null
+++ b/test/infiniop-test/test_generate/testcases/linear_backward.py
@@ -0,0 +1,196 @@
+from ast import List
+import numpy as np
+import gguf
+from typing import List
+from numpy.lib.stride_tricks import as_strided
+
+from .. import InfiniopTestWriter, InfiniopTestCase, np_dtype_to_ggml, gguf_strides, contiguous_gguf_strides, process_zero_stride_tensor
+import torch
+
+def linear_backward(
+    grad_y: np.ndarray,
+    x: np.ndarray,
+    w: np.ndarray,
+    b: np.ndarray,
+    bias_exist: bool
+):
+    torch_grad_y = torch.from_numpy(grad_y)
+    torch_x = torch.from_numpy(x)
+    torch_w = torch.from_numpy(w)
+
+    torch_x.requires_grad_(True)
+    torch_w.requires_grad_(True)
+    if bias_exist:
+        torch_b = torch.from_numpy(b)
+        torch_b.requires_grad_(True)
+
+    torch_y = torch.nn.functional.linear(torch_x, torch_w, bias=(torch_b if bias_exist else None))
+    torch_y.backward(torch_grad_y)
+    return torch_x.grad.detach().numpy(), \
+            torch_w.grad.detach().numpy(), \
+            torch_b.grad.detach().numpy() if bias_exist else np.empty([], dtype=np.float64)
+
+
+class LinearBackwardTestCase(InfiniopTestCase):
+    def __init__(
+        self,
+        grad_x: np.ndarray,
+        grad_x_shape: List[int],
+        grad_x_strides: List[int],
+        grad_w: np.ndarray,
+        grad_w_shape: List[int],
+        grad_w_strides: List[int],
+        grad_b: np.ndarray,
+        grad_b_shape: List[int],
+        grad_b_strides: List[int],
+        grad_y: np.ndarray,
+        grad_y_shape: List[int],
+        grad_y_strides: List[int],
+        x: np.ndarray,
+        x_shape: List[int],
+        x_strides: List[int],
+        w: np.ndarray,
+        w_shape: List[int],
+        w_strides: List[int],
+        b: np.ndarray,
+        bias_exist: bool
+    ):
+        super().__init__("linear_backward")
+        self.grad_x = grad_x
+        self.grad_x_shape = grad_x_shape
+        self.grad_x_strides = grad_x_strides
+        self.grad_w = grad_w
+        self.grad_w_shape = grad_w_shape
+        self.grad_w_strides = grad_w_strides
+        self.grad_b = grad_b
+        self.grad_b_shape = grad_b_shape
+        self.grad_b_strides = grad_b_strides
+        self.grad_y = grad_y
+        self.grad_y_shape = grad_y_shape
+        self.grad_y_strides = grad_y_strides
+        self.x = x
+        self.x_shape = x_shape
+        self.x_strides = x_strides
+        self.w = w
+        self.w_shape = w_shape
+        self.w_strides = w_strides
+
+        self.b = b
+        self.bias_exist = bias_exist
+
+    def write_test(self, test_writer: "InfiniopTestWriter"):
+        super().write_test(test_writer)
+        test_writer.add_array(test_writer.gguf_key("grad_x.shape"), self.grad_x_shape)
+        test_writer.add_array(test_writer.gguf_key("grad_w.shape"), self.grad_w_shape)
+        test_writer.add_array(test_writer.gguf_key("grad_b.shape"), self.grad_b_shape)
+        test_writer.add_array(test_writer.gguf_key("grad_y.shape"), self.grad_y_shape)
+        test_writer.add_array(test_writer.gguf_key("x.shape"), self.x_shape)
+        test_writer.add_array(test_writer.gguf_key("w.shape"), self.w_shape)
+        test_writer.add_bool(test_writer.gguf_key("bias_exist"), self.bias_exist)
+        if self.grad_y_strides is not None:
+            test_writer.add_array(test_writer.gguf_key("grad_y.strides"), gguf_strides(*self.grad_y_strides))
+        if self.x_strides is not None:
+            test_writer.add_array(test_writer.gguf_key("x.strides"), gguf_strides(*self.x_strides))
+        if self.w_strides is not None:
+            test_writer.add_array(test_writer.gguf_key("w.strides"), gguf_strides(*self.w_strides))
+        test_writer.add_array(
+            test_writer.gguf_key("grad_x.strides"),
+            gguf_strides(*(self.grad_x_strides if self.grad_x_strides is not None else contiguous_gguf_strides(self.grad_x_shape)))
+        )
+        test_writer.add_array(
+            test_writer.gguf_key("grad_w.strides"),
+            gguf_strides(*(self.grad_w_strides if self.grad_w_strides is not None else contiguous_gguf_strides(self.grad_w_shape)))
+        )
+        test_writer.add_array(
+            test_writer.gguf_key("grad_b.strides"),
+            gguf_strides(*(self.grad_b_strides if self.grad_b_strides is not None else contiguous_gguf_strides(self.grad_b_shape)))
+        )
+        test_writer.add_tensor(
+            test_writer.gguf_key("grad_x"), self.grad_x, raw_dtype=np_dtype_to_ggml(self.grad_x.dtype)
+        )
+        test_writer.add_tensor(
+            test_writer.gguf_key("grad_w"), self.grad_w, raw_dtype=np_dtype_to_ggml(self.grad_w.dtype)
+        )
+        test_writer.add_tensor(
+            test_writer.gguf_key("grad_b"), self.grad_b, raw_dtype=np_dtype_to_ggml(self.grad_b.dtype)
+        )
+        test_writer.add_tensor(
+            test_writer.gguf_key("grad_y"), self.grad_y, raw_dtype=np_dtype_to_ggml(self.grad_y.dtype)
+        )
+        test_writer.add_tensor(
+            test_writer.gguf_key("x"), self.x, raw_dtype=np_dtype_to_ggml(self.x.dtype)
+        )
+        test_writer.add_tensor(
+            test_writer.gguf_key("w"), self.w, raw_dtype=np_dtype_to_ggml(self.w.dtype)
+        )
+
+        ans_grad_x, ans_grad_w, ans_grad_b = linear_backward(
+            self.grad_y.astype(np.float64),
+            self.x.astype(np.float64),
+            self.w.astype(np.float64),
+            self.b.astype(np.float64),
+            self.bias_exist
+        )
+
+        test_writer.add_tensor(
+            test_writer.gguf_key("ans_grad_x"), ans_grad_x, raw_dtype=gguf.GGMLQuantizationType.F64
+        )
+        test_writer.add_tensor(
+            test_writer.gguf_key("ans_grad_w"), ans_grad_w, raw_dtype=gguf.GGMLQuantizationType.F64
+        )
+        test_writer.add_tensor(
+            test_writer.gguf_key("ans_grad_b"), ans_grad_b, raw_dtype=gguf.GGMLQuantizationType.F64
+        )
+if __name__ == "__main__":
+    test_writer = InfiniopTestWriter("linear_backward.gguf")
+    test_cases = []
+    # ==============================================================================
+    #  Configuration (Internal Use Only)
+    # ==============================================================================
+    # These are not meant to be imported from other modules
+    _TEST_CASES_ = [
+        # in_features, out_features, bias_exist, grad_x_strides, grad_y_strides, grad_w_strides
+        (50, 40, True, None, None, [1, 377]),
+        (50, 40, False, [10], [1], None),
+        (50, 40, True, [10], [1], None),
+        (333, 999, True, [1], [10], None),
+        (333, 999, False, [1], [10], None),      
+        (1001, 505, True, None, None, [3001, 3]),
+    ]
+    _TENSOR_DTYPES_ = [np.float32, np.float16]
+    for dtype in _TENSOR_DTYPES_:
+        for in_features, out_features, bias_exist, grad_x_strides, grad_y_strides, grad_w_strides in _TEST_CASES_:
+            grad_y = np.random.rand(out_features).astype(dtype)
+            x = np.random.rand(in_features).astype(dtype)
+            w = np.random.rand(out_features, in_features).astype(dtype)
+            b = np.random.rand(out_features).astype(dtype) if bias_exist else \
+                np.empty([], dtype=dtype)
+            grad_x = np.empty([in_features], dtype=dtype)
+            grad_w = np.empty([out_features, in_features], dtype=dtype)
+            grad_b = np.empty([out_features], dtype=dtype)
+
+            test_case = LinearBackwardTestCase(
+                grad_x=grad_x,
+                grad_x_shape=[in_features],
+                grad_x_strides=grad_x_strides,
+                grad_w=grad_w,
+                grad_w_shape=[out_features, in_features],
+                grad_w_strides=grad_w_strides,
+                grad_b=grad_b,
+                grad_b_shape=[out_features],
+                grad_b_strides=None,
+                grad_y=grad_y,
+                grad_y_shape=[out_features],
+                grad_y_strides=grad_y_strides,
+                x=x,
+                x_shape=[in_features],
+                x_strides=None,
+                w=w,
+                w_shape=[out_features, in_features],
+                w_strides=None,
+                b=b,
+                bias_exist=bias_exist
+            )
+            test_cases.append(test_case)
+    test_writer.add_tests(test_cases)
+    test_writer.save()
diff --git a/test/infiniop-test/test_generate/testcases/scatter.py b/test/infiniop-test/test_generate/testcases/scatter.py
new file mode 100644
index 000000000..ea383c7fb
--- /dev/null
+++ b/test/infiniop-test/test_generate/testcases/scatter.py
@@ -0,0 +1,148 @@
+from ast import List
+import numpy as np
+import gguf
+from typing import List
+from numpy.lib.stride_tricks import as_strided
+import random
+from .. import InfiniopTestWriter, InfiniopTestCase, np_dtype_to_ggml, gguf_strides, contiguous_gguf_strides, process_zero_stride_tensor
+import torch
+
+def scatter(
+    output: np.ndarray,
+    input: np.ndarray,
+    index: np.ndarray,
+    dim: int,
+):
+    torch_output = torch.from_numpy(output)
+    torch_input = torch.from_numpy(input)
+    torch_index = torch.from_numpy(index)
+
+    torch_output.scatter_(dim, torch_index, src=torch_input)
+    return torch_output.detach().numpy()
+
+
+class ScatterTestCase(InfiniopTestCase):
+    def __init__(
+        self,
+        output: np.ndarray,
+        output_shape: List[int],
+        output_strides: List[int],
+        input: np.ndarray,
+        input_shape: List[int],
+        input_strides: List[int],
+        index: np.ndarray,
+        index_shape: List[int],
+        index_strides: List[int],
+        dim: int,
+    ):
+        super().__init__("scatter")
+        self.output = output
+        self.output_shape = output_shape
+        self.output_strides = output_strides
+        self.input = input
+        self.input_shape = input_shape
+        self.input_strides = input_strides
+        self.index = index
+        self.index_shape = index_shape
+        self.index_strides = index_strides
+        self.dim = dim
+
+    def write_test(self, test_writer: "InfiniopTestWriter"):
+        super().write_test(test_writer)
+        test_writer.add_array(test_writer.gguf_key("output.shape"), self.output_shape)
+        test_writer.add_array(test_writer.gguf_key("input.shape"), self.input_shape)
+        test_writer.add_array(test_writer.gguf_key("index.shape"), self.index_shape)
+        test_writer.add_uint64(test_writer.gguf_key("dim"), self.dim)
+        if self.input_strides is not None:
+            test_writer.add_array(test_writer.gguf_key("input.strides"), gguf_strides(*self.input_strides))
+        if self.index_strides is not None:
+            test_writer.add_array(test_writer.gguf_key("index.strides"), gguf_strides(*self.index_strides))
+        test_writer.add_array(
+            test_writer.gguf_key("output.strides"),
+            gguf_strides(*(self.output_strides if self.output_strides is not None else contiguous_gguf_strides(self.output_shape)))
+        )
+        test_writer.add_tensor(
+            test_writer.gguf_key("output"), self.output, raw_dtype=np_dtype_to_ggml(self.output.dtype)
+        )
+        test_writer.add_tensor(
+            test_writer.gguf_key("input"), self.input, raw_dtype=np_dtype_to_ggml(self.input.dtype)
+        )
+        test_writer.add_tensor(
+            test_writer.gguf_key("index"), self.index, raw_dtype=np_dtype_to_ggml(self.index.dtype)
+        )
+
+        ans = scatter(
+            self.output,
+            self.input,
+            self.index,
+            self.dim,
+        )
+        test_writer.add_tensor(
+            test_writer.gguf_key("ans"), ans, raw_dtype=np_dtype_to_ggml(self.input.dtype)
+        )
+def get_test_index_tensor(input_shape, index_shape, output_shape, scatter_dim):
+    index = np.empty(index_shape, dtype=np.int64)
+    ndim = len(input_shape)
+    if ndim == 2 and scatter_dim == 1:
+        for i in range(input.shape[0]):
+            row = list(range(output_shape[dim]))
+            random.shuffle(row)
+            index[i, :] = row[:index_shape[dim]]
+    elif ndim == 3 and scatter_dim == 2:
+        for i in range(input.shape[0]):
+            for j in range(input.shape[1]):
+                row = list(range(output_shape[dim]))
+                random.shuffle(row)
+                index[i, j, :] = row[:index_shape[dim]]
+    elif ndim == 3 and scatter_dim == 1:
+        for i in range(input.shape[0]):
+            for j in range(input.shape[2]):
+                row = list(range(output_shape[dim]))
+                random.shuffle(row)
+                index[i, :, j] = row[:index_shape[dim]]                    
+    elif ndim == 4 and scatter_dim == 0:
+        for i in range(input.shape[1]):
+            for j in range(input.shape[2]):
+                for k in range(input.shape[3]):
+                    row = list(range(output_shape[dim]))
+                    random.shuffle(row)
+                    index[:, i, j, k] = row[:index_shape[dim]]        
+    return index
+
+if __name__ == "__main__":
+    test_writer = InfiniopTestWriter("scatter.gguf")
+    test_cases = []
+    # ==============================================================================
+    #  Configuration (Internal Use Only)
+    # ==============================================================================
+    # These are not meant to be imported from other modules
+    _TEST_CASES_ = [
+        # input_shape, index_shape, output_shape, dim, input_strides, output_strides, index_strides
+        ((6, 7), (6, 7), (6, 7), 1, (55, 2), (55, 2), (55, 2)),
+        ((2, 3, 7), (2, 3, 5), (2, 3, 5), 2, (177, 17, 1), None, None),
+        ((10, 5, 4), (10, 4, 4), (10, 4, 4), 1, (30, 5, 1), None, [107, 10, 1]),
+        ((11, 2, 2, 4), (11, 2, 2, 4), (11, 2, 2, 4), 0, None, (1007, 107, 10, 1), None),
+    ]
+    _TENSOR_DTYPES_ = [np.float32, np.float16, np.int32, np.int64, np.bool_]
+    for dtype in _TENSOR_DTYPES_:
+        for input_shape, index_shape, output_shape, dim, input_strides, output_strides, index_strides in _TEST_CASES_:
+            input = (np.random.rand(*input_shape) * 100).astype(dtype)
+            index = get_test_index_tensor(input_shape, index_shape, output_shape, dim)  
+                    
+            output = np.zeros(output_shape, dtype=dtype)
+
+            test_case = ScatterTestCase(
+                output=output,
+                output_shape=output_shape,
+                output_strides=output_strides,
+                input=input,
+                input_shape=input_shape,
+                input_strides=input_strides,
+                index=index,
+                index_shape=index_shape,
+                index_strides=index_strides,
+                dim=dim,
+            )
+            test_cases.append(test_case)
+    test_writer.add_tests(test_cases)
+    test_writer.save()
diff --git a/test/infiniop-test/test_generate/testcases/tril.py b/test/infiniop-test/test_generate/testcases/tril.py
new file mode 100644
index 000000000..bfd606473
--- /dev/null
+++ b/test/infiniop-test/test_generate/testcases/tril.py
@@ -0,0 +1,118 @@
+from ast import List
+import numpy as np
+import gguf
+from typing import List
+from numpy.lib.stride_tricks import as_strided
+
+from .. import InfiniopTestWriter, InfiniopTestCase, np_dtype_to_ggml, gguf_strides, contiguous_gguf_strides, process_zero_stride_tensor
+import torch
+from enum import Enum, auto
+class Inplace(Enum):
+    OUT_OF_PLACE = auto()
+    INPLACE = auto()
+_INPLACE = [
+    Inplace.OUT_OF_PLACE,
+    Inplace.INPLACE,
+]
+
+def tril(
+    output: np.ndarray,
+    input: np.ndarray,
+    diagonal: int,
+    inplace: Inplace
+):
+    torch_input = torch.from_numpy(input)
+    if inplace == Inplace.INPLACE:
+        torch_output = torch_input
+    else:
+        torch_output = torch.from_numpy(output)
+
+    torch.tril(torch_input, diagonal, out=torch_output)
+    
+    return torch_output.detach().numpy()
+
+
+class TrilTestCase(InfiniopTestCase):
+    def __init__(
+        self,
+        output: np.ndarray,
+        output_shape: List[int],
+        output_strides: List[int],
+        input: np.ndarray,
+        input_shape: List[int],
+        input_strides: List[int],
+        diagonal: int,
+        inplace: Inplace
+    ):
+        super().__init__("tril")
+        self.output = output
+        self.output_shape = output_shape
+        self.output_strides = output_strides
+        self.input = input
+        self.input_shape = input_shape
+        self.input_strides = input_strides
+        self.diagonal = diagonal
+        self.inplace = inplace
+
+    def write_test(self, test_writer: "InfiniopTestWriter"):
+        super().write_test(test_writer)
+        test_writer.add_array(test_writer.gguf_key("output.shape"), self.output_shape)
+        test_writer.add_array(test_writer.gguf_key("input.shape"), self.input_shape)
+        test_writer.add_int64(test_writer.gguf_key("diagonal"), self.diagonal)
+        if self.input_strides is not None:
+            test_writer.add_array(test_writer.gguf_key("input.strides"), gguf_strides(*self.input_strides))
+        test_writer.add_array(
+            test_writer.gguf_key("output.strides"),
+            gguf_strides(*(self.output_strides if self.output_strides is not None else contiguous_gguf_strides(self.output_shape)))
+        )
+        test_writer.add_tensor(
+            test_writer.gguf_key("output"), self.output, raw_dtype=np_dtype_to_ggml(self.output.dtype)
+        )
+        test_writer.add_tensor(
+            test_writer.gguf_key("input"), self.input, raw_dtype=np_dtype_to_ggml(self.input.dtype)
+        )
+        ans = tril(
+            self.output,
+            self.input,
+            self.diagonal,
+            self.inplace
+        )
+        test_writer.add_tensor(
+            test_writer.gguf_key("ans"), ans, raw_dtype=np_dtype_to_ggml(self.input.dtype)
+        )
+if __name__ == "__main__":
+    test_writer = InfiniopTestWriter("tril.gguf")
+    test_cases = []
+    # ==============================================================================
+    #  Configuration (Internal Use Only)
+    # ==============================================================================
+    # These are not meant to be imported from other modules
+    _TEST_CASES_ = [
+        ((5, 6), 0),
+        ((4, 5), -1),
+        ((61, 71), 2),
+        ((111, 222), 33),
+        ((1001, 1001), -101)
+    ]
+    _TENSOR_DTYPES_ = [np.float32, np.float16, np.int64, np.int32, np.bool_]
+    for dtype in _TENSOR_DTYPES_:
+        for shape, diagonal in _TEST_CASES_:
+            for inplace in _INPLACE:
+                input = np.random.rand(*shape).astype(dtype)
+                if inplace == Inplace.INPLACE:
+                    output = input
+                else:
+                    output = np.empty(shape, dtype=dtype)
+                test_case = TrilTestCase(
+                    output=output,
+                    output_shape=shape,
+                    output_strides=None,
+                    input=input,
+                    input_shape=shape,
+                    input_strides=None,
+                    diagonal=diagonal,
+                    inplace=inplace
+                )
+                test_cases.append(test_case)
+    test_writer.add_tests(test_cases)
+    test_writer.save()
diff --git a/test/infiniop-test/test_generate/testcases/triu.py b/test/infiniop-test/test_generate/testcases/triu.py
new file mode 100644
index 000000000..3c0b965a2
--- /dev/null
+++ b/test/infiniop-test/test_generate/testcases/triu.py
@@ -0,0 +1,120 @@
+from ast import List
+import numpy as np
+import gguf
+from typing import List
+from numpy.lib.stride_tricks import as_strided
+
+from .. import InfiniopTestWriter, InfiniopTestCase, np_dtype_to_ggml, gguf_strides, contiguous_gguf_strides, process_zero_stride_tensor
+import torch
+from enum import Enum, auto
+class Inplace(Enum):
+    OUT_OF_PLACE = auto()
+    INPLACE = auto()
+_INPLACE = [
+    Inplace.OUT_OF_PLACE,
+    Inplace.INPLACE,
+]
+
+def triu(
+    output: np.ndarray,
+    input: np.ndarray,
+    diagonal: int,
+    inplace: Inplace
+):
+    torch_input = torch.from_numpy(input)
+    if inplace == Inplace.INPLACE:
+        torch_output = torch_input
+    else:
+        torch_output = torch.from_numpy(output)
+
+    torch.triu(torch_input, diagonal, out=torch_output)
+
+    return torch_output.detach().numpy()
+
+
+class TriuTestCase(InfiniopTestCase):
+    def __init__(
+        self,
+        output: np.ndarray,
+        output_shape: List[int],
+        output_strides: List[int],
+        input: np.ndarray,
+        input_shape: List[int],
+        input_strides: List[int],
+        diagonal: int,
+        inplace: Inplace
+    ):
+        super().__init__("triu")
+        self.output = output
+        self.output_shape = output_shape
+        self.output_strides = output_strides
+        self.input = input
+        self.input_shape = input_shape
+        self.input_strides = input_strides
+        self.diagonal = diagonal
+        self.inplace = inplace
+
+    def write_test(self, test_writer: "InfiniopTestWriter"):
+        super().write_test(test_writer)
+        test_writer.add_array(test_writer.gguf_key("output.shape"), self.output_shape)
+        test_writer.add_array(test_writer.gguf_key("input.shape"), self.input_shape)
+        test_writer.add_int64(test_writer.gguf_key("diagonal"), self.diagonal)
+        if self.input_strides is not None:
+            test_writer.add_array(test_writer.gguf_key("input.strides"), gguf_strides(*self.input_strides))
+        test_writer.add_array(
+            test_writer.gguf_key("output.strides"),
+            gguf_strides(*(self.output_strides if self.output_strides is not None else contiguous_gguf_strides(self.output_shape)))
+        )
+        test_writer.add_tensor(
+            test_writer.gguf_key("output"), self.output, raw_dtype=np_dtype_to_ggml(self.output.dtype)
+        )
+        test_writer.add_tensor(
+            test_writer.gguf_key("input"), self.input, raw_dtype=np_dtype_to_ggml(self.input.dtype)
+        )
+
+        ans = triu(
+            self.output,
+            self.input,
+            self.diagonal,
+            self.inplace
+        )
+        test_writer.add_tensor(
+            test_writer.gguf_key("ans"), ans, raw_dtype=np_dtype_to_ggml(self.input.dtype)
+        )
+if __name__ == "__main__":
+    test_writer = InfiniopTestWriter("triu.gguf")
+    test_cases = []
+    # ==============================================================================
+    #  Configuration (Internal Use Only)
+    # ==============================================================================
+    # These are not meant to be imported from other modules
+    _TEST_CASES_ = [
+        # shape, diagonal
+        ((5, 6), 0),
+        ((4, 5), -1),
+        ((61, 71), 2),
+        ((111, 222), 33),
+        ((1001, 1001), -101)
+    ]
+    _TENSOR_DTYPES_ = [np.float32, np.float16, np.int64, np.int32, np.bool_]
+    for dtype in _TENSOR_DTYPES_:
+        for shape, diagonal in _TEST_CASES_:
+            for inplace in _INPLACE:
+                input = np.random.rand(*shape).astype(dtype)
+                if inplace == Inplace.INPLACE:
+                    output = input
+                else:
+                    output = np.empty(shape, dtype=dtype)
+                test_case = TriuTestCase(
+                    output=output,
+                    output_shape=shape,
+                    output_strides=None,
+                    input=input,
+                    input_shape=shape,
+                    input_strides=None,
+                    diagonal=diagonal,
+                    inplace=inplace
+                )
+                test_cases.append(test_case)
+    test_writer.add_tests(test_cases)
+    test_writer.save()
diff --git a/test/infiniop/gather.py b/test/infiniop/gather.py
new file mode 100644
index 000000000..d22a17811
--- /dev/null
+++ b/test/infiniop/gather.py
@@ -0,0 +1,161 @@
+import torch
+import ctypes
+from ctypes import c_uint64
+from libinfiniop import (
+    LIBINFINIOP,
+    TestTensor,
+    get_test_devices,
+    check_error,
+    test_operator,
+    get_args,
+    debug,
+    get_tolerance,
+    profile_operation,
+    TestWorkspace,
+    InfiniDtype,
+    InfiniDtypeNames,
+    InfiniDeviceNames,
+    infiniopOperatorDescriptor_t,
+)
+from enum import Enum, auto
+
+_TEST_CASES = [
+    # input_shape, output_shape, dim, input_strides, output_strides, index_strides
+    ((2, 3, 7), (2, 3, 5), 2, (177, 17, 1), None, None),
+    ((10, 5, 4), (10, 4, 4), 1, (30, 5, 1), None, [16, 4, 1]),
+    ((11, 2, 2, 4), (11, 2, 2, 4), 0, None, (1007, 107, 10, 1), None),
+    ((11, 20, 20, 13, 37), (11, 20, 20, 13, 37), 1, None, None, None)
+]
+
+# Data types used for testing
+_TENSOR_DTYPES = [InfiniDtype.F16, InfiniDtype.F32, InfiniDtype.BF16]
+_TENSOR_DTYPES = [InfiniDtype.F16,]
+
+# Tolerance map for different data types
+_TOLERANCE_MAP = {
+    InfiniDtype.F16: {"atol": 0, "rtol": 0},
+    InfiniDtype.F32: {"atol": 0, "rtol": 0},
+    InfiniDtype.BF16: {"atol": 0, "rtol": 0},
+}
+
+DEBUG = False
+PROFILE = False
+NUM_PRERUN = 10
+NUM_ITERATIONS = 1000
+
+
+def torch_gather(output, input, dim, index):
+    torch.gather(input, dim, index, out=output)
+
+def test(
+    handle,
+    device,
+    input_shape, output_shape, dim, input_strides, output_strides, index_strides,
+    dtype,
+    sync=None,
+):
+    print(
+        f"Testing Gather on {InfiniDeviceNames[device]} with input shape:{input_shape}, dim:{dim}, output_shape:{output_shape},"
+        f"dtype:{InfiniDtypeNames[dtype]}"
+    )
+
+    input = TestTensor(
+        input_shape,
+        input_strides,
+        dtype,
+        device
+    ) 
+    torch_index = torch.randint(low=0, high=input_shape[dim], size=output_shape, dtype=torch.int64)
+    if index_strides:
+        torch_index = torch_index.as_strided(output_shape, index_strides)        
+    index = TestTensor(
+        output_shape,
+        torch_index.stride(),
+        InfiniDtype.I64,
+        device,
+        "manual",
+        set_tensor=torch_index
+    ) 
+    output = TestTensor(
+        output_shape,
+        output_strides,
+        dtype,
+        device,
+    )
+
+    torch_gather(output.torch_tensor(), input.torch_tensor(), dim, index.torch_tensor())
+
+    if sync is not None:
+        sync()
+
+    descriptor = infiniopOperatorDescriptor_t()
+    check_error(
+        LIBINFINIOP.infiniopCreateGatherDescriptor(
+            handle,
+            ctypes.byref(descriptor),
+            output.descriptor,
+            input.descriptor,
+            index.descriptor,
+            dim
+        )
+    )
+
+    # Invalidate the shape and strides in the descriptor to prevent them from being directly used by the kernel
+    for tensor in [input, output, index]:
+        tensor.destroy_desc()
+
+    workspace_size = c_uint64(0)
+    check_error(
+        LIBINFINIOP.infiniopGetGatherWorkspaceSize(
+            descriptor, ctypes.byref(workspace_size)
+        )
+    )
+    workspace = TestWorkspace(workspace_size.value, input.device)
+
+    def lib_gather():
+        check_error(
+            LIBINFINIOP.infiniopGather(
+                descriptor,
+                workspace.data(),
+                workspace.size(),
+                output.data(),
+                input.data(),
+                index.data(),
+                None,
+            )
+        )
+
+    lib_gather()
+
+    atol, rtol = get_tolerance(_TOLERANCE_MAP, dtype)
+    if DEBUG:
+        debug(output.actual_tensor(), output.torch_tensor(), atol=atol, rtol=rtol)
+    # print("x:", input.torch_tensor())
+    # print("CALCULATED:\n", output.actual_tensor(), )
+    # print("GT\n", output.torch_tensor())
+    assert torch.allclose(output.actual_tensor(), output.torch_tensor(), atol=atol, rtol=rtol)
+
+    # Profiling workflow
+    if PROFILE:
+        # fmt: off
+        profile_operation("PyTorch", lambda: torch_gather(
+            output.torch_tensor(), input.torch_tensor(), dim, index.torch_tensor()
+        ), device, NUM_PRERUN, NUM_ITERATIONS)
+        profile_operation("    lib", lambda: lib_gather(), device, NUM_PRERUN, NUM_ITERATIONS)
+        # fmt: on
+    check_error(LIBINFINIOP.infiniopDestroyGatherDescriptor(descriptor))
+
+
+if __name__ == "__main__":
+    args = get_args()
+
+    # Configure testing options
+    DEBUG = args.debug
+    PROFILE = args.profile
+    NUM_PRERUN = args.num_prerun
+    NUM_ITERATIONS = args.num_iterations
+
+    for device in get_test_devices(args):
+        test_operator(device, test, _TEST_CASES, _TENSOR_DTYPES)
+
+    print("\033[92mTest my Gather passed!\033[0m")
diff --git a/test/infiniop/index_copy_inplace.py b/test/infiniop/index_copy_inplace.py
new file mode 100644
index 000000000..97dbd8266
--- /dev/null
+++ b/test/infiniop/index_copy_inplace.py
@@ -0,0 +1,180 @@
+import torch
+import ctypes
+from ctypes import c_uint64
+from libinfiniop import (
+    LIBINFINIOP,
+    TestTensor,
+    get_test_devices,
+    check_error,
+    test_operator,
+    get_args,
+    debug,
+    get_tolerance,
+    profile_operation,
+    TestWorkspace,
+    InfiniDtype,
+    InfiniDtypeNames,
+    InfiniDeviceNames,
+    infiniopOperatorDescriptor_t,
+)
+from enum import Enum, auto
+import random
+
+
+class Inplace(Enum):
+    OUT_OF_PLACE = auto()
+    INPLACE = auto()
+
+_TEST_CASES = [
+    # input_shape, output_shape, dim, output_strides, input_strides,
+    ([13, 1], [13, 4], 1, [37, 1], [37, 1], Inplace.OUT_OF_PLACE),
+    ([1333, 4], [1333, 4], 0, [1, 1333], [1, 2333], Inplace.INPLACE),
+    ([1333, 4], [1333, 4], 0, [1, 1333], [1, 2333], Inplace.OUT_OF_PLACE),
+    ([133, 23, 53], [133, 23, 53], 1, None, None, Inplace.OUT_OF_PLACE),
+    ([133, 23, 13, 53], [133, 23, 13, 53], 2, None, None, Inplace.OUT_OF_PLACE),
+]
+
+# Data types used for testing
+_TENSOR_DTYPES = [InfiniDtype.F16, InfiniDtype.F32, InfiniDtype.BF16]
+
+# Tolerance map for different data types
+_TOLERANCE_MAP = {
+    InfiniDtype.F16: {"atol": 0, "rtol": 0},
+    InfiniDtype.F32: {"atol": 0, "rtol": 0},
+    InfiniDtype.BF16: {"atol": 0, "rtol": 0},
+}
+
+
+DEBUG = False
+PROFILE = False
+NUM_PRERUN = 10
+NUM_ITERATIONS = 1000
+
+
+def torch_index_copy_inplace(output, input, index, dim):
+    output.index_copy_(dim, index, input.clone())
+    
+
+def test(
+    handle,
+    device,
+    input_shape, output_shape, dim, output_strides, input_strides,
+    inplace,
+    dtype,
+    sync=None,
+):
+    print(
+        f"Testing index_copy_inplace on {InfiniDeviceNames[device]} with shape:{input_shape},"
+        f"inplace:{inplace},"
+        f"dtype:{InfiniDtypeNames[dtype]}"
+    )
+
+    input = TestTensor(
+        input_shape,
+        input_strides,
+        dtype,
+        device,
+    )
+    if inplace == Inplace.INPLACE:
+        assert output_shape == input_shape
+        output = input
+    else:
+        output = TestTensor(
+            output_shape,
+            output_strides,
+            dtype,
+            device,
+            "zeros",
+        )
+
+    index_list = list(range(output_shape[dim]))
+    
+    random.shuffle(index_list)
+    torch_index = torch.tensor(index_list[:input_shape[dim]], dtype=torch.int64)
+    index = TestTensor(
+        [input_shape[dim]],
+        torch_index.stride(),
+        InfiniDtype.I64,
+        device,
+        "manual",
+        set_tensor=torch_index
+    )
+
+    torch_index_copy_inplace(output.torch_tensor(), input.torch_tensor(), index.torch_tensor(), dim)
+
+    if sync is not None:
+        sync()
+
+    descriptor = infiniopOperatorDescriptor_t()
+    check_error(
+        LIBINFINIOP.infiniopCreateIndexCopyInplaceDescriptor(
+            handle,
+            ctypes.byref(descriptor),
+			output.descriptor,
+			input.descriptor,
+			index.descriptor,
+			dim,
+        )
+    )
+
+    # Invalidate the shape and strides in the descriptor to prevent them from being directly used by the kernel
+    for tensor in [output, input, index]:
+        tensor.destroy_desc()
+
+    workspace_size = c_uint64(0)
+    check_error(
+        LIBINFINIOP.infiniopGetIndexCopyInplaceWorkspaceSize(
+            descriptor, ctypes.byref(workspace_size)
+        )
+    )
+    workspace = TestWorkspace(workspace_size.value, output.device)
+
+    def lib_index_copy_inplace():
+        check_error(
+            LIBINFINIOP.infiniopIndexCopyInplace(
+                descriptor,
+                workspace.data(),
+                workspace.size(),
+				output.data(),
+				input.data(),
+				index.data(),                
+                None,
+            )
+        )
+
+    lib_index_copy_inplace()
+
+    atol, rtol = get_tolerance(_TOLERANCE_MAP, dtype)
+    if DEBUG:
+        debug(output.actual_tensor(), output.torch_tensor(), atol=atol, rtol=rtol)
+    # print('input:\n', input.torch_tensor())
+    # print('index:\n', index.torch_tensor())
+    # print('output:\n', output.torch_tensor(), '\n', output.actual_tensor(), )
+
+
+    assert torch.allclose(output.actual_tensor(), output.torch_tensor(), atol=atol, rtol=rtol)         
+
+    # Profiling workflow
+    if PROFILE:
+        # fmt: off
+        profile_operation("PyTorch", lambda: torch_index_copy_inplace(
+            output.torch_tensor(), input.torch_tensor(), index.torch_tensor(), dim
+        ), device, NUM_PRERUN, NUM_ITERATIONS)
+        profile_operation("    lib", lambda: lib_index_copy_inplace(), device, NUM_PRERUN, NUM_ITERATIONS)
+        # fmt: on
+    check_error(LIBINFINIOP.infiniopDestroyIndexCopyInplaceDescriptor(descriptor))
+
+
+if __name__ == "__main__":
+    args = get_args()
+
+    # Configure testing options
+    DEBUG = args.debug
+    PROFILE = args.profile
+    NUM_PRERUN = args.num_prerun
+    NUM_ITERATIONS = args.num_iterations
+
+    for device in get_test_devices(args):
+        test_operator(device, test, _TEST_CASES, _TENSOR_DTYPES)
+
+    print("\033[92mTest my index_copy_inplace passed!\033[0m")
diff --git a/test/infiniop/libinfiniop/op_register.py b/test/infiniop/libinfiniop/op_register.py
index e92e77105..a75300a76 100644
--- a/test/infiniop/libinfiniop/op_register.py
+++ b/test/infiniop/libinfiniop/op_register.py
@@ -489,3 +489,209 @@ def conv_(lib):
     lib.infiniopDestroyConvDescriptor.argtypes = [
         infiniopOperatorDescriptor_t,
     ]
+
+@OpRegister.operator
+def index_copy_inplace_(lib):
+    lib.infiniopCreateIndexCopyInplaceDescriptor.restype = c_int32
+    lib.infiniopCreateIndexCopyInplaceDescriptor.argtypes = [
+        infiniopHandle_t,
+        POINTER(infiniopOperatorDescriptor_t),
+        infiniopTensorDescriptor_t,
+        infiniopTensorDescriptor_t,
+        infiniopTensorDescriptor_t,
+        c_size_t,
+    ]
+    lib.infiniopGetIndexCopyInplaceWorkspaceSize.restype = c_int32
+    lib.infiniopGetIndexCopyInplaceWorkspaceSize.argtypes = [
+        infiniopOperatorDescriptor_t,
+        POINTER(c_size_t),
+    ]    
+    lib.infiniopIndexCopyInplace.restype = c_int32
+    lib.infiniopIndexCopyInplace.argtypes = [
+        infiniopOperatorDescriptor_t,
+        c_void_p,
+        c_size_t,
+        c_void_p,
+        c_void_p,
+        c_void_p,
+        c_void_p,
+    ]
+    lib.infiniopDestroyIndexCopyInplaceDescriptor.restype = c_int32
+    lib.infiniopDestroyIndexCopyInplaceDescriptor.argtypes = [infiniopOperatorDescriptor_t]
+
+@OpRegister.operator
+def gather_(lib):
+    lib.infiniopCreateGatherDescriptor.restype = c_int32
+    lib.infiniopCreateGatherDescriptor.argtypes = [
+        infiniopHandle_t,
+        POINTER(infiniopOperatorDescriptor_t),
+        infiniopTensorDescriptor_t,
+        infiniopTensorDescriptor_t,
+        infiniopTensorDescriptor_t,
+        c_size_t,
+    ]
+    lib.infiniopGetGatherWorkspaceSize.restype = c_int32
+    lib.infiniopGetGatherWorkspaceSize.argtypes = [
+        infiniopOperatorDescriptor_t,
+        POINTER(c_size_t),
+    ]    
+    lib.infiniopGather.restype = c_int32
+    lib.infiniopGather.argtypes = [
+        infiniopOperatorDescriptor_t,
+        c_void_p,
+        c_size_t,
+        c_void_p,
+        c_void_p,
+        c_void_p,
+        c_void_p,
+    ]
+    lib.infiniopDestroyGatherDescriptor.restype = c_int32
+    lib.infiniopDestroyGatherDescriptor.argtypes = [infiniopOperatorDescriptor_t]
+
+@OpRegister.operator
+def scatter_(lib):
+    lib.infiniopCreateScatterDescriptor.restype = c_int32
+    lib.infiniopCreateScatterDescriptor.argtypes = [
+        infiniopHandle_t,
+        POINTER(infiniopOperatorDescriptor_t),
+        infiniopTensorDescriptor_t,
+        infiniopTensorDescriptor_t,
+        infiniopTensorDescriptor_t,
+        c_size_t,
+    ]
+    lib.infiniopGetScatterWorkspaceSize.restype = c_int32
+    lib.infiniopGetScatterWorkspaceSize.argtypes = [
+        infiniopOperatorDescriptor_t,
+        POINTER(c_size_t),
+    ]    
+    lib.infiniopScatter.restype = c_int32
+    lib.infiniopScatter.argtypes = [
+        infiniopOperatorDescriptor_t,
+        c_void_p,
+        c_size_t,
+        c_void_p,
+        c_void_p,
+        c_void_p,
+        c_void_p,
+    ]
+    lib.infiniopDestroyScatterDescriptor.restype = c_int32
+    lib.infiniopDestroyScatterDescriptor.argtypes = [infiniopOperatorDescriptor_t]
+
+@OpRegister.operator
+def tril_(lib):
+    lib.infiniopCreateTrilDescriptor.restype = c_int32
+    lib.infiniopCreateTrilDescriptor.argtypes = [
+        infiniopHandle_t,
+        POINTER(infiniopOperatorDescriptor_t),
+        infiniopTensorDescriptor_t,
+        infiniopTensorDescriptor_t,
+        c_int32,
+    ]
+    lib.infiniopGetTrilWorkspaceSize.restype = c_int32
+    lib.infiniopGetTrilWorkspaceSize.argtypes = [
+        infiniopOperatorDescriptor_t,
+        POINTER(c_size_t),
+    ]    
+    lib.infiniopTril.restype = c_int32
+    lib.infiniopTril.argtypes = [
+        infiniopOperatorDescriptor_t,
+        c_void_p,
+        c_size_t,
+        c_void_p,
+        c_void_p,
+        c_void_p,
+    ]
+    lib.infiniopDestroyTrilDescriptor.restype = c_int32
+    lib.infiniopDestroyTrilDescriptor.argtypes = [infiniopOperatorDescriptor_t]
+
+@OpRegister.operator
+def triu_(lib):
+    lib.infiniopCreateTriuDescriptor.restype = c_int32
+    lib.infiniopCreateTriuDescriptor.argtypes = [
+        infiniopHandle_t,
+        POINTER(infiniopOperatorDescriptor_t),
+        infiniopTensorDescriptor_t,
+        infiniopTensorDescriptor_t,
+        c_int32,
+    ]
+    lib.infiniopGetTriuWorkspaceSize.restype = c_int32
+    lib.infiniopGetTriuWorkspaceSize.argtypes = [
+        infiniopOperatorDescriptor_t,
+        POINTER(c_size_t),
+    ]    
+    lib.infiniopTriu.restype = c_int32
+    lib.infiniopTriu.argtypes = [
+        infiniopOperatorDescriptor_t,
+        c_void_p,
+        c_size_t,
+        c_void_p,
+        c_void_p,
+        c_void_p,
+    ]
+    lib.infiniopDestroyTriuDescriptor.restype = c_int32
+    lib.infiniopDestroyTriuDescriptor.argtypes = [infiniopOperatorDescriptor_t]
+
+@OpRegister.operator
+def linear_(lib):
+    lib.infiniopCreateLinearDescriptor.restype = c_int32
+    lib.infiniopCreateLinearDescriptor.argtypes = [
+        infiniopHandle_t,
+        POINTER(infiniopOperatorDescriptor_t),
+        infiniopTensorDescriptor_t,
+        infiniopTensorDescriptor_t,
+        infiniopTensorDescriptor_t,
+        infiniopTensorDescriptor_t,
+    ]
+    lib.infiniopGetLinearWorkspaceSize.restype = c_int32
+    lib.infiniopGetLinearWorkspaceSize.argtypes = [
+        infiniopOperatorDescriptor_t,
+        POINTER(c_size_t),
+    ]    
+    lib.infiniopLinear.restype = c_int32
+    lib.infiniopLinear.argtypes = [
+        infiniopOperatorDescriptor_t,
+        c_void_p,
+        c_size_t,
+        c_void_p,
+        c_void_p,
+        c_void_p,
+        c_void_p,
+        c_void_p,
+    ]
+    lib.infiniopDestroyLinearDescriptor.restype = c_int32
+    lib.infiniopDestroyLinearDescriptor.argtypes = [infiniopOperatorDescriptor_t]
+
+@OpRegister.operator
+def linear_backward_(lib):
+    lib.infiniopCreateLinearBackwardDescriptor.restype = c_int32
+    lib.infiniopCreateLinearBackwardDescriptor.argtypes = [
+        infiniopHandle_t,
+        POINTER(infiniopOperatorDescriptor_t),
+        infiniopTensorDescriptor_t,
+        infiniopTensorDescriptor_t,
+        infiniopTensorDescriptor_t,
+        infiniopTensorDescriptor_t,
+        infiniopTensorDescriptor_t,
+        infiniopTensorDescriptor_t,
+    ]
+    lib.infiniopGetLinearBackwardWorkspaceSize.restype = c_int32
+    lib.infiniopGetLinearBackwardWorkspaceSize.argtypes = [
+        infiniopOperatorDescriptor_t,
+        POINTER(c_size_t),
+    ]    
+    lib.infiniopLinearBackward.restype = c_int32
+    lib.infiniopLinearBackward.argtypes = [
+        infiniopOperatorDescriptor_t,
+        c_void_p,
+        c_size_t,
+        c_void_p,
+        c_void_p,
+        c_void_p,
+        c_void_p,
+        c_void_p,
+        c_void_p,
+        c_void_p,
+    ]
+    lib.infiniopDestroyLinearBackwardDescriptor.restype = c_int32
+    lib.infiniopDestroyLinearBackwardDescriptor.argtypes = [infiniopOperatorDescriptor_t]
+
diff --git a/test/infiniop/libinfiniop/utils.py b/test/infiniop/libinfiniop/utils.py
index de397a69e..093e93ec2 100644
--- a/test/infiniop/libinfiniop/utils.py
+++ b/test/infiniop/libinfiniop/utils.py
@@ -122,6 +122,8 @@ def from_torch(torch_tensor, dt: InfiniDtype, device: InfiniDeviceEnum):
 
 
 def to_torch_dtype(dt: InfiniDtype, compatability_mode=False):
+    if dt == InfiniDtype.BOOL: # support torch.bool input dtype
+        return torch.bool    
     if dt == InfiniDtype.I8:
         return torch.int8
     elif dt == InfiniDtype.I16:
@@ -185,6 +187,8 @@ def rearrange_tensor(tensor, new_strides):
     Given a PyTorch tensor and a list of new strides, return a new PyTorch tensor with the given strides.
     """
     import torch
+    if list(tensor.stride()) == list(new_strides):# support self-define strides as the input of TestTensor
+        return tensor    
 
     shape = tensor.shape
 
diff --git a/test/infiniop/linear.py b/test/infiniop/linear.py
new file mode 100644
index 000000000..120853572
--- /dev/null
+++ b/test/infiniop/linear.py
@@ -0,0 +1,169 @@
+import torch
+import ctypes
+from ctypes import c_uint64
+from libinfiniop import (
+    LIBINFINIOP,
+    TestTensor,
+    get_test_devices,
+    check_error,
+    test_operator,
+    get_args,
+    debug,
+    get_tolerance,
+    profile_operation,
+    TestWorkspace,
+    InfiniDtype,
+    InfiniDtypeNames,
+    InfiniDeviceNames,
+    infiniopOperatorDescriptor_t,
+)
+from enum import Enum, auto
+
+_TEST_CASES = [
+    (50, 40, True, None, None, [1, 377]),
+    (50, 40, False, [10], [1], None),
+    (50, 40, True, [10], [1], None),
+    (333, 999, True, [1], [10], None),
+    (333, 999, False, [1], [10], None),        
+    (1001, 505, True, None, None, [3001, 3]),
+]
+
+
+
+# Data types used for testing
+_TENSOR_DTYPES = [InfiniDtype.F16, InfiniDtype.F32, InfiniDtype.BF16]
+
+# Tolerance map for different data types
+_TOLERANCE_MAP = {
+    InfiniDtype.F16: {"atol": 1e-3, "rtol": 1e-3},
+    InfiniDtype.F32: {"atol": 1e-5, "rtol": 1e-5},
+    InfiniDtype.BF16: {"atol": 1e-1, "rtol": 1e-1},
+}
+
+DEBUG = False
+PROFILE = False
+NUM_PRERUN = 10
+NUM_ITERATIONS = 1000
+
+
+def torch_linear(x, w, b, bias):
+    return torch.nn.functional.linear(x, w, bias=(b if bias else None))
+    
+
+def test(
+    handle,
+    device,
+    in_features, out_features, bias_exist, x_strides, y_strides, w_strides,
+    dtype,
+    sync=None,
+):
+    y = TestTensor(
+        [out_features],
+        y_strides,
+        dtype,
+        device,
+    )
+
+    x = TestTensor(
+        [in_features],
+        x_strides,
+        dtype,
+        device,
+    )
+
+    w = TestTensor(
+        [out_features, in_features],
+        w_strides,
+        dtype,
+        device,
+    )
+
+    b = TestTensor(
+        [out_features],
+        None,
+        dtype,
+        device,
+    ) if bias_exist else None
+
+    print(
+        f"Testing linear on {InfiniDeviceNames[device]} with in_features:{in_features}, out_features:{out_features}, bias:{bias_exist},"
+        f"dtype:{InfiniDtypeNames[dtype]}"
+    )
+
+    y._torch_tensor = torch_linear(x.torch_tensor(), w.torch_tensor(), (b.torch_tensor() if bias_exist else None), bias_exist)
+
+    if sync is not None:
+        sync()
+
+    descriptor = infiniopOperatorDescriptor_t()
+    check_error(
+        LIBINFINIOP.infiniopCreateLinearDescriptor(
+            handle,
+            ctypes.byref(descriptor),
+			y.descriptor,
+			x.descriptor,
+			w.descriptor,
+			(b.descriptor if bias_exist else None),
+        )
+    )
+    
+    # Invalidate the shape and strides in the descriptor to prevent them from being directly used by the kernel
+    for tensor in [y, x, w] + ([b] if bias_exist else []):
+        tensor.destroy_desc()
+
+    workspace_size = c_uint64(0)
+    check_error(
+        LIBINFINIOP.infiniopGetLinearWorkspaceSize(
+            descriptor, ctypes.byref(workspace_size)
+        )
+    )
+    workspace = TestWorkspace(workspace_size.value, x.device)
+    
+    def lib_linear():
+        check_error(
+            LIBINFINIOP.infiniopLinear(
+                descriptor,
+                workspace.data(),
+                workspace.size(),
+				y.data(),
+				x.data(),
+				w.data(),
+				b.data() if bias_exist else None,                
+                None,
+            )
+        )
+    
+    lib_linear()
+    
+
+    atol, rtol = get_tolerance(_TOLERANCE_MAP, dtype)
+    
+    if DEBUG:
+        debug(y.actual_tensor(), y.torch_tensor(), atol=atol, rtol=rtol)     
+
+    assert torch.allclose(y.actual_tensor(), y.torch_tensor(), atol=atol, rtol=rtol)         
+
+    # Profiling workflow
+    if PROFILE:
+        # fmt: off
+        profile_operation("PyTorch", lambda: torch_linear(
+            x.torch_tensor(), w.torch_tensor(), (b.torch_tensor() if bias_exist else None), bias_exist
+        ), device, NUM_PRERUN, NUM_ITERATIONS)
+        profile_operation("    lib", lambda: lib_linear(), device, NUM_PRERUN, NUM_ITERATIONS)
+        # fmt: on
+    check_error(LIBINFINIOP.infiniopDestroyLinearDescriptor(descriptor))
+
+
+if __name__ == "__main__":
+    args = get_args()
+
+    # Configure testing options
+    DEBUG = args.debug
+    PROFILE = args.profile
+    NUM_PRERUN = args.num_prerun
+    NUM_ITERATIONS = args.num_iterations
+
+    for device in get_test_devices(args):
+        test_operator(device, test, _TEST_CASES, _TENSOR_DTYPES)
+
+    print("\033[92mTest my linear passed!\033[0m")
diff --git a/test/infiniop/linear_backward.py b/test/infiniop/linear_backward.py
new file mode 100644
index 000000000..b9ece88ff
--- /dev/null
+++ b/test/infiniop/linear_backward.py
@@ -0,0 +1,233 @@
+import torch
+import ctypes
+from ctypes import c_uint64
+from libinfiniop import (
+    LIBINFINIOP,
+    TestTensor,
+    get_test_devices,
+    check_error,
+    test_operator,
+    get_args,
+    debug,
+    get_tolerance,
+    profile_operation,
+    TestWorkspace,
+    InfiniDtype,
+    InfiniDtypeNames,
+    InfiniDeviceNames,
+    infiniopOperatorDescriptor_t,
+)
+from enum import Enum, auto
+
+_TEST_CASES = [
+    (50, 40, True, None, None, [1, 377]),
+    (50, 40, False, [10], [1], None),
+    (50, 40, True, [10], [1], None),
+    (333, 999, True, [1], [10], None),
+    (333, 999, False, [1], [10], None),      
+    (1001, 505, True, None, None, [3001, 3]),
+]
+
+# Data types used for testing
+_TENSOR_DTYPES = [InfiniDtype.F16, InfiniDtype.F32, InfiniDtype.BF16]
+
+# Tolerance map for different data types
+
+_TOLERANCE_MAP = {
+    InfiniDtype.F16: {"atol": 1e-3, "rtol": 1e-2},
+    InfiniDtype.F32: {"atol": 1e-5, "rtol": 1e-5},
+    InfiniDtype.BF16: {"atol": 1e-3, "rtol": 1e-2},
+}
+
+DEBUG = False
+PROFILE = False
+NUM_PRERUN = 10
+NUM_ITERATIONS = 1000
+
+    
+def torch_linear_backward(
+        grad_x: torch.Tensor,
+        grad_w: torch.Tensor,
+        grad_b: torch.Tensor,
+        grad_y: torch.Tensor,
+        x: torch.Tensor,
+        w: torch.Tensor,
+        b: torch.Tensor,
+        bias_exist:bool
+    ):
+    x.requires_grad_(True)
+    w.requires_grad_(True)
+    if bias_exist:
+        b.requires_grad_(True)
+    y = torch.nn.functional.linear(x, w, bias=(b if bias_exist else None))
+    y.backward(grad_y)
+    grad_x.copy_(x.grad)
+    grad_w.copy_(w.grad)
+    if bias_exist:
+        grad_b.copy_(b.grad)
+
+def test(
+    handle,
+    device,
+    in_features, out_features, bias_exist, grad_x_strides, grad_y_strides, grad_w_strides,
+    dtype,
+    sync=None,
+):
+    print(
+        f"Testing linear_backward on {InfiniDeviceNames[device]} with in_features:{in_features}, out_features: {out_features},"
+        f"bias:{bias_exist},"
+        f"dtype:{InfiniDtypeNames[dtype]}"
+    )
+
+    grad_x = TestTensor(
+        [in_features],
+        grad_x_strides,
+        dtype,
+        device,
+    )
+
+    grad_w = TestTensor(
+        [out_features, in_features],
+        grad_w_strides,
+        dtype,
+        device,
+    )
+
+    grad_b = TestTensor(
+        [out_features],
+        None,
+        dtype,
+        device,
+    ) if bias_exist else None
+
+    grad_y = TestTensor(
+        [out_features],
+        grad_y_strides,
+        dtype,
+        device,
+    )
+
+    x = TestTensor(
+        [in_features],
+        None,
+        dtype,
+        device,
+    )
+
+    w = TestTensor(
+        [out_features, in_features],
+        None,
+        dtype,
+        device,
+    )
+
+    b = TestTensor(
+        [out_features],
+        None,
+        dtype,
+        device,
+    ) if bias_exist else None
+
+
+
+
+    torch_linear_backward(
+        grad_x.torch_tensor(), grad_w.torch_tensor(),
+        grad_b.torch_tensor() if bias_exist else None,
+        grad_y.torch_tensor(), x.torch_tensor(), w.torch_tensor(),
+        b.torch_tensor() if bias_exist else None,
+        bias_exist
+    )
+
+    if sync is not None:
+        sync()
+
+    descriptor = infiniopOperatorDescriptor_t()
+    check_error(
+        LIBINFINIOP.infiniopCreateLinearBackwardDescriptor(
+            handle,
+            ctypes.byref(descriptor),
+			grad_x.descriptor,
+			grad_w.descriptor,
+			(grad_b.descriptor if bias_exist else None),
+			grad_y.descriptor,
+			x.descriptor,
+			w.descriptor,
+        )
+    )
+
+    # Invalidate the shape and strides in the descriptor to prevent them from being directly used by the kernel
+    for tensor in [grad_x, grad_w, grad_y, x, w,] + [grad_b, b] if bias_exist else []:
+        if tensor is not None:
+            tensor.destroy_desc()
+
+    workspace_size = c_uint64(0)
+    check_error(
+        LIBINFINIOP.infiniopGetLinearBackwardWorkspaceSize(
+            descriptor, ctypes.byref(workspace_size)
+        )
+    )
+    workspace = TestWorkspace(workspace_size.value, grad_x.device)
+
+    def lib_linear_backward():
+        check_error(
+            LIBINFINIOP.infiniopLinearBackward(
+                descriptor,
+                workspace.data(),
+                workspace.size(),
+				grad_x.data(),
+				grad_w.data(),
+				grad_b.data() if bias_exist else None,
+				grad_y.data(),
+				x.data(),
+				w.data(),                
+                None,
+            )
+        )
+
+    lib_linear_backward()
+
+    atol, rtol = get_tolerance(_TOLERANCE_MAP, dtype)
+    if DEBUG:
+        debug(grad_x.actual_tensor(), grad_x.torch_tensor(), atol=atol, rtol=rtol)
+        debug(grad_w.actual_tensor(), grad_w.torch_tensor(), atol=atol, rtol=rtol)
+        if bias_exist:
+            debug(grad_b.actual_tensor(), grad_b.torch_tensor(), atol=atol, rtol=rtol)
+
+
+    assert torch.allclose(grad_x.actual_tensor(), grad_x.torch_tensor(), atol=atol, rtol=rtol)
+    assert torch.allclose(grad_w.actual_tensor(), grad_w.torch_tensor(), atol=atol, rtol=rtol)
+    if bias_exist:
+        assert torch.allclose(grad_b.actual_tensor(), grad_b.torch_tensor(), atol=atol, rtol=rtol)         
+
+    # Profiling workflow
+    if PROFILE:
+        # fmt: off
+        profile_operation("PyTorch", lambda: torch_linear_backward(
+            grad_x.torch_tensor(),
+            grad_w.torch_tensor(),
+            grad_b.torch_tensor() if bias_exist else None,
+            grad_y.torch_tensor(),
+            x.torch_tensor(),
+            w.torch_tensor(),
+            b.torch_tensor() if bias_exist else None,
+            bias_exist
+        ), device, NUM_PRERUN, NUM_ITERATIONS)
+        profile_operation("    lib", lambda: lib_linear_backward(), device, NUM_PRERUN, NUM_ITERATIONS)
+        # fmt: on
+    check_error(LIBINFINIOP.infiniopDestroyLinearBackwardDescriptor(descriptor))
+
+
+if __name__ == "__main__":
+    args = get_args()
+
+    # Configure testing options
+    DEBUG = args.debug
+    PROFILE = args.profile
+    NUM_PRERUN = args.num_prerun
+    NUM_ITERATIONS = args.num_iterations
+
+    for device in get_test_devices(args):
+        test_operator(device, test, _TEST_CASES, _TENSOR_DTYPES)
+
+    print("\033[92mTest my linear_backward passed!\033[0m")
diff --git a/test/infiniop/scatter.py b/test/infiniop/scatter.py
new file mode 100644
index 000000000..86ccdcdeb
--- /dev/null
+++ b/test/infiniop/scatter.py
@@ -0,0 +1,196 @@
+import torch
+import ctypes
+from ctypes import c_uint64
+from libinfiniop import (
+    LIBINFINIOP,
+    TestTensor,
+    get_test_devices,
+    check_error,
+    test_operator,
+    get_args,
+    debug,
+    get_tolerance,
+    profile_operation,
+    TestWorkspace,
+    InfiniDtype,
+    InfiniDtypeNames,
+    InfiniDeviceNames,
+    infiniopOperatorDescriptor_t,
+)
+from enum import Enum, auto
+import random
+
+_TEST_CASES = [
+    # input_shape, index_shape, output_shape, dim, input_strides, output_strides, index_strides
+    ((6, 7), (6, 7), (6, 7), 1, (7, 1), (1, 7), None),
+    ((2, 3, 7), (2, 3, 5), (2, 3, 5), 2, (1, 2, 6), None, None),
+    ((10, 5, 4), (10, 4, 4), (10, 4, 4), 1, None, None, [16, 4, 1]),
+    ((11, 2, 2, 4), (11, 2, 2, 4), (11, 2, 2, 4), 0, None, [16, 8, 4, 1], None),
+]
+
+
+_TENSOR_DTYPES = [InfiniDtype.F16, InfiniDtype.F32, InfiniDtype.BF16]
+
+# Tolerance map for different data types
+_TOLERANCE_MAP = {
+    InfiniDtype.F16: {"atol": 0, "rtol": 0},
+    InfiniDtype.F32: {"atol": 0, "rtol": 0},
+    InfiniDtype.BF16: {"atol": 0, "rtol": 0},
+}
+
+DEBUG = False
+PROFILE = False
+NUM_PRERUN = 10
+NUM_ITERATIONS = 1000
+
+
+def torch_scatter(output: torch.Tensor, input, index, dim):
+    output.scatter_(dim, index, src=input)
+    
+
+def test(
+    handle,
+    device,
+    input_shape, index_shape, output_shape, dim, input_strides, output_strides, index_strides,
+    dtype,
+    sync=None,
+):
+    print(
+        f"Testing scatter on {InfiniDeviceNames[device]} with input_shape:{input_shape}, index_shape:{index_shape}, output_shape:{output_shape}, dim:{dim},"
+        f"dtype:{InfiniDtypeNames[dtype]}"
+    )
+
+    output = TestTensor(
+        output_shape,
+        output_strides,
+        dtype,
+        device,
+        "zeros",
+    )
+
+    input = TestTensor(
+        input_shape,
+        input_strides,
+        dtype,
+        device,
+    )
+
+    def get_test_index_tensor(input_shape, index_shape, output_shape, scatter_dim):
+        index = torch.empty(index_shape, dtype=torch.int64)
+        ndim = len(input_shape)
+        if ndim == 2 and scatter_dim == 1:
+            for i in range(input.shape[0]):
+                row = list(range(output_shape[dim]))
+                random.shuffle(row)
+                index[i, :] = torch.tensor(row[:index_shape[dim]]).type(torch.float64)
+        elif ndim == 3 and scatter_dim == 2:
+            for i in range(input.shape[0]):
+                for j in range(input.shape[1]):
+                    row = list(range(output_shape[dim]))
+                    random.shuffle(row)
+                    index[i, j, :] = torch.tensor(row[:index_shape[dim]]).type(torch.float64)
+        elif ndim == 3 and scatter_dim == 1:
+            for i in range(input.shape[0]):
+                for j in range(input.shape[2]):
+                    row = list(range(output_shape[dim]))
+                    random.shuffle(row)
+                    index[i, :, j] = torch.tensor(row[:index_shape[dim]]).type(torch.float64)
+        elif ndim == 4 and scatter_dim == 0:
+            for i in range(input.shape[1]):
+                for j in range(input.shape[2]):
+                    for k in range(input.shape[3]):
+                        row = list(range(output_shape[dim]))
+                        random.shuffle(row)
+                        index[:, i, j, k] = torch.tensor(row[:index_shape[dim]]).type(torch.float64)
+        return index
+    
+    torch_index = get_test_index_tensor(input_shape, index_shape, output_shape, dim).type(torch.int64)
+    if index_strides:
+        torch_index = torch_index.as_strided(index_shape, index_strides)    
+    index = TestTensor(
+        index_shape,
+        torch_index.stride(),
+        InfiniDtype.I64,
+        device,
+        "manual",
+        set_tensor=torch_index
+    )
+
+    torch_scatter(output.torch_tensor(), input.torch_tensor(), index.torch_tensor(), dim)
+
+    if sync is not None:
+        sync()
+
+    descriptor = infiniopOperatorDescriptor_t()
+    check_error(
+        LIBINFINIOP.infiniopCreateScatterDescriptor(
+            handle,
+            ctypes.byref(descriptor),
+			output.descriptor,
+			input.descriptor,
+			index.descriptor,
+			dim,
+        )
+    )
+
+    # Invalidate the shape and strides in the descriptor to prevent them from being directly used by the kernel
+    for tensor in [output, input, index]:
+        tensor.destroy_desc()
+
+    workspace_size = c_uint64(0)
+    check_error(
+        LIBINFINIOP.infiniopGetScatterWorkspaceSize(
+            descriptor, ctypes.byref(workspace_size)
+        )
+    )
+    workspace = TestWorkspace(workspace_size.value, output.device)
+
+    def lib_scatter():
+        check_error(
+            LIBINFINIOP.infiniopScatter(
+                descriptor,
+                workspace.data(),
+                workspace.size(),
+				output.data(),
+				input.data(),
+				index.data(),                
+                None,
+            )
+        )
+
+    lib_scatter()
+
+    atol, rtol = get_tolerance(_TOLERANCE_MAP, dtype)
+    if DEBUG:
+        debug(output.actual_tensor(), output.torch_tensor(), atol=atol, rtol=rtol)
+    # print('input:\n', input.torch_tensor())
+    # print('index:\n', index.torch_tensor())
+    # print('output:\n', output.torch_tensor(), '\n', output.actual_tensor(), )
+
+
+    assert torch.allclose(output.actual_tensor(), output.torch_tensor(), atol=atol, rtol=rtol)         
+
+    # Profiling workflow
+    if PROFILE:
+        # fmt: off
+        profile_operation("PyTorch", lambda: torch_scatter(
+            output.torch_tensor(), input.torch_tensor(), index.torch_tensor(), dim
+        ), device, NUM_PRERUN, NUM_ITERATIONS)
+        profile_operation("    lib", lambda: lib_scatter(), device, NUM_PRERUN, NUM_ITERATIONS)
+        # fmt: on
+    check_error(LIBINFINIOP.infiniopDestroyScatterDescriptor(descriptor))
+
+
+if __name__ == "__main__":
+    args = get_args()
+
+    # Configure testing options
+    DEBUG = args.debug
+    PROFILE = args.profile
+    NUM_PRERUN = args.num_prerun
+    NUM_ITERATIONS = args.num_iterations
+
+    for device in get_test_devices(args):
+        test_operator(device, test, _TEST_CASES, _TENSOR_DTYPES)
+
+    print("\033[92mTest my scatter passed!\033[0m")
diff --git a/test/infiniop/tril.py b/test/infiniop/tril.py
new file mode 100644
index 000000000..d4c40eff7
--- /dev/null
+++ b/test/infiniop/tril.py
@@ -0,0 +1,188 @@
+import torch
+import ctypes
+from ctypes import c_uint64
+from libinfiniop import (
+    LIBINFINIOP,
+    TestTensor,
+    get_test_devices,
+    check_error,
+    test_operator,
+    get_args,
+    debug,
+    get_tolerance,
+    profile_operation,
+    TestWorkspace,
+    InfiniDtype,
+    InfiniDtypeNames,
+    InfiniDeviceNames,
+    infiniopOperatorDescriptor_t,
+)
+from enum import Enum, auto
+
+_TEST_CASES_ = [
+    ((5, 6), 0),
+    ((4, 5), -1),
+    ((5, 2), 3),
+    ((89, 80), -20),
+]
+
+
+class Inplace(Enum):
+    OUT_OF_PLACE = auto()
+    INPLACE = auto()
+
+
+# Inplace options applied for each test case in _TEST_CASES_
+_INPLACE = [
+    Inplace.OUT_OF_PLACE,
+    Inplace.INPLACE,
+]
+
+
+# Form the test cases by appending each element of _INPLACE to each tuple in _TEST_CASES_
+_TEST_CASES = [
+    test_case + (inplace_item,)
+    for test_case in _TEST_CASES_
+    for inplace_item in _INPLACE
+]
+
+
+# Data types used for testing
+_TENSOR_DTYPES = [InfiniDtype.F16, InfiniDtype.F32, InfiniDtype.BF16, InfiniDtype.I64, InfiniDtype.I32]
+
+# Tolerance map for different data types
+
+_TOLERANCE_MAP = {
+    InfiniDtype.F16: {"atol": 0, "rtol": 0},
+    InfiniDtype.F32: {"atol": 0, "rtol": 0},
+    InfiniDtype.BF16: {"atol": 0, "rtol": 0},
+    InfiniDtype.I32: {"atol": 0, "rtol": 0},
+    InfiniDtype.I64: {"atol": 0, "rtol": 0},
+
+}
+
+DEBUG = False
+PROFILE = False
+NUM_PRERUN = 10
+NUM_ITERATIONS = 1000
+
+
+def torch_tril(output, input, diagonal):
+    torch.tril(input, diagonal, out=output)
+
+    
+
+def test(
+    handle,
+    device,
+    input_shape, diagonal,
+    inplace,
+    dtype,
+    sync=None,
+):
+    torch_dtype = {
+        InfiniDtype.F16: torch.half,
+        InfiniDtype.F32: torch.float,
+        InfiniDtype.BF16: torch.bfloat16,
+        InfiniDtype.I32: torch.int32,
+        InfiniDtype.I64: torch.int64
+    }[dtype]
+
+    print(
+        f"Testing tril on {InfiniDeviceNames[device]} with shape:{input_shape}, diagonal:{diagonal}, "
+        f"inplace:{inplace},"
+        f"dtype:{InfiniDtypeNames[dtype]}"
+    )
+
+    torch_input = (torch.rand(size=input_shape) * 100 - 50).type(torch_dtype)
+    input = TestTensor(
+        input_shape,
+        torch_input.stride(),
+        dtype,
+        device,
+        "manual",
+        set_tensor=torch_input
+    )
+    if inplace == Inplace.INPLACE:
+        output = input
+    else:
+        output = TestTensor(
+            input_shape,
+            None,
+            dtype,
+            device,
+            "zeros"
+        )
+
+
+    torch_tril(output.torch_tensor(), input.torch_tensor(), diagonal)
+
+    if sync is not None:
+        sync()
+
+    descriptor = infiniopOperatorDescriptor_t()
+    check_error(
+        LIBINFINIOP.infiniopCreateTrilDescriptor(
+            handle,
+            ctypes.byref(descriptor),
+			output.descriptor,
+			input.descriptor,
+			diagonal,
+        )
+    )
+
+    # Invalidate the shape and strides in the descriptor to prevent them from being directly used by the kernel
+    for tensor in [output, input]:
+        tensor.destroy_desc()
+
+    workspace_size = c_uint64(0)
+    check_error(
+        LIBINFINIOP.infiniopGetTrilWorkspaceSize(
+            descriptor, ctypes.byref(workspace_size)
+        )
+    )
+    workspace = TestWorkspace(workspace_size.value, input.device)
+
+    def lib_tril():
+        check_error(
+            LIBINFINIOP.infiniopTril(
+                descriptor,
+                workspace.data(),
+                workspace.size(),
+				output.data(),
+				input.data(),                
+                None,
+            )
+        )
+
+    lib_tril()
+
+    atol, rtol = get_tolerance(_TOLERANCE_MAP, dtype)
+    if DEBUG:
+        debug(output.actual_tensor(), output.torch_tensor(), atol=atol, rtol=rtol)     
+    assert torch.allclose(output.actual_tensor(), output.torch_tensor(), atol=atol, rtol=rtol)         
+
+    # Profiling workflow
+    if PROFILE:
+        # fmt: off
+        profile_operation("PyTorch", lambda: torch_tril(
+            output.torch_tensor(), input.torch_tensor(), diagonal
+        ), device, NUM_PRERUN, NUM_ITERATIONS)
+        profile_operation("    lib", lambda: lib_tril(), device, NUM_PRERUN, NUM_ITERATIONS)
+        # fmt: on
+    check_error(LIBINFINIOP.infiniopDestroyTrilDescriptor(descriptor))
+
+
+if __name__ == "__main__":
+    args = get_args()
+
+    # Configure testing options
+    DEBUG = args.debug
+    PROFILE = args.profile
+    NUM_PRERUN = args.num_prerun
+    NUM_ITERATIONS = args.num_iterations
+
+    for device in get_test_devices(args):
+        test_operator(device, test, _TEST_CASES, _TENSOR_DTYPES)
+
+    print("\033[92mTest my tril passed!\033[0m")
diff --git a/test/infiniop/triu.py b/test/infiniop/triu.py
new file mode 100644
index 000000000..5c3ec63d1
--- /dev/null
+++ b/test/infiniop/triu.py
@@ -0,0 +1,186 @@
+import torch
+import ctypes
+from ctypes import c_uint64
+from libinfiniop import (
+    LIBINFINIOP,
+    TestTensor,
+    get_test_devices,
+    check_error,
+    test_operator,
+    get_args,
+    debug,
+    get_tolerance,
+    profile_operation,
+    TestWorkspace,
+    InfiniDtype,
+    InfiniDtypeNames,
+    InfiniDeviceNames,
+    infiniopOperatorDescriptor_t,
+)
+from enum import Enum, auto
+
+_TEST_CASES_ = [
+    ((5, 6), 0),
+    ((4, 5), -1),
+    ((5, 2), 3),
+    ((89, 80), -20),
+]
+
+
+class Inplace(Enum):
+    OUT_OF_PLACE = auto()
+    INPLACE = auto()
+
+
+# Inplace options applied for each test case in _TEST_CASES_
+_INPLACE = [
+    Inplace.OUT_OF_PLACE,
+    Inplace.INPLACE,
+]
+
+
+# Form the test cases by appending each element of _INPLACE to each tuple in _TEST_CASES_
+_TEST_CASES = [
+    test_case + (inplace_item,)
+    for test_case in _TEST_CASES_
+    for inplace_item in _INPLACE
+]
+
+
+# Data types used for testing
+_TENSOR_DTYPES = [InfiniDtype.F16, InfiniDtype.F32, InfiniDtype.BF16, InfiniDtype.I64, InfiniDtype.I32]
+
+# Tolerance map for different data types
+
+_TOLERANCE_MAP = {
+    InfiniDtype.F16: {"atol": 0, "rtol": 0},
+    InfiniDtype.F32: {"atol": 0, "rtol": 0},
+    InfiniDtype.BF16: {"atol": 0, "rtol": 0},
+    InfiniDtype.I32: {"atol": 0, "rtol": 0},
+    InfiniDtype.I64: {"atol": 0, "rtol": 0},    
+}
+
+
+DEBUG = False
+PROFILE = False
+NUM_PRERUN = 10
+NUM_ITERATIONS = 1000
+
+
+def torch_triu(output, input, diagonal):
+    torch.triu(input, diagonal, out=output)
+    
+
+def test(
+    handle,
+    device,
+    input_shape, diagonal,
+    inplace,
+    dtype,
+    sync=None,
+):
+    torch_dtype = {
+        InfiniDtype.F16: torch.half,
+        InfiniDtype.F32: torch.float,
+        InfiniDtype.BF16: torch.bfloat16,
+        InfiniDtype.I32: torch.int32,
+        InfiniDtype.I64: torch.int64
+    }[dtype]
+
+    print(
+        f"Testing triu on {InfiniDeviceNames[device]} with shape:{input_shape}, diagonal:{diagonal}, "
+        f"inplace:{inplace},"
+        f"dtype:{InfiniDtypeNames[dtype]}"
+    )
+
+    torch_input = (torch.rand(size=input_shape) * 100 - 50).type(torch_dtype)
+    input = TestTensor(
+        input_shape,
+        torch_input.stride(),
+        dtype,
+        device,
+        "manual",
+        set_tensor=torch_input
+    )
+    if inplace == Inplace.INPLACE:
+        output = input
+    else:
+        output = TestTensor(
+            input_shape,
+            None,
+            dtype,
+            device,
+            "zeros"
+        )
+
+    torch_triu(output.torch_tensor(), input.torch_tensor(), diagonal)
+
+    if sync is not None:
+        sync()
+
+    descriptor = infiniopOperatorDescriptor_t()
+    check_error(
+        LIBINFINIOP.infiniopCreateTriuDescriptor(
+            handle,
+            ctypes.byref(descriptor),
+			output.descriptor,
+			input.descriptor,
+			diagonal,
+        )
+    )
+
+    # Invalidate the shape and strides in the descriptor to prevent them from being directly used by the kernel
+    for tensor in [output, input]:
+        tensor.destroy_desc()
+
+    workspace_size = c_uint64(0)
+    check_error(
+        LIBINFINIOP.infiniopGetTriuWorkspaceSize(
+            descriptor, ctypes.byref(workspace_size)
+        )
+    )
+    workspace = TestWorkspace(workspace_size.value, output.device)
+
+    def lib_triu():
+        check_error(
+            LIBINFINIOP.infiniopTriu(
+                descriptor,
+                workspace.data(),
+                workspace.size(),
+				output.data(),
+				input.data(),                
+                None,
+            )
+        )
+
+    lib_triu()
+
+    atol, rtol = get_tolerance(_TOLERANCE_MAP, dtype)
+    if DEBUG:
+        debug(output.actual_tensor(), output.torch_tensor(), atol=atol, rtol=rtol)
+    assert torch.allclose(output.actual_tensor(), output.torch_tensor(), atol=atol, rtol=rtol)         
+
+    # Profiling workflow
+    if PROFILE:
+        # fmt: off
+        profile_operation("PyTorch", lambda: torch_triu(
+            output.torch_tensor(), input.torch_tensor(), diagonal
+        ), device, NUM_PRERUN, NUM_ITERATIONS)
+        profile_operation("    lib", lambda: lib_triu(), device, NUM_PRERUN, NUM_ITERATIONS)
+        # fmt: on
+    check_error(LIBINFINIOP.infiniopDestroyTriuDescriptor(descriptor))
+
+
+if __name__ == "__main__":
+    args = get_args()
+
+    # Configure testing options
+    DEBUG = args.debug
+    PROFILE = args.profile
+    NUM_PRERUN = args.num_prerun
+    NUM_ITERATIONS = args.num_iterations
+
+    for device in get_test_devices(args):
+        test_operator(device, test, _TEST_CASES, _TENSOR_DTYPES)
+
+    print("\033[92mTest my triu passed!\033[0m")