diff --git a/include/infiniop.h b/include/infiniop.h index d51b8d92e..868f29d48 100644 --- a/include/infiniop.h +++ b/include/infiniop.h @@ -17,5 +17,12 @@ #include "infiniop/ops/sub.h" #include "infiniop/ops/swiglu.h" #include "infiniop/tensor_descriptor.h" +#include "infiniop/ops/index_copy_inplace.h" +#include "infiniop/ops/gather.h" +#include "infiniop/ops/scatter.h" +#include "infiniop/ops/tril.h" +#include "infiniop/ops/triu.h" +#include "infiniop/ops/linear.h" +#include "infiniop/ops/linear_backward.h" #endif // __INFINIOP_API_H__ diff --git a/include/infiniop/ops/gather.h b/include/infiniop/ops/gather.h new file mode 100644 index 000000000..9ffe310c9 --- /dev/null +++ b/include/infiniop/ops/gather.h @@ -0,0 +1,31 @@ +#ifndef __INFINIOP_GATHER_API_H__ +#define __INFINIOP_GATHER_API_H__ + +#include "../operator_descriptor.h" + +typedef struct InfiniopDescriptor *infiniopGatherDescriptor_t; + +__C __export infiniStatus_t infiniopCreateGatherDescriptor( + infiniopHandle_t handle, + infiniopGatherDescriptor_t *desc_ptr, + infiniopTensorDescriptor_t output_desc, + infiniopTensorDescriptor_t input_desc, + infiniopTensorDescriptor_t index_desc, + size_t dim +); + +__C __export infiniStatus_t infiniopGetGatherWorkspaceSize(infiniopGatherDescriptor_t desc, size_t *size); + +__C __export infiniStatus_t infiniopGather( + infiniopGatherDescriptor_t desc, + void *workspace, + size_t workspace_size, + void * output, + const void * input, + const void * index, + void *stream +); + +__C __export infiniStatus_t infiniopDestroyGatherDescriptor(infiniopGatherDescriptor_t desc); + +#endif diff --git a/include/infiniop/ops/index_copy_inplace.h b/include/infiniop/ops/index_copy_inplace.h new file mode 100644 index 000000000..e2266299a --- /dev/null +++ b/include/infiniop/ops/index_copy_inplace.h @@ -0,0 +1,30 @@ +#ifndef __INFINIOP_INDEX_COPY_INPLACE_API_H__ +#define __INFINIOP_INDEX_COPY_INPLACE_API_H__ + +#include "../operator_descriptor.h" + +typedef struct InfiniopDescriptor *infiniopIndexCopyInplaceDescriptor_t; + +__C __export infiniStatus_t infiniopCreateIndexCopyInplaceDescriptor( + infiniopHandle_t handle, + infiniopIndexCopyInplaceDescriptor_t *desc_ptr, + infiniopTensorDescriptor_t output_desc, + infiniopTensorDescriptor_t input_desc, + infiniopTensorDescriptor_t index_desc, + size_t dim +); + +__C __export infiniStatus_t infiniopGetIndexCopyInplaceWorkspaceSize(infiniopIndexCopyInplaceDescriptor_t desc, size_t *size); + +__C __export infiniStatus_t infiniopIndexCopyInplace(infiniopIndexCopyInplaceDescriptor_t desc, + void *workspace, + size_t workspace_size, + void * output, + const void * input, + const void * index, + void *stream +); + +__C __export infiniStatus_t infiniopDestroyIndexCopyInplaceDescriptor(infiniopIndexCopyInplaceDescriptor_t desc); + +#endif diff --git a/include/infiniop/ops/linear.h b/include/infiniop/ops/linear.h new file mode 100644 index 000000000..4a81d4988 --- /dev/null +++ b/include/infiniop/ops/linear.h @@ -0,0 +1,32 @@ +#ifndef __INFINIOP_LINEAR_API_H__ +#define __INFINIOP_LINEAR_API_H__ + +#include "../operator_descriptor.h" + +typedef struct InfiniopDescriptor *infiniopLinearDescriptor_t; + +__C __export infiniStatus_t infiniopCreateLinearDescriptor( + infiniopHandle_t handle, + infiniopLinearDescriptor_t *desc_ptr, + infiniopTensorDescriptor_t y_desc, + infiniopTensorDescriptor_t x_desc, + infiniopTensorDescriptor_t w_desc, + infiniopTensorDescriptor_t b_desc +); + +__C __export infiniStatus_t infiniopGetLinearWorkspaceSize(infiniopLinearDescriptor_t desc, size_t *size); + +__C __export infiniStatus_t infiniopLinear( + infiniopLinearDescriptor_t desc, + void *workspace, + size_t workspace_size, + void * y, + const void * x, + const void * w, + const void * b, + void *stream +); + +__C __export infiniStatus_t infiniopDestroyLinearDescriptor(infiniopLinearDescriptor_t desc); + +#endif \ No newline at end of file diff --git a/include/infiniop/ops/linear_backward.h b/include/infiniop/ops/linear_backward.h new file mode 100644 index 000000000..de74668c7 --- /dev/null +++ b/include/infiniop/ops/linear_backward.h @@ -0,0 +1,35 @@ +#ifndef __INFINIOP_LINEAR_BACKWARD_API_H__ +#define __INFINIOP_LINEAR_BACKWARD_API_H__ + +#include "../operator_descriptor.h" + +typedef struct InfiniopDescriptor *infiniopLinearBackwardDescriptor_t; + +__C __export infiniStatus_t infiniopCreateLinearBackwardDescriptor( + infiniopHandle_t handle, + infiniopLinearBackwardDescriptor_t *desc_ptr, + infiniopTensorDescriptor_t grad_x_desc, + infiniopTensorDescriptor_t grad_w_desc, + infiniopTensorDescriptor_t grad_b_desc, + infiniopTensorDescriptor_t grad_y_desc, + infiniopTensorDescriptor_t x_desc, + infiniopTensorDescriptor_t w_desc +); + +__C __export infiniStatus_t infiniopGetLinearBackwardWorkspaceSize(infiniopLinearBackwardDescriptor_t desc, size_t *size); + +__C __export infiniStatus_t infiniopLinearBackward(infiniopLinearBackwardDescriptor_t desc, + void *workspace, + size_t workspace_size, + void * grad_x, + void * grad_w, + void * grad_b, + const void * grad_y, + const void * x, + const void * w, + void *stream +); + +__C __export infiniStatus_t infiniopDestroyLinearBackwardDescriptor(infiniopLinearBackwardDescriptor_t desc); + +#endif diff --git a/include/infiniop/ops/scatter.h b/include/infiniop/ops/scatter.h new file mode 100644 index 000000000..22e0eff83 --- /dev/null +++ b/include/infiniop/ops/scatter.h @@ -0,0 +1,30 @@ +#ifndef __INFINIOP_SCATTER_API_H__ +#define __INFINIOP_SCATTER_API_H__ + +#include "../operator_descriptor.h" + +typedef struct InfiniopDescriptor *infiniopScatterDescriptor_t; + +__C __export infiniStatus_t infiniopCreateScatterDescriptor( + infiniopHandle_t handle, + infiniopScatterDescriptor_t *desc_ptr, + infiniopTensorDescriptor_t output_desc, + infiniopTensorDescriptor_t input_desc, + infiniopTensorDescriptor_t index_desc, + size_t dim +); + +__C __export infiniStatus_t infiniopGetScatterWorkspaceSize(infiniopScatterDescriptor_t desc, size_t *size); + +__C __export infiniStatus_t infiniopScatter(infiniopScatterDescriptor_t desc, + void *workspace, + size_t workspace_size, + void * output, + const void * input, + const void * index, + void *stream +); + +__C __export infiniStatus_t infiniopDestroyScatterDescriptor(infiniopScatterDescriptor_t desc); + +#endif diff --git a/include/infiniop/ops/tril.h b/include/infiniop/ops/tril.h new file mode 100644 index 000000000..8ce215308 --- /dev/null +++ b/include/infiniop/ops/tril.h @@ -0,0 +1,28 @@ +#ifndef __INFINIOP_TRIL_API_H__ +#define __INFINIOP_TRIL_API_H__ + +#include "../operator_descriptor.h" + +typedef struct InfiniopDescriptor *infiniopTrilDescriptor_t; + +__C __export infiniStatus_t infiniopCreateTrilDescriptor( + infiniopHandle_t handle, + infiniopTrilDescriptor_t *desc_ptr, + infiniopTensorDescriptor_t output_desc, + infiniopTensorDescriptor_t input_desc, + int diagonal +); + +__C __export infiniStatus_t infiniopGetTrilWorkspaceSize(infiniopTrilDescriptor_t desc, size_t *size); + +__C __export infiniStatus_t infiniopTril(infiniopTrilDescriptor_t desc, + void *workspace, + size_t workspace_size, + void * output, + const void * input, + void *stream +); + +__C __export infiniStatus_t infiniopDestroyTrilDescriptor(infiniopTrilDescriptor_t desc); + +#endif diff --git a/include/infiniop/ops/triu.h b/include/infiniop/ops/triu.h new file mode 100644 index 000000000..bb7854767 --- /dev/null +++ b/include/infiniop/ops/triu.h @@ -0,0 +1,28 @@ +#ifndef __INFINIOP_TRIU_API_H__ +#define __INFINIOP_TRIU_API_H__ + +#include "../operator_descriptor.h" + +typedef struct InfiniopDescriptor *infiniopTriuDescriptor_t; + +__C __export infiniStatus_t infiniopCreateTriuDescriptor( + infiniopHandle_t handle, + infiniopTriuDescriptor_t *desc_ptr, + infiniopTensorDescriptor_t output_desc, + infiniopTensorDescriptor_t input_desc, + int diagonal +); + +__C __export infiniStatus_t infiniopGetTriuWorkspaceSize(infiniopTriuDescriptor_t desc, size_t *size); + +__C __export infiniStatus_t infiniopTriu(infiniopTriuDescriptor_t desc, + void *workspace, + size_t workspace_size, + void * output, + const void * input, + void *stream +); + +__C __export infiniStatus_t infiniopDestroyTriuDescriptor(infiniopTriuDescriptor_t desc); + +#endif diff --git a/scripts/python_test.py b/scripts/python_test.py index eb2d4319e..548a68025 100644 --- a/scripts/python_test.py +++ b/scripts/python_test.py @@ -12,18 +12,25 @@ def run_tests(args): failed = [] for test in [ - "add.py", - "attention.py", - "causal_softmax.py", - "clip.py", - "gemm.py", - "mul.py", - "random_sample.py", - "rearrange.py", - "rms_norm.py", - "rope.py", - "sub.py", - "swiglu.py", + # "add.py", + # "attention.py", + # "causal_softmax.py", + # "clip.py", + # "gemm.py", + # "mul.py", + # "random_sample.py", + # "rearrange.py", + # "rms_norm.py", + # "rope.py", + # "sub.py", + # "swiglu.py", + "index_copy_inplace.py", + "gather.py", + "scatter.py", + "tril.py", + "triu.py", + "linear.py", + "linear_backward.py", ]: result = subprocess.run( f"python {test} {args} --debug", text=True, encoding="utf-8", shell=True diff --git a/src/infiniop-test/include/ops.hpp b/src/infiniop-test/include/ops.hpp index 3820f7cfd..f59552709 100644 --- a/src/infiniop-test/include/ops.hpp +++ b/src/infiniop-test/include/ops.hpp @@ -16,6 +16,13 @@ DECLARE_INFINIOP_TEST(add) DECLARE_INFINIOP_TEST(causal_softmax) DECLARE_INFINIOP_TEST(rearrange) DECLARE_INFINIOP_TEST(sub) +DECLARE_INFINIOP_TEST(index_copy_inplace) +DECLARE_INFINIOP_TEST(gather) +DECLARE_INFINIOP_TEST(scatter) +DECLARE_INFINIOP_TEST(tril) +DECLARE_INFINIOP_TEST(triu) +DECLARE_INFINIOP_TEST(linear) +DECLARE_INFINIOP_TEST(linear_backward) #define REGISTER_INFINIOP_TEST(name) \ { \ @@ -43,6 +50,13 @@ DECLARE_INFINIOP_TEST(sub) REGISTER_INFINIOP_TEST(causal_softmax) \ REGISTER_INFINIOP_TEST(rearrange) \ REGISTER_INFINIOP_TEST(sub) \ + REGISTER_INFINIOP_TEST(index_copy_inplace) \ + REGISTER_INFINIOP_TEST(gather) \ + REGISTER_INFINIOP_TEST(scatter) \ + REGISTER_INFINIOP_TEST(tril) \ + REGISTER_INFINIOP_TEST(triu) \ + REGISTER_INFINIOP_TEST(linear) \ + REGISTER_INFINIOP_TEST(linear_backward) \ } namespace infiniop_test { diff --git a/src/infiniop-test/src/ops/gather.cpp b/src/infiniop-test/src/ops/gather.cpp new file mode 100644 index 000000000..e1b998fe9 --- /dev/null +++ b/src/infiniop-test/src/ops/gather.cpp @@ -0,0 +1,114 @@ +#include "ops.hpp" +#include "utils.hpp" +#include +#include +#include + +namespace infiniop_test::gather { +struct Test::Attributes { + std::shared_ptr output; + std::shared_ptr input; + std::shared_ptr index; + size_t dim; + std::shared_ptr ans; +}; + +std::shared_ptr Test::build( + std::unordered_map> attributes, + std::unordered_map> tensors, + double rtol, double atol) { + auto test = std::shared_ptr(new Test(rtol, atol)); + test->_attributes = new Attributes(); + if (tensors.find("output") == tensors.end() + || tensors.find("input") == tensors.end() + || tensors.find("index") == tensors.end() + || tensors.find("ans") == tensors.end()) { + throw std::runtime_error("Invalid Test"); + } + test->_attributes->output = tensors["output"]; + test->_attributes->input = tensors["input"]; + test->_attributes->index = tensors["index"]; + test->_attributes->ans = tensors["ans"]; + test->_attributes->dim = *reinterpret_cast(attributes["dim"].data()); + + return test; +} +std::shared_ptr Test::run( + infiniopHandle_t handle, infiniDevice_t device, int device_id, size_t warm_ups, size_t iterations) { + infiniopGatherDescriptor_t op_desc; + auto output = _attributes->output->to(device, device_id); + auto input = _attributes->input->to(device, device_id); + auto index = _attributes->index->to(device, device_id); + auto dim = _attributes->dim; + CHECK_OR(infiniopCreateGatherDescriptor(handle, &op_desc, + output->desc(), + input->desc(), + index->desc(), + dim + ), + return TEST_FAILED(OP_CREATION_FAILED, "Failed to create op descriptor.")); + size_t workspace_size; + CHECK_OR(infiniopGetGatherWorkspaceSize(op_desc, &workspace_size), + return TEST_FAILED(OP_CREATION_FAILED, "Failed to get workspace size.")); + void *workspace; + CHECK_OR(infinirtMalloc(&workspace, workspace_size), + return TEST_FAILED(OP_CREATION_FAILED, "Failed to allocate workspace.")); + CHECK_OR(infiniopGather(op_desc, workspace, workspace_size, + output->data(), + input->data(), + index->data(), + nullptr), + return TEST_FAILED(OP_EXECUTION_FAILED, "Failed during execution.")); + + try { + allClose(output, _attributes->ans, _rtol, _atol); + } catch (const std::exception &e) { + return TEST_FAILED(RESULT_INCORRECT, e.what()); + } + + double elapsed_time = 0.; + + elapsed_time = benchmark( + [=]() { + infiniopGather( + op_desc, workspace, workspace_size, + output->data(), + input->data(), + index->data(), + nullptr); + }, + warm_ups, iterations); + + return TEST_PASSED(elapsed_time); +} + +std::vector Test::attribute_names() { + return {"dim"}; +} + +std::vector Test::tensor_names() { + return {"output", "input", "index", "ans"}; +} + +std::vector Test::output_names() { + return {"output"}; +} + +std::string Test::toString() const { + std::ostringstream oss; + oss << op_name() << std::endl; + oss << "- output: " << _attributes->output->info() << std::endl; + oss << "- input: " << _attributes->input->info() << std::endl; + oss << "- index: " << _attributes->index->info() << std::endl; + oss << "- dim: " << _attributes->dim << std::endl; + + oss << std::scientific << std::setprecision(2); + oss << "- rtol=" << _rtol << ", atol=" << _atol << std::endl; + return oss.str(); +} + +Test::~Test() { + delete _attributes; +} + +} // namespace infiniop_test::gather diff --git a/src/infiniop-test/src/ops/index_copy_inplace.cpp b/src/infiniop-test/src/ops/index_copy_inplace.cpp new file mode 100644 index 000000000..9f5fb5be9 --- /dev/null +++ b/src/infiniop-test/src/ops/index_copy_inplace.cpp @@ -0,0 +1,114 @@ +#include "ops.hpp" +#include "utils.hpp" +#include +#include +#include + +namespace infiniop_test::index_copy_inplace { +struct Test::Attributes { + std::shared_ptr output; + std::shared_ptr input; + std::shared_ptr index; + size_t dim; + std::shared_ptr ans; +}; + +std::shared_ptr Test::build( + std::unordered_map> attributes, + std::unordered_map> tensors, + double rtol, double atol) { + auto test = std::shared_ptr(new Test(rtol, atol)); + test->_attributes = new Attributes(); + if (tensors.find("output") == tensors.end() + || tensors.find("input") == tensors.end() + || tensors.find("index") == tensors.end() + || tensors.find("ans") == tensors.end()) { + throw std::runtime_error("Invalid Test"); + } + test->_attributes->output = tensors["output"]; + test->_attributes->input = tensors["input"]; + test->_attributes->index = tensors["index"]; + test->_attributes->ans = tensors["ans"]; + test->_attributes->dim = *reinterpret_cast(attributes["dim"].data()); + + return test; +} +std::shared_ptr Test::run( + infiniopHandle_t handle, infiniDevice_t device, int device_id, size_t warm_ups, size_t iterations) { + infiniopIndexCopyInplaceDescriptor_t op_desc; + auto output = _attributes->output->to(device, device_id); + auto input = _attributes->input->to(device, device_id); + auto index = _attributes->index->to(device, device_id); + auto dim = _attributes->dim; + CHECK_OR(infiniopCreateIndexCopyInplaceDescriptor(handle, &op_desc, + output->desc(), + input->desc(), + index->desc(), + dim + ), + return TEST_FAILED(OP_CREATION_FAILED, "Failed to create op descriptor.")); + size_t workspace_size; + CHECK_OR(infiniopGetIndexCopyInplaceWorkspaceSize(op_desc, &workspace_size), + return TEST_FAILED(OP_CREATION_FAILED, "Failed to get workspace size.")); + void *workspace; + CHECK_OR(infinirtMalloc(&workspace, workspace_size), + return TEST_FAILED(OP_CREATION_FAILED, "Failed to allocate workspace.")); + CHECK_OR(infiniopIndexCopyInplace(op_desc, workspace, workspace_size, + output->data(), + input->data(), + index->data(), + nullptr), + return TEST_FAILED(OP_EXECUTION_FAILED, "Failed during execution.")); + + try { + allClose(output, _attributes->ans, _rtol, _atol); + } catch (const std::exception &e) { + return TEST_FAILED(RESULT_INCORRECT, e.what()); + } + + double elapsed_time = 0.; + + elapsed_time = benchmark( + [=]() { + infiniopIndexCopyInplace( + op_desc, workspace, workspace_size, + output->data(), + input->data(), + index->data(), + nullptr); + }, + warm_ups, iterations); + + return TEST_PASSED(elapsed_time); +} + +std::vector Test::attribute_names() { + return {"dim"}; +} + +std::vector Test::tensor_names() { + return {"output", "input", "index", "ans"}; +} + +std::vector Test::output_names() { + return {"output"}; +} + +std::string Test::toString() const { + std::ostringstream oss; + oss << op_name() << std::endl; + oss << "- output: " << _attributes->output->info() << std::endl; + oss << "- input: " << _attributes->input->info() << std::endl; + oss << "- index: " << _attributes->index->info() << std::endl; + oss << "- dim: " << _attributes->dim << std::endl; + + oss << std::scientific << std::setprecision(2); + oss << "- rtol=" << _rtol << ", atol=" << _atol << std::endl; + return oss.str(); +} + +Test::~Test() { + delete _attributes; +} + +} // namespace infiniop_test::index_copy_inplace diff --git a/src/infiniop-test/src/ops/linear.cpp b/src/infiniop-test/src/ops/linear.cpp new file mode 100644 index 000000000..94e146b9e --- /dev/null +++ b/src/infiniop-test/src/ops/linear.cpp @@ -0,0 +1,119 @@ +#include "ops.hpp" +#include "utils.hpp" +#include +#include +#include + +namespace infiniop_test::linear { +struct Test::Attributes { + bool bias_exist; + std::shared_ptr y; + std::shared_ptr x; + std::shared_ptr w; + std::shared_ptr b; + std::shared_ptr ans; +}; + +std::shared_ptr Test::build( + std::unordered_map> attributes, + std::unordered_map> tensors, + double rtol, double atol) { + auto test = std::shared_ptr(new Test(rtol, atol)); + test->_attributes = new Attributes(); + if (tensors.find("y") == tensors.end() + || tensors.find("x") == tensors.end() + || tensors.find("w") == tensors.end() + || tensors.find("b") == tensors.end() + || tensors.find("ans") == tensors.end()) { + throw std::runtime_error("Invalid Test"); + } + test->_attributes->y = tensors["y"]; + test->_attributes->x = tensors["x"]; + test->_attributes->w = tensors["w"]; + test->_attributes->b = tensors["b"]; + test->_attributes->ans = tensors["ans"]; + test->_attributes->bias_exist = *reinterpret_cast(attributes["bias_exist"].data()); + + return test; +} +std::shared_ptr Test::run( + infiniopHandle_t handle, infiniDevice_t device, int device_id, size_t warm_ups, size_t iterations) { + infiniopLinearDescriptor_t op_desc; + auto y = _attributes->y->to(device, device_id); + auto x = _attributes->x->to(device, device_id); + auto w = _attributes->w->to(device, device_id); + auto b = _attributes->b->to(device, device_id); + CHECK_OR(infiniopCreateLinearDescriptor(handle, &op_desc, + y->desc(), + x->desc(), + w->desc(), + (_attributes->bias_exist) ? (b->desc() ) : nullptr + ), + return TEST_FAILED(OP_CREATION_FAILED, "Failed to create op descriptor.")); + size_t workspace_size; + CHECK_OR(infiniopGetLinearWorkspaceSize(op_desc, &workspace_size), + return TEST_FAILED(OP_CREATION_FAILED, "Failed to get workspace size.")); + void *workspace; + CHECK_OR(infinirtMalloc(&workspace, workspace_size), + return TEST_FAILED(OP_CREATION_FAILED, "Failed to allocate workspace.")); + CHECK_OR(infiniopLinear(op_desc, workspace, workspace_size, + y->data(), + x->data(), + w->data(), + (_attributes->bias_exist) ? b->data() : nullptr, + nullptr), + return TEST_FAILED(OP_EXECUTION_FAILED, "Failed during execution.")); + + try { + allClose(y, _attributes->ans, _rtol, _atol); + } catch (const std::exception &e) { + return TEST_FAILED(RESULT_INCORRECT, e.what()); + } + + double elapsed_time = 0.; + + elapsed_time = benchmark( + [=]() { + infiniopLinear( + op_desc, workspace, workspace_size, + y->data(), + x->data(), + w->data(), + (_attributes->bias_exist) ? b->data() : nullptr, + nullptr); + }, + warm_ups, iterations); + + return TEST_PASSED(elapsed_time); +} + +std::vector Test::attribute_names() { + return {"bias_exist"}; +} + +std::vector Test::tensor_names() { + return {"y", "x", "w", "b", "ans"}; +} + +std::vector Test::output_names() { + return {"y"}; +} + +std::string Test::toString() const { + std::ostringstream oss; + oss << op_name() << std::endl; + oss << "- y: " << _attributes->y->info() << std::endl; + oss << "- x: " << _attributes->x->info() << std::endl; + oss << "- w: " << _attributes->w->info() << std::endl; + oss << "- b: " << (_attributes->bias_exist ? _attributes->b->info() : "null") << std::endl; + + oss << std::scientific << std::setprecision(2); + oss << "- rtol=" << _rtol << ", atol=" << _atol << std::endl; + return oss.str(); +} + +Test::~Test() { + delete _attributes; +} + +} // namespace infiniop_test::linear diff --git a/src/infiniop-test/src/ops/linear_backward.cpp b/src/infiniop-test/src/ops/linear_backward.cpp new file mode 100644 index 000000000..5352b62a9 --- /dev/null +++ b/src/infiniop-test/src/ops/linear_backward.cpp @@ -0,0 +1,144 @@ +#include "ops.hpp" +#include "utils.hpp" +#include +#include +#include + +namespace infiniop_test::linear_backward { +struct Test::Attributes { + bool bias_exist; + std::shared_ptr grad_x; + std::shared_ptr grad_w; + std::shared_ptr grad_b; + std::shared_ptr grad_y; + std::shared_ptr x; + std::shared_ptr w; + std::shared_ptr ans_grad_x; + std::shared_ptr ans_grad_w; + std::shared_ptr ans_grad_b; +}; + +std::shared_ptr Test::build( + std::unordered_map> attributes, + std::unordered_map> tensors, + double rtol, double atol) { + auto test = std::shared_ptr(new Test(rtol, atol)); + test->_attributes = new Attributes(); + if (tensors.find("grad_x") == tensors.end() + || tensors.find("grad_w") == tensors.end() + || tensors.find("grad_b") == tensors.end() + || tensors.find("grad_y") == tensors.end() + || tensors.find("x") == tensors.end() + || tensors.find("w") == tensors.end() + || tensors.find("ans_grad_x") == tensors.end() + || tensors.find("ans_grad_w") == tensors.end() + || tensors.find("ans_grad_b") == tensors.end()) { + throw std::runtime_error("Invalid Test"); + } + test->_attributes->grad_x = tensors["grad_x"]; + test->_attributes->grad_w = tensors["grad_w"]; + test->_attributes->grad_b = tensors["grad_b"]; + test->_attributes->grad_y = tensors["grad_y"]; + test->_attributes->x = tensors["x"]; + test->_attributes->w = tensors["w"]; + test->_attributes->ans_grad_x = tensors["ans_grad_x"]; + test->_attributes->ans_grad_w = tensors["ans_grad_w"]; + test->_attributes->ans_grad_b = tensors["ans_grad_b"]; + test->_attributes->bias_exist = *reinterpret_cast(attributes["bias_exist"].data()); + + return test; +} +std::shared_ptr Test::run( + infiniopHandle_t handle, infiniDevice_t device, int device_id, size_t warm_ups, size_t iterations) { + infiniopLinearBackwardDescriptor_t op_desc; + auto grad_x = _attributes->grad_x->to(device, device_id); + auto grad_w = _attributes->grad_w->to(device, device_id); + auto grad_b = _attributes->grad_b->to(device, device_id); + auto grad_y = _attributes->grad_y->to(device, device_id); + auto x = _attributes->x->to(device, device_id); + auto w = _attributes->w->to(device, device_id); + CHECK_OR(infiniopCreateLinearBackwardDescriptor(handle, &op_desc, + grad_x->desc(), + grad_w->desc(), + (_attributes->bias_exist) ? grad_b->desc() : nullptr, + grad_y->desc(), + x->desc(), + w->desc() + ), + return TEST_FAILED(OP_CREATION_FAILED, "Failed to create op descriptor.")); + size_t workspace_size; + CHECK_OR(infiniopGetLinearBackwardWorkspaceSize(op_desc, &workspace_size), + return TEST_FAILED(OP_CREATION_FAILED, "Failed to get workspace size.")); + void *workspace; + CHECK_OR(infinirtMalloc(&workspace, workspace_size), + return TEST_FAILED(OP_CREATION_FAILED, "Failed to allocate workspace.")); + CHECK_OR(infiniopLinearBackward(op_desc, workspace, workspace_size, + grad_x->data(), + grad_w->data(), + (_attributes->bias_exist) ? grad_b->data() : nullptr, + grad_y->data(), + x->data(), + w->data(), + nullptr), + return TEST_FAILED(OP_EXECUTION_FAILED, "Failed during execution.")); + + try { + allClose(grad_x, _attributes->ans_grad_x, _rtol, _atol); + allClose(grad_w, _attributes->ans_grad_w, _rtol, _atol); + if (_attributes->bias_exist) + allClose(grad_b, _attributes->ans_grad_b, _rtol, _atol); + } catch (const std::exception &e) { + return TEST_FAILED(RESULT_INCORRECT, e.what()); + } + + double elapsed_time = 0.; + + elapsed_time = benchmark( + [=]() { + infiniopLinearBackward( + op_desc, workspace, workspace_size, + grad_x->data(), + grad_w->data(), + (_attributes->bias_exist) ? grad_b->data() : nullptr, + grad_y->data(), + x->data(), + w->data(), + nullptr); + }, + warm_ups, iterations); + + return TEST_PASSED(elapsed_time); +} + +std::vector Test::attribute_names() { + return {"bias_exist"}; +} + +std::vector Test::tensor_names() { + return {"grad_x", "grad_w", "grad_b", "grad_y", "x", "w", "ans_grad_x", "ans_grad_w", "ans_grad_b"}; +} + +std::vector Test::output_names() { + return {"grad_x", "grad_w", "grad_b"}; +} + +std::string Test::toString() const { + std::ostringstream oss; + oss << op_name() << std::endl; + oss << "- grad_x: " << _attributes->grad_x->info() << std::endl; + oss << "- grad_w: " << _attributes->grad_w->info() << std::endl; + oss << "- grad_b: " << (_attributes->bias_exist ? _attributes->grad_b->info() : "null") << std::endl; + oss << "- grad_y: " << _attributes->grad_y->info() << std::endl; + oss << "- x: " << _attributes->x->info() << std::endl; + oss << "- w: " << _attributes->w->info() << std::endl; + + oss << std::scientific << std::setprecision(2); + oss << "- rtol=" << _rtol << ", atol=" << _atol << std::endl; + return oss.str(); +} + +Test::~Test() { + delete _attributes; +} + +} // namespace infiniop_test::linear_backward diff --git a/src/infiniop-test/src/ops/scatter.cpp b/src/infiniop-test/src/ops/scatter.cpp new file mode 100644 index 000000000..691e338da --- /dev/null +++ b/src/infiniop-test/src/ops/scatter.cpp @@ -0,0 +1,114 @@ +#include "ops.hpp" +#include "utils.hpp" +#include +#include +#include + +namespace infiniop_test::scatter { +struct Test::Attributes { + std::shared_ptr output; + std::shared_ptr input; + std::shared_ptr index; + size_t dim; + std::shared_ptr ans; +}; + +std::shared_ptr Test::build( + std::unordered_map> attributes, + std::unordered_map> tensors, + double rtol, double atol) { + auto test = std::shared_ptr(new Test(rtol, atol)); + test->_attributes = new Attributes(); + if (tensors.find("output") == tensors.end() + || tensors.find("input") == tensors.end() + || tensors.find("index") == tensors.end() + || tensors.find("ans") == tensors.end()) { + throw std::runtime_error("Invalid Test"); + } + test->_attributes->output = tensors["output"]; + test->_attributes->input = tensors["input"]; + test->_attributes->index = tensors["index"]; + test->_attributes->ans = tensors["ans"]; + test->_attributes->dim = *reinterpret_cast(attributes["dim"].data()); + + return test; +} +std::shared_ptr Test::run( + infiniopHandle_t handle, infiniDevice_t device, int device_id, size_t warm_ups, size_t iterations) { + infiniopScatterDescriptor_t op_desc; + auto output = _attributes->output->to(device, device_id); + auto input = _attributes->input->to(device, device_id); + auto index = _attributes->index->to(device, device_id); + auto dim = _attributes->dim; + CHECK_OR(infiniopCreateScatterDescriptor(handle, &op_desc, + output->desc(), + input->desc(), + index->desc(), + dim + ), + return TEST_FAILED(OP_CREATION_FAILED, "Failed to create op descriptor.")); + size_t workspace_size; + CHECK_OR(infiniopGetScatterWorkspaceSize(op_desc, &workspace_size), + return TEST_FAILED(OP_CREATION_FAILED, "Failed to get workspace size.")); + void *workspace; + CHECK_OR(infinirtMalloc(&workspace, workspace_size), + return TEST_FAILED(OP_CREATION_FAILED, "Failed to allocate workspace.")); + CHECK_OR(infiniopScatter(op_desc, workspace, workspace_size, + output->data(), + input->data(), + index->data(), + nullptr), + return TEST_FAILED(OP_EXECUTION_FAILED, "Failed during execution.")); + + try { + allClose(output, _attributes->ans, _rtol, _atol); + } catch (const std::exception &e) { + return TEST_FAILED(RESULT_INCORRECT, e.what()); + } + + double elapsed_time = 0.; + + elapsed_time = benchmark( + [=]() { + infiniopScatter( + op_desc, workspace, workspace_size, + output->data(), + input->data(), + index->data(), + nullptr); + }, + warm_ups, iterations); + + return TEST_PASSED(elapsed_time); +} + +std::vector Test::attribute_names() { + return {"dim"}; +} + +std::vector Test::tensor_names() { + return {"output", "input", "index", "ans"}; +} + +std::vector Test::output_names() { + return {"output"}; +} + +std::string Test::toString() const { + std::ostringstream oss; + oss << op_name() << std::endl; + oss << "- output: " << _attributes->output->info() << std::endl; + oss << "- input: " << _attributes->input->info() << std::endl; + oss << "- index: " << _attributes->index->info() << std::endl; + oss << "- dim: " << _attributes->dim << std::endl; + + oss << std::scientific << std::setprecision(2); + oss << "- rtol=" << _rtol << ", atol=" << _atol << std::endl; + return oss.str(); +} + +Test::~Test() { + delete _attributes; +} + +} // namespace infiniop_test::scatter diff --git a/src/infiniop-test/src/ops/tril.cpp b/src/infiniop-test/src/ops/tril.cpp new file mode 100644 index 000000000..4b025e6f7 --- /dev/null +++ b/src/infiniop-test/src/ops/tril.cpp @@ -0,0 +1,106 @@ +#include "ops.hpp" +#include "utils.hpp" +#include +#include +#include + +namespace infiniop_test::tril { +struct Test::Attributes { + std::shared_ptr output; + std::shared_ptr input; + int diagonal; + std::shared_ptr ans; +}; + +std::shared_ptr Test::build( + std::unordered_map> attributes, + std::unordered_map> tensors, + double rtol, double atol) { + auto test = std::shared_ptr(new Test(rtol, atol)); + test->_attributes = new Attributes(); + if (tensors.find("output") == tensors.end() + || tensors.find("input") == tensors.end() + || tensors.find("ans") == tensors.end()) { + throw std::runtime_error("Invalid Test"); + } + test->_attributes->output = tensors["output"]; + test->_attributes->input = tensors["input"]; + test->_attributes->ans = tensors["ans"]; + test->_attributes->diagonal = *reinterpret_cast(attributes["diagonal"].data()); + + return test; +} +std::shared_ptr Test::run( + infiniopHandle_t handle, infiniDevice_t device, int device_id, size_t warm_ups, size_t iterations) { + infiniopTrilDescriptor_t op_desc; + auto output = _attributes->output->to(device, device_id); + auto input = _attributes->input->to(device, device_id); + auto diagonal = _attributes->diagonal; + CHECK_OR(infiniopCreateTrilDescriptor(handle, &op_desc, + output->desc(), + input->desc(), + diagonal + ), + return TEST_FAILED(OP_CREATION_FAILED, "Failed to create op descriptor.")); + size_t workspace_size; + CHECK_OR(infiniopGetTrilWorkspaceSize(op_desc, &workspace_size), + return TEST_FAILED(OP_CREATION_FAILED, "Failed to get workspace size.")); + void *workspace; + CHECK_OR(infinirtMalloc(&workspace, workspace_size), + return TEST_FAILED(OP_CREATION_FAILED, "Failed to allocate workspace.")); + CHECK_OR(infiniopTril(op_desc, workspace, workspace_size, + output->data(), + input->data(), + nullptr), + return TEST_FAILED(OP_EXECUTION_FAILED, "Failed during execution.")); + + try { + allClose(output, _attributes->ans, _rtol, _atol); + } catch (const std::exception &e) { + return TEST_FAILED(RESULT_INCORRECT, e.what()); + } + + double elapsed_time = 0.; + + elapsed_time = benchmark( + [=]() { + infiniopTril( + op_desc, workspace, workspace_size, + output->data(), + input->data(), + nullptr); + }, + warm_ups, iterations); + + return TEST_PASSED(elapsed_time); +} + +std::vector Test::attribute_names() { + return {"diagonal"}; +} + +std::vector Test::tensor_names() { + return {"output", "input", "ans"}; +} + +std::vector Test::output_names() { + return {"output"}; +} + +std::string Test::toString() const { + std::ostringstream oss; + oss << op_name() << std::endl; + oss << "- output: " << _attributes->output->info() << std::endl; + oss << "- input: " << _attributes->input->info() << std::endl; + oss << "- diagonal: " << _attributes->diagonal << std::endl; + + oss << std::scientific << std::setprecision(2); + oss << "- rtol=" << _rtol << ", atol=" << _atol << std::endl; + return oss.str(); +} + +Test::~Test() { + delete _attributes; +} + +} // namespace infiniop_test::tril diff --git a/src/infiniop-test/src/ops/triu.cpp b/src/infiniop-test/src/ops/triu.cpp new file mode 100644 index 000000000..f49240283 --- /dev/null +++ b/src/infiniop-test/src/ops/triu.cpp @@ -0,0 +1,106 @@ +#include "ops.hpp" +#include "utils.hpp" +#include +#include +#include + +namespace infiniop_test::triu { +struct Test::Attributes { + std::shared_ptr output; + std::shared_ptr input; + int diagonal; + std::shared_ptr ans; +}; + +std::shared_ptr Test::build( + std::unordered_map> attributes, + std::unordered_map> tensors, + double rtol, double atol) { + auto test = std::shared_ptr(new Test(rtol, atol)); + test->_attributes = new Attributes(); + if (tensors.find("output") == tensors.end() + || tensors.find("input") == tensors.end() + || tensors.find("ans") == tensors.end()) { + throw std::runtime_error("Invalid Test"); + } + test->_attributes->output = tensors["output"]; + test->_attributes->input = tensors["input"]; + test->_attributes->ans = tensors["ans"]; + test->_attributes->diagonal = *reinterpret_cast(attributes["diagonal"].data()); + + return test; +} +std::shared_ptr Test::run( + infiniopHandle_t handle, infiniDevice_t device, int device_id, size_t warm_ups, size_t iterations) { + infiniopTriuDescriptor_t op_desc; + auto output = _attributes->output->to(device, device_id); + auto input = _attributes->input->to(device, device_id); + auto diagonal = _attributes->diagonal; + CHECK_OR(infiniopCreateTriuDescriptor(handle, &op_desc, + output->desc(), + input->desc(), + diagonal + ), + return TEST_FAILED(OP_CREATION_FAILED, "Failed to create op descriptor.")); + size_t workspace_size; + CHECK_OR(infiniopGetTriuWorkspaceSize(op_desc, &workspace_size), + return TEST_FAILED(OP_CREATION_FAILED, "Failed to get workspace size.")); + void *workspace; + CHECK_OR(infinirtMalloc(&workspace, workspace_size), + return TEST_FAILED(OP_CREATION_FAILED, "Failed to allocate workspace.")); + CHECK_OR(infiniopTriu(op_desc, workspace, workspace_size, + output->data(), + input->data(), + nullptr), + return TEST_FAILED(OP_EXECUTION_FAILED, "Failed during execution.")); + + try { + allClose(output, _attributes->ans, _rtol, _atol); + } catch (const std::exception &e) { + return TEST_FAILED(RESULT_INCORRECT, e.what()); + } + + double elapsed_time = 0.; + + elapsed_time = benchmark( + [=]() { + infiniopTriu( + op_desc, workspace, workspace_size, + output->data(), + input->data(), + nullptr); + }, + warm_ups, iterations); + + return TEST_PASSED(elapsed_time); +} + +std::vector Test::attribute_names() { + return {"diagonal"}; +} + +std::vector Test::tensor_names() { + return {"output", "input", "ans"}; +} + +std::vector Test::output_names() { + return {"output"}; +} + +std::string Test::toString() const { + std::ostringstream oss; + oss << op_name() << std::endl; + oss << "- output: " << _attributes->output->info() << std::endl; + oss << "- input: " << _attributes->input->info() << std::endl; + oss << "- diagonal: " << _attributes->diagonal << std::endl; + + oss << std::scientific << std::setprecision(2); + oss << "- rtol=" << _rtol << ", atol=" << _atol << std::endl; + return oss.str(); +} + +Test::~Test() { + delete _attributes; +} + +} // namespace infiniop_test::triu diff --git a/src/infiniop/ops/gather/cpu/gather_cpu.cc b/src/infiniop/ops/gather/cpu/gather_cpu.cc new file mode 100644 index 000000000..cc1d5b740 --- /dev/null +++ b/src/infiniop/ops/gather/cpu/gather_cpu.cc @@ -0,0 +1,106 @@ +#include "gather_cpu.h" +#include "../../../devices/cpu/common_cpu.h" +#include "../../../reduce/cpu/reduce.h" +#include "../info.h" + +namespace op::gather::cpu { + +infiniStatus_t calculate_gather( + const GatherInfo &info, + char * output, + const char * input, + const int64_t * index +) { +// -------------------------------- start: perform operator on CPU -------------------------------- + std::vector contiguous_strides(info.ndim); + ptrdiff_t last_dim = 1; + ptrdiff_t last_stride = 1; + for(size_t d = 0; d < info.ndim; d ++) + { + contiguous_strides[d] = last_dim * last_stride; + last_dim = info.output_shape[d]; + last_stride = contiguous_strides[d]; + } + size_t total_size = last_dim * last_stride; + + int gather_dim = info.dim; + size_t element_size = infiniSizeOf(info.dtype); + + #pragma omp parallel for + for(size_t i = 0; i < total_size; i++) + { + auto output_ptr = output; + auto input_ptr = input; + auto index_ptr = index; + size_t rem = i; + for (int d = info.ndim - 1; d >= 0; d--) { + size_t dim_index = rem / contiguous_strides[d]; + rem = rem % contiguous_strides[d]; + output_ptr += dim_index * element_size * info.output_strides[d]; + index_ptr += dim_index * info.index_strides[d]; + if(d != gather_dim) + input_ptr += dim_index * element_size * info.input_strides[d]; + } + int64_t gather_number = *index_ptr; + input_ptr += gather_number * element_size * info.input_strides[gather_dim]; + // *output_ptr = *input_ptr; + memcpy( + output_ptr, + input_ptr, + element_size + ); + } +// --------------------------------- end: perform operator on CPU --------------------------------- + return INFINI_STATUS_SUCCESS; +} + +Descriptor::~Descriptor() = default; + +infiniStatus_t Descriptor::create( + infiniopHandle_t handle_, + Descriptor **desc_ptr, + infiniopTensorDescriptor_t output_desc, + infiniopTensorDescriptor_t input_desc, + infiniopTensorDescriptor_t index_desc, + size_t dim +) { + auto handle = reinterpret_cast(handle_); + +// --------------------- start: check data type and calculate workspace size ---------------------- + auto dtype = output_desc->dtype(); + size_t WorkSpaceSize = 0; +// ---------------------- end: check data type and calculate workspace size ----------------------- + + auto result = GatherInfo::createGatherInfo( + output_desc, + input_desc, + index_desc, + dim + ); + CHECK_RESULT(result); + const GatherInfo &info = result.take(); + + *desc_ptr = new Descriptor( + dtype, std::move(info), WorkSpaceSize, + nullptr, + handle->device, handle->device_id + ); + + return INFINI_STATUS_SUCCESS; +} + + + +infiniStatus_t Descriptor::calculate( + void *workspace, + size_t workspace_size, + void * output, + const void * input, + const void * index, + void *stream +) const { + + return calculate_gather(_info, (char *)output, (const char *)input, (const int64_t *)index); +} +} + diff --git a/src/infiniop/ops/gather/cpu/gather_cpu.h b/src/infiniop/ops/gather/cpu/gather_cpu.h new file mode 100644 index 000000000..bc74fd669 --- /dev/null +++ b/src/infiniop/ops/gather/cpu/gather_cpu.h @@ -0,0 +1,9 @@ +#ifndef __GATHER_CPU_H__ +#define __GATHER_CPU_H__ + +#include "../gather.h" + +DESCRIPTOR(cpu) + + +#endif // __GATHER_CPU_H__ diff --git a/src/infiniop/ops/gather/cuda/kernel.cuh b/src/infiniop/ops/gather/cuda/kernel.cuh new file mode 100644 index 000000000..e49ca0fd2 --- /dev/null +++ b/src/infiniop/ops/gather/cuda/kernel.cuh @@ -0,0 +1,38 @@ +#ifndef __GATHER_KERNEL_CUH__ +#define __GATHER_KERNEL_CUH__ +// ------------------------------- start: perform operator on CUDA -------------------------------- +template +__device__ void gatherKernel( + Tdata * output, + const Tdata * input, + const int64_t * index, + size_t ndim, + size_t index_gather_size, + ptrdiff_t * output_strides, + ptrdiff_t * input_strides, + ptrdiff_t * index_strides, + ptrdiff_t * contiguous_strides, + int gather_dim +) { + auto output_ptr = output; + auto input_ptr = input; + auto index_ptr = index; + size_t rem = blockIdx.x; + for (int d = ndim - 1; d >= 0; d--) { + if (d == gather_dim) + continue; + size_t dim_index = rem / contiguous_strides[d]; + rem = rem % contiguous_strides[d]; + output_ptr += dim_index * output_strides[d]; + input_ptr += dim_index * input_strides[d]; + index_ptr += dim_index * index_strides[d]; + } + for (size_t c = threadIdx.x; c < index_gather_size; c ++) { + int64_t gather_number = *(index_ptr + c * index_strides[gather_dim]); + *(output_ptr + c * output_strides[gather_dim]) = \ + *(input_ptr + gather_number * input_strides[gather_dim]); + } +} +// -------------------------------- end: perform operator on CUDA --------------------------------- + +#endif // __GATHER_KERNEL_CUH__ diff --git a/src/infiniop/ops/gather/gather.h b/src/infiniop/ops/gather/gather.h new file mode 100644 index 000000000..70991f5db --- /dev/null +++ b/src/infiniop/ops/gather/gather.h @@ -0,0 +1,49 @@ +#ifndef __GATHER_H__ +#define __GATHER_H__ + +#include "../../../utils.h" +#include "../../operator.h" +#include "../../tensor.h" +#include "info.h" + +#define DESCRIPTOR(NAMESPACE) \ + namespace op::gather::NAMESPACE { \ + class Descriptor final : public InfiniopDescriptor { \ + struct Opaque; \ + Opaque *_opaque; \ + GatherInfo _info; \ + size_t _workspace_size; \ + Descriptor( \ + infiniDtype_t dtype, \ + GatherInfo info, \ + size_t workspace_size_, \ + Opaque *opaque, \ + infiniDevice_t device_type, \ + int device_id \ + ) : InfiniopDescriptor{device_type, device_id}, \ + _opaque(opaque), \ + _info(info), \ + _workspace_size(workspace_size_) {} \ + public: \ + ~Descriptor(); \ + size_t workspaceSize() const { return _workspace_size; } \ + static infiniStatus_t create( \ + infiniopHandle_t handle, \ + Descriptor **desc_ptr, \ + infiniopTensorDescriptor_t output_desc, \ + infiniopTensorDescriptor_t input_desc, \ + infiniopTensorDescriptor_t index_desc, \ + size_t dim \ + ); \ + infiniStatus_t calculate( \ + void *workspace, \ + size_t workspace_size, \ + void * output, \ + const void * input, \ + const void * index, \ + void *stream \ + ) const; \ + }; \ + } + +#endif \ No newline at end of file diff --git a/src/infiniop/ops/gather/info.h b/src/infiniop/ops/gather/info.h new file mode 100644 index 000000000..aa7a54a16 --- /dev/null +++ b/src/infiniop/ops/gather/info.h @@ -0,0 +1,58 @@ +#ifndef __GATHER_INFO_H__ +#define __GATHER_INFO_H__ + +#include "../../../utils.h" +#include "../../operator.h" +#include "../../tensor.h" + +namespace op::gather { + +class GatherInfo { +private: + GatherInfo() = default; + +public: +// ---------------------------- start: define member variables of Info ---------------------------- + infiniDtype_t dtype; + size_t ndim; + std::vector output_shape; + size_t input_dim_size; + std::vector output_strides; + std::vector input_strides; + std::vector index_strides; + size_t dim; + +// ----------------------------- end: define member variables of Info ----------------------------- + + static utils::Result createGatherInfo( + infiniopTensorDescriptor_t output_desc, + infiniopTensorDescriptor_t input_desc, + infiniopTensorDescriptor_t index_desc, + size_t dim + ) { +// ------------------------- start: check tensor shape and input validity ------------------------- + CHECK_SAME_SHAPE(output_desc->shape(), index_desc->shape()); + size_t ndim = output_desc->ndim(); + for (size_t d = 0; d < ndim; d ++) { + if (d != dim) + CHECK_OR_RETURN(input_desc->dim(d) == output_desc->dim(d), INFINI_STATUS_BAD_TENSOR_SHAPE); + } + CHECK_OR_RETURN(ndim > dim, INFINI_STATUS_BAD_PARAM); +// -------------------------- end: check tensor shape and input validity -------------------------- + return utils::Result(GatherInfo{ +// ------------------------------ start: create an instance of Info ------------------------------- + output_desc->dtype(), + ndim, + output_desc->shape(), + input_desc->dim(dim), + output_desc->strides(), + input_desc->strides(), + index_desc->strides(), + dim +// ------------------------------- end: create an instance of Info -------------------------------- + }); + } +}; +} + +#endif // __GATHER_INFO_H__ diff --git a/src/infiniop/ops/gather/metax/gather_metax.h b/src/infiniop/ops/gather/metax/gather_metax.h new file mode 100644 index 000000000..bf0a25a36 --- /dev/null +++ b/src/infiniop/ops/gather/metax/gather_metax.h @@ -0,0 +1,8 @@ +#ifndef __GATHER_METAX_H__ +#define __GATHER_METAX_H__ + +#include "../gather.h" + +DESCRIPTOR(metax) + +#endif // __GATHER_METAX_H__ diff --git a/src/infiniop/ops/gather/metax/gather_metax.maca b/src/infiniop/ops/gather/metax/gather_metax.maca new file mode 100644 index 000000000..7254078c3 --- /dev/null +++ b/src/infiniop/ops/gather/metax/gather_metax.maca @@ -0,0 +1,190 @@ +#include "../../../devices/metax/metax_common.h" +#include "gather_metax.h" +#include +#include "../../../devices/metax/metax_kernel_common.h" +#include "../../../reduce/cuda/reduce.cuh" +#include "../cuda/kernel.cuh" +#include "../info.h" + +namespace op::gather::metax { + +template +INFINIOP_METAX_KERNEL launchKernel( + Tdata * output, + const Tdata * input, + const int64_t * index, + size_t ndim, + size_t index_gather_size, + ptrdiff_t * output_strides, + ptrdiff_t * input_strides, + ptrdiff_t * index_strides, + ptrdiff_t * contiguous_strides, + int gather_dim +) { + gatherKernel( + output, + input, + index, + ndim, + index_gather_size, + output_strides, + input_strides, + index_strides, + contiguous_strides, + gather_dim + ); +} + +// ----------------------------------- start: call launchKernel ----------------------------------- +template +infiniStatus_t calculate_gather( + const GatherInfo &info, + Tdata * output, + const Tdata * input, + const int64_t * index, + hcStream_t stream, + void * workspace +) { + size_t ndim = info.ndim; + ptrdiff_t * contiguous_strides = new ptrdiff_t[ndim]; + size_t last_dim = 1, last_stride = 1; + size_t gather_dim = info.dim; + for(size_t d = 0; d < ndim; d ++) + { + if (d == gather_dim) + continue; + contiguous_strides[d] = last_dim * last_stride; + last_dim = info.output_shape[d]; + last_stride = contiguous_strides[d]; + } + size_t batch_size = last_dim * last_stride; + + + ptrdiff_t * contiguous_strides_cuda = reinterpret_cast(workspace); + ptrdiff_t * input_strides_cuda = contiguous_strides_cuda + ndim; + ptrdiff_t * output_strides_cuda = input_strides_cuda + ndim; + ptrdiff_t * index_strides_cuda = output_strides_cuda + ndim; + + CHECK_METAX(hcMemcpyAsync(contiguous_strides_cuda, contiguous_strides, sizeof(ptrdiff_t) * ndim, hcMemcpyHostToDevice, stream)); + CHECK_METAX(hcMemcpyAsync(input_strides_cuda, info.input_strides.data(), sizeof(ptrdiff_t) * ndim, hcMemcpyHostToDevice, stream)); + CHECK_METAX(hcMemcpyAsync(output_strides_cuda, info.output_strides.data(), sizeof(ptrdiff_t) * ndim, hcMemcpyHostToDevice, stream)); + CHECK_METAX(hcMemcpyAsync(index_strides_cuda, info.index_strides.data(), sizeof(ptrdiff_t) * ndim, hcMemcpyHostToDevice, stream)); + + + launchKernel<<>>( + output, + input, + index, + ndim, + info.output_shape[gather_dim], + output_strides_cuda, + input_strides_cuda, + index_strides_cuda, + contiguous_strides_cuda, + info.dim + ); + delete[] contiguous_strides; + return INFINI_STATUS_SUCCESS; +} +// ------------------------------------ end: call launchKernel ------------------------------------ + + +struct Descriptor::Opaque { + std::shared_ptr internal; +}; + +Descriptor::~Descriptor() { + delete _opaque; +} + +infiniStatus_t Descriptor::create( + infiniopHandle_t handle_, + Descriptor **desc_ptr, + infiniopTensorDescriptor_t output_desc, + infiniopTensorDescriptor_t input_desc, + infiniopTensorDescriptor_t index_desc, + size_t dim +) { + auto handle = reinterpret_cast(handle_); +// --------------------- start: check data type and calculate workspace size ---------------------- + auto dtype = output_desc->dtype(); + auto result = GatherInfo::createGatherInfo( + output_desc, + input_desc, + index_desc, + dim + ); + CHECK_RESULT(result); + const GatherInfo &info = result.take(); + size_t WorkSpaceSize = sizeof(ptrdiff_t) * input_desc->ndim() * 4; +// ---------------------- end: check data type and calculate workspace size ----------------------- + + *desc_ptr = new Descriptor( + dtype, std::move(info), WorkSpaceSize, + new Opaque{handle->internal()}, + handle->device, handle->device_id + ); + return INFINI_STATUS_SUCCESS; +} + + + +infiniStatus_t Descriptor::calculate( + void * workspace, + size_t workspace_size, + void * output, + const void * input, + const void * index, + void *stream_ +) const { + if (workspace_size < _workspace_size) + return INFINI_STATUS_INSUFFICIENT_WORKSPACE; + + hcStream_t stream = (hcStream_t)stream_; + + + #define CALCULATE_GATHER(BLOCK_SIZE, TDATA) \ + calculate_gather(_info, (TDATA *)output, (const TDATA *)input, (const int64_t *)index, stream, workspace) + + #define CALCULATE_GATHER_WITH_METAX_BLOCK(BLOCK_SIZE) \ + switch (_info.dtype) { \ + case INFINI_DTYPE_BOOL: \ + return CALCULATE_GATHER(BLOCK_SIZE, bool); \ + case INFINI_DTYPE_U8: \ + return CALCULATE_GATHER(BLOCK_SIZE, uint8_t); \ + case INFINI_DTYPE_U16: \ + return CALCULATE_GATHER(BLOCK_SIZE, uint16_t); \ + case INFINI_DTYPE_U32: \ + return CALCULATE_GATHER(BLOCK_SIZE, uint32_t); \ + case INFINI_DTYPE_U64: \ + return CALCULATE_GATHER(BLOCK_SIZE, uint64_t); \ + case INFINI_DTYPE_I8: \ + return CALCULATE_GATHER(BLOCK_SIZE, int8_t); \ + case INFINI_DTYPE_I16: \ + return CALCULATE_GATHER(BLOCK_SIZE, int16_t); \ + case INFINI_DTYPE_I32: \ + return CALCULATE_GATHER(BLOCK_SIZE, int32_t); \ + case INFINI_DTYPE_I64: \ + return CALCULATE_GATHER(BLOCK_SIZE, int64_t); \ + case INFINI_DTYPE_F16: \ + return CALCULATE_GATHER(BLOCK_SIZE, half); \ + case INFINI_DTYPE_F32: \ + return CALCULATE_GATHER(BLOCK_SIZE, float); \ + case INFINI_DTYPE_BF16: \ + return CALCULATE_GATHER(BLOCK_SIZE, cuda_bfloat16); \ + default: \ + return INFINI_STATUS_BAD_TENSOR_DTYPE; \ + } + + if (_opaque->internal->maxThreadsPerBlock() == METAX_BLOCK_SIZE_1024) + CALCULATE_GATHER_WITH_METAX_BLOCK(METAX_BLOCK_SIZE_1024) + else if (_opaque->internal->maxThreadsPerBlock() == METAX_BLOCK_SIZE_512) + CALCULATE_GATHER_WITH_METAX_BLOCK(METAX_BLOCK_SIZE_512) + else + return INFINI_STATUS_DEVICE_ARCHITECTURE_NOT_SUPPORTED; + return INFINI_STATUS_SUCCESS; + + #undef CALCULATE_GATHER_WITH_METAX_BLOCK + #undef CALCULATE_GATHER +} +} // namespace op::gather::metax diff --git a/src/infiniop/ops/gather/nvidia/gather_nvidia.cu b/src/infiniop/ops/gather/nvidia/gather_nvidia.cu new file mode 100644 index 000000000..94741a7d5 --- /dev/null +++ b/src/infiniop/ops/gather/nvidia/gather_nvidia.cu @@ -0,0 +1,189 @@ +#include "../../../devices/nvidia/nvidia_handle.cuh" +#include "../../../devices/nvidia/nvidia_common.cuh" +#include "../../../devices/nvidia/nvidia_kernel_common.cuh" +#include "gather_nvidia.cuh" +#include "../cuda/kernel.cuh" +#include "../info.h" + +namespace op::gather::nvidia { + +// ---------------------- start: launchKernel: call kernel function of CUDA ----------------------- +template +INFINIOP_CUDA_KERNEL launchKernel( + Tdata * output, + const Tdata * input, + const int64_t * index, + size_t ndim, + size_t index_gather_size, + ptrdiff_t * output_strides, + ptrdiff_t * input_strides, + ptrdiff_t * index_strides, + ptrdiff_t * contiguous_strides, + int gather_dim +) { + gatherKernel( + output, + input, + index, + ndim, + index_gather_size, + output_strides, + input_strides, + index_strides, + contiguous_strides, + gather_dim + ); +} +// ----------------------- end: launchKernel: call kernel function of CUDA ------------------------ + +// ----------------------------------- start: call launchKernel ----------------------------------- +template +infiniStatus_t calculate_gather( + const GatherInfo &info, + Tdata * output, + const Tdata * input, + const int64_t * index, + cudaStream_t stream, + void * workspace +) { + size_t ndim = info.ndim; + ptrdiff_t * contiguous_strides = new ptrdiff_t[ndim]; + size_t last_dim = 1, last_stride = 1; + size_t gather_dim = info.dim; + for(size_t d = 0; d < ndim; d ++) + { + if (d == gather_dim) + continue; + contiguous_strides[d] = last_dim * last_stride; + last_dim = info.output_shape[d]; + last_stride = contiguous_strides[d]; + } + size_t batch_size = last_dim * last_stride; + + + ptrdiff_t * contiguous_strides_cuda = reinterpret_cast(workspace); + ptrdiff_t * input_strides_cuda = contiguous_strides_cuda + ndim; + ptrdiff_t * output_strides_cuda = input_strides_cuda + ndim; + ptrdiff_t * index_strides_cuda = output_strides_cuda + ndim; + + CHECK_CUDA(cudaMemcpyAsync(contiguous_strides_cuda, contiguous_strides, sizeof(ptrdiff_t) * ndim, cudaMemcpyHostToDevice, stream)); + CHECK_CUDA(cudaMemcpyAsync(input_strides_cuda, info.input_strides.data(), sizeof(ptrdiff_t) * ndim, cudaMemcpyHostToDevice, stream)); + CHECK_CUDA(cudaMemcpyAsync(output_strides_cuda, info.output_strides.data(), sizeof(ptrdiff_t) * ndim, cudaMemcpyHostToDevice, stream)); + CHECK_CUDA(cudaMemcpyAsync(index_strides_cuda, info.index_strides.data(), sizeof(ptrdiff_t) * ndim, cudaMemcpyHostToDevice, stream)); + + launchKernel<1, Tdata><<>>( + output, + input, + index, + ndim, + info.output_shape[gather_dim], + output_strides_cuda, + input_strides_cuda, + index_strides_cuda, + contiguous_strides_cuda, + info.dim + ); + delete[] contiguous_strides; + return INFINI_STATUS_SUCCESS; +} +// ------------------------------------ end: call launchKernel ------------------------------------ + + +struct Descriptor::Opaque { + std::shared_ptr internal; +}; + +Descriptor::~Descriptor() { + delete _opaque; +} + +infiniStatus_t Descriptor::create( + infiniopHandle_t handle_, + Descriptor **desc_ptr, + infiniopTensorDescriptor_t output_desc, + infiniopTensorDescriptor_t input_desc, + infiniopTensorDescriptor_t index_desc, + size_t dim +) { + auto handle = reinterpret_cast(handle_); +// --------------------- start: check data type and calculate workspace size ---------------------- + auto dtype = output_desc->dtype(); + size_t WorkSpaceSize = sizeof(ptrdiff_t) * input_desc->ndim() * 4; +// ---------------------- end: check data type and calculate workspace size ----------------------- + auto result = GatherInfo::createGatherInfo( + output_desc, + input_desc, + index_desc, + dim + ); + CHECK_RESULT(result); + const GatherInfo &info = result.take(); + *desc_ptr = new Descriptor( + dtype, std::move(info), WorkSpaceSize, + new Opaque{handle->internal()}, + handle->device, handle->device_id + ); + return INFINI_STATUS_SUCCESS; +} + + +infiniStatus_t Descriptor::calculate( + void * workspace, + size_t workspace_size, + void * output, + const void * input, + const void * index, + void *stream_ +) const { + if (workspace_size < _workspace_size) { + return INFINI_STATUS_INSUFFICIENT_WORKSPACE; + } + cudaStream_t stream = (cudaStream_t)stream_; + + #define CALCULATE_GATHER(BLOCK_SIZE, TDATA) \ + calculate_gather(_info, (TDATA *)output, (const TDATA *)input, (const int64_t *)index, stream, workspace) + #define CALCULATE_GATHER_WITH_BLOCK_SIZE(BLOCK_SIZE) \ + switch (_info.dtype) { \ + case INFINI_DTYPE_BOOL: \ + return CALCULATE_GATHER(BLOCK_SIZE, bool); \ + case INFINI_DTYPE_U8: \ + return CALCULATE_GATHER(BLOCK_SIZE, uint8_t); \ + case INFINI_DTYPE_U16: \ + return CALCULATE_GATHER(BLOCK_SIZE, uint16_t); \ + case INFINI_DTYPE_U32: \ + return CALCULATE_GATHER(BLOCK_SIZE, uint32_t); \ + case INFINI_DTYPE_U64: \ + return CALCULATE_GATHER(BLOCK_SIZE, uint64_t); \ + case INFINI_DTYPE_I8: \ + return CALCULATE_GATHER(BLOCK_SIZE, int8_t); \ + case INFINI_DTYPE_I16: \ + return CALCULATE_GATHER(BLOCK_SIZE, int16_t); \ + case INFINI_DTYPE_I32: \ + return CALCULATE_GATHER(BLOCK_SIZE, int32_t); \ + case INFINI_DTYPE_I64: \ + return CALCULATE_GATHER(BLOCK_SIZE, int64_t); \ + case INFINI_DTYPE_F16: \ + return CALCULATE_GATHER(BLOCK_SIZE, half); \ + case INFINI_DTYPE_F32: \ + return CALCULATE_GATHER(BLOCK_SIZE, float); \ + case INFINI_DTYPE_BF16: \ + return CALCULATE_GATHER(BLOCK_SIZE, cuda_bfloat16); \ + default: \ + return INFINI_STATUS_BAD_TENSOR_DTYPE; \ + } + + if (_opaque->internal->maxThreadsPerBlock() == CUDA_BLOCK_SIZE_1024) + CALCULATE_GATHER_WITH_BLOCK_SIZE(CUDA_BLOCK_SIZE_1024) + else if (_opaque->internal->maxThreadsPerBlock() == CUDA_BLOCK_SIZE_512) + CALCULATE_GATHER_WITH_BLOCK_SIZE(CUDA_BLOCK_SIZE_512) + else if (_opaque->internal->maxThreadsPerBlock() == CUDA_BLOCK_SIZE_4096) + CALCULATE_GATHER_WITH_BLOCK_SIZE(CUDA_BLOCK_SIZE_4096) + else + return INFINI_STATUS_DEVICE_ARCHITECTURE_NOT_SUPPORTED; + + #undef CALCULATE_GATHER_WITH_BLOCK_SIZE + #undef CALCULATE_GATHER + + return INFINI_STATUS_SUCCESS; +} +} // namespace op::gather::nvidia diff --git a/src/infiniop/ops/gather/nvidia/gather_nvidia.cuh b/src/infiniop/ops/gather/nvidia/gather_nvidia.cuh new file mode 100644 index 000000000..46d42fa0c --- /dev/null +++ b/src/infiniop/ops/gather/nvidia/gather_nvidia.cuh @@ -0,0 +1,7 @@ +#ifndef __GATHER_NVIDIA_API_H__ +#define __GATHER_NVIDIA_API_H__ +#include "../gather.h" + +DESCRIPTOR(nvidia) + +#endif // __GATHER_NVIDIA_API_H__ diff --git a/src/infiniop/ops/gather/operator.cc b/src/infiniop/ops/gather/operator.cc new file mode 100644 index 000000000..c748f811c --- /dev/null +++ b/src/infiniop/ops/gather/operator.cc @@ -0,0 +1,154 @@ +#include "../../operator.h" +#include "../../handle.h" +#include "infiniop/ops/gather.h" + +#ifdef ENABLE_CPU_API +#include "cpu/gather_cpu.h" +#endif +#if defined(ENABLE_NVIDIA_API) || defined(ENABLE_ILUVATAR_API) +#include "nvidia/gather_nvidia.cuh" +#endif +#ifdef ENABLE_METAX_API +#include "metax/gather_metax.h" +#endif + +__C infiniStatus_t infiniopCreateGatherDescriptor( + infiniopHandle_t handle, + infiniopGatherDescriptor_t *desc_ptr, + infiniopTensorDescriptor_t output_desc, + infiniopTensorDescriptor_t input_desc, + infiniopTensorDescriptor_t index_desc, + size_t dim +) { +#define CREATE(CASE, NAMESPACE) \ + case CASE: \ + return op::gather::NAMESPACE::Descriptor::create( \ + handle, \ + reinterpret_cast(desc_ptr), \ + output_desc, \ + input_desc, \ + index_desc, \ + dim \ + ) + + switch (handle->device) { + +#ifdef ENABLE_CPU_API + CREATE(INFINI_DEVICE_CPU, cpu); +#endif +#ifdef ENABLE_NVIDIA_API + CREATE(INFINI_DEVICE_NVIDIA, nvidia); +#endif +#ifdef ENABLE_ILUVATAR_API + CREATE(INFINI_DEVICE_ILUVATAR, nvidia); +#endif +#ifdef ENABLE_METAX_API + CREATE(INFINI_DEVICE_METAX, metax); +#endif + + default: + return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED; + } + +#undef CREATE +} + +__C infiniStatus_t infiniopGetGatherWorkspaceSize(infiniopGatherDescriptor_t desc, size_t *size) { +#define GET(CASE, NAMESPACE) \ + case CASE: \ + *size = reinterpret_cast(desc)->workspaceSize(); \ + return INFINI_STATUS_SUCCESS + + switch (desc->device_type) { +#ifdef ENABLE_CPU_API + GET(INFINI_DEVICE_CPU, cpu); +#endif +#ifdef ENABLE_NVIDIA_API + GET(INFINI_DEVICE_NVIDIA, nvidia); +#endif +#ifdef ENABLE_ILUVATAR_API + GET(INFINI_DEVICE_ILUVATAR, nvidia); +#endif +#ifdef ENABLE_METAX_API + GET(INFINI_DEVICE_METAX, metax); +#endif + default: + return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED; + } +#undef GET + + return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED; +} + +__C infiniStatus_t infiniopGather( + infiniopGatherDescriptor_t desc, + void *workspace, + size_t workspace_size, + void * output, + const void * input, + const void * index, + void *stream +) { + +#define CALCULATE(CASE, NAMESPACE) \ + case CASE: \ + return reinterpret_cast(desc)->calculate( \ + workspace, \ + workspace_size, \ + output, \ + input, \ + index, \ + stream \ + ) + + switch (desc->device_type) { + +#ifdef ENABLE_CPU_API + CALCULATE(INFINI_DEVICE_CPU, cpu); +#endif +#ifdef ENABLE_NVIDIA_API + CALCULATE(INFINI_DEVICE_NVIDIA, nvidia); +#endif +#ifdef ENABLE_ILUVATAR_API + CALCULATE(INFINI_DEVICE_ILUVATAR, nvidia); +#endif +#ifdef ENABLE_METAX_API + CALCULATE(INFINI_DEVICE_METAX, metax); +#endif + + default: + return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED; + } + +#undef CALCULATE +} + +__C infiniStatus_t +infiniopDestroyGatherDescriptor(infiniopGatherDescriptor_t desc) { + +#define DELETE(CASE, NAMESPACE) \ + case CASE: \ + delete reinterpret_cast(desc); \ + return INFINI_STATUS_SUCCESS + + switch (desc->device_type) { + +#ifdef ENABLE_CPU_API + DELETE(INFINI_DEVICE_CPU, cpu); +#endif +#ifdef ENABLE_NVIDIA_API + DELETE(INFINI_DEVICE_NVIDIA, nvidia); +#endif +#ifdef ENABLE_ILUVATAR_API + DELETE(INFINI_DEVICE_ILUVATAR, nvidia); +#endif +#ifdef ENABLE_METAX_API + DELETE(INFINI_DEVICE_METAX, metax); +#endif + + default: + return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED; + } + +#undef DELETE +} diff --git a/src/infiniop/ops/index_copy_inplace/cpu/index_copy_inplace_cpu.cc b/src/infiniop/ops/index_copy_inplace/cpu/index_copy_inplace_cpu.cc new file mode 100644 index 000000000..f45ddef7a --- /dev/null +++ b/src/infiniop/ops/index_copy_inplace/cpu/index_copy_inplace_cpu.cc @@ -0,0 +1,108 @@ +#include "index_copy_inplace_cpu.h" +#include "../../../devices/cpu/common_cpu.h" +#include "../../../reduce/cpu/reduce.h" +#include "../../rearrange/cpu/rearrange_cpu.h" +#include "../info.h" + +namespace op::index_copy_inplace::cpu { + +Descriptor::~Descriptor() = default; + +infiniStatus_t Descriptor::create( + infiniopHandle_t handle_, + Descriptor **desc_ptr, + infiniopTensorDescriptor_t output_desc, + infiniopTensorDescriptor_t input_desc, + infiniopTensorDescriptor_t index_desc, + size_t dim +) { + auto handle = reinterpret_cast(handle_); + +// --------------------- start: check data type and calculate workspace size ---------------------- + auto dtype = output_desc->dtype(); + + + auto result = IndexCopyInplaceInfo::createIndexCopyInplaceInfo( + output_desc, + input_desc, + index_desc, + dim + ); + CHECK_RESULT(result); + const IndexCopyInplaceInfo &info = result.take(); + size_t WorkSpaceSize = (info.total_input_size + info.total_output_size) * infiniSizeOf(dtype); +// ---------------------- end: check data type and calculate workspace size ----------------------- + InfiniopTensorDescriptor * rearrange_in_desc = new InfiniopTensorDescriptor( + dtype, input_desc->ndim(), input_desc->shape().data(), info.meta_strides.data() + ); + InfiniopTensorDescriptor * rearrange_out_desc = new InfiniopTensorDescriptor( + dtype, input_desc->ndim(), output_desc->shape().data(), info.meta_strides.data() + ); + + void * in_rearrange_descriptor = nullptr; + void * out_rearrange_descriptor = nullptr; + + op::rearrange::cpu::Descriptor::create( + handle_, reinterpret_cast(&in_rearrange_descriptor), + rearrange_in_desc, input_desc + ); + op::rearrange::cpu::Descriptor::create( + handle_, reinterpret_cast(&out_rearrange_descriptor), + output_desc, rearrange_out_desc + ); + + *desc_ptr = new Descriptor( + dtype, std::move(info), WorkSpaceSize, + nullptr, + handle->device, handle->device_id, + in_rearrange_descriptor, + out_rearrange_descriptor + ); + + return INFINI_STATUS_SUCCESS; +} + + + +infiniStatus_t Descriptor::calculate( + void *workspace, + size_t workspace_size, + void * output, + const void * input, + const void * index, + void *stream +) const { + size_t size_of_dtype = infiniSizeOf(_info.dtype); + auto index_ptr = reinterpret_cast(index); + + + char* workspace_in = reinterpret_cast(workspace); + char* workspace_out = workspace_in + size_of_dtype * _info.total_input_size; + + + reinterpret_cast(_rearrange_desc_in)->calculate(workspace_in, input, stream); + memset(workspace_out, 0, _info.total_output_size * size_of_dtype); + size_t copy_unit_size = _info.meta_strides[_info.dim] * size_of_dtype; + #pragma omp parallel for + for (size_t dst_index = 0; dst_index < _info.output_shape[_info.dim]; dst_index++) { + size_t src_index = _info.index_shape[0] - 1; + while (true) + { + if (*(index_ptr + src_index * _info.index_strides[0]) == int64_t(dst_index)) { + std::memcpy( + workspace_out + size_of_dtype * dst_index * _info.meta_strides[_info.dim], + workspace_in + size_of_dtype * src_index * _info.meta_strides[_info.dim], + copy_unit_size + ); + break; + } + else if (src_index == 0) + break; + src_index --; + } + } + reinterpret_cast(_rearrange_desc_out)->calculate(output, workspace_out, stream); + + return INFINI_STATUS_SUCCESS; +} +} diff --git a/src/infiniop/ops/index_copy_inplace/cpu/index_copy_inplace_cpu.h b/src/infiniop/ops/index_copy_inplace/cpu/index_copy_inplace_cpu.h new file mode 100644 index 000000000..384197013 --- /dev/null +++ b/src/infiniop/ops/index_copy_inplace/cpu/index_copy_inplace_cpu.h @@ -0,0 +1,9 @@ +#ifndef __INDEX_COPY_INPLACE_CPU_H__ +#define __INDEX_COPY_INPLACE_CPU_H__ + +#include "../index_copy_inplace.h" + +INDEX_COPY_INPLACE_DESCRIPTOR(cpu) + + +#endif // __INDEX_COPY_INPLACE_CPU_H__ diff --git a/src/infiniop/ops/index_copy_inplace/index_copy_inplace.h b/src/infiniop/ops/index_copy_inplace/index_copy_inplace.h new file mode 100644 index 000000000..f99fd3116 --- /dev/null +++ b/src/infiniop/ops/index_copy_inplace/index_copy_inplace.h @@ -0,0 +1,55 @@ +#ifndef __INDEX_COPY_INPLACE_H__ +#define __INDEX_COPY_INPLACE_H__ + +#include "../../../utils.h" +#include "../../operator.h" +#include "../../tensor.h" +#include "info.h" + +#define INDEX_COPY_INPLACE_DESCRIPTOR(NAMESPACE) \ + namespace op::index_copy_inplace::NAMESPACE { \ + class Descriptor final : public InfiniopDescriptor { \ + struct Opaque; \ + Opaque *_opaque; \ + IndexCopyInplaceInfo _info; \ + size_t _workspace_size; \ + void *_rearrange_desc_in; \ + void *_rearrange_desc_out; \ + Descriptor( \ + infiniDtype_t dtype, \ + IndexCopyInplaceInfo info, \ + size_t workspace_size_, \ + Opaque *opaque, \ + infiniDevice_t device_type, \ + int device_id, \ + void *rearrange_desc_in, \ + void *rearrange_desc_out \ + ) : InfiniopDescriptor{device_type, device_id}, \ + _opaque(opaque), \ + _info(info), \ + _workspace_size(workspace_size_), \ + _rearrange_desc_in(rearrange_desc_in), \ + _rearrange_desc_out(rearrange_desc_out) {} \ + public: \ + ~Descriptor(); \ + size_t workspaceSize() const { return _workspace_size; } \ + static infiniStatus_t create( \ + infiniopHandle_t handle, \ + Descriptor **desc_ptr, \ + infiniopTensorDescriptor_t output_desc, \ + infiniopTensorDescriptor_t input_desc, \ + infiniopTensorDescriptor_t index_desc, \ + size_t dim \ + ); \ + infiniStatus_t calculate( \ + void *workspace, \ + size_t workspace_size, \ + void * output, \ + const void * input, \ + const void * index, \ + void *stream \ + ) const; \ + }; \ + } + +#endif \ No newline at end of file diff --git a/src/infiniop/ops/index_copy_inplace/info.h b/src/infiniop/ops/index_copy_inplace/info.h new file mode 100644 index 000000000..99d6a3a1e --- /dev/null +++ b/src/infiniop/ops/index_copy_inplace/info.h @@ -0,0 +1,76 @@ +#ifndef __INDEX_COPY_INPLACE_INFO_H__ +#define __INDEX_COPY_INPLACE_INFO_H__ + +#include "../../../utils.h" +#include "../../operator.h" +#include "../../tensor.h" + +namespace op::index_copy_inplace { + +class IndexCopyInplaceInfo { +private: + IndexCopyInplaceInfo() = default; + +public: +// ---------------------------- start: define member variables of Info ---------------------------- + infiniDtype_t dtype; + size_t total_input_size; + size_t total_output_size; + std::vector output_shape; + std::vector input_shape; + std::vector index_shape; + std::vector output_strides; + std::vector input_strides; + std::vector index_strides; + std::vector meta_strides; + size_t dim; + +// ----------------------------- end: define member variables of Info ----------------------------- + + static utils::Result createIndexCopyInplaceInfo( + infiniopTensorDescriptor_t output_desc, + infiniopTensorDescriptor_t input_desc, + infiniopTensorDescriptor_t index_desc, + size_t dim + ) { +// ------------------------- start: check tensor shape and input validity ------------------------- + CHECK_OR_RETURN(output_desc->ndim() == input_desc->ndim(), INFINI_STATUS_BAD_TENSOR_STRIDES); + std::vector meta_strides(input_desc->ndim()); + ptrdiff_t last_dim = 1; + ptrdiff_t last_stride = 1; + size_t total_input_size = 1; + size_t total_output_size = 1; + for (size_t d = 0; d < input_desc->ndim(); d++){ + total_input_size *= input_desc->dim(d); + total_output_size *= output_desc->dim(d); + if (d == dim) { + continue; + } + else { + meta_strides[d] = last_dim * last_stride; + last_dim = input_desc->dim(d); + last_stride = meta_strides[d]; + } + } + meta_strides[dim] = last_dim * last_stride; +// -------------------------- end: check tensor shape and input validity -------------------------- + return utils::Result(IndexCopyInplaceInfo{ +// ------------------------------ start: create an instance of Info ------------------------------- + output_desc->dtype(), + total_input_size, + total_output_size, + output_desc->shape(), + input_desc->shape(), + index_desc->shape(), + output_desc->strides(), + input_desc->strides(), + index_desc->strides(), + meta_strides, + dim +// ------------------------------- end: create an instance of Info -------------------------------- + }); + } +}; +} + +#endif // __INDEX_COPY_INPLACE_INFO_H__ diff --git a/src/infiniop/ops/index_copy_inplace/metax/index_copy_inplace_metax.h b/src/infiniop/ops/index_copy_inplace/metax/index_copy_inplace_metax.h new file mode 100644 index 000000000..65673c88c --- /dev/null +++ b/src/infiniop/ops/index_copy_inplace/metax/index_copy_inplace_metax.h @@ -0,0 +1,8 @@ +#ifndef __INDEX_COPY_INPLACE_METAX_H__ +#define __INDEX_COPY_INPLACE_METAX_H__ + +#include "../index_copy_inplace.h" + +INDEX_COPY_INPLACE_DESCRIPTOR(metax) + +#endif // __INDEX_COPY_INPLACE_METAX_H__ diff --git a/src/infiniop/ops/index_copy_inplace/metax/index_copy_inplace_metax.maca b/src/infiniop/ops/index_copy_inplace/metax/index_copy_inplace_metax.maca new file mode 100644 index 000000000..e540b6201 --- /dev/null +++ b/src/infiniop/ops/index_copy_inplace/metax/index_copy_inplace_metax.maca @@ -0,0 +1,147 @@ +#include "../../../devices/metax/metax_common.h" +#include "index_copy_inplace_metax.h" +#include +#include "../../../devices/metax/metax_kernel_common.h" +#include "../../../reduce/cuda/reduce.cuh" +#include "../../rearrange/metax/rearrange_metax.h" +#include "../info.h" + +namespace op::index_copy_inplace::metax { + +infiniStatus_t calculate_index_copy_inplace( + char * output, + const char * input, + const int64_t * index, + size_t copy_unit_size, + size_t output_len, + size_t index_len, + ptrdiff_t index_stride, + hcStream_t stream +) { + int64_t* dst_index = new int64_t; + size_t sizeof_int64_t = sizeof(int64_t); + for (size_t src_index = 0; src_index < index_len; src_index ++) { + CHECK_METAX(hcMemcpyAsync( + dst_index, + index + src_index * index_stride, + sizeof_int64_t, + hcMemcpyDeviceToHost, + stream + )); + hcStreamSynchronize(stream); + CHECK_METAX(hcMemcpyAsync( + output + (size_t)(*dst_index) * copy_unit_size, + input + src_index * copy_unit_size, + copy_unit_size, + hcMemcpyDeviceToDevice, + stream + )); + hcStreamSynchronize(stream); + } + delete dst_index; + return INFINI_STATUS_SUCCESS; +} + +struct Descriptor::Opaque { + std::shared_ptr internal; +}; + +Descriptor::~Descriptor() { + delete reinterpret_cast(_rearrange_desc_in); + delete reinterpret_cast(_rearrange_desc_out); + delete _opaque; +} + +infiniStatus_t Descriptor::create( + infiniopHandle_t handle_, + Descriptor **desc_ptr, + infiniopTensorDescriptor_t output_desc, + infiniopTensorDescriptor_t input_desc, + infiniopTensorDescriptor_t index_desc, + size_t dim +) { + auto handle = reinterpret_cast(handle_); +// --------------------- start: check data type and calculate workspace size ---------------------- + auto dtype = output_desc->dtype(); + + auto result = IndexCopyInplaceInfo::createIndexCopyInplaceInfo( + output_desc, + input_desc, + index_desc, + dim + ); + CHECK_RESULT(result); + const IndexCopyInplaceInfo &info = result.take(); + size_t WorkSpaceSize = (info.total_input_size + info.total_output_size) * infiniSizeOf(dtype); + + + InfiniopTensorDescriptor * rearrange_in_desc = new InfiniopTensorDescriptor( + dtype, input_desc->ndim(), input_desc->shape().data(), info.meta_strides.data() + ); + InfiniopTensorDescriptor * rearrange_out_desc = new InfiniopTensorDescriptor( + dtype, input_desc->ndim(), output_desc->shape().data(), info.meta_strides.data() + ); + + void * in_rearrange_descriptor = nullptr; + void * out_rearrange_descriptor = nullptr; + + op::rearrange::metax::Descriptor::create( + handle_, reinterpret_cast(&in_rearrange_descriptor), + rearrange_in_desc, input_desc + ); + op::rearrange::metax::Descriptor::create( + handle_, reinterpret_cast(&out_rearrange_descriptor), + output_desc, rearrange_out_desc + ); + +// ---------------------- end: check data type and calculate workspace size ----------------------- + + *desc_ptr = new Descriptor( + dtype, std::move(info), WorkSpaceSize, + new Opaque{handle->internal()}, + handle->device, handle->device_id, + in_rearrange_descriptor, + out_rearrange_descriptor + ); + return INFINI_STATUS_SUCCESS; +} + + + +infiniStatus_t Descriptor::calculate( + void * workspace, + size_t workspace_size, + void * output, + const void * input, + const void * index, + void *stream_ +) const { + if (workspace_size < _workspace_size) + return INFINI_STATUS_INSUFFICIENT_WORKSPACE; + + hcStream_t stream = (hcStream_t)stream_; + + size_t elem_size = infiniSizeOf(_info.dtype); + char* workspace_in = reinterpret_cast(workspace); + char* workspace_out = workspace_in + elem_size * _info.total_input_size; + CHECK_STATUS(reinterpret_cast(_rearrange_desc_in)->calculate(workspace_in, input, stream)); + hcMemsetAsync(workspace_out, 0, _info.total_output_size * elem_size, stream); + + hcDeviceSynchronize(); + CHECK_STATUS(calculate_index_copy_inplace( + reinterpret_cast(workspace_out), + reinterpret_cast(workspace_in), + reinterpret_cast(index), + elem_size * _info.meta_strides[_info.dim], + _info.output_shape[_info.dim], + _info.index_shape[0], + _info.index_strides[0], + stream + )); + hcDeviceSynchronize(); + + CHECK_STATUS(reinterpret_cast(_rearrange_desc_out)->calculate(output, workspace_out, stream)); + return INFINI_STATUS_SUCCESS; + +} +} // namespace op::index_copy_inplace::metax diff --git a/src/infiniop/ops/index_copy_inplace/nvidia/index_copy_inplace_nvidia.cu b/src/infiniop/ops/index_copy_inplace/nvidia/index_copy_inplace_nvidia.cu new file mode 100644 index 000000000..ebf5907e0 --- /dev/null +++ b/src/infiniop/ops/index_copy_inplace/nvidia/index_copy_inplace_nvidia.cu @@ -0,0 +1,140 @@ +#include "../../../devices/nvidia/nvidia_handle.cuh" +#include "../../../devices/nvidia/nvidia_common.cuh" +#include "../../../devices/nvidia/nvidia_kernel_common.cuh" +#include "index_copy_inplace_nvidia.cuh" +#include "../../rearrange/nvidia/rearrange_nvidia.cuh" +#include "../info.h" + +namespace op::index_copy_inplace::nvidia { + +infiniStatus_t calculate_index_copy_inplace( + char * output, + const char * input, + const int64_t * index, + size_t copy_unit_size, + size_t output_len, + size_t index_len, + ptrdiff_t index_stride, + cudaStream_t stream +) { + int64_t* dst_index = new int64_t; + size_t sizeof_int64_t = sizeof(int64_t); + for (size_t src_index = 0; src_index < index_len; src_index ++) { + CHECK_CUDA(cudaMemcpyAsync( + dst_index, + index + src_index * index_stride, + sizeof_int64_t, + cudaMemcpyDeviceToHost, + stream + )); + cudaStreamSynchronize(stream); + CHECK_CUDA(cudaMemcpyAsync( + output + (size_t)(*dst_index) * copy_unit_size, + input + src_index * copy_unit_size, + copy_unit_size, + cudaMemcpyDeviceToDevice, + stream + )); + cudaStreamSynchronize(stream); + } + delete dst_index; + return INFINI_STATUS_SUCCESS; +} + +struct Descriptor::Opaque { + std::shared_ptr internal; +}; + +Descriptor::~Descriptor() { + delete reinterpret_cast(_rearrange_desc_in); + delete reinterpret_cast(_rearrange_desc_out); + delete _opaque; +} + +infiniStatus_t Descriptor::create( + infiniopHandle_t handle_, + Descriptor **desc_ptr, + infiniopTensorDescriptor_t output_desc, + infiniopTensorDescriptor_t input_desc, + infiniopTensorDescriptor_t index_desc, + size_t dim +) { + auto handle = reinterpret_cast(handle_); +// --------------------- start: check data type and calculate workspace size ---------------------- + auto dtype = output_desc->dtype(); +// ---------------------- end: check data type and calculate workspace size ----------------------- + auto result = IndexCopyInplaceInfo::createIndexCopyInplaceInfo( + output_desc, + input_desc, + index_desc, + dim + ); + CHECK_RESULT(result); + const IndexCopyInplaceInfo &info = result.take(); + size_t WorkSpaceSize = (info.total_input_size + info.total_output_size) * infiniSizeOf(dtype); + + InfiniopTensorDescriptor * rearrange_in_desc = new InfiniopTensorDescriptor( + dtype, input_desc->ndim(), input_desc->shape().data(), info.meta_strides.data() + ); + InfiniopTensorDescriptor * rearrange_out_desc = new InfiniopTensorDescriptor( + dtype, input_desc->ndim(), output_desc->shape().data(), info.meta_strides.data() + ); + + void * in_rearrange_descriptor = nullptr; + void * out_rearrange_descriptor = nullptr; + + op::rearrange::nvidia::Descriptor::create( + handle_, reinterpret_cast(&in_rearrange_descriptor), + rearrange_in_desc, input_desc + ); + op::rearrange::nvidia::Descriptor::create( + handle_, reinterpret_cast(&out_rearrange_descriptor), + output_desc, rearrange_out_desc + ); + + *desc_ptr = new Descriptor( + dtype, std::move(info), WorkSpaceSize, + new Opaque{handle->internal()}, + handle->device, handle->device_id, + in_rearrange_descriptor, + out_rearrange_descriptor + ); + return INFINI_STATUS_SUCCESS; +} + + +infiniStatus_t Descriptor::calculate( + void * workspace, + size_t workspace_size, + void * output, + const void * input, + const void * index, + void *stream_ +) const { + if (workspace_size < _workspace_size) { + return INFINI_STATUS_INSUFFICIENT_WORKSPACE; + } + cudaStream_t stream = (cudaStream_t)stream_; + + size_t elem_size = infiniSizeOf(_info.dtype); + char* workspace_in = reinterpret_cast(workspace); + char* workspace_out = workspace_in + elem_size * _info.total_input_size; + CHECK_STATUS(reinterpret_cast(_rearrange_desc_in)->calculate(workspace_in, input, stream)); + cudaMemsetAsync(workspace_out, 0, _info.total_output_size * elem_size, stream); + cudaDeviceSynchronize(); + CHECK_STATUS(calculate_index_copy_inplace( + reinterpret_cast(workspace_out), + reinterpret_cast(workspace_in), + reinterpret_cast(index), + elem_size * _info.meta_strides[_info.dim], + _info.output_shape[_info.dim], + _info.index_shape[0], + _info.index_strides[0], + stream + )); + cudaDeviceSynchronize(); + + CHECK_STATUS(reinterpret_cast(_rearrange_desc_out)->calculate(output, workspace_out, stream)); + return INFINI_STATUS_SUCCESS; +} +} // namespace op::index_copy_inplace::nvidia diff --git a/src/infiniop/ops/index_copy_inplace/nvidia/index_copy_inplace_nvidia.cuh b/src/infiniop/ops/index_copy_inplace/nvidia/index_copy_inplace_nvidia.cuh new file mode 100644 index 000000000..04c3c86f7 --- /dev/null +++ b/src/infiniop/ops/index_copy_inplace/nvidia/index_copy_inplace_nvidia.cuh @@ -0,0 +1,7 @@ +#ifndef __INDEX_COPY_INPLACE_NVIDIA_API_H__ +#define __INDEX_COPY_INPLACE_NVIDIA_API_H__ +#include "../index_copy_inplace.h" + +INDEX_COPY_INPLACE_DESCRIPTOR(nvidia) + +#endif // __INDEX_COPY_INPLACE_NVIDIA_API_H__ diff --git a/src/infiniop/ops/index_copy_inplace/operator.cc b/src/infiniop/ops/index_copy_inplace/operator.cc new file mode 100644 index 000000000..e8886b65f --- /dev/null +++ b/src/infiniop/ops/index_copy_inplace/operator.cc @@ -0,0 +1,154 @@ +#include "../../operator.h" +#include "../../handle.h" +#include "infiniop/ops/index_copy_inplace.h" + +#ifdef ENABLE_CPU_API +#include "cpu/index_copy_inplace_cpu.h" +#endif +#if defined(ENABLE_NVIDIA_API) || defined(ENABLE_ILUVATAR_API) +#include "nvidia/index_copy_inplace_nvidia.cuh" +#endif +#ifdef ENABLE_METAX_API +#include "metax/index_copy_inplace_metax.h" +#endif + +__C infiniStatus_t infiniopCreateIndexCopyInplaceDescriptor( + infiniopHandle_t handle, + infiniopIndexCopyInplaceDescriptor_t *desc_ptr, + infiniopTensorDescriptor_t output_desc, + infiniopTensorDescriptor_t input_desc, + infiniopTensorDescriptor_t index_desc, + size_t dim +) { +#define CREATE(CASE, NAMESPACE) \ + case CASE: \ + return op::index_copy_inplace::NAMESPACE::Descriptor::create( \ + handle, \ + reinterpret_cast(desc_ptr), \ + output_desc, \ + input_desc, \ + index_desc, \ + dim \ + ) + + switch (handle->device) { + +#ifdef ENABLE_CPU_API + CREATE(INFINI_DEVICE_CPU, cpu); +#endif +#ifdef ENABLE_NVIDIA_API + CREATE(INFINI_DEVICE_NVIDIA, nvidia); +#endif +#ifdef ENABLE_ILUVATAR_API + CREATE(INFINI_DEVICE_ILUVATAR, nvidia); +#endif +#ifdef ENABLE_METAX_API + CREATE(INFINI_DEVICE_METAX, metax); +#endif + + default: + return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED; + } + +#undef CREATE +} + +__C infiniStatus_t infiniopGetIndexCopyInplaceWorkspaceSize(infiniopIndexCopyInplaceDescriptor_t desc, size_t *size) { +#define GET(CASE, NAMESPACE) \ + case CASE: \ + *size = reinterpret_cast(desc)->workspaceSize(); \ + return INFINI_STATUS_SUCCESS + + switch (desc->device_type) { +#ifdef ENABLE_CPU_API + GET(INFINI_DEVICE_CPU, cpu); +#endif +#ifdef ENABLE_NVIDIA_API + GET(INFINI_DEVICE_NVIDIA, nvidia); +#endif +#ifdef ENABLE_ILUVATAR_API + GET(INFINI_DEVICE_ILUVATAR, nvidia); +#endif +#ifdef ENABLE_METAX_API + GET(INFINI_DEVICE_METAX, metax); +#endif + default: + return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED; + } +#undef GET + + return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED; +} + +__C infiniStatus_t infiniopIndexCopyInplace( + infiniopIndexCopyInplaceDescriptor_t desc, + void *workspace, + size_t workspace_size, + void * output, + const void * input, + const void * index, + void *stream +) { + +#define CALCULATE(CASE, NAMESPACE) \ + case CASE: \ + return reinterpret_cast(desc)->calculate( \ + workspace, \ + workspace_size, \ + output, \ + input, \ + index, \ + stream \ + ) + + switch (desc->device_type) { + +#ifdef ENABLE_CPU_API + CALCULATE(INFINI_DEVICE_CPU, cpu); +#endif +#ifdef ENABLE_NVIDIA_API + CALCULATE(INFINI_DEVICE_NVIDIA, nvidia); +#endif +#ifdef ENABLE_ILUVATAR_API + CALCULATE(INFINI_DEVICE_ILUVATAR, nvidia); +#endif +#ifdef ENABLE_METAX_API + CALCULATE(INFINI_DEVICE_METAX, metax); +#endif + + default: + return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED; + } + +#undef CALCULATE +} + +__C infiniStatus_t +infiniopDestroyIndexCopyInplaceDescriptor(infiniopIndexCopyInplaceDescriptor_t desc) { + +#define DELETE(CASE, NAMESPACE) \ + case CASE: \ + delete reinterpret_cast(desc); \ + return INFINI_STATUS_SUCCESS + + switch (desc->device_type) { + +#ifdef ENABLE_CPU_API + DELETE(INFINI_DEVICE_CPU, cpu); +#endif +#ifdef ENABLE_NVIDIA_API + DELETE(INFINI_DEVICE_NVIDIA, nvidia); +#endif +#ifdef ENABLE_ILUVATAR_API + DELETE(INFINI_DEVICE_ILUVATAR, nvidia); +#endif +#ifdef ENABLE_METAX_API + DELETE(INFINI_DEVICE_METAX, metax); +#endif + + default: + return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED; + } + +#undef DELETE +} diff --git a/src/infiniop/ops/linear/cpu/linear_cpu.cc b/src/infiniop/ops/linear/cpu/linear_cpu.cc new file mode 100644 index 000000000..78dd86997 --- /dev/null +++ b/src/infiniop/ops/linear/cpu/linear_cpu.cc @@ -0,0 +1,95 @@ +#include "linear_cpu.h" +#include "../../../devices/cpu/common_cpu.h" +#include "../../../reduce/cpu/reduce.h" +#include "../info.h" + +namespace op::linear::cpu { + +template +infiniStatus_t calculate_linear( + const LinearInfo &info, + Tdata * y, + const Tdata * x, + const Tdata * w, + const Tdata * b +) { +// -------------------------------- start: perform operator on CPU -------------------------------- + #pragma omp parallel for + for(size_t j = 0; j < info.out_features; j ++) + { + auto w_ptr = w + j * info.w_stride_out; + float y_sum = info.bias ? (utils::cast(*(b + j * info.b_stride))) : 0.; + for(size_t i = 0; i < info.in_features; i ++) + { + y_sum += utils::cast(*(x + i * info.x_stride)) * utils::cast(*(w_ptr + i * info.w_stride_in)); + } + *(y + j * info.y_stride) = utils::cast(y_sum); + } +// --------------------------------- end: perform operator on CPU --------------------------------- + return INFINI_STATUS_SUCCESS; +} + +Descriptor::~Descriptor() = default; + +infiniStatus_t Descriptor::create( + infiniopHandle_t handle_, + Descriptor **desc_ptr, + infiniopTensorDescriptor_t y_desc, + infiniopTensorDescriptor_t x_desc, + infiniopTensorDescriptor_t w_desc, + infiniopTensorDescriptor_t b_desc +) { + auto handle = reinterpret_cast(handle_); + +// --------------------- start: check data type and calculate workspace size ---------------------- + auto dtype = y_desc->dtype(); + CHECK_DTYPE(dtype, INFINI_DTYPE_F16, INFINI_DTYPE_F32, INFINI_DTYPE_BF16); + size_t WorkSpaceSize = 0; +// ---------------------- end: check data type and calculate workspace size ----------------------- + + auto result = LinearInfo::createLinearInfo( + y_desc, + x_desc, + w_desc, + b_desc + ); + CHECK_RESULT(result); + const LinearInfo &info = result.take(); + + *desc_ptr = new Descriptor( + dtype, std::move(info), WorkSpaceSize, + nullptr, + handle->device, handle->device_id + ); + + return INFINI_STATUS_SUCCESS; +} + + + +infiniStatus_t Descriptor::calculate( + void *workspace, + size_t workspace_size, + void * y, + const void * x, + const void * w, + const void * b, + void *stream +) const { + #define CALCULATE_LINEAR(TDATA) \ + CHECK_STATUS(calculate_linear(_info, \ + (TDATA *)y, (const TDATA *)x, (const TDATA *)w, (const TDATA *)b)) + if (_info.dtype == INFINI_DTYPE_F16) { + CALCULATE_LINEAR(fp16_t); + } else if (_info.dtype == INFINI_DTYPE_BF16) { + CALCULATE_LINEAR(bf16_t); + } else if (_info.dtype == INFINI_DTYPE_F32) { + CALCULATE_LINEAR(float); + } else { + return INFINI_STATUS_BAD_TENSOR_DTYPE; + } + #undef CALCULATE_LINEAR + + return INFINI_STATUS_SUCCESS; +} +} diff --git a/src/infiniop/ops/linear/cpu/linear_cpu.h b/src/infiniop/ops/linear/cpu/linear_cpu.h new file mode 100644 index 000000000..5caf2a8a8 --- /dev/null +++ b/src/infiniop/ops/linear/cpu/linear_cpu.h @@ -0,0 +1,9 @@ +#ifndef __LINEAR_CPU_H__ +#define __LINEAR_CPU_H__ + +#include "../linear.h" + +DESCRIPTOR(cpu) + + +#endif // __LINEAR_CPU_H__ diff --git a/src/infiniop/ops/linear/cuda/kernel.cuh b/src/infiniop/ops/linear/cuda/kernel.cuh new file mode 100644 index 000000000..dbc2ec898 --- /dev/null +++ b/src/infiniop/ops/linear/cuda/kernel.cuh @@ -0,0 +1,32 @@ +#ifndef __LINEAR_KERNEL_CUH__ +#define __LINEAR_KERNEL_CUH__ +// ------------------------------- start: perform operator on CUDA -------------------------------- +template +__device__ void linearKernel( + Tdata * y, + const Tdata * x, + const Tdata * w, + const Tdata * b, + size_t in_features, + size_t out_features, + ptrdiff_t y_stride, + ptrdiff_t x_stride, + ptrdiff_t w_stride_out, + ptrdiff_t w_stride_in, + ptrdiff_t b_stride, + bool bias +) { + size_t y_index = blockIdx.x; + auto y_ptr = y + y_index * y_stride; + auto w_ptr = w + y_index * w_stride_out; + + Tcompute y_value = bias ? (Tcompute(*(b + y_index * b_stride))) : Tcompute(0); + for(size_t i = 0; i < in_features; i ++) + { + y_value += Tcompute(*(x + i * x_stride)) * Tcompute(*(w_ptr + i * w_stride_in)); + } + *y_ptr = y_value; +} +// -------------------------------- end: perform operator on CUDA --------------------------------- + +#endif // __LINEAR_KERNEL_CUH__ diff --git a/src/infiniop/ops/linear/info.h b/src/infiniop/ops/linear/info.h new file mode 100644 index 000000000..eef5dc6ce --- /dev/null +++ b/src/infiniop/ops/linear/info.h @@ -0,0 +1,67 @@ +#ifndef __LINEAR_INFO_H__ +#define __LINEAR_INFO_H__ + +#include "../../../utils.h" +#include "../../operator.h" +#include "../../tensor.h" + +namespace op::linear { + +class LinearInfo { +private: + LinearInfo() = default; + +public: +// ---------------------------- start: define member variables of Info ---------------------------- + infiniDtype_t dtype; + size_t in_features; + size_t out_features; + ptrdiff_t y_stride; + ptrdiff_t x_stride; + ptrdiff_t w_stride_in; + ptrdiff_t w_stride_out; + ptrdiff_t b_stride; + bool bias; + +// ----------------------------- end: define member variables of Info ----------------------------- + + static utils::Result createLinearInfo( + infiniopTensorDescriptor_t y_desc, + infiniopTensorDescriptor_t x_desc, + infiniopTensorDescriptor_t w_desc, + infiniopTensorDescriptor_t b_desc + ) { +// ------------------------- start: check tensor shape and input validity ------------------------- + size_t in_features = x_desc->dim(0); + size_t out_features = y_desc->dim(0); + CHECK_OR_RETURN(x_desc->ndim() == 1 && y_desc->ndim() == 1 && w_desc->ndim() == 2 && \ + w_desc->dim(0) == out_features && w_desc->dim(1) == in_features, + INFINI_STATUS_BAD_TENSOR_SHAPE + ); + bool bias = (b_desc != nullptr); + if (bias) + CHECK_OR_RETURN( + b_desc->ndim() == 1 && b_desc->dim(0) == out_features, + INFINI_STATUS_BAD_TENSOR_SHAPE + ); + + +// -------------------------- end: check tensor shape and input validity -------------------------- + return utils::Result(LinearInfo{ +// ------------------------------ start: create an instance of Info ------------------------------- + y_desc->dtype(), + x_desc->dim(0), + y_desc->dim(0), + y_desc->stride(0), + x_desc->stride(0), + w_desc->stride(1), + w_desc->stride(0), + bias ? b_desc->stride(0) : 0, + bias +// ------------------------------- end: create an instance of Info -------------------------------- + }); + } +}; +} + +#endif // __LINEAR_INFO_H__ diff --git a/src/infiniop/ops/linear/linear.h b/src/infiniop/ops/linear/linear.h new file mode 100644 index 000000000..5e7e8e022 --- /dev/null +++ b/src/infiniop/ops/linear/linear.h @@ -0,0 +1,50 @@ +#ifndef __LINEAR_H__ +#define __LINEAR_H__ + +#include "../../../utils.h" +#include "../../operator.h" +#include "../../tensor.h" +#include "info.h" + +#define DESCRIPTOR(NAMESPACE) \ + namespace op::linear::NAMESPACE { \ + class Descriptor final : public InfiniopDescriptor { \ + struct Opaque; \ + Opaque *_opaque; \ + LinearInfo _info; \ + size_t _workspace_size; \ + Descriptor( \ + infiniDtype_t dtype, \ + LinearInfo info, \ + size_t workspace_size_, \ + Opaque *opaque, \ + infiniDevice_t device_type, \ + int device_id \ + ) : InfiniopDescriptor{device_type, device_id}, \ + _opaque(opaque), \ + _info(info), \ + _workspace_size(workspace_size_) {} \ + public: \ + ~Descriptor(); \ + size_t workspaceSize() const { return _workspace_size; } \ + static infiniStatus_t create( \ + infiniopHandle_t handle, \ + Descriptor **desc_ptr, \ + infiniopTensorDescriptor_t y_desc, \ + infiniopTensorDescriptor_t x_desc, \ + infiniopTensorDescriptor_t w_desc, \ + infiniopTensorDescriptor_t b_desc \ + ); \ + infiniStatus_t calculate( \ + void *workspace, \ + size_t workspace_size, \ + void * y, \ + const void * x, \ + const void * w, \ + const void * b, \ + void *stream \ + ) const; \ + }; \ + } + +#endif \ No newline at end of file diff --git a/src/infiniop/ops/linear/metax/linear_metax.h b/src/infiniop/ops/linear/metax/linear_metax.h new file mode 100644 index 000000000..736ef96b8 --- /dev/null +++ b/src/infiniop/ops/linear/metax/linear_metax.h @@ -0,0 +1,8 @@ +#ifndef __LINEAR_METAX_H__ +#define __LINEAR_METAX_H__ + +#include "../linear.h" + +DESCRIPTOR(metax) + +#endif // __LINEAR_METAX_H__ diff --git a/src/infiniop/ops/linear/metax/linear_metax.maca b/src/infiniop/ops/linear/metax/linear_metax.maca new file mode 100644 index 000000000..7b9765c79 --- /dev/null +++ b/src/infiniop/ops/linear/metax/linear_metax.maca @@ -0,0 +1,151 @@ +#include "../../../devices/metax/metax_common.h" +#include "linear_metax.h" +#include +#include "../../../devices/metax/metax_kernel_common.h" +#include "../../../reduce/cuda/reduce.cuh" +#include "../cuda/kernel.cuh" +#include "../info.h" + +namespace op::linear::metax { + +template +INFINIOP_METAX_KERNEL launchKernel( + Tdata * y, + const Tdata * x, + const Tdata * w, + const Tdata * b, + size_t in_features, + size_t out_features, + ptrdiff_t y_stride, + ptrdiff_t x_stride, + ptrdiff_t w_stride_out, + ptrdiff_t w_stride_in, + ptrdiff_t b_stride, + bool bias +) { + linearKernel( + y, + x, + w, + b, + in_features, + out_features, + y_stride, + x_stride, + w_stride_out, + w_stride_in, + b_stride, + bias + ); +} + +// ----------------------------------- start: call launchKernel ----------------------------------- +template +infiniStatus_t calculate_linear( + const LinearInfo &info, + Tdata * y, + const Tdata * x, + const Tdata * w, + const Tdata * b, + hcStream_t stream +) { + launchKernel<1, Tdata, float><<>>( + y, + x, + w, + b, + info.in_features, + info.out_features, + info.y_stride, + info.x_stride, + info.w_stride_out, + info.w_stride_in, + info.b_stride, + info.bias + ); + return INFINI_STATUS_SUCCESS; +} +// ------------------------------------ end: call launchKernel ------------------------------------ + + +struct Descriptor::Opaque { + std::shared_ptr internal; +}; + +Descriptor::~Descriptor() { + delete _opaque; +} + +infiniStatus_t Descriptor::create( + infiniopHandle_t handle_, + Descriptor **desc_ptr, + infiniopTensorDescriptor_t y_desc, + infiniopTensorDescriptor_t x_desc, + infiniopTensorDescriptor_t w_desc, + infiniopTensorDescriptor_t b_desc +) { + auto handle = reinterpret_cast(handle_); +// --------------------- start: check data type and calculate workspace size ---------------------- + auto dtype = y_desc->dtype(); + CHECK_DTYPE(dtype, INFINI_DTYPE_F16, INFINI_DTYPE_F32, INFINI_DTYPE_BF16); + auto result = LinearInfo::createLinearInfo( + y_desc, + x_desc, + w_desc, + b_desc + ); + CHECK_RESULT(result); + const LinearInfo &info = result.take(); + size_t WorkSpaceSize = 0; +// ---------------------- end: check data type and calculate workspace size ----------------------- + + *desc_ptr = new Descriptor( + dtype, std::move(info), WorkSpaceSize, + new Opaque{handle->internal()}, + handle->device, handle->device_id + ); + return INFINI_STATUS_SUCCESS; +} + + + +infiniStatus_t Descriptor::calculate( + void * workspace, + size_t workspace_size, + void * y, + const void * x, + const void * w, + const void * b, + void *stream_ +) const { + if (workspace_size < _workspace_size) + return INFINI_STATUS_INSUFFICIENT_WORKSPACE; + + hcStream_t stream = (hcStream_t)stream_; + + #define CALCULATE_LINEAR(BLOCK_SIZE, TDATA) \ + calculate_linear(_info, (TDATA *)y, (const TDATA *)x, (const TDATA *)w, (const TDATA *)b, stream) + #define CALCULATE_LINEAR_WITH_METAX_BLOCK(BLOCK_SIZE) \ + { \ + if (_info.dtype == INFINI_DTYPE_F16) \ + return CALCULATE_LINEAR(BLOCK_SIZE, half); \ + else if (_info.dtype == INFINI_DTYPE_F32) \ + return CALCULATE_LINEAR(BLOCK_SIZE, float); \ + else if (_info.dtype == INFINI_DTYPE_BF16) \ + return CALCULATE_LINEAR(BLOCK_SIZE, cuda_bfloat16); \ + else \ + return INFINI_STATUS_BAD_TENSOR_DTYPE; \ + } + + if (_opaque->internal->maxThreadsPerBlock() == METAX_BLOCK_SIZE_1024) + CALCULATE_LINEAR_WITH_METAX_BLOCK(METAX_BLOCK_SIZE_1024) + else if (_opaque->internal->maxThreadsPerBlock() == METAX_BLOCK_SIZE_512) + CALCULATE_LINEAR_WITH_METAX_BLOCK(METAX_BLOCK_SIZE_512) + else + return INFINI_STATUS_DEVICE_ARCHITECTURE_NOT_SUPPORTED; + return INFINI_STATUS_SUCCESS; + + #undef CALCULATE_LINEAR_WITH_METAX_BLOCK + #undef CALCULATE_LINEAR +} +} // namespace op::linear::metax diff --git a/src/infiniop/ops/linear/nvidia/linear_nvidia.cu b/src/infiniop/ops/linear/nvidia/linear_nvidia.cu new file mode 100644 index 000000000..98d897d12 --- /dev/null +++ b/src/infiniop/ops/linear/nvidia/linear_nvidia.cu @@ -0,0 +1,155 @@ +#include "../../../devices/nvidia/nvidia_handle.cuh" +#include "../../../devices/nvidia/nvidia_common.cuh" +#include "../../../devices/nvidia/nvidia_kernel_common.cuh" +#include "linear_nvidia.cuh" +#include "../cuda/kernel.cuh" +#include "../info.h" + +namespace op::linear::nvidia { + +// ---------------------- start: launchKernel: call kernel function of CUDA ----------------------- +template +INFINIOP_CUDA_KERNEL launchKernel( + Tdata * y, + const Tdata * x, + const Tdata * w, + const Tdata * b, + size_t in_features, + size_t out_features, + ptrdiff_t y_stride, + ptrdiff_t x_stride, + ptrdiff_t w_stride_out, + ptrdiff_t w_stride_in, + ptrdiff_t b_stride, + bool bias +) { + linearKernel( + y, + x, + w, + b, + in_features, + out_features, + y_stride, + x_stride, + w_stride_out, + w_stride_in, + b_stride, + bias + ); +} +// ----------------------- end: launchKernel: call kernel function of CUDA ------------------------ + +// ----------------------------------- start: call launchKernel ----------------------------------- +template +infiniStatus_t calculate_linear( + const LinearInfo &info, + Tdata * y, + const Tdata * x, + const Tdata * w, + const Tdata * b, + cudaStream_t stream +) { + + launchKernel<1, Tdata, float><<>>( + y, + x, + w, + b, + info.in_features, + info.out_features, + info.y_stride, + info.x_stride, + info.w_stride_out, + info.w_stride_in, + info.b_stride, + info.bias + ); + return INFINI_STATUS_SUCCESS; +} +// ------------------------------------ end: call launchKernel ------------------------------------ + + +struct Descriptor::Opaque { + std::shared_ptr internal; +}; + +Descriptor::~Descriptor() { + delete _opaque; +} + +infiniStatus_t Descriptor::create( + infiniopHandle_t handle_, + Descriptor **desc_ptr, + infiniopTensorDescriptor_t y_desc, + infiniopTensorDescriptor_t x_desc, + infiniopTensorDescriptor_t w_desc, + infiniopTensorDescriptor_t b_desc +) { + auto handle = reinterpret_cast(handle_); +// --------------------- start: check data type and calculate workspace size ---------------------- + auto dtype = y_desc->dtype(); + CHECK_DTYPE(dtype, INFINI_DTYPE_F16, INFINI_DTYPE_F32, INFINI_DTYPE_BF16); + size_t WorkSpaceSize = 0; + //size_t workspace_size = reinterpret_cast(y_desc)->workspaceSize(); +// ---------------------- end: check data type and calculate workspace size ----------------------- + auto result = LinearInfo::createLinearInfo( + y_desc, + x_desc, + w_desc, + b_desc + ); + CHECK_RESULT(result); + const LinearInfo &info = result.take(); + *desc_ptr = new Descriptor( + dtype, std::move(info), WorkSpaceSize, + new Opaque{handle->internal()}, + handle->device, handle->device_id + ); + return INFINI_STATUS_SUCCESS; +} + + +infiniStatus_t Descriptor::calculate( + void * workspace, + size_t workspace_size, + void * y, + const void * x, + const void * w, + const void * b, + void *stream_ +) const { + if (workspace_size < _workspace_size) { + return INFINI_STATUS_INSUFFICIENT_WORKSPACE; + } + cudaStream_t stream = (cudaStream_t)stream_; + + #define CALCULATE_LINEAR(BLOCK_SIZE, TDATA) \ + calculate_linear(_info, (TDATA *)y, (const TDATA *)x, (const TDATA *)w, (const TDATA *)b, stream) + #define CALCULATE_LINEAR_WITH_BLOCK_SIZE(BLOCK_SIZE) \ + { \ + if (_info.dtype == INFINI_DTYPE_F16) \ + return CALCULATE_LINEAR(BLOCK_SIZE, half); \ + else if (_info.dtype == INFINI_DTYPE_F32) \ + return CALCULATE_LINEAR(BLOCK_SIZE, float); \ + else if (_info.dtype == INFINI_DTYPE_BF16) \ + return CALCULATE_LINEAR(BLOCK_SIZE, __nv_bfloat16); \ + else \ + return INFINI_STATUS_BAD_TENSOR_DTYPE; \ + } + + if (_opaque->internal->maxThreadsPerBlock() == CUDA_BLOCK_SIZE_1024) + CALCULATE_LINEAR_WITH_BLOCK_SIZE(CUDA_BLOCK_SIZE_1024) + else if (_opaque->internal->maxThreadsPerBlock() == CUDA_BLOCK_SIZE_512) + CALCULATE_LINEAR_WITH_BLOCK_SIZE(CUDA_BLOCK_SIZE_512) + else if (_opaque->internal->maxThreadsPerBlock() == CUDA_BLOCK_SIZE_4096) + CALCULATE_LINEAR_WITH_BLOCK_SIZE(CUDA_BLOCK_SIZE_4096) + else + return INFINI_STATUS_DEVICE_ARCHITECTURE_NOT_SUPPORTED; + + return INFINI_STATUS_SUCCESS; + + #undef CALCULATE_LINEAR_WITH_BLOCK_SIZE + #undef CALCULATE_LINEAR +} +} // namespace op::linear::nvidia diff --git a/src/infiniop/ops/linear/nvidia/linear_nvidia.cuh b/src/infiniop/ops/linear/nvidia/linear_nvidia.cuh new file mode 100644 index 000000000..fdc3ddf64 --- /dev/null +++ b/src/infiniop/ops/linear/nvidia/linear_nvidia.cuh @@ -0,0 +1,7 @@ +#ifndef __LINEAR_NVIDIA_API_H__ +#define __LINEAR_NVIDIA_API_H__ +#include "../linear.h" + +DESCRIPTOR(nvidia) + +#endif // __LINEAR_NVIDIA_API_H__ diff --git a/src/infiniop/ops/linear/operator.cc b/src/infiniop/ops/linear/operator.cc new file mode 100644 index 000000000..7f741ffad --- /dev/null +++ b/src/infiniop/ops/linear/operator.cc @@ -0,0 +1,156 @@ +#include "../../operator.h" +#include "../../handle.h" +#include "infiniop/ops/linear.h" + +#ifdef ENABLE_CPU_API +#include "cpu/linear_cpu.h" +#endif +#if defined(ENABLE_NVIDIA_API) || defined(ENABLE_ILUVATAR_API) +#include "nvidia/linear_nvidia.cuh" +#endif +#ifdef ENABLE_METAX_API +#include "metax/linear_metax.h" +#endif + +__C infiniStatus_t infiniopCreateLinearDescriptor( + infiniopHandle_t handle, + infiniopLinearDescriptor_t *desc_ptr, + infiniopTensorDescriptor_t y_desc, + infiniopTensorDescriptor_t x_desc, + infiniopTensorDescriptor_t w_desc, + infiniopTensorDescriptor_t b_desc +) { +#define CREATE(CASE, NAMESPACE) \ + case CASE: \ + return op::linear::NAMESPACE::Descriptor::create( \ + handle, \ + reinterpret_cast(desc_ptr), \ + y_desc, \ + x_desc, \ + w_desc, \ + b_desc \ + ) + + switch (handle->device) { + +#ifdef ENABLE_CPU_API + CREATE(INFINI_DEVICE_CPU, cpu); +#endif +#ifdef ENABLE_NVIDIA_API + CREATE(INFINI_DEVICE_NVIDIA, nvidia); +#endif +#ifdef ENABLE_ILUVATAR_API + CREATE(INFINI_DEVICE_ILUVATAR, nvidia); +#endif +#ifdef ENABLE_METAX_API + CREATE(INFINI_DEVICE_METAX, metax); +#endif + + default: + return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED; + } + +#undef CREATE +} + +__C infiniStatus_t infiniopGetLinearWorkspaceSize(infiniopLinearDescriptor_t desc, size_t *size) { +#define GET(CASE, NAMESPACE) \ + case CASE: \ + *size = reinterpret_cast(desc)->workspaceSize(); \ + return INFINI_STATUS_SUCCESS + + switch (desc->device_type) { +#ifdef ENABLE_CPU_API + GET(INFINI_DEVICE_CPU, cpu); +#endif +#ifdef ENABLE_NVIDIA_API + GET(INFINI_DEVICE_NVIDIA, nvidia); +#endif +#ifdef ENABLE_ILUVATAR_API + GET(INFINI_DEVICE_ILUVATAR, nvidia); +#endif +#ifdef ENABLE_METAX_API + GET(INFINI_DEVICE_METAX, metax); +#endif + default: + return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED; + } +#undef GET + + return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED; +} + +__C infiniStatus_t infiniopLinear( + infiniopLinearDescriptor_t desc, + void *workspace, + size_t workspace_size, + void * y, + const void * x, + const void * w, + const void * b, + void *stream +) { + +#define CALCULATE(CASE, NAMESPACE) \ + case CASE: \ + return reinterpret_cast(desc)->calculate( \ + workspace, \ + workspace_size, \ + y, \ + x, \ + w, \ + b, \ + stream \ + ) + + switch (desc->device_type) { + +#ifdef ENABLE_CPU_API + CALCULATE(INFINI_DEVICE_CPU, cpu); +#endif +#ifdef ENABLE_NVIDIA_API + CALCULATE(INFINI_DEVICE_NVIDIA, nvidia); +#endif +#ifdef ENABLE_ILUVATAR_API + CALCULATE(INFINI_DEVICE_ILUVATAR, nvidia); +#endif +#ifdef ENABLE_METAX_API + CALCULATE(INFINI_DEVICE_METAX, metax); +#endif + + default: + return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED; + } + +#undef CALCULATE +} + +__C infiniStatus_t +infiniopDestroyLinearDescriptor(infiniopLinearDescriptor_t desc) { + +#define DELETE(CASE, NAMESPACE) \ + case CASE: \ + delete reinterpret_cast(desc); \ + return INFINI_STATUS_SUCCESS + + switch (desc->device_type) { + +#ifdef ENABLE_CPU_API + DELETE(INFINI_DEVICE_CPU, cpu); +#endif +#ifdef ENABLE_NVIDIA_API + DELETE(INFINI_DEVICE_NVIDIA, nvidia); +#endif +#ifdef ENABLE_ILUVATAR_API + DELETE(INFINI_DEVICE_ILUVATAR, nvidia); +#endif +#ifdef ENABLE_METAX_API + DELETE(INFINI_DEVICE_METAX, metax); +#endif + + default: + return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED; + } + +#undef DELETE +} diff --git a/src/infiniop/ops/linear_backward/cpu/linear_backward_cpu.cc b/src/infiniop/ops/linear_backward/cpu/linear_backward_cpu.cc new file mode 100644 index 000000000..006626a4a --- /dev/null +++ b/src/infiniop/ops/linear_backward/cpu/linear_backward_cpu.cc @@ -0,0 +1,110 @@ +#include "linear_backward_cpu.h" +#include "../../../devices/cpu/common_cpu.h" +#include "../../../reduce/cpu/reduce.h" +#include "../info.h" + +namespace op::linear_backward::cpu { + +template +infiniStatus_t calculate_linear_backward( + const LinearBackwardInfo &info, + Tdata * grad_x, + Tdata * grad_w, + Tdata * grad_b, + const Tdata * grad_y, + const Tdata * x, + const Tdata * w +) { +// -------------------------------- start: perform operator on CPU -------------------------------- + // #pragma omp parallel for + for (size_t i = 0; i < info.in_features; i ++) + { + auto w_ptr = w + i * info.w_stride_in; + auto grad_w_ptr = grad_w + i * info.grad_w_stride_in; + float grad_x_sum = 0.; + float x_value = utils::cast(*(x + i * info.x_stride)); + for (size_t j = 0; j < info.out_features; j ++) + { + float grad_y_value = utils::cast(*(grad_y + j * info.grad_y_stride)); + grad_x_sum += grad_y_value * utils::cast(*(w_ptr + j * info.w_stride_out)); + (*(grad_w_ptr + j * info.grad_w_stride_out)) = utils::cast(x_value * grad_y_value); + if (info.bias && i == 0) + (*(grad_b + j * info.grad_b_stride)) = utils::cast(grad_y_value); + } + + *(grad_x + i * info.grad_x_stride) = utils::cast(grad_x_sum); + } +// --------------------------------- end: perform operator on CPU --------------------------------- + return INFINI_STATUS_SUCCESS; +} + +Descriptor::~Descriptor() = default; + +infiniStatus_t Descriptor::create( + infiniopHandle_t handle_, + Descriptor **desc_ptr, + infiniopTensorDescriptor_t grad_x_desc, + infiniopTensorDescriptor_t grad_w_desc, + infiniopTensorDescriptor_t grad_b_desc, + infiniopTensorDescriptor_t grad_y_desc, + infiniopTensorDescriptor_t x_desc, + infiniopTensorDescriptor_t w_desc +) { + auto handle = reinterpret_cast(handle_); + +// --------------------- start: check data type and calculate workspace size ---------------------- + auto dtype = grad_y_desc->dtype(); + CHECK_DTYPE(dtype, INFINI_DTYPE_F16, INFINI_DTYPE_F32, INFINI_DTYPE_BF16); + size_t WorkSpaceSize = 0; +// ---------------------- end: check data type and calculate workspace size ----------------------- + + auto result = LinearBackwardInfo::createLinearBackwardInfo( + grad_x_desc, + grad_w_desc, + grad_b_desc, + grad_y_desc, + x_desc, + w_desc + ); + CHECK_RESULT(result); + const LinearBackwardInfo &info = result.take(); + + *desc_ptr = new Descriptor( + dtype, std::move(info), WorkSpaceSize, + nullptr, + handle->device, handle->device_id + ); + + return INFINI_STATUS_SUCCESS; +} + +infiniStatus_t Descriptor::calculate( + void *workspace, + size_t workspace_size, + void * grad_x, + void * grad_w, + void * grad_b, + const void * grad_y, + const void * x, + const void * w, + void *stream +) const { + + #define CALCULATE_LINEAR_BACKWARD(TDATA) \ + CHECK_STATUS(calculate_linear_backward(_info, \ + (TDATA *)grad_x, (TDATA *)grad_w, (TDATA *)grad_b, (const TDATA *)grad_y, (const TDATA *)x, (const TDATA *)w)) + + if (_info.dtype == INFINI_DTYPE_F16) { + CALCULATE_LINEAR_BACKWARD(fp16_t); + } else if (_info.dtype == INFINI_DTYPE_BF16) { + CALCULATE_LINEAR_BACKWARD(bf16_t); + } else if (_info.dtype == INFINI_DTYPE_F32) { + CALCULATE_LINEAR_BACKWARD(float); + } else { + return INFINI_STATUS_BAD_TENSOR_DTYPE; + } + #undef CALCULATE_LINEAR_BACKWARD + + return INFINI_STATUS_SUCCESS; +} +} diff --git a/src/infiniop/ops/linear_backward/cpu/linear_backward_cpu.h b/src/infiniop/ops/linear_backward/cpu/linear_backward_cpu.h new file mode 100644 index 000000000..849638877 --- /dev/null +++ b/src/infiniop/ops/linear_backward/cpu/linear_backward_cpu.h @@ -0,0 +1,9 @@ +#ifndef __LINEAR_BACKWARD_CPU_H__ +#define __LINEAR_BACKWARD_CPU_H__ + +#include "../linear_backward.h" + +DESCRIPTOR(cpu) + + +#endif // __LINEAR_BACKWARD_CPU_H__ diff --git a/src/infiniop/ops/linear_backward/cuda/kernel.cuh b/src/infiniop/ops/linear_backward/cuda/kernel.cuh new file mode 100644 index 000000000..8ba6635ff --- /dev/null +++ b/src/infiniop/ops/linear_backward/cuda/kernel.cuh @@ -0,0 +1,41 @@ +#ifndef __LINEAR_BACKWARD_KERNEL_CUH__ +#define __LINEAR_BACKWARD_KERNEL_CUH__ +// ------------------------------- start: perform operator on CUDA -------------------------------- +template +__device__ void linearBackwardKernel( + Tdata * grad_x, + Tdata * grad_w, + Tdata * grad_b, + const Tdata * grad_y, + const Tdata * x, + const Tdata * w, + size_t out_features, + ptrdiff_t grad_x_stride, + ptrdiff_t grad_w_stride_out, + ptrdiff_t grad_w_stride_in, + ptrdiff_t grad_b_stride, + ptrdiff_t grad_y_stride, + ptrdiff_t x_stride, + ptrdiff_t w_stride_out, + ptrdiff_t w_stride_in, + bool bias +) { + size_t in_index = blockIdx.x; + + auto w_ptr = w + in_index * w_stride_in; + auto grad_w_ptr = grad_w + in_index * grad_w_stride_in; + Tcompute grad_x_sum = 0.; + Tcompute x_value = *(x + in_index * x_stride); + for (size_t j = 0; j < out_features; j ++) + { + Tcompute grad_y_value = *(grad_y + j * grad_y_stride); + grad_x_sum += grad_y_value * Tcompute(*(w_ptr + j * w_stride_out)); + (*(grad_w_ptr + j * grad_w_stride_out)) = x_value * grad_y_value; + if (bias && blockIdx.x == 0) + (*(grad_b + j * grad_b_stride)) = grad_y_value; + } + (*(grad_x + in_index * grad_x_stride)) = grad_x_sum; +} +// -------------------------------- end: perform operator on CUDA --------------------------------- + +#endif // __LINEAR_BACKWARD_KERNEL_CUH__ diff --git a/src/infiniop/ops/linear_backward/info.h b/src/infiniop/ops/linear_backward/info.h new file mode 100644 index 000000000..0c474e6dc --- /dev/null +++ b/src/infiniop/ops/linear_backward/info.h @@ -0,0 +1,82 @@ +#ifndef __LINEAR_BACKWARD_INFO_H__ +#define __LINEAR_BACKWARD_INFO_H__ + +#include "../../../utils.h" +#include "../../operator.h" +#include "../../tensor.h" + +namespace op::linear_backward { + +class LinearBackwardInfo { +private: + LinearBackwardInfo() = default; + +public: +// ---------------------------- start: define member variables of Info ---------------------------- + infiniDtype_t dtype; + size_t in_features; + size_t out_features; + ptrdiff_t grad_x_stride; + ptrdiff_t grad_w_stride_out; + ptrdiff_t grad_w_stride_in; + ptrdiff_t grad_b_stride; + ptrdiff_t grad_y_stride; + ptrdiff_t x_stride; + ptrdiff_t w_stride_out; + ptrdiff_t w_stride_in; + bool bias; + +// ----------------------------- end: define member variables of Info ----------------------------- + + static utils::Result createLinearBackwardInfo( + infiniopTensorDescriptor_t grad_x_desc, + infiniopTensorDescriptor_t grad_w_desc, + infiniopTensorDescriptor_t grad_b_desc, + infiniopTensorDescriptor_t grad_y_desc, + infiniopTensorDescriptor_t x_desc, + infiniopTensorDescriptor_t w_desc + ) { +// ------------------------- start: check tensor shape and input validity ------------------------- + + CHECK_SAME_SHAPE(x_desc->shape(), grad_x_desc->shape()); + CHECK_SAME_SHAPE(w_desc->shape(), grad_w_desc->shape()); + CHECK_OR_RETURN( + w_desc->ndim() == 2 && x_desc->ndim() == 1 && grad_y_desc->ndim() == 1, + INFINI_STATUS_BAD_TENSOR_SHAPE + ); + size_t out = grad_y_desc->dim(0); + size_t in = x_desc->dim(0); + CHECK_OR_RETURN( + w_desc->dim(0) == out && w_desc->dim(1) == in, + INFINI_STATUS_BAD_TENSOR_SHAPE + ); + bool bias = (grad_b_desc != nullptr); + if (bias) + CHECK_OR_RETURN( + grad_b_desc->ndim() == 1 && grad_b_desc->dim(0) == out, + INFINI_STATUS_BAD_TENSOR_SHAPE + ); + + +// -------------------------- end: check tensor shape and input validity -------------------------- + return utils::Result(LinearBackwardInfo{ +// ------------------------------ start: create an instance of Info ------------------------------- + grad_y_desc->dtype(), + in, + out, + grad_x_desc->stride(0), + grad_w_desc->stride(0), + grad_w_desc->stride(1), + bias ? (grad_b_desc->stride(0)) : 0, + grad_y_desc->stride(0), + x_desc->stride(0), + w_desc->stride(0), + w_desc->stride(1), + bias +// ------------------------------- end: create an instance of Info -------------------------------- + }); + } +}; +} + +#endif // __LINEAR_BACKWARD_INFO_H__ diff --git a/src/infiniop/ops/linear_backward/linear_backward.h b/src/infiniop/ops/linear_backward/linear_backward.h new file mode 100644 index 000000000..1c082fa3c --- /dev/null +++ b/src/infiniop/ops/linear_backward/linear_backward.h @@ -0,0 +1,54 @@ +#ifndef __LINEAR_BACKWARD_H__ +#define __LINEAR_BACKWARD_H__ + +#include "../../../utils.h" +#include "../../operator.h" +#include "../../tensor.h" +#include "info.h" + +#define DESCRIPTOR(NAMESPACE) \ + namespace op::linear_backward::NAMESPACE { \ + class Descriptor final : public InfiniopDescriptor { \ + struct Opaque; \ + Opaque *_opaque; \ + LinearBackwardInfo _info; \ + size_t _workspace_size; \ + Descriptor( \ + infiniDtype_t dtype, \ + LinearBackwardInfo info, \ + size_t workspace_size_, \ + Opaque *opaque, \ + infiniDevice_t device_type, \ + int device_id \ + ) : InfiniopDescriptor{device_type, device_id}, \ + _opaque(opaque), \ + _info(info), \ + _workspace_size(workspace_size_) {} \ + public: \ + ~Descriptor(); \ + size_t workspaceSize() const { return _workspace_size; } \ + static infiniStatus_t create( \ + infiniopHandle_t handle, \ + Descriptor **desc_ptr, \ + infiniopTensorDescriptor_t grad_x_desc, \ + infiniopTensorDescriptor_t grad_w_desc, \ + infiniopTensorDescriptor_t grad_b_desc, \ + infiniopTensorDescriptor_t grad_y_desc, \ + infiniopTensorDescriptor_t x_desc, \ + infiniopTensorDescriptor_t w_desc \ + ); \ + infiniStatus_t calculate( \ + void *workspace, \ + size_t workspace_size, \ + void * grad_x, \ + void * grad_w, \ + void * grad_b, \ + const void * grad_y, \ + const void * x, \ + const void * w, \ + void *stream \ + ) const; \ + }; \ + } + +#endif \ No newline at end of file diff --git a/src/infiniop/ops/linear_backward/metax/linear_backward_metax.h b/src/infiniop/ops/linear_backward/metax/linear_backward_metax.h new file mode 100644 index 000000000..b926cee89 --- /dev/null +++ b/src/infiniop/ops/linear_backward/metax/linear_backward_metax.h @@ -0,0 +1,8 @@ +#ifndef __LINEAR_BACKWARD_METAX_H__ +#define __LINEAR_BACKWARD_METAX_H__ + +#include "../linear_backward.h" + +DESCRIPTOR(metax) + +#endif // __LINEAR_BACKWARD_METAX_H__ diff --git a/src/infiniop/ops/linear_backward/metax/linear_backward_metax.maca b/src/infiniop/ops/linear_backward/metax/linear_backward_metax.maca new file mode 100644 index 000000000..b3a6e596f --- /dev/null +++ b/src/infiniop/ops/linear_backward/metax/linear_backward_metax.maca @@ -0,0 +1,171 @@ +#include "../../../devices/metax/metax_common.h" +#include "linear_backward_metax.h" +#include +#include "../../../devices/metax/metax_kernel_common.h" +#include "../../../reduce/cuda/reduce.cuh" +#include "../cuda/kernel.cuh" +#include "../info.h" + +namespace op::linear_backward::metax { + +template +INFINIOP_METAX_KERNEL launchKernel( + Tdata * grad_x, + Tdata * grad_w, + Tdata * grad_b, + const Tdata * grad_y, + const Tdata * x, + const Tdata * w, + size_t out_features, + ptrdiff_t grad_x_stride, + ptrdiff_t grad_w_stride_out, + ptrdiff_t grad_w_stride_in, + ptrdiff_t grad_b_stride, + ptrdiff_t grad_y_stride, + ptrdiff_t x_stride, + ptrdiff_t w_stride_out, + ptrdiff_t w_stride_in, + bool bias +) { + linearBackwardKernel( + grad_x, + grad_w, + grad_b, + grad_y, + x, + w, + out_features, + grad_x_stride, + grad_w_stride_out, + grad_w_stride_in, + grad_b_stride, + grad_y_stride, + x_stride, + w_stride_out, + w_stride_in, + bias + ); +} + +// ----------------------------------- start: call launchKernel ----------------------------------- +template +infiniStatus_t calculate_linear_backward( + const LinearBackwardInfo &info, + Tdata * grad_x, + Tdata * grad_w, + Tdata * grad_b, + const Tdata * grad_y, + const Tdata * x, + const Tdata * w, + hcStream_t stream +) { + launchKernel<1, Tdata, float><<>>( + grad_x, + grad_w, + grad_b, + grad_y, + x, + w, + info.out_features, + info.grad_x_stride, + info.grad_w_stride_out, + info.grad_w_stride_in, + info.grad_b_stride, + info.grad_y_stride, + info.x_stride, + info.w_stride_out, + info.w_stride_in, + info.bias + ); + return INFINI_STATUS_SUCCESS; +} +// ------------------------------------ end: call launchKernel ------------------------------------ + + +struct Descriptor::Opaque { + std::shared_ptr internal; +}; + +Descriptor::~Descriptor() { + delete _opaque; +} + +infiniStatus_t Descriptor::create( + infiniopHandle_t handle_, + Descriptor **desc_ptr, + infiniopTensorDescriptor_t grad_x_desc, + infiniopTensorDescriptor_t grad_w_desc, + infiniopTensorDescriptor_t grad_b_desc, + infiniopTensorDescriptor_t grad_y_desc, + infiniopTensorDescriptor_t x_desc, + infiniopTensorDescriptor_t w_desc +) { + auto handle = reinterpret_cast(handle_); +// --------------------- start: check data type and calculate workspace size ---------------------- + auto dtype = grad_x_desc->dtype(); + CHECK_DTYPE(dtype, INFINI_DTYPE_F16, INFINI_DTYPE_F32, INFINI_DTYPE_BF16); + auto result = LinearBackwardInfo::createLinearBackwardInfo( + grad_x_desc, + grad_w_desc, + grad_b_desc, + grad_y_desc, + x_desc, + w_desc + ); + CHECK_RESULT(result); + const LinearBackwardInfo &info = result.take(); + size_t WorkSpaceSize = 0; +// ---------------------- end: check data type and calculate workspace size ----------------------- + + *desc_ptr = new Descriptor( + dtype, std::move(info), WorkSpaceSize, + new Opaque{handle->internal()}, + handle->device, handle->device_id + ); + return INFINI_STATUS_SUCCESS; +} + + + +infiniStatus_t Descriptor::calculate( + void * workspace, + size_t workspace_size, + void * grad_x, + void * grad_w, + void * grad_b, + const void * grad_y, + const void * x, + const void * w, + void *stream_ +) const { + if (workspace_size < _workspace_size) + return INFINI_STATUS_INSUFFICIENT_WORKSPACE; + + hcStream_t stream = (hcStream_t)stream_; + + #define CALCULATE_LINEAR_BACKWARD(BLOCK_SIZE, TDATA) \ + calculate_linear_backward(_info, (TDATA *)grad_x, (TDATA *)grad_w, (TDATA *)grad_b, (const TDATA *)grad_y, (const TDATA *)x, (const TDATA *)w, stream) + #define CALCULATE_LINEAR_BACKWARD_WITH_METAX_BLOCK(BLOCK_SIZE) \ + { \ + if (_info.dtype == INFINI_DTYPE_F16) \ + return CALCULATE_LINEAR_BACKWARD(BLOCK_SIZE, half); \ + else if (_info.dtype == INFINI_DTYPE_F32) \ + return CALCULATE_LINEAR_BACKWARD(BLOCK_SIZE, float); \ + else if (_info.dtype == INFINI_DTYPE_BF16) \ + return CALCULATE_LINEAR_BACKWARD(BLOCK_SIZE, cuda_bfloat16); \ + else \ + return INFINI_STATUS_BAD_TENSOR_DTYPE; \ + } + + if (_opaque->internal->maxThreadsPerBlock() == METAX_BLOCK_SIZE_1024) + CALCULATE_LINEAR_BACKWARD_WITH_METAX_BLOCK(METAX_BLOCK_SIZE_1024) + else if (_opaque->internal->maxThreadsPerBlock() == METAX_BLOCK_SIZE_512) + CALCULATE_LINEAR_BACKWARD_WITH_METAX_BLOCK(METAX_BLOCK_SIZE_512) + else + return INFINI_STATUS_DEVICE_ARCHITECTURE_NOT_SUPPORTED; + return INFINI_STATUS_SUCCESS; + + #undef CALCULATE_LINEAR_BACKWARD_WITH_METAX_BLOCK + #undef CALCULATE_LINEAR_BACKWARD +} +} // namespace op::linear_backward::metax diff --git a/src/infiniop/ops/linear_backward/nvidia/linear_backward_nvidia.cu b/src/infiniop/ops/linear_backward/nvidia/linear_backward_nvidia.cu new file mode 100644 index 000000000..5102d1bb3 --- /dev/null +++ b/src/infiniop/ops/linear_backward/nvidia/linear_backward_nvidia.cu @@ -0,0 +1,172 @@ +#include "../../../devices/nvidia/nvidia_handle.cuh" +#include "../../../devices/nvidia/nvidia_common.cuh" +#include "../../../devices/nvidia/nvidia_kernel_common.cuh" +#include "linear_backward_nvidia.cuh" +#include "../cuda/kernel.cuh" +#include "../info.h" + +namespace op::linear_backward::nvidia { + +// ---------------------- start: launchKernel: call kernel function of CUDA ----------------------- +template +INFINIOP_CUDA_KERNEL launchKernel( + Tdata * grad_x, + Tdata * grad_w, + Tdata * grad_b, + const Tdata * grad_y, + const Tdata * x, + const Tdata * w, + size_t out_features, + ptrdiff_t grad_x_stride, + ptrdiff_t grad_w_stride_out, + ptrdiff_t grad_w_stride_in, + ptrdiff_t grad_b_stride, + ptrdiff_t grad_y_stride, + ptrdiff_t x_stride, + ptrdiff_t w_stride_out, + ptrdiff_t w_stride_in, + bool bias +) { + linearBackwardKernel( + grad_x, + grad_w, + grad_b, + grad_y, + x, + w, + out_features, + grad_x_stride, + grad_w_stride_out, + grad_w_stride_in, + grad_b_stride, + grad_y_stride, + x_stride, + w_stride_out, + w_stride_in, + bias + ); +} +// ----------------------- end: launchKernel: call kernel function of CUDA ------------------------ + +// ----------------------------------- start: call launchKernel ----------------------------------- +template +infiniStatus_t calculate_linear_backward( + const LinearBackwardInfo &info, + Tdata * grad_x, + Tdata * grad_w, + Tdata * grad_b, + const Tdata * grad_y, + const Tdata * x, + const Tdata * w, + cudaStream_t stream +) { + launchKernel<1, Tdata, float><<>>( + grad_x, + grad_w, + grad_b, + grad_y, + x, + w, + info.out_features, + info.grad_x_stride, + info.grad_w_stride_out, + info.grad_w_stride_in, + info.grad_b_stride, + info.grad_y_stride, + info.x_stride, + info.w_stride_out, + info.w_stride_in, + info.bias + ); + return INFINI_STATUS_SUCCESS; +} +// ------------------------------------ end: call launchKernel ------------------------------------ + + +struct Descriptor::Opaque { + std::shared_ptr internal; +}; + +Descriptor::~Descriptor() { + delete _opaque; +} + +infiniStatus_t Descriptor::create( + infiniopHandle_t handle_, + Descriptor **desc_ptr, + infiniopTensorDescriptor_t grad_x_desc, + infiniopTensorDescriptor_t grad_w_desc, + infiniopTensorDescriptor_t grad_b_desc, + infiniopTensorDescriptor_t grad_y_desc, + infiniopTensorDescriptor_t x_desc, + infiniopTensorDescriptor_t w_desc +) { + auto handle = reinterpret_cast(handle_); +// --------------------- start: check data type and calculate workspace size ---------------------- + auto dtype = grad_x_desc->dtype(); + CHECK_DTYPE(dtype, INFINI_DTYPE_F16, INFINI_DTYPE_F32, INFINI_DTYPE_BF16); + size_t WorkSpaceSize = 0; +// ---------------------- end: check data type and calculate workspace size ----------------------- + auto result = LinearBackwardInfo::createLinearBackwardInfo( + grad_x_desc, + grad_w_desc, + grad_b_desc, + grad_y_desc, + x_desc, + w_desc + ); + CHECK_RESULT(result); + const LinearBackwardInfo &info = result.take(); + *desc_ptr = new Descriptor( + dtype, std::move(info), WorkSpaceSize, + new Opaque{handle->internal()}, + handle->device, handle->device_id + ); + return INFINI_STATUS_SUCCESS; +} + + +infiniStatus_t Descriptor::calculate( + void * workspace, + size_t workspace_size, + void * grad_x, + void * grad_w, + void * grad_b, + const void * grad_y, + const void * x, + const void * w, + void *stream_ +) const { + if (workspace_size < _workspace_size) { + return INFINI_STATUS_INSUFFICIENT_WORKSPACE; + } + cudaStream_t stream = (cudaStream_t)stream_; + + #define CALCULATE_LINEAR_BACKWARD(BLOCK_SIZE, TDATA) \ + calculate_linear_backward(_info, (TDATA *)grad_x, (TDATA *)grad_w, (TDATA *)grad_b, (const TDATA *)grad_y, (const TDATA *)x, (const TDATA *)w, stream) + #define CALCULATE_LINEAR_BACKWARD_WITH_BLOCK_SIZE(BLOCK_SIZE) \ + { \ + if (_info.dtype == INFINI_DTYPE_F16) \ + return CALCULATE_LINEAR_BACKWARD(BLOCK_SIZE, half); \ + else if (_info.dtype == INFINI_DTYPE_F32) \ + return CALCULATE_LINEAR_BACKWARD(BLOCK_SIZE, float); \ + else if (_info.dtype == INFINI_DTYPE_BF16) \ + return CALCULATE_LINEAR_BACKWARD(BLOCK_SIZE, __nv_bfloat16); \ + else \ + return INFINI_STATUS_BAD_TENSOR_DTYPE; \ + } + + if (_opaque->internal->maxThreadsPerBlock() == CUDA_BLOCK_SIZE_1024) + CALCULATE_LINEAR_BACKWARD_WITH_BLOCK_SIZE(CUDA_BLOCK_SIZE_1024) + else if (_opaque->internal->maxThreadsPerBlock() == CUDA_BLOCK_SIZE_512) + CALCULATE_LINEAR_BACKWARD_WITH_BLOCK_SIZE(CUDA_BLOCK_SIZE_512) + else if (_opaque->internal->maxThreadsPerBlock() == CUDA_BLOCK_SIZE_4096) + CALCULATE_LINEAR_BACKWARD_WITH_BLOCK_SIZE(CUDA_BLOCK_SIZE_4096) + else + return INFINI_STATUS_DEVICE_ARCHITECTURE_NOT_SUPPORTED; + return INFINI_STATUS_SUCCESS; + + #undef CALCULATE_LINEAR_BACKWARD_WITH_BLOCK_SIZE + #undef CALCULATE_LINEAR_BACKWARD +} +} // namespace op::linear_backward::nvidia diff --git a/src/infiniop/ops/linear_backward/nvidia/linear_backward_nvidia.cuh b/src/infiniop/ops/linear_backward/nvidia/linear_backward_nvidia.cuh new file mode 100644 index 000000000..e80ba5342 --- /dev/null +++ b/src/infiniop/ops/linear_backward/nvidia/linear_backward_nvidia.cuh @@ -0,0 +1,7 @@ +#ifndef __LINEAR_BACKWARD_NVIDIA_API_H__ +#define __LINEAR_BACKWARD_NVIDIA_API_H__ +#include "../linear_backward.h" + +DESCRIPTOR(nvidia) + +#endif // __LINEAR_BACKWARD_NVIDIA_API_H__ diff --git a/src/infiniop/ops/linear_backward/operator.cc b/src/infiniop/ops/linear_backward/operator.cc new file mode 100644 index 000000000..677442d31 --- /dev/null +++ b/src/infiniop/ops/linear_backward/operator.cc @@ -0,0 +1,164 @@ +#include "../../operator.h" +#include "../../handle.h" +#include "infiniop/ops/linear_backward.h" + +#ifdef ENABLE_CPU_API +#include "cpu/linear_backward_cpu.h" +#endif +#if defined(ENABLE_NVIDIA_API) || defined(ENABLE_ILUVATAR_API) +#include "nvidia/linear_backward_nvidia.cuh" +#endif +#ifdef ENABLE_METAX_API +#include "metax/linear_backward_metax.h" +#endif + +__C infiniStatus_t infiniopCreateLinearBackwardDescriptor( + infiniopHandle_t handle, + infiniopLinearBackwardDescriptor_t *desc_ptr, + infiniopTensorDescriptor_t grad_x_desc, + infiniopTensorDescriptor_t grad_w_desc, + infiniopTensorDescriptor_t grad_b_desc, + infiniopTensorDescriptor_t grad_y_desc, + infiniopTensorDescriptor_t x_desc, + infiniopTensorDescriptor_t w_desc +) { +#define CREATE(CASE, NAMESPACE) \ + case CASE: \ + return op::linear_backward::NAMESPACE::Descriptor::create( \ + handle, \ + reinterpret_cast(desc_ptr), \ + grad_x_desc, \ + grad_w_desc, \ + grad_b_desc, \ + grad_y_desc, \ + x_desc, \ + w_desc \ + ) + + switch (handle->device) { + +#ifdef ENABLE_CPU_API + CREATE(INFINI_DEVICE_CPU, cpu); +#endif +#ifdef ENABLE_NVIDIA_API + CREATE(INFINI_DEVICE_NVIDIA, nvidia); +#endif +#ifdef ENABLE_ILUVATAR_API + CREATE(INFINI_DEVICE_ILUVATAR, nvidia); +#endif +#ifdef ENABLE_METAX_API + CREATE(INFINI_DEVICE_METAX, metax); +#endif + + default: + return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED; + } + +#undef CREATE +} + +__C infiniStatus_t infiniopGetLinearBackwardWorkspaceSize(infiniopLinearBackwardDescriptor_t desc, size_t *size) { +#define GET(CASE, NAMESPACE) \ + case CASE: \ + *size = reinterpret_cast(desc)->workspaceSize(); \ + return INFINI_STATUS_SUCCESS + + switch (desc->device_type) { +#ifdef ENABLE_CPU_API + GET(INFINI_DEVICE_CPU, cpu); +#endif +#ifdef ENABLE_NVIDIA_API + GET(INFINI_DEVICE_NVIDIA, nvidia); +#endif +#ifdef ENABLE_ILUVATAR_API + GET(INFINI_DEVICE_ILUVATAR, nvidia); +#endif +#ifdef ENABLE_METAX_API + GET(INFINI_DEVICE_METAX, metax); +#endif + default: + return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED; + } +#undef GET + + return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED; +} + +__C infiniStatus_t infiniopLinearBackward( + infiniopLinearBackwardDescriptor_t desc, + void *workspace, + size_t workspace_size, + void * grad_x, + void * grad_w, + void * grad_b, + const void * grad_y, + const void * x, + const void * w, + void *stream +) { + +#define CALCULATE(CASE, NAMESPACE) \ + case CASE: \ + return reinterpret_cast(desc)->calculate( \ + workspace, \ + workspace_size, \ + grad_x, \ + grad_w, \ + grad_b, \ + grad_y, \ + x, \ + w, \ + stream \ + ) + + switch (desc->device_type) { + +#ifdef ENABLE_CPU_API + CALCULATE(INFINI_DEVICE_CPU, cpu); +#endif +#ifdef ENABLE_NVIDIA_API + CALCULATE(INFINI_DEVICE_NVIDIA, nvidia); +#endif +#ifdef ENABLE_ILUVATAR_API + CALCULATE(INFINI_DEVICE_ILUVATAR, nvidia); +#endif +#ifdef ENABLE_METAX_API + CALCULATE(INFINI_DEVICE_METAX, metax); +#endif + + default: + return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED; + } + +#undef CALCULATE +} + +__C infiniStatus_t +infiniopDestroyLinearBackwardDescriptor(infiniopLinearBackwardDescriptor_t desc) { + +#define DELETE(CASE, NAMESPACE) \ + case CASE: \ + delete reinterpret_cast(desc); \ + return INFINI_STATUS_SUCCESS + + switch (desc->device_type) { + +#ifdef ENABLE_CPU_API + DELETE(INFINI_DEVICE_CPU, cpu); +#endif +#ifdef ENABLE_NVIDIA_API + DELETE(INFINI_DEVICE_NVIDIA, nvidia); +#endif +#ifdef ENABLE_ILUVATAR_API + DELETE(INFINI_DEVICE_ILUVATAR, nvidia); +#endif +#ifdef ENABLE_METAX_API + DELETE(INFINI_DEVICE_METAX, metax); +#endif + + default: + return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED; + } + +#undef DELETE +} diff --git a/src/infiniop/ops/scatter/cpu/scatter_cpu.cc b/src/infiniop/ops/scatter/cpu/scatter_cpu.cc new file mode 100644 index 000000000..c47a38ef9 --- /dev/null +++ b/src/infiniop/ops/scatter/cpu/scatter_cpu.cc @@ -0,0 +1,108 @@ +#include "scatter_cpu.h" +#include "../../../devices/cpu/common_cpu.h" +#include "../../../reduce/cpu/reduce.h" +#include "../info.h" + +namespace op::scatter::cpu { + +infiniStatus_t calculate_scatter( + const ScatterInfo &info, + char * output, + const char * input, + const int64_t * index +) { +// -------------------------------- start: perform operator on CPU -------------------------------- + std::vector contiguous_strides(info.ndim); + ptrdiff_t last_dim = 1; + ptrdiff_t last_stride = 1; + for(size_t d = 0; d < info.ndim; d ++) + { + if (d == info.dim) + continue; + contiguous_strides[d] = last_dim * last_stride; + last_dim = info.index_shape[d]; + last_stride = contiguous_strides[d]; + } + size_t batch_size = last_dim * last_stride; + int scatter_dim = int(info.dim); + size_t element_size = infiniSizeOf(info.dtype); + + #pragma omp parallel for + for (size_t n = 0; n < batch_size; n ++) { + auto output_ptr = output; + auto input_ptr = input; + auto index_ptr = index; + size_t rem = n; + for(int d = info.ndim - 1; d >= 0; d --) { + if (d == scatter_dim) + continue; + size_t dim_index = rem / contiguous_strides[d]; + rem = rem % contiguous_strides[d]; + output_ptr += dim_index * element_size * info.output_strides[d]; + input_ptr += dim_index * element_size * info.input_strides[d]; + index_ptr += dim_index * info.index_strides[d]; + } + for (size_t c = 0; c < info.index_shape[scatter_dim]; c ++) { + int64_t scatter_number = *(index_ptr + c * info.index_strides[scatter_dim]); + memcpy( + output_ptr + scatter_number * element_size * info.output_strides[scatter_dim], + input_ptr + c * element_size * info.input_strides[scatter_dim], + element_size + ); + } + } + +// --------------------------------- end: perform operator on CPU --------------------------------- + return INFINI_STATUS_SUCCESS; +} + + +Descriptor::~Descriptor() = default; + +infiniStatus_t Descriptor::create( + infiniopHandle_t handle_, + Descriptor **desc_ptr, + infiniopTensorDescriptor_t output_desc, + infiniopTensorDescriptor_t input_desc, + infiniopTensorDescriptor_t index_desc, + size_t dim +) { + auto handle = reinterpret_cast(handle_); + +// --------------------- start: check data type and calculate workspace size ---------------------- + auto dtype = input_desc->dtype(); + size_t WorkSpaceSize = 0; +// ---------------------- end: check data type and calculate workspace size ----------------------- + + auto result = ScatterInfo::createScatterInfo( + output_desc, + input_desc, + index_desc, + dim + ); + CHECK_RESULT(result); + const ScatterInfo &info = result.take(); + + *desc_ptr = new Descriptor( + dtype, std::move(info), WorkSpaceSize, + nullptr, + handle->device, handle->device_id + ); + + return INFINI_STATUS_SUCCESS; +} + + + +infiniStatus_t Descriptor::calculate( + void *workspace, + size_t workspace_size, + void * output, + const void * input, + const void * index, + void *stream +) const { + + return calculate_scatter(_info, (char *)output, (const char *)input, (const int64_t *)index); +} +} diff --git a/src/infiniop/ops/scatter/cpu/scatter_cpu.h b/src/infiniop/ops/scatter/cpu/scatter_cpu.h new file mode 100644 index 000000000..bf2fcb7a1 --- /dev/null +++ b/src/infiniop/ops/scatter/cpu/scatter_cpu.h @@ -0,0 +1,9 @@ +#ifndef __SCATTER_CPU_H__ +#define __SCATTER_CPU_H__ + +#include "../scatter.h" + +DESCRIPTOR(cpu) + + +#endif // __SCATTER_CPU_H__ diff --git a/src/infiniop/ops/scatter/cuda/kernel.cuh b/src/infiniop/ops/scatter/cuda/kernel.cuh new file mode 100644 index 000000000..6c4de6ce5 --- /dev/null +++ b/src/infiniop/ops/scatter/cuda/kernel.cuh @@ -0,0 +1,38 @@ +#ifndef __SCATTER_KERNEL_CUH__ +#define __SCATTER_KERNEL_CUH__ +// ------------------------------- start: perform operator on CUDA -------------------------------- +template +__device__ void scatterKernel( + Tdata * output, + const Tdata * input, + const int64_t * index, + size_t ndim, + size_t index_scatter_size, + ptrdiff_t * output_strides, + ptrdiff_t * input_strides, + ptrdiff_t * index_strides, + ptrdiff_t * contiguous_strides, + int scatter_dim +) { + auto output_ptr = output; + auto input_ptr = input; + auto index_ptr = index; + size_t rem = blockIdx.x; + for(int d = ndim - 1; d >= 0; d --) { + if (d == scatter_dim) + continue; + size_t dim_index = rem / contiguous_strides[d]; + rem = rem % contiguous_strides[d]; + output_ptr += dim_index * output_strides[d]; + input_ptr += dim_index * input_strides[d]; + index_ptr += dim_index * index_strides[d]; + } + for (size_t c = threadIdx.x; c < index_scatter_size; c += BLOCK_SIZE) { + int64_t scatter_number = *(index_ptr + c * index_strides[scatter_dim]); + *(output_ptr + scatter_number * output_strides[scatter_dim]) = \ + *(input_ptr + c * input_strides[scatter_dim]); + } +} +// -------------------------------- end: perform operator on CUDA --------------------------------- + +#endif // __SCATTER_KERNEL_CUH__ diff --git a/src/infiniop/ops/scatter/info.h b/src/infiniop/ops/scatter/info.h new file mode 100644 index 000000000..9f21e435c --- /dev/null +++ b/src/infiniop/ops/scatter/info.h @@ -0,0 +1,67 @@ +#ifndef __SCATTER_INFO_H__ +#define __SCATTER_INFO_H__ + +#include "../../../utils.h" +#include "../../operator.h" +#include "../../tensor.h" + +namespace op::scatter { + +class ScatterInfo { +private: + ScatterInfo() = default; + +public: +// ---------------------------- start: define member variables of Info ---------------------------- + infiniDtype_t dtype; + size_t ndim; + std::vector output_shape; + std::vector input_shape; + std::vector index_shape; + std::vector output_strides; + std::vector input_strides; + std::vector index_strides; + size_t dim; + +// ----------------------------- end: define member variables of Info ----------------------------- + + static utils::Result createScatterInfo( + infiniopTensorDescriptor_t output_desc, + infiniopTensorDescriptor_t input_desc, + infiniopTensorDescriptor_t index_desc, + size_t dim + ) { +// ------------------------- start: check tensor shape and input validity ------------------------- + CHECK_OR_RETURN( + input_desc->ndim() == output_desc->ndim() && output_desc->ndim() == index_desc->ndim(), + INFINI_STATUS_BAD_TENSOR_SHAPE + ); + size_t ndim = output_desc->ndim(); + for (size_t d = 0; d < ndim; d ++){ + if(d != dim) { + CHECK_OR_RETURN( + index_desc->dim(d) <= input_desc->dim(d) && index_desc->dim(d) <= output_desc->dim(d), + INFINI_STATUS_BAD_TENSOR_SHAPE; + ); + } + } + CHECK_OR_RETURN(index_desc->dim(dim) <= input_desc->dim(dim), INFINI_STATUS_BAD_TENSOR_SHAPE); +// -------------------------- end: check tensor shape and input validity -------------------------- + return utils::Result(ScatterInfo{ +// ------------------------------ start: create an instance of Info ------------------------------- + output_desc->dtype(), + ndim, + output_desc->shape(), + input_desc->shape(), + index_desc->shape(), + output_desc->strides(), + input_desc->strides(), + index_desc->strides(), + dim +// ------------------------------- end: create an instance of Info -------------------------------- + }); + } +}; +} + +#endif // __SCATTER_INFO_H__ diff --git a/src/infiniop/ops/scatter/metax/scatter_metax.h b/src/infiniop/ops/scatter/metax/scatter_metax.h new file mode 100644 index 000000000..d5ce0ef16 --- /dev/null +++ b/src/infiniop/ops/scatter/metax/scatter_metax.h @@ -0,0 +1,8 @@ +#ifndef __SCATTER_METAX_H__ +#define __SCATTER_METAX_H__ + +#include "../scatter.h" + +DESCRIPTOR(metax) + +#endif // __SCATTER_METAX_H__ diff --git a/src/infiniop/ops/scatter/metax/scatter_metax.maca b/src/infiniop/ops/scatter/metax/scatter_metax.maca new file mode 100644 index 000000000..1c742f60d --- /dev/null +++ b/src/infiniop/ops/scatter/metax/scatter_metax.maca @@ -0,0 +1,190 @@ +#include "../../../devices/metax/metax_common.h" +#include "scatter_metax.h" +#include +#include "../../../devices/metax/metax_kernel_common.h" +#include "../../../reduce/cuda/reduce.cuh" +#include "../cuda/kernel.cuh" +#include "../info.h" + +namespace op::scatter::metax { + +template +INFINIOP_METAX_KERNEL launchKernel( + Tdata * output, + const Tdata * input, + const int64_t * index, + size_t ndim, + size_t index_scatter_size, + ptrdiff_t * output_strides, + ptrdiff_t * input_strides, + ptrdiff_t * index_strides, + ptrdiff_t * contiguous_strides, + int scatter_dim +) { + scatterKernel( + output, + input, + index, + ndim, + index_scatter_size, + output_strides, + input_strides, + index_strides, + contiguous_strides, + scatter_dim + ); +} +// ----------------------- end: launchKernel: call kernel function of CUDA ------------------------ + +// ----------------------------------- start: call launchKernel ----------------------------------- +template +infiniStatus_t calculate_scatter( + const ScatterInfo &info, + Tdata * output, + const Tdata * input, + const int64_t * index, + hcStream_t stream, + void * workspace +) { + size_t ndim = info.ndim; + ptrdiff_t * contiguous_strides = new ptrdiff_t[ndim]; + size_t last_dim = 1, last_stride = 1; + size_t scatter_dim = info.dim; + for(size_t d = 0; d < ndim; d ++) + { + if (d == scatter_dim) + continue; + contiguous_strides[d] = last_dim * last_stride; + last_dim = info.index_shape[d]; + last_stride = contiguous_strides[d]; + } + + size_t batch_size = last_dim * last_stride; + + ptrdiff_t * contiguous_strides_cuda = reinterpret_cast(workspace); + ptrdiff_t * input_strides_cuda = contiguous_strides_cuda + ndim; + ptrdiff_t * output_strides_cuda = input_strides_cuda + ndim; + ptrdiff_t * index_strides_cuda = output_strides_cuda + ndim; + + CHECK_METAX(hcMemcpyAsync(contiguous_strides_cuda, contiguous_strides, sizeof(ptrdiff_t) * ndim, hcMemcpyHostToDevice, stream)); + CHECK_METAX(hcMemcpyAsync(input_strides_cuda, info.input_strides.data(), sizeof(ptrdiff_t) * ndim, hcMemcpyHostToDevice, stream)); + CHECK_METAX(hcMemcpyAsync(output_strides_cuda, info.output_strides.data(), sizeof(ptrdiff_t) * ndim, hcMemcpyHostToDevice, stream)); + CHECK_METAX(hcMemcpyAsync(index_strides_cuda, info.index_strides.data(), sizeof(ptrdiff_t) * ndim, hcMemcpyHostToDevice, stream)); + + + launchKernel<<>>( + output, + input, + index, + ndim, + info.index_shape[scatter_dim], + output_strides_cuda, + input_strides_cuda, + index_strides_cuda, + contiguous_strides_cuda, + scatter_dim + ); + delete[] contiguous_strides; + return INFINI_STATUS_SUCCESS; +} +// ------------------------------------ end: call launchKernel ------------------------------------ + + +struct Descriptor::Opaque { + std::shared_ptr internal; +}; + +Descriptor::~Descriptor() { + delete _opaque; +} + +infiniStatus_t Descriptor::create( + infiniopHandle_t handle_, + Descriptor **desc_ptr, + infiniopTensorDescriptor_t output_desc, + infiniopTensorDescriptor_t input_desc, + infiniopTensorDescriptor_t index_desc, + size_t dim +) { + auto handle = reinterpret_cast(handle_); +// --------------------- start: check data type and calculate workspace size ---------------------- + auto dtype = output_desc->dtype(); + auto result = ScatterInfo::createScatterInfo( + output_desc, + input_desc, + index_desc, + dim + ); + CHECK_RESULT(result); + const ScatterInfo &info = result.take(); + size_t WorkSpaceSize = sizeof(ptrdiff_t) * input_desc->ndim() * 4; +// ---------------------- end: check data type and calculate workspace size ----------------------- + + *desc_ptr = new Descriptor( + dtype, std::move(info), WorkSpaceSize, + new Opaque{handle->internal()}, + handle->device, handle->device_id + ); + return INFINI_STATUS_SUCCESS; +} + + + +infiniStatus_t Descriptor::calculate( + void * workspace, + size_t workspace_size, + void * output, + const void * input, + const void * index, + void *stream_ +) const { + if (workspace_size < _workspace_size) + return INFINI_STATUS_INSUFFICIENT_WORKSPACE; + + hcStream_t stream = (hcStream_t)stream_; + + #define CALCULATE_SCATTER(BLOCK_SIZE, TDATA) \ + calculate_scatter(_info, (TDATA *)output, (const TDATA *)input, (const int64_t *)index, stream, workspace) + + #define CALCULATE_SCATTER_WITH_METAX_BLOCK(BLOCK_SIZE) \ + switch (_info.dtype) { \ + case INFINI_DTYPE_BOOL: \ + return CALCULATE_SCATTER(BLOCK_SIZE, bool); \ + case INFINI_DTYPE_U8: \ + return CALCULATE_SCATTER(BLOCK_SIZE, uint8_t); \ + case INFINI_DTYPE_U16: \ + return CALCULATE_SCATTER(BLOCK_SIZE, uint16_t); \ + case INFINI_DTYPE_U32: \ + return CALCULATE_SCATTER(BLOCK_SIZE, uint32_t); \ + case INFINI_DTYPE_U64: \ + return CALCULATE_SCATTER(BLOCK_SIZE, uint64_t); \ + case INFINI_DTYPE_I8: \ + return CALCULATE_SCATTER(BLOCK_SIZE, int8_t); \ + case INFINI_DTYPE_I16: \ + return CALCULATE_SCATTER(BLOCK_SIZE, int16_t); \ + case INFINI_DTYPE_I32: \ + return CALCULATE_SCATTER(BLOCK_SIZE, int32_t); \ + case INFINI_DTYPE_I64: \ + return CALCULATE_SCATTER(BLOCK_SIZE, int64_t); \ + case INFINI_DTYPE_F16: \ + return CALCULATE_SCATTER(BLOCK_SIZE, half); \ + case INFINI_DTYPE_F32: \ + return CALCULATE_SCATTER(BLOCK_SIZE, float); \ + case INFINI_DTYPE_BF16: \ + return CALCULATE_SCATTER(BLOCK_SIZE, cuda_bfloat16); \ + default: \ + return INFINI_STATUS_BAD_TENSOR_DTYPE; \ + } + + if (_opaque->internal->maxThreadsPerBlock() == METAX_BLOCK_SIZE_1024) + CALCULATE_SCATTER_WITH_METAX_BLOCK(METAX_BLOCK_SIZE_1024) + else if (_opaque->internal->maxThreadsPerBlock() == METAX_BLOCK_SIZE_512) + CALCULATE_SCATTER_WITH_METAX_BLOCK(METAX_BLOCK_SIZE_512) + else + return INFINI_STATUS_DEVICE_ARCHITECTURE_NOT_SUPPORTED; + return INFINI_STATUS_SUCCESS; + + #undef CALCULATE_SCATTER_WITH_METAX_BLOCK + #undef CALCULATE_SCATTER +} +} // namespace op::scatter::metax diff --git a/src/infiniop/ops/scatter/nvidia/scatter_nvidia.cu b/src/infiniop/ops/scatter/nvidia/scatter_nvidia.cu new file mode 100644 index 000000000..7d6e1a1a1 --- /dev/null +++ b/src/infiniop/ops/scatter/nvidia/scatter_nvidia.cu @@ -0,0 +1,190 @@ +#include "../../../devices/nvidia/nvidia_handle.cuh" +#include "../../../devices/nvidia/nvidia_common.cuh" +#include "../../../devices/nvidia/nvidia_kernel_common.cuh" + +#include "scatter_nvidia.cuh" +#include "../cuda/kernel.cuh" +#include "../info.h" + +namespace op::scatter::nvidia { + +// ---------------------- start: launchKernel: call kernel function of CUDA ----------------------- +template +INFINIOP_CUDA_KERNEL launchKernel( + Tdata * output, + const Tdata * input, + const int64_t * index, + size_t ndim, + size_t index_scatter_size, + ptrdiff_t * output_strides, + ptrdiff_t * input_strides, + ptrdiff_t * index_strides, + ptrdiff_t * contiguous_strides, + int scatter_dim +) { + scatterKernel( + output, + input, + index, + ndim, + index_scatter_size, + output_strides, + input_strides, + index_strides, + contiguous_strides, + scatter_dim + ); +} +// ----------------------- end: launchKernel: call kernel function of CUDA ------------------------ + +// ----------------------------------- start: call launchKernel ----------------------------------- +template +infiniStatus_t calculate_scatter( + const ScatterInfo &info, + Tdata * output, + const Tdata * input, + const int64_t * index, + cudaStream_t stream, + void * workspace +) { + size_t ndim = info.ndim; + ptrdiff_t * contiguous_strides = new ptrdiff_t[ndim]; + size_t last_dim = 1, last_stride = 1; + size_t scatter_dim = info.dim; + for(size_t d = 0; d < ndim; d ++) + { + if (d == scatter_dim) + continue; + contiguous_strides[d] = last_dim * last_stride; + last_dim = info.index_shape[d]; + last_stride = contiguous_strides[d]; + } + + size_t batch_size = last_dim * last_stride; + + ptrdiff_t * contiguous_strides_cuda = reinterpret_cast(workspace); + ptrdiff_t * input_strides_cuda = contiguous_strides_cuda + ndim; + ptrdiff_t * output_strides_cuda = input_strides_cuda + ndim; + ptrdiff_t * index_strides_cuda = output_strides_cuda + ndim; + + CHECK_CUDA(cudaMemcpyAsync(contiguous_strides_cuda, contiguous_strides, sizeof(ptrdiff_t) * ndim, cudaMemcpyHostToDevice, stream)); + CHECK_CUDA(cudaMemcpyAsync(input_strides_cuda, info.input_strides.data(), sizeof(ptrdiff_t) * ndim, cudaMemcpyHostToDevice, stream)); + CHECK_CUDA(cudaMemcpyAsync(output_strides_cuda, info.output_strides.data(), sizeof(ptrdiff_t) * ndim, cudaMemcpyHostToDevice, stream)); + CHECK_CUDA(cudaMemcpyAsync(index_strides_cuda, info.index_strides.data(), sizeof(ptrdiff_t) * ndim, cudaMemcpyHostToDevice, stream)); + + launchKernel<<>>( + output, + input, + index, + ndim, + info.index_shape[scatter_dim], + output_strides_cuda, + input_strides_cuda, + index_strides_cuda, + contiguous_strides_cuda, + scatter_dim + ); + delete[] contiguous_strides; + return INFINI_STATUS_SUCCESS; +} +// ------------------------------------ end: call launchKernel ------------------------------------ + + +struct Descriptor::Opaque { + std::shared_ptr internal; +}; + +Descriptor::~Descriptor() { + delete _opaque; +} + +infiniStatus_t Descriptor::create( + infiniopHandle_t handle_, + Descriptor **desc_ptr, + infiniopTensorDescriptor_t output_desc, + infiniopTensorDescriptor_t input_desc, + infiniopTensorDescriptor_t index_desc, + size_t dim +) { + auto handle = reinterpret_cast(handle_); +// --------------------- start: check data type and calculate workspace size ---------------------- + auto dtype = output_desc->dtype(); + size_t WorkSpaceSize = sizeof(ptrdiff_t) * input_desc->ndim() * 4; +// ---------------------- end: check data type and calculate workspace size ----------------------- + auto result = ScatterInfo::createScatterInfo( + output_desc, + input_desc, + index_desc, + dim + ); + CHECK_RESULT(result); + const ScatterInfo &info = result.take(); + *desc_ptr = new Descriptor( + dtype, std::move(info), WorkSpaceSize, + new Opaque{handle->internal()}, + handle->device, handle->device_id + ); + return INFINI_STATUS_SUCCESS; +} + + +infiniStatus_t Descriptor::calculate( + void * workspace, + size_t workspace_size, + void * output, + const void * input, + const void * index, + void *stream_ +) const { + if (workspace_size < _workspace_size) { + return INFINI_STATUS_INSUFFICIENT_WORKSPACE; + } + cudaStream_t stream = (cudaStream_t)stream_; + #define CALCULATE_SCATTER(BLOCK_SIZE, TDATA) \ + calculate_scatter(_info, (TDATA *)output, (const TDATA *)input, (const int64_t *)index, stream, workspace) + #define CALCULATE_SCATTER_WITH_BLOCK_SIZE(BLOCK_SIZE) \ + switch (_info.dtype) { \ + case INFINI_DTYPE_BOOL: \ + return CALCULATE_SCATTER(BLOCK_SIZE, bool); \ + case INFINI_DTYPE_U8: \ + return CALCULATE_SCATTER(BLOCK_SIZE, uint8_t); \ + case INFINI_DTYPE_U16: \ + return CALCULATE_SCATTER(BLOCK_SIZE, uint16_t); \ + case INFINI_DTYPE_U32: \ + return CALCULATE_SCATTER(BLOCK_SIZE, uint32_t); \ + case INFINI_DTYPE_U64: \ + return CALCULATE_SCATTER(BLOCK_SIZE, uint64_t); \ + case INFINI_DTYPE_I8: \ + return CALCULATE_SCATTER(BLOCK_SIZE, int8_t); \ + case INFINI_DTYPE_I16: \ + return CALCULATE_SCATTER(BLOCK_SIZE, int16_t); \ + case INFINI_DTYPE_I32: \ + return CALCULATE_SCATTER(BLOCK_SIZE, int32_t); \ + case INFINI_DTYPE_I64: \ + return CALCULATE_SCATTER(BLOCK_SIZE, int64_t); \ + case INFINI_DTYPE_F16: \ + return CALCULATE_SCATTER(BLOCK_SIZE, half); \ + case INFINI_DTYPE_F32: \ + return CALCULATE_SCATTER(BLOCK_SIZE, float); \ + case INFINI_DTYPE_BF16: \ + return CALCULATE_SCATTER(BLOCK_SIZE, cuda_bfloat16); \ + default: \ + return INFINI_STATUS_BAD_TENSOR_DTYPE; \ + } + + + if (_opaque->internal->maxThreadsPerBlock() == CUDA_BLOCK_SIZE_1024) + CALCULATE_SCATTER_WITH_BLOCK_SIZE(CUDA_BLOCK_SIZE_1024) + else if (_opaque->internal->maxThreadsPerBlock() == CUDA_BLOCK_SIZE_512) + CALCULATE_SCATTER_WITH_BLOCK_SIZE(CUDA_BLOCK_SIZE_512) + else if (_opaque->internal->maxThreadsPerBlock() == CUDA_BLOCK_SIZE_4096) + CALCULATE_SCATTER_WITH_BLOCK_SIZE(CUDA_BLOCK_SIZE_4096) + else + return INFINI_STATUS_DEVICE_ARCHITECTURE_NOT_SUPPORTED; + + #undef CALCULATE_SCATTER_WITH_BLOCK_SIZE + #undef CALCULATE_SCATTER + + return INFINI_STATUS_SUCCESS; +} +} // namespace op::scatter::nvidia diff --git a/src/infiniop/ops/scatter/nvidia/scatter_nvidia.cuh b/src/infiniop/ops/scatter/nvidia/scatter_nvidia.cuh new file mode 100644 index 000000000..a199edb6e --- /dev/null +++ b/src/infiniop/ops/scatter/nvidia/scatter_nvidia.cuh @@ -0,0 +1,7 @@ +#ifndef __SCATTER_NVIDIA_API_H__ +#define __SCATTER_NVIDIA_API_H__ +#include "../scatter.h" + +DESCRIPTOR(nvidia) + +#endif // __SCATTER_NVIDIA_API_H__ diff --git a/src/infiniop/ops/scatter/operator.cc b/src/infiniop/ops/scatter/operator.cc new file mode 100644 index 000000000..7c7de71b5 --- /dev/null +++ b/src/infiniop/ops/scatter/operator.cc @@ -0,0 +1,154 @@ +#include "../../operator.h" +#include "../../handle.h" +#include "infiniop/ops/scatter.h" + +#ifdef ENABLE_CPU_API +#include "cpu/scatter_cpu.h" +#endif +#if defined(ENABLE_NVIDIA_API) || defined(ENABLE_ILUVATAR_API) +#include "nvidia/scatter_nvidia.cuh" +#endif +#ifdef ENABLE_METAX_API +#include "metax/scatter_metax.h" +#endif + +__C infiniStatus_t infiniopCreateScatterDescriptor( + infiniopHandle_t handle, + infiniopScatterDescriptor_t *desc_ptr, + infiniopTensorDescriptor_t output_desc, + infiniopTensorDescriptor_t input_desc, + infiniopTensorDescriptor_t index_desc, + size_t dim +) { +#define CREATE(CASE, NAMESPACE) \ + case CASE: \ + return op::scatter::NAMESPACE::Descriptor::create( \ + handle, \ + reinterpret_cast(desc_ptr), \ + output_desc, \ + input_desc, \ + index_desc, \ + dim \ + ) + + switch (handle->device) { + +#ifdef ENABLE_CPU_API + CREATE(INFINI_DEVICE_CPU, cpu); +#endif +#ifdef ENABLE_NVIDIA_API + CREATE(INFINI_DEVICE_NVIDIA, nvidia); +#endif +#ifdef ENABLE_ILUVATAR_API + CREATE(INFINI_DEVICE_ILUVATAR, nvidia); +#endif +#ifdef ENABLE_METAX_API + CREATE(INFINI_DEVICE_METAX, metax); +#endif + + default: + return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED; + } + +#undef CREATE +} + +__C infiniStatus_t infiniopGetScatterWorkspaceSize(infiniopScatterDescriptor_t desc, size_t *size) { +#define GET(CASE, NAMESPACE) \ + case CASE: \ + *size = reinterpret_cast(desc)->workspaceSize(); \ + return INFINI_STATUS_SUCCESS + + switch (desc->device_type) { +#ifdef ENABLE_CPU_API + GET(INFINI_DEVICE_CPU, cpu); +#endif +#ifdef ENABLE_NVIDIA_API + GET(INFINI_DEVICE_NVIDIA, nvidia); +#endif +#ifdef ENABLE_ILUVATAR_API + GET(INFINI_DEVICE_ILUVATAR, nvidia); +#endif +#ifdef ENABLE_METAX_API + GET(INFINI_DEVICE_METAX, metax); +#endif + default: + return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED; + } +#undef GET + + return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED; +} + +__C infiniStatus_t infiniopScatter( + infiniopScatterDescriptor_t desc, + void *workspace, + size_t workspace_size, + void * output, + const void * input, + const void * index, + void *stream +) { + +#define CALCULATE(CASE, NAMESPACE) \ + case CASE: \ + return reinterpret_cast(desc)->calculate( \ + workspace, \ + workspace_size, \ + output, \ + input, \ + index, \ + stream \ + ) + + switch (desc->device_type) { + +#ifdef ENABLE_CPU_API + CALCULATE(INFINI_DEVICE_CPU, cpu); +#endif +#ifdef ENABLE_NVIDIA_API + CALCULATE(INFINI_DEVICE_NVIDIA, nvidia); +#endif +#ifdef ENABLE_ILUVATAR_API + CALCULATE(INFINI_DEVICE_ILUVATAR, nvidia); +#endif +#ifdef ENABLE_METAX_API + CALCULATE(INFINI_DEVICE_METAX, metax); +#endif + + default: + return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED; + } + +#undef CALCULATE +} + +__C infiniStatus_t +infiniopDestroyScatterDescriptor(infiniopScatterDescriptor_t desc) { + +#define DELETE(CASE, NAMESPACE) \ + case CASE: \ + delete reinterpret_cast(desc); \ + return INFINI_STATUS_SUCCESS + + switch (desc->device_type) { + +#ifdef ENABLE_CPU_API + DELETE(INFINI_DEVICE_CPU, cpu); +#endif +#ifdef ENABLE_NVIDIA_API + DELETE(INFINI_DEVICE_NVIDIA, nvidia); +#endif +#ifdef ENABLE_ILUVATAR_API + DELETE(INFINI_DEVICE_ILUVATAR, nvidia); +#endif +#ifdef ENABLE_METAX_API + DELETE(INFINI_DEVICE_METAX, metax); +#endif + + default: + return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED; + } + +#undef DELETE +} diff --git a/src/infiniop/ops/scatter/scatter.h b/src/infiniop/ops/scatter/scatter.h new file mode 100644 index 000000000..7d2deefc1 --- /dev/null +++ b/src/infiniop/ops/scatter/scatter.h @@ -0,0 +1,49 @@ +#ifndef __SCATTER_H__ +#define __SCATTER_H__ + +#include "../../../utils.h" +#include "../../operator.h" +#include "../../tensor.h" +#include "info.h" + +#define DESCRIPTOR(NAMESPACE) \ + namespace op::scatter::NAMESPACE { \ + class Descriptor final : public InfiniopDescriptor { \ + struct Opaque; \ + Opaque *_opaque; \ + ScatterInfo _info; \ + size_t _workspace_size; \ + Descriptor( \ + infiniDtype_t dtype, \ + ScatterInfo info, \ + size_t workspace_size_, \ + Opaque *opaque, \ + infiniDevice_t device_type, \ + int device_id \ + ) : InfiniopDescriptor{device_type, device_id}, \ + _opaque(opaque), \ + _info(info), \ + _workspace_size(workspace_size_) {} \ + public: \ + ~Descriptor(); \ + size_t workspaceSize() const { return _workspace_size; } \ + static infiniStatus_t create( \ + infiniopHandle_t handle, \ + Descriptor **desc_ptr, \ + infiniopTensorDescriptor_t output_desc, \ + infiniopTensorDescriptor_t input_desc, \ + infiniopTensorDescriptor_t index_desc, \ + size_t dim \ + ); \ + infiniStatus_t calculate( \ + void *workspace, \ + size_t workspace_size, \ + void * output, \ + const void * input, \ + const void * index, \ + void *stream \ + ) const; \ + }; \ + } + +#endif \ No newline at end of file diff --git a/src/infiniop/ops/tril/cpu/tril_cpu.cc b/src/infiniop/ops/tril/cpu/tril_cpu.cc new file mode 100644 index 000000000..3b2d2f312 --- /dev/null +++ b/src/infiniop/ops/tril/cpu/tril_cpu.cc @@ -0,0 +1,81 @@ +#include "tril_cpu.h" +#include "../../../devices/cpu/common_cpu.h" +#include "../../../reduce/cpu/reduce.h" +#include "../info.h" + +namespace op::tril::cpu { + +Descriptor::~Descriptor() = default; + +infiniStatus_t Descriptor::create( + infiniopHandle_t handle_, + Descriptor **desc_ptr, + infiniopTensorDescriptor_t output_desc, + infiniopTensorDescriptor_t input_desc, + int diagonal +) { + auto handle = reinterpret_cast(handle_); + +// --------------------- start: check data type and calculate workspace size ---------------------- + auto dtype = input_desc->dtype(); + size_t WorkSpaceSize = 0; +// ---------------------- end: check data type and calculate workspace size ----------------------- + + auto result = TrilInfo::createTrilInfo( + output_desc, + input_desc, + diagonal + ); + CHECK_RESULT(result); + const TrilInfo &info = result.take(); + + *desc_ptr = new Descriptor( + dtype, std::move(info), WorkSpaceSize, + nullptr, + handle->device, handle->device_id + ); + + return INFINI_STATUS_SUCCESS; +} + +infiniStatus_t Descriptor::calculate( + void *workspace, + size_t workspace_size, + void * output, + const void * input, + void *stream +) const { + + #pragma omp parallel for + for (int row = 0; row < _info.row_size; row ++) + { + auto output_ptr = reinterpret_cast(output) + row * _info.column_size * _info.elem_size; + auto input_ptr = reinterpret_cast(input) + row * _info.column_size * _info.elem_size; + if (0 > row + _info.diagonal) + memset( + output_ptr, + 0, + _info.elem_size * _info.column_size + ); + else if (_info.column_size - 1 <= row + _info.diagonal) + memcpy( + output_ptr, + input_ptr, + _info.elem_size * _info.column_size + ); + else { + memset( + output_ptr + _info.elem_size * (row + _info.diagonal + 1), + 0, + _info.elem_size * (_info.column_size - row - _info.diagonal - 1) + ); + memcpy( + output_ptr, + input_ptr, + _info.elem_size * (row + _info.diagonal + 1) + ); + } + } + return INFINI_STATUS_SUCCESS; +} +} diff --git a/src/infiniop/ops/tril/cpu/tril_cpu.h b/src/infiniop/ops/tril/cpu/tril_cpu.h new file mode 100644 index 000000000..1410af08b --- /dev/null +++ b/src/infiniop/ops/tril/cpu/tril_cpu.h @@ -0,0 +1,9 @@ +#ifndef __TRIL_CPU_H__ +#define __TRIL_CPU_H__ + +#include "../tril.h" + +DESCRIPTOR(cpu) + + +#endif // __TRIL_CPU_H__ diff --git a/src/infiniop/ops/tril/cuda/kernel.cuh b/src/infiniop/ops/tril/cuda/kernel.cuh new file mode 100644 index 000000000..07b163524 --- /dev/null +++ b/src/infiniop/ops/tril/cuda/kernel.cuh @@ -0,0 +1,19 @@ +#ifndef __TRIL_KERNEL_CUH__ +#define __TRIL_KERNEL_CUH__ +// ------------------------------- start: perform operator on CUDA -------------------------------- +template +__device__ void trilKernel( + Tdata * output, + const Tdata * input, + int column_size, + int diagonal +) { + int row = blockIdx.x; + auto output_ptr = output + row * column_size; + auto input_ptr = input + row * column_size; + for (int i = threadIdx.x; i < column_size; i += BLOCK_SIZE) + *(output_ptr + i) = (i >= row + diagonal + 1) ? (Tdata)0 : *(input_ptr + i); +} +// -------------------------------- end: perform operator on CUDA --------------------------------- + +#endif // __TRIL_KERNEL_CUH__ \ No newline at end of file diff --git a/src/infiniop/ops/tril/info.h b/src/infiniop/ops/tril/info.h new file mode 100644 index 000000000..6e0c1d87b --- /dev/null +++ b/src/infiniop/ops/tril/info.h @@ -0,0 +1,52 @@ +#ifndef __TRIL_INFO_H__ +#define __TRIL_INFO_H__ + +#include "../../../utils.h" +#include "../../operator.h" +#include "../../tensor.h" + +namespace op::tril { + +class TrilInfo { +private: + TrilInfo() = default; + +public: +// ---------------------------- start: define member variables of Info ---------------------------- + infiniDtype_t dtype; + size_t elem_size; + int row_size; + int column_size; + int diagonal; + +// ----------------------------- end: define member variables of Info ----------------------------- + + static utils::Result createTrilInfo( + infiniopTensorDescriptor_t output_desc, + infiniopTensorDescriptor_t input_desc, + int diagonal + ) { +// ------------------------- start: check tensor shape and input validity ------------------------- + CHECK_SAME_SHAPE( + input_desc->shape(), output_desc->shape() + ); + CHECK_OR_RETURN( + input_desc->ndim() == 2, + INFINI_STATUS_BAD_TENSOR_SHAPE + ); + size_t elem_size = infiniSizeOf(input_desc->dtype()); +// -------------------------- end: check tensor shape and input validity -------------------------- + return utils::Result(TrilInfo{ +// ------------------------------ start: create an instance of Info ------------------------------- + input_desc->dtype(), + elem_size, + int(input_desc->dim(0)), + int(input_desc->dim(1)), + diagonal +// ------------------------------- end: create an instance of Info -------------------------------- + }); + } +}; +} + +#endif // __TRIL_INFO_H__ diff --git a/src/infiniop/ops/tril/metax/tril_metax.h b/src/infiniop/ops/tril/metax/tril_metax.h new file mode 100644 index 000000000..35c4a0f72 --- /dev/null +++ b/src/infiniop/ops/tril/metax/tril_metax.h @@ -0,0 +1,8 @@ +#ifndef __TRIL_METAX_H__ +#define __TRIL_METAX_H__ + +#include "../tril.h" + +DESCRIPTOR(metax) + +#endif // __TRIL_METAX_H__ diff --git a/src/infiniop/ops/tril/metax/tril_metax.maca b/src/infiniop/ops/tril/metax/tril_metax.maca new file mode 100644 index 000000000..e2c59b634 --- /dev/null +++ b/src/infiniop/ops/tril/metax/tril_metax.maca @@ -0,0 +1,128 @@ +#include "../../../devices/metax/metax_common.h" +#include "tril_metax.h" +#include +#include "../../../devices/metax/metax_kernel_common.h" +#include "../../../reduce/cuda/reduce.cuh" +#include "../cuda/kernel.cuh" +#include "../info.h" + +namespace op::tril::metax { +template +INFINIOP_METAX_KERNEL launchKernel( + Tdata * output, + const Tdata * input, + int column_size, + int diagonal +) { + trilKernel( + output, + input, + column_size, + diagonal + ); +} + + +struct Descriptor::Opaque { + std::shared_ptr internal; +}; + +Descriptor::~Descriptor() { + delete _opaque; +} + +infiniStatus_t Descriptor::create( + infiniopHandle_t handle_, + Descriptor **desc_ptr, + infiniopTensorDescriptor_t output_desc, + infiniopTensorDescriptor_t input_desc, + int diagonal +) { + auto handle = reinterpret_cast(handle_); +// --------------------- start: check data type and calculate workspace size ---------------------- + auto dtype = output_desc->dtype(); + auto result = TrilInfo::createTrilInfo( + output_desc, + input_desc, + diagonal + ); + CHECK_RESULT(result); + const TrilInfo &info = result.take(); + size_t WorkSpaceSize = 0; +// ---------------------- end: check data type and calculate workspace size ----------------------- + + *desc_ptr = new Descriptor( + dtype, std::move(info), WorkSpaceSize, + new Opaque{handle->internal()}, + handle->device, handle->device_id + ); + return INFINI_STATUS_SUCCESS; +} + + + +infiniStatus_t Descriptor::calculate( + void * workspace, + size_t workspace_size, + void * output, + const void * input, + void *stream_ +) const { + if (workspace_size < _workspace_size) + return INFINI_STATUS_INSUFFICIENT_WORKSPACE; + + hcStream_t stream = (hcStream_t)stream_; + + #define CALCULATE_TRIL(TDATA) \ + launchKernel<256, TDATA><<<_info.row_size, 256, 0, stream>>>( \ + (TDATA *)output, \ + (const TDATA *)input, \ + _info.column_size, \ + _info.diagonal \ + ) + + switch (_info.dtype) { + case INFINI_DTYPE_BOOL: + CALCULATE_TRIL(bool); + break; + case INFINI_DTYPE_U8: + CALCULATE_TRIL(uint8_t); + break; + case INFINI_DTYPE_U16: + CALCULATE_TRIL(uint16_t); + break; + case INFINI_DTYPE_U32: + CALCULATE_TRIL(uint32_t); + break; + case INFINI_DTYPE_U64: + CALCULATE_TRIL(uint64_t); + break; + case INFINI_DTYPE_I8: + CALCULATE_TRIL(int8_t); + break; + case INFINI_DTYPE_I16: + CALCULATE_TRIL(int16_t); + break; + case INFINI_DTYPE_I32: + CALCULATE_TRIL(int32_t); + break; + case INFINI_DTYPE_I64: + CALCULATE_TRIL(int64_t); + break; + case INFINI_DTYPE_F16: + CALCULATE_TRIL(half); + break; + case INFINI_DTYPE_F32: + CALCULATE_TRIL(float); + break; + case INFINI_DTYPE_BF16: + CALCULATE_TRIL(cuda_bfloat16); + break; + default: + return INFINI_STATUS_BAD_TENSOR_DTYPE; + } + return INFINI_STATUS_SUCCESS; + + #undef CALCULATE_TRIL +} +} // namespace op::tril::metax diff --git a/src/infiniop/ops/tril/nvidia/tril_nvidia.cu b/src/infiniop/ops/tril/nvidia/tril_nvidia.cu new file mode 100644 index 000000000..2038a1e4a --- /dev/null +++ b/src/infiniop/ops/tril/nvidia/tril_nvidia.cu @@ -0,0 +1,122 @@ +#include "../../../devices/nvidia/nvidia_handle.cuh" +#include "../../../devices/nvidia/nvidia_common.cuh" +#include "../../../devices/nvidia/nvidia_kernel_common.cuh" +#include "tril_nvidia.cuh" +#include "../cuda/kernel.cuh" +#include "../info.h" + +namespace op::tril::nvidia { +template +INFINIOP_CUDA_KERNEL launchKernel( + Tdata * output, + const Tdata * input, + int column_size, + int diagonal +) { + trilKernel( + output, + input, + column_size, + diagonal + ); +} + + +struct Descriptor::Opaque { + std::shared_ptr internal; +}; + +Descriptor::~Descriptor() { + delete _opaque; +} + +infiniStatus_t Descriptor::create( + infiniopHandle_t handle_, + Descriptor **desc_ptr, + infiniopTensorDescriptor_t output_desc, + infiniopTensorDescriptor_t input_desc, + int diagonal +) { + auto handle = reinterpret_cast(handle_); +// --------------------- start: check data type and calculate workspace size ---------------------- + auto dtype = output_desc->dtype(); + size_t WorkSpaceSize = 0; +// ---------------------- end: check data type and calculate workspace size ----------------------- + auto result = TrilInfo::createTrilInfo( + output_desc, + input_desc, + diagonal + ); + CHECK_RESULT(result); + const TrilInfo &info = result.take(); + *desc_ptr = new Descriptor( + dtype, std::move(info), WorkSpaceSize, + new Opaque{handle->internal()}, + handle->device, handle->device_id + ); + return INFINI_STATUS_SUCCESS; +} + + +infiniStatus_t Descriptor::calculate( + void * workspace, + size_t workspace_size, + void * output, + const void * input, + void *stream_ +) const { + if (workspace_size < _workspace_size) { + return INFINI_STATUS_INSUFFICIENT_WORKSPACE; + } + cudaStream_t stream = (cudaStream_t)stream_; + + #define CALCULATE_TRIL(TDATA) \ + launchKernel<256, TDATA><<<_info.row_size, 256, 0, stream>>>( \ + (TDATA *)output, \ + (const TDATA *)input, \ + _info.column_size, \ + _info.diagonal \ + ) + switch (_info.dtype) { + case INFINI_DTYPE_BOOL: + CALCULATE_TRIL(uint8_t); + break; + case INFINI_DTYPE_U8: + CALCULATE_TRIL(uint8_t); + break; + case INFINI_DTYPE_U16: + CALCULATE_TRIL(uint16_t); + break; + case INFINI_DTYPE_U32: + CALCULATE_TRIL(uint32_t); + break; + case INFINI_DTYPE_U64: + CALCULATE_TRIL(uint64_t); + break; + case INFINI_DTYPE_I8: + CALCULATE_TRIL(int8_t); + break; + case INFINI_DTYPE_I16: + CALCULATE_TRIL(int16_t); + break; + case INFINI_DTYPE_I32: + CALCULATE_TRIL(int32_t); + break; + case INFINI_DTYPE_I64: + CALCULATE_TRIL(int64_t); + break; + case INFINI_DTYPE_F16: + CALCULATE_TRIL(half); + break; + case INFINI_DTYPE_F32: + CALCULATE_TRIL(float); + break; + case INFINI_DTYPE_BF16: + CALCULATE_TRIL(cuda_bfloat16); + break; + default: + return INFINI_STATUS_BAD_TENSOR_DTYPE; + } + return INFINI_STATUS_SUCCESS; +} +} // namespace op::tril::nvidia diff --git a/src/infiniop/ops/tril/nvidia/tril_nvidia.cuh b/src/infiniop/ops/tril/nvidia/tril_nvidia.cuh new file mode 100644 index 000000000..4db2fbb45 --- /dev/null +++ b/src/infiniop/ops/tril/nvidia/tril_nvidia.cuh @@ -0,0 +1,7 @@ +#ifndef __TRIL_NVIDIA_API_H__ +#define __TRIL_NVIDIA_API_H__ +#include "../tril.h" + +DESCRIPTOR(nvidia) + +#endif // __TRIL_NVIDIA_API_H__ diff --git a/src/infiniop/ops/tril/operator.cc b/src/infiniop/ops/tril/operator.cc new file mode 100644 index 000000000..f7f519bde --- /dev/null +++ b/src/infiniop/ops/tril/operator.cc @@ -0,0 +1,150 @@ +#include "../../operator.h" +#include "../../handle.h" +#include "infiniop/ops/tril.h" + +#ifdef ENABLE_CPU_API +#include "cpu/tril_cpu.h" +#endif +#if defined(ENABLE_NVIDIA_API) || defined(ENABLE_ILUVATAR_API) +#include "nvidia/tril_nvidia.cuh" +#endif +#ifdef ENABLE_METAX_API +#include "metax/tril_metax.h" +#endif + +__C infiniStatus_t infiniopCreateTrilDescriptor( + infiniopHandle_t handle, + infiniopTrilDescriptor_t *desc_ptr, + infiniopTensorDescriptor_t output_desc, + infiniopTensorDescriptor_t input_desc, + int diagonal +) { +#define CREATE(CASE, NAMESPACE) \ + case CASE: \ + return op::tril::NAMESPACE::Descriptor::create( \ + handle, \ + reinterpret_cast(desc_ptr), \ + output_desc, \ + input_desc, \ + diagonal \ + ) + + switch (handle->device) { + +#ifdef ENABLE_CPU_API + CREATE(INFINI_DEVICE_CPU, cpu); +#endif +#ifdef ENABLE_NVIDIA_API + CREATE(INFINI_DEVICE_NVIDIA, nvidia); +#endif +#ifdef ENABLE_ILUVATAR_API + CREATE(INFINI_DEVICE_ILUVATAR, nvidia); +#endif +#ifdef ENABLE_METAX_API + CREATE(INFINI_DEVICE_METAX, metax); +#endif + + default: + return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED; + } + +#undef CREATE +} + +__C infiniStatus_t infiniopGetTrilWorkspaceSize(infiniopTrilDescriptor_t desc, size_t *size) { +#define GET(CASE, NAMESPACE) \ + case CASE: \ + *size = reinterpret_cast(desc)->workspaceSize(); \ + return INFINI_STATUS_SUCCESS + + switch (desc->device_type) { +#ifdef ENABLE_CPU_API + GET(INFINI_DEVICE_CPU, cpu); +#endif +#ifdef ENABLE_NVIDIA_API + GET(INFINI_DEVICE_NVIDIA, nvidia); +#endif +#ifdef ENABLE_ILUVATAR_API + GET(INFINI_DEVICE_ILUVATAR, nvidia); +#endif +#ifdef ENABLE_METAX_API + GET(INFINI_DEVICE_METAX, metax); +#endif + default: + return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED; + } +#undef GET + + return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED; +} + +__C infiniStatus_t infiniopTril( + infiniopTrilDescriptor_t desc, + void *workspace, + size_t workspace_size, + void * output, + const void * input, + void *stream +) { + +#define CALCULATE(CASE, NAMESPACE) \ + case CASE: \ + return reinterpret_cast(desc)->calculate( \ + workspace, \ + workspace_size, \ + output, \ + input, \ + stream \ + ) + + switch (desc->device_type) { + +#ifdef ENABLE_CPU_API + CALCULATE(INFINI_DEVICE_CPU, cpu); +#endif +#ifdef ENABLE_NVIDIA_API + CALCULATE(INFINI_DEVICE_NVIDIA, nvidia); +#endif +#ifdef ENABLE_ILUVATAR_API + CALCULATE(INFINI_DEVICE_ILUVATAR, nvidia); +#endif +#ifdef ENABLE_METAX_API + CALCULATE(INFINI_DEVICE_METAX, metax); +#endif + + default: + return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED; + } + +#undef CALCULATE +} + +__C infiniStatus_t +infiniopDestroyTrilDescriptor(infiniopTrilDescriptor_t desc) { + +#define DELETE(CASE, NAMESPACE) \ + case CASE: \ + delete reinterpret_cast(desc); \ + return INFINI_STATUS_SUCCESS + + switch (desc->device_type) { + +#ifdef ENABLE_CPU_API + DELETE(INFINI_DEVICE_CPU, cpu); +#endif +#ifdef ENABLE_NVIDIA_API + DELETE(INFINI_DEVICE_NVIDIA, nvidia); +#endif +#ifdef ENABLE_ILUVATAR_API + DELETE(INFINI_DEVICE_ILUVATAR, nvidia); +#endif +#ifdef ENABLE_METAX_API + DELETE(INFINI_DEVICE_METAX, metax); +#endif + + default: + return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED; + } + +#undef DELETE +} diff --git a/src/infiniop/ops/tril/tril.h b/src/infiniop/ops/tril/tril.h new file mode 100644 index 000000000..7d36709a6 --- /dev/null +++ b/src/infiniop/ops/tril/tril.h @@ -0,0 +1,47 @@ +#ifndef __TRIL_H__ +#define __TRIL_H__ + +#include "../../../utils.h" +#include "../../operator.h" +#include "../../tensor.h" +#include "info.h" + +#define DESCRIPTOR(NAMESPACE) \ + namespace op::tril::NAMESPACE { \ + class Descriptor final : public InfiniopDescriptor { \ + struct Opaque; \ + Opaque *_opaque; \ + TrilInfo _info; \ + size_t _workspace_size; \ + Descriptor( \ + infiniDtype_t dtype, \ + TrilInfo info, \ + size_t workspace_size_, \ + Opaque *opaque, \ + infiniDevice_t device_type, \ + int device_id \ + ) : InfiniopDescriptor{device_type, device_id}, \ + _opaque(opaque), \ + _info(info), \ + _workspace_size(workspace_size_) {} \ + public: \ + ~Descriptor(); \ + size_t workspaceSize() const { return _workspace_size; } \ + static infiniStatus_t create( \ + infiniopHandle_t handle, \ + Descriptor **desc_ptr, \ + infiniopTensorDescriptor_t output_desc, \ + infiniopTensorDescriptor_t input_desc, \ + int diagonal \ + ); \ + infiniStatus_t calculate( \ + void *workspace, \ + size_t workspace_size, \ + void * output, \ + const void * input, \ + void *stream \ + ) const; \ + }; \ + } + +#endif \ No newline at end of file diff --git a/src/infiniop/ops/triu/cpu/triu_cpu.cc b/src/infiniop/ops/triu/cpu/triu_cpu.cc new file mode 100644 index 000000000..0650c5147 --- /dev/null +++ b/src/infiniop/ops/triu/cpu/triu_cpu.cc @@ -0,0 +1,82 @@ +#include "triu_cpu.h" +#include "../../../devices/cpu/common_cpu.h" +#include "../../../reduce/cpu/reduce.h" +#include "../info.h" + +namespace op::triu::cpu { + +Descriptor::~Descriptor() = default; + +infiniStatus_t Descriptor::create( + infiniopHandle_t handle_, + Descriptor **desc_ptr, + infiniopTensorDescriptor_t output_desc, + infiniopTensorDescriptor_t input_desc, + int diagonal +) { + auto handle = reinterpret_cast(handle_); + +// --------------------- start: check data type and calculate workspace size ---------------------- + auto dtype = input_desc->dtype(); + size_t WorkSpaceSize = 0; +// ---------------------- end: check data type and calculate workspace size ----------------------- + + auto result = TriuInfo::createTriuInfo( + output_desc, + input_desc, + diagonal + ); + CHECK_RESULT(result); + const TriuInfo &info = result.take(); + + *desc_ptr = new Descriptor( + dtype, std::move(info), WorkSpaceSize, + nullptr, + handle->device, handle->device_id + ); + + return INFINI_STATUS_SUCCESS; +} + +infiniStatus_t Descriptor::calculate( + void *workspace, + size_t workspace_size, + void * output, + const void * input, + void *stream +) const { + + #pragma omp parallel for + for (int row = 0; row < _info.row_size; row ++) + { + auto output_ptr = reinterpret_cast(output) + row * _info.column_size * _info.elem_size; + auto input_ptr = reinterpret_cast(input) + row * _info.column_size * _info.elem_size; + if (_info.column_size - 1 < row + _info.diagonal) + memset( + output_ptr, + 0, + _info.elem_size * _info.column_size + ); + else if (0 >= row + _info.diagonal) + memcpy( + output_ptr, + input_ptr, + _info.elem_size * _info.column_size + ); + else { + memset( + output_ptr, + 0, + _info.elem_size * (row + _info.diagonal) + ); + memcpy( + output_ptr + _info.elem_size * (row + _info.diagonal), + input_ptr + _info.elem_size * (row + _info.diagonal), + _info.elem_size * (_info.column_size - row - _info.diagonal) + ); + } + + } + return INFINI_STATUS_SUCCESS; +} +} diff --git a/src/infiniop/ops/triu/cpu/triu_cpu.h b/src/infiniop/ops/triu/cpu/triu_cpu.h new file mode 100644 index 000000000..f58acfe7e --- /dev/null +++ b/src/infiniop/ops/triu/cpu/triu_cpu.h @@ -0,0 +1,9 @@ +#ifndef __TRIU_CPU_H__ +#define __TRIU_CPU_H__ + +#include "../triu.h" + +DESCRIPTOR(cpu) + + +#endif // __TRIU_CPU_H__ diff --git a/src/infiniop/ops/triu/cuda/kernel.cuh b/src/infiniop/ops/triu/cuda/kernel.cuh new file mode 100644 index 000000000..934d2d213 --- /dev/null +++ b/src/infiniop/ops/triu/cuda/kernel.cuh @@ -0,0 +1,19 @@ +#ifndef __TRIU_KERNEL_CUH__ +#define __TRIU_KERNEL_CUH__ +// ------------------------------- start: perform operator on CUDA -------------------------------- +template +__device__ void triuKernel( + Tdata * output, + const Tdata * input, + int column_size, + int diagonal +) { + int row = blockIdx.x; + auto output_ptr = output + row * column_size; + auto input_ptr = input + row * column_size; + for (int i = threadIdx.x; i < column_size; i += BLOCK_SIZE) + *(output_ptr + i) = (i < row + diagonal) ? (Tdata)0 : *(input_ptr + i); +} +// -------------------------------- end: perform operator on CUDA --------------------------------- + +#endif // __TRIU_KERNEL_CUH__ diff --git a/src/infiniop/ops/triu/info.h b/src/infiniop/ops/triu/info.h new file mode 100644 index 000000000..135e05c97 --- /dev/null +++ b/src/infiniop/ops/triu/info.h @@ -0,0 +1,52 @@ +#ifndef __TRIU_INFO_H__ +#define __TRIU_INFO_H__ + +#include "../../../utils.h" +#include "../../operator.h" +#include "../../tensor.h" + +namespace op::triu { + +class TriuInfo { +private: + TriuInfo() = default; + +public: +// ---------------------------- start: define member variables of Info ---------------------------- + infiniDtype_t dtype; + size_t elem_size; + int row_size; + int column_size; + int diagonal; + +// ----------------------------- end: define member variables of Info ----------------------------- + + static utils::Result createTriuInfo( + infiniopTensorDescriptor_t output_desc, + infiniopTensorDescriptor_t input_desc, + int diagonal + ) { +// ------------------------- start: check tensor shape and input validity ------------------------- + CHECK_SAME_SHAPE( + input_desc->shape(), output_desc->shape() + ); + CHECK_OR_RETURN( + input_desc->ndim() == 2, + INFINI_STATUS_BAD_TENSOR_SHAPE + ); + size_t elem_size = infiniSizeOf(input_desc->dtype()); +// -------------------------- end: check tensor shape and input validity -------------------------- + return utils::Result(TriuInfo{ +// ------------------------------ start: create an instance of Info ------------------------------- + input_desc->dtype(), + elem_size, + int(input_desc->dim(0)), + int(input_desc->dim(1)), + diagonal +// ------------------------------- end: create an instance of Info -------------------------------- + }); + } +}; +} + +#endif // __TRIU_INFO_H__ diff --git a/src/infiniop/ops/triu/metax/triu_metax.h b/src/infiniop/ops/triu/metax/triu_metax.h new file mode 100644 index 000000000..631d9188c --- /dev/null +++ b/src/infiniop/ops/triu/metax/triu_metax.h @@ -0,0 +1,8 @@ +#ifndef __TRIU_METAX_H__ +#define __TRIU_METAX_H__ + +#include "../triu.h" + +DESCRIPTOR(metax) + +#endif // __TRIU_METAX_H__ diff --git a/src/infiniop/ops/triu/metax/triu_metax.maca b/src/infiniop/ops/triu/metax/triu_metax.maca new file mode 100644 index 000000000..59e3275e1 --- /dev/null +++ b/src/infiniop/ops/triu/metax/triu_metax.maca @@ -0,0 +1,128 @@ +#include "../../../devices/metax/metax_common.h" +#include "triu_metax.h" +#include +#include "../../../devices/metax/metax_kernel_common.h" +#include "../../../reduce/cuda/reduce.cuh" +#include "../cuda/kernel.cuh" +#include "../info.h" + +namespace op::triu::metax { +template +INFINIOP_METAX_KERNEL launchKernel( + Tdata * output, + const Tdata * input, + int column_size, + int diagonal +) { + triuKernel( + output, + input, + column_size, + diagonal + ); +} + + +struct Descriptor::Opaque { + std::shared_ptr internal; +}; + +Descriptor::~Descriptor() { + delete _opaque; +} + +infiniStatus_t Descriptor::create( + infiniopHandle_t handle_, + Descriptor **desc_ptr, + infiniopTensorDescriptor_t output_desc, + infiniopTensorDescriptor_t input_desc, + int diagonal +) { + auto handle = reinterpret_cast(handle_); +// --------------------- start: check data type and calculate workspace size ---------------------- + auto dtype = output_desc->dtype(); + auto result = TriuInfo::createTriuInfo( + output_desc, + input_desc, + diagonal + ); + CHECK_RESULT(result); + const TriuInfo &info = result.take(); + size_t WorkSpaceSize = 0; +// ---------------------- end: check data type and calculate workspace size ----------------------- + + *desc_ptr = new Descriptor( + dtype, std::move(info), WorkSpaceSize, + new Opaque{handle->internal()}, + handle->device, handle->device_id + ); + return INFINI_STATUS_SUCCESS; +} + + + +infiniStatus_t Descriptor::calculate( + void * workspace, + size_t workspace_size, + void * output, + const void * input, + void *stream_ +) const { + if (workspace_size < _workspace_size) + return INFINI_STATUS_INSUFFICIENT_WORKSPACE; + + hcStream_t stream = (hcStream_t)stream_; + + #define CALCULATE_TRIU(TDATA) \ + launchKernel<256, TDATA><<<_info.row_size, 256, 0, stream>>>( \ + (TDATA *)output, \ + (const TDATA *)input, \ + _info.column_size, \ + _info.diagonal \ + ) + + switch (_info.dtype) { + case INFINI_DTYPE_BOOL: + CALCULATE_TRIU(bool); + break; + case INFINI_DTYPE_U8: + CALCULATE_TRIU(uint8_t); + break; + case INFINI_DTYPE_U16: + CALCULATE_TRIU(uint16_t); + break; + case INFINI_DTYPE_U32: + CALCULATE_TRIU(uint32_t); + break; + case INFINI_DTYPE_U64: + CALCULATE_TRIU(uint64_t); + break; + case INFINI_DTYPE_I8: + CALCULATE_TRIU(int8_t); + break; + case INFINI_DTYPE_I16: + CALCULATE_TRIU(int16_t); + break; + case INFINI_DTYPE_I32: + CALCULATE_TRIU(int32_t); + break; + case INFINI_DTYPE_I64: + CALCULATE_TRIU(int64_t); + break; + case INFINI_DTYPE_F16: + CALCULATE_TRIU(half); + break; + case INFINI_DTYPE_F32: + CALCULATE_TRIU(float); + break; + case INFINI_DTYPE_BF16: + CALCULATE_TRIU(cuda_bfloat16); + break; + default: + return INFINI_STATUS_BAD_TENSOR_DTYPE; + } + return INFINI_STATUS_SUCCESS; + + #undef CALCULATE_TRIU +} +} // namespace op::triu::metax diff --git a/src/infiniop/ops/triu/nvidia/triu_nvidia.cu b/src/infiniop/ops/triu/nvidia/triu_nvidia.cu new file mode 100644 index 000000000..51761cf95 --- /dev/null +++ b/src/infiniop/ops/triu/nvidia/triu_nvidia.cu @@ -0,0 +1,123 @@ +#include "../../../devices/nvidia/nvidia_handle.cuh" +#include "../../../devices/nvidia/nvidia_common.cuh" +#include "../../../devices/nvidia/nvidia_kernel_common.cuh" +#include "triu_nvidia.cuh" +#include "../cuda/kernel.cuh" +#include "../info.h" + +namespace op::triu::nvidia { + +template +INFINIOP_CUDA_KERNEL launchKernel( + Tdata * output, + const Tdata * input, + int column_size, + int diagonal +) { + triuKernel( + output, + input, + column_size, + diagonal + ); +} + + +struct Descriptor::Opaque { + std::shared_ptr internal; +}; + +Descriptor::~Descriptor() { + delete _opaque; +} + +infiniStatus_t Descriptor::create( + infiniopHandle_t handle_, + Descriptor **desc_ptr, + infiniopTensorDescriptor_t output_desc, + infiniopTensorDescriptor_t input_desc, + int diagonal +) { + auto handle = reinterpret_cast(handle_); +// --------------------- start: check data type and calculate workspace size ---------------------- + auto dtype = output_desc->dtype(); + size_t WorkSpaceSize = 0; +// ---------------------- end: check data type and calculate workspace size ----------------------- + auto result = TriuInfo::createTriuInfo( + output_desc, + input_desc, + diagonal + ); + CHECK_RESULT(result); + const TriuInfo &info = result.take(); + *desc_ptr = new Descriptor( + dtype, std::move(info), WorkSpaceSize, + new Opaque{handle->internal()}, + handle->device, handle->device_id + ); + return INFINI_STATUS_SUCCESS; +} + + +infiniStatus_t Descriptor::calculate( + void * workspace, + size_t workspace_size, + void * output, + const void * input, + void *stream_ +) const { + if (workspace_size < _workspace_size) { + return INFINI_STATUS_INSUFFICIENT_WORKSPACE; + } + cudaStream_t stream = (cudaStream_t)stream_; + + #define CALCULATE_TRIU(TDATA) \ + launchKernel<256, TDATA><<<_info.row_size, 256, 0, stream>>>( \ + (TDATA *)output, \ + (const TDATA *)input, \ + _info.column_size, \ + _info.diagonal \ + ) +switch (_info.dtype) { + case INFINI_DTYPE_BOOL: + CALCULATE_TRIU(bool); + break; + case INFINI_DTYPE_U8: + CALCULATE_TRIU(uint8_t); + break; + case INFINI_DTYPE_U16: + CALCULATE_TRIU(uint16_t); + break; + case INFINI_DTYPE_U32: + CALCULATE_TRIU(uint32_t); + break; + case INFINI_DTYPE_U64: + CALCULATE_TRIU(uint64_t); + break; + case INFINI_DTYPE_I8: + CALCULATE_TRIU(int8_t); + break; + case INFINI_DTYPE_I16: + CALCULATE_TRIU(int16_t); + break; + case INFINI_DTYPE_I32: + CALCULATE_TRIU(int32_t); + break; + case INFINI_DTYPE_I64: + CALCULATE_TRIU(int64_t); + break; + case INFINI_DTYPE_F16: + CALCULATE_TRIU(half); + break; + case INFINI_DTYPE_F32: + CALCULATE_TRIU(float); + break; + case INFINI_DTYPE_BF16: + CALCULATE_TRIU(cuda_bfloat16); + break; + default: + return INFINI_STATUS_BAD_TENSOR_DTYPE; + } + return INFINI_STATUS_SUCCESS; +} +} // namespace op::triu::nvidia diff --git a/src/infiniop/ops/triu/nvidia/triu_nvidia.cuh b/src/infiniop/ops/triu/nvidia/triu_nvidia.cuh new file mode 100644 index 000000000..1f86ebbd3 --- /dev/null +++ b/src/infiniop/ops/triu/nvidia/triu_nvidia.cuh @@ -0,0 +1,7 @@ +#ifndef __TRIU_NVIDIA_API_H__ +#define __TRIU_NVIDIA_API_H__ +#include "../triu.h" + +DESCRIPTOR(nvidia) + +#endif // __TRIU_NVIDIA_API_H__ diff --git a/src/infiniop/ops/triu/operator.cc b/src/infiniop/ops/triu/operator.cc new file mode 100644 index 000000000..c7a27a4ee --- /dev/null +++ b/src/infiniop/ops/triu/operator.cc @@ -0,0 +1,150 @@ +#include "../../operator.h" +#include "../../handle.h" +#include "infiniop/ops/triu.h" + +#ifdef ENABLE_CPU_API +#include "cpu/triu_cpu.h" +#endif +#if defined(ENABLE_NVIDIA_API) || defined(ENABLE_ILUVATAR_API) +#include "nvidia/triu_nvidia.cuh" +#endif +#ifdef ENABLE_METAX_API +#include "metax/triu_metax.h" +#endif + +__C infiniStatus_t infiniopCreateTriuDescriptor( + infiniopHandle_t handle, + infiniopTriuDescriptor_t *desc_ptr, + infiniopTensorDescriptor_t output_desc, + infiniopTensorDescriptor_t input_desc, + int diagonal +) { +#define CREATE(CASE, NAMESPACE) \ + case CASE: \ + return op::triu::NAMESPACE::Descriptor::create( \ + handle, \ + reinterpret_cast(desc_ptr), \ + output_desc, \ + input_desc, \ + diagonal \ + ) + + switch (handle->device) { + +#ifdef ENABLE_CPU_API + CREATE(INFINI_DEVICE_CPU, cpu); +#endif +#ifdef ENABLE_NVIDIA_API + CREATE(INFINI_DEVICE_NVIDIA, nvidia); +#endif +#ifdef ENABLE_ILUVATAR_API + CREATE(INFINI_DEVICE_ILUVATAR, nvidia); +#endif +#ifdef ENABLE_METAX_API + CREATE(INFINI_DEVICE_METAX, metax); +#endif + + default: + return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED; + } + +#undef CREATE +} + +__C infiniStatus_t infiniopGetTriuWorkspaceSize(infiniopTriuDescriptor_t desc, size_t *size) { +#define GET(CASE, NAMESPACE) \ + case CASE: \ + *size = reinterpret_cast(desc)->workspaceSize(); \ + return INFINI_STATUS_SUCCESS + + switch (desc->device_type) { +#ifdef ENABLE_CPU_API + GET(INFINI_DEVICE_CPU, cpu); +#endif +#ifdef ENABLE_NVIDIA_API + GET(INFINI_DEVICE_NVIDIA, nvidia); +#endif +#ifdef ENABLE_ILUVATAR_API + GET(INFINI_DEVICE_ILUVATAR, nvidia); +#endif +#ifdef ENABLE_METAX_API + GET(INFINI_DEVICE_METAX, metax); +#endif + default: + return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED; + } +#undef GET + + return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED; +} + +__C infiniStatus_t infiniopTriu( + infiniopTriuDescriptor_t desc, + void *workspace, + size_t workspace_size, + void * output, + const void * input, + void *stream +) { + +#define CALCULATE(CASE, NAMESPACE) \ + case CASE: \ + return reinterpret_cast(desc)->calculate( \ + workspace, \ + workspace_size, \ + output, \ + input, \ + stream \ + ) + + switch (desc->device_type) { + +#ifdef ENABLE_CPU_API + CALCULATE(INFINI_DEVICE_CPU, cpu); +#endif +#ifdef ENABLE_NVIDIA_API + CALCULATE(INFINI_DEVICE_NVIDIA, nvidia); +#endif +#ifdef ENABLE_ILUVATAR_API + CALCULATE(INFINI_DEVICE_ILUVATAR, nvidia); +#endif +#ifdef ENABLE_METAX_API + CALCULATE(INFINI_DEVICE_METAX, metax); +#endif + + default: + return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED; + } + +#undef CALCULATE +} + +__C infiniStatus_t +infiniopDestroyTriuDescriptor(infiniopTriuDescriptor_t desc) { + +#define DELETE(CASE, NAMESPACE) \ + case CASE: \ + delete reinterpret_cast(desc); \ + return INFINI_STATUS_SUCCESS + + switch (desc->device_type) { + +#ifdef ENABLE_CPU_API + DELETE(INFINI_DEVICE_CPU, cpu); +#endif +#ifdef ENABLE_NVIDIA_API + DELETE(INFINI_DEVICE_NVIDIA, nvidia); +#endif +#ifdef ENABLE_ILUVATAR_API + DELETE(INFINI_DEVICE_ILUVATAR, nvidia); +#endif +#ifdef ENABLE_METAX_API + DELETE(INFINI_DEVICE_METAX, metax); +#endif + + default: + return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED; + } + +#undef DELETE +} diff --git a/src/infiniop/ops/triu/triu.h b/src/infiniop/ops/triu/triu.h new file mode 100644 index 000000000..c519e14f6 --- /dev/null +++ b/src/infiniop/ops/triu/triu.h @@ -0,0 +1,47 @@ +#ifndef __TRIU_H__ +#define __TRIU_H__ + +#include "../../../utils.h" +#include "../../operator.h" +#include "../../tensor.h" +#include "info.h" + +#define DESCRIPTOR(NAMESPACE) \ + namespace op::triu::NAMESPACE { \ + class Descriptor final : public InfiniopDescriptor { \ + struct Opaque; \ + Opaque *_opaque; \ + TriuInfo _info; \ + size_t _workspace_size; \ + Descriptor( \ + infiniDtype_t dtype, \ + TriuInfo info, \ + size_t workspace_size_, \ + Opaque *opaque, \ + infiniDevice_t device_type, \ + int device_id \ + ) : InfiniopDescriptor{device_type, device_id}, \ + _opaque(opaque), \ + _info(info), \ + _workspace_size(workspace_size_) {} \ + public: \ + ~Descriptor(); \ + size_t workspaceSize() const { return _workspace_size; } \ + static infiniStatus_t create( \ + infiniopHandle_t handle, \ + Descriptor **desc_ptr, \ + infiniopTensorDescriptor_t output_desc, \ + infiniopTensorDescriptor_t input_desc, \ + int diagonal \ + ); \ + infiniStatus_t calculate( \ + void *workspace, \ + size_t workspace_size, \ + void * output, \ + const void * input, \ + void *stream \ + ) const; \ + }; \ + } + +#endif \ No newline at end of file diff --git a/test/infiniop-test/test_generate/testcases/gather.py b/test/infiniop-test/test_generate/testcases/gather.py new file mode 100644 index 000000000..88e2abec6 --- /dev/null +++ b/test/infiniop-test/test_generate/testcases/gather.py @@ -0,0 +1,118 @@ +from ast import List +import numpy as np +import gguf +from typing import List +from numpy.lib.stride_tricks import as_strided + +from .. import InfiniopTestWriter, InfiniopTestCase, np_dtype_to_ggml, gguf_strides, contiguous_gguf_strides, process_zero_stride_tensor +import torch + +def gather( + output: np.ndarray, + input: np.ndarray, + index: np.ndarray, + dim: int, +): + torch_output = torch.from_numpy(output) + torch_input = torch.from_numpy(input) + torch_index = torch.from_numpy(index) + + torch.gather(torch_input, dim, torch_index, out=torch_output) + return torch_output.detach().numpy() + + +class GatherTestCase(InfiniopTestCase): + def __init__( + self, + output: np.ndarray, + output_shape: List[int], + output_strides: List[int], + input: np.ndarray, + input_shape: List[int], + input_strides: List[int], + index: np.ndarray, + index_shape: List[int], + index_strides: List[int], + dim: int, + ): + super().__init__("gather") + self.output = output + self.output_shape = output_shape + self.output_strides = output_strides + self.input = input + self.input_shape = input_shape + self.input_strides = input_strides + self.index = index + self.index_shape = index_shape + self.index_strides = index_strides + self.dim = dim + + def write_test(self, test_writer: "InfiniopTestWriter"): + super().write_test(test_writer) + test_writer.add_array(test_writer.gguf_key("output.shape"), self.output_shape) + test_writer.add_array(test_writer.gguf_key("input.shape"), self.input_shape) + test_writer.add_array(test_writer.gguf_key("index.shape"), self.index_shape) + test_writer.add_uint64(test_writer.gguf_key("dim"), self.dim) + if self.input_strides is not None: + test_writer.add_array(test_writer.gguf_key("input.strides"), gguf_strides(*self.input_strides)) + if self.index_strides is not None: + test_writer.add_array(test_writer.gguf_key("index.strides"), gguf_strides(*self.index_strides)) + test_writer.add_array( + test_writer.gguf_key("output.strides"), + gguf_strides(*(self.output_strides if self.output_strides is not None else contiguous_gguf_strides(self.output_shape))) + ) + test_writer.add_tensor( + test_writer.gguf_key("output"), self.output, raw_dtype=np_dtype_to_ggml(self.output.dtype) + ) + test_writer.add_tensor( + test_writer.gguf_key("input"), self.input, raw_dtype=np_dtype_to_ggml(self.input.dtype) + ) + test_writer.add_tensor( + test_writer.gguf_key("index"), self.index, raw_dtype=np_dtype_to_ggml(self.index.dtype) + ) + + ans = gather( + self.output, + self.input, + self.index, + self.dim, + ) + test_writer.add_tensor( + test_writer.gguf_key("ans"), ans, raw_dtype=np_dtype_to_ggml(self.input.dtype) + ) +if __name__ == "__main__": + test_writer = InfiniopTestWriter("gather.gguf") + test_cases = [] + # ============================================================================== + # Configuration (Internal Use Only) + # ============================================================================== + # These are not meant to be imported from other modules + _TEST_CASES_ = [ + # input_shape,output_shape, dim, input_strides, output_strides, index_strides + ((2, 3, 7), (2, 3, 5), 2, (177, 17, 1), None, None), + ((10, 5, 4), (10, 4, 4), 1, (30, 5, 1), None, [107, 10, 1]), + ((11, 2, 2, 4), (11, 2, 2, 4), 0, None, (1007, 107, 10, 1), None), + ((11, 20, 20, 13, 37), (11, 20, 20, 13, 37), 1, None, None, None), + ] + _TENSOR_DTYPES_ = [np.float32, np.float16, np.int32, np.int64, np.bool_] + for dtype in _TENSOR_DTYPES_: + for input_shape, output_shape, dim, input_strides, output_strides, index_strides in _TEST_CASES_: + input = (np.random.rand(*input_shape) * 100).astype(dtype) + index = np.random.randint(low=0, high=input_shape[dim], size=output_shape, dtype=np.int64) + output = np.empty(output_shape, dtype=dtype) + + test_case = GatherTestCase( + output=output, + output_shape=output_shape, + output_strides=output_strides, + input=input, + input_shape=input_shape, + input_strides=input_strides, + index=index, + index_shape=output_shape, + index_strides=index_strides, + dim=dim, + ) + test_cases.append(test_case) + test_writer.add_tests(test_cases) + test_writer.save() diff --git a/test/infiniop-test/test_generate/testcases/index_copy_inplace.py b/test/infiniop-test/test_generate/testcases/index_copy_inplace.py new file mode 100644 index 000000000..7bb314cbc --- /dev/null +++ b/test/infiniop-test/test_generate/testcases/index_copy_inplace.py @@ -0,0 +1,121 @@ +from ast import List +import numpy as np +import gguf +from typing import List +from numpy.lib.stride_tricks import as_strided + +from .. import InfiniopTestWriter, InfiniopTestCase, np_dtype_to_ggml, gguf_strides, contiguous_gguf_strides, process_zero_stride_tensor +import torch +import random + +def index_copy_inplace( + output: np.ndarray, + input: np.ndarray, + index: np.ndarray, + dim: int, +): + torch_output = torch.from_numpy(output) + torch_input = torch.from_numpy(input) + torch_index = torch.from_numpy(index) + torch_output.index_copy_(dim, torch_index, torch_input) + return torch_output.detach().numpy() + + +class IndexCopyInplaceTestCase(InfiniopTestCase): + def __init__( + self, + output: np.ndarray, + output_shape: List[int], + output_strides: List[int], + input: np.ndarray, + input_shape: List[int], + input_strides: List[int], + index: np.ndarray, + index_shape: List[int], + index_strides: List[int], + dim: int, + ): + super().__init__("index_copy_inplace") + self.output = output + self.output_shape = output_shape + self.output_strides = output_strides + self.input = input + self.input_shape = input_shape + self.input_strides = input_strides + self.index = index + self.index_shape = index_shape + self.index_strides = index_strides + self.dim = dim + + def write_test(self, test_writer: "InfiniopTestWriter"): + super().write_test(test_writer) + test_writer.add_array(test_writer.gguf_key("output.shape"), self.output_shape) + test_writer.add_array(test_writer.gguf_key("input.shape"), self.input_shape) + test_writer.add_array(test_writer.gguf_key("index.shape"), self.index_shape) + test_writer.add_uint64(test_writer.gguf_key("dim"), self.dim) + if self.input_strides is not None: + test_writer.add_array(test_writer.gguf_key("input.strides"), gguf_strides(*self.input_strides)) + if self.index_strides is not None: + test_writer.add_array(test_writer.gguf_key("index.strides"), gguf_strides(*self.index_strides)) + test_writer.add_array( + test_writer.gguf_key("output.strides"), + gguf_strides(*(self.output_strides if self.output_strides is not None else contiguous_gguf_strides(self.output_shape))) + ) + test_writer.add_tensor( + test_writer.gguf_key("output"), self.output, raw_dtype=np_dtype_to_ggml(self.output.dtype) + ) + test_writer.add_tensor( + test_writer.gguf_key("input"), self.input, raw_dtype=np_dtype_to_ggml(self.input.dtype) + ) + test_writer.add_tensor( + test_writer.gguf_key("index"), self.index, raw_dtype=np_dtype_to_ggml(self.index.dtype) + ) + + ans = index_copy_inplace( + self.output, + self.input, + self.index, + self.dim, + ) + test_writer.add_tensor( + test_writer.gguf_key("ans"), ans, raw_dtype=np_dtype_to_ggml(self.input.dtype) + ) +if __name__ == "__main__": + test_writer = InfiniopTestWriter("index_copy_inplace.gguf") + test_cases = [] + # ============================================================================== + # Configuration (Internal Use Only) + # ============================================================================== + # These are not meant to be imported from other modules + _TEST_CASES_ = [ + # input_shape, output_shape, dim, output_strides, input_strides, index_strides, + ([13, 1], [13, 4], 1, [37, 1], [37, 1], None), + ([1333, 4], [1333, 4], 0, [1, 1333], [1, 2333], [2]), + ([133, 23, 53], [133, 23, 53], 1, None, None, None), + ([133, 23, 13, 53], [133, 23, 13, 53], 2, None, None, None), + ] + _TENSOR_DTYPES_ = [np.float32, np.float16, np.int16, np.int32, np.bool_] + for dtype in _TENSOR_DTYPES_: + for input_shape, output_shape, dim, output_strides, input_strides, index_strides in _TEST_CASES_: + input = np.random.rand(*input_shape).astype(dtype) + + index_list = list(range(output_shape[dim])) + random.shuffle(index_list) + index = np.array(index_list[:input_shape[dim]], dtype=np.int64) + output = np.zeros(output_shape, dtype=dtype) + + test_case = IndexCopyInplaceTestCase( + output=output, + output_shape=output_shape, + output_strides=output_strides, + input=input, + input_shape=input_shape, + input_strides=input_strides, + index=index, + index_shape=[input_shape[dim]], + index_strides=index_strides, + dim=dim + ) + test_cases.append(test_case) + test_writer.add_tests(test_cases) + test_writer.save() diff --git a/test/infiniop-test/test_generate/testcases/linear.py b/test/infiniop-test/test_generate/testcases/linear.py new file mode 100644 index 000000000..8bf842d9d --- /dev/null +++ b/test/infiniop-test/test_generate/testcases/linear.py @@ -0,0 +1,138 @@ +from ast import List +import numpy as np +import gguf +from typing import List +from numpy.lib.stride_tricks import as_strided + +from .. import InfiniopTestWriter, InfiniopTestCase, np_dtype_to_ggml, gguf_strides, contiguous_gguf_strides, process_zero_stride_tensor +import torch + +def linear( + x: np.ndarray, + w: np.ndarray, + b: np.ndarray, + bias_exist: bool +): + torch_x = torch.from_numpy(x) + torch_w = torch.from_numpy(w) + if bias_exist: + torch_b = torch.from_numpy(b) + return torch.nn.functional.linear(torch_x, torch_w, bias=(torch_b if bias_exist else None)).detach().numpy() + + +class LinearTestCase(InfiniopTestCase): + def __init__( + self, + y: np.ndarray, + y_shape: List[int], + y_strides: List[int], + x: np.ndarray, + x_shape: List[int], + x_strides: List[int], + w: np.ndarray, + w_shape: List[int], + w_strides: List[int], + b: np.ndarray, + b_shape: List[int], + b_strides: List[int], + bias_exist: bool + ): + super().__init__("linear") + self.y = y + self.y_shape = y_shape + self.y_strides = y_strides + self.x = x + self.x_shape = x_shape + self.x_strides = x_strides + self.w = w + self.w_shape = w_shape + self.w_strides = w_strides + self.b = b + self.b_shape = b_shape + self.b_strides = b_strides + self.bias_exist = bias_exist + + def write_test(self, test_writer: "InfiniopTestWriter"): + super().write_test(test_writer) + test_writer.add_array(test_writer.gguf_key("y.shape"), self.y_shape) + test_writer.add_array(test_writer.gguf_key("x.shape"), self.x_shape) + test_writer.add_array(test_writer.gguf_key("w.shape"), self.w_shape) + test_writer.add_array(test_writer.gguf_key("b.shape"), self.b_shape) + test_writer.add_bool(test_writer.gguf_key("bias_exist"), self.bias_exist) + if self.x_strides is not None: + test_writer.add_array(test_writer.gguf_key("x.strides"), gguf_strides(*self.x_strides)) + if self.w_strides is not None: + test_writer.add_array(test_writer.gguf_key("w.strides"), gguf_strides(*self.w_strides)) + if self.b_strides is not None: + test_writer.add_array(test_writer.gguf_key("b.strides"), gguf_strides(*self.b_strides)) + test_writer.add_array( + test_writer.gguf_key("y.strides"), + gguf_strides(*(self.y_strides if self.y_strides is not None else contiguous_gguf_strides(self.y_shape))) + ) + test_writer.add_tensor( + test_writer.gguf_key("y"), self.y, raw_dtype=np_dtype_to_ggml(self.y.dtype) + ) + test_writer.add_tensor( + test_writer.gguf_key("x"), self.x, raw_dtype=np_dtype_to_ggml(self.x.dtype) + ) + test_writer.add_tensor( + test_writer.gguf_key("w"), self.w, raw_dtype=np_dtype_to_ggml(self.w.dtype) + ) + test_writer.add_tensor( + test_writer.gguf_key("b"), self.b, raw_dtype=np_dtype_to_ggml(self.y.dtype) + ) + + ans = linear( + self.x.astype(np.float64), + self.w.astype(np.float64), + self.b.astype(np.float64), + self.bias_exist + ) + test_writer.add_tensor( + test_writer.gguf_key("ans"), ans, raw_dtype=gguf.GGMLQuantizationType.F64 + ) +if __name__ == "__main__": + test_writer = InfiniopTestWriter("linear.gguf") + test_cases = [] + # ============================================================================== + # Configuration (Internal Use Only) + # ============================================================================== + # These are not meant to be imported from other modules + _TEST_CASES_ = [ + # in_features, out_features, bias_exist, x_strides, y_strides, w_strides + (50, 40, True, None, None, [1, 377]), + (50, 40, False, [10], [1], None), + (50, 40, True, [10], [1], None), + (333, 999, True, [1], [10], None), + (333, 999, False, [1], [10], None), + (1001, 505, True, None, None, [3001, 3]), + ] + _TENSOR_DTYPES_ = [np.float32, np.float16] + for dtype in _TENSOR_DTYPES_: + for in_features, out_features, bias_exist, x_strides, y_strides, w_strides in _TEST_CASES_: + x = np.random.rand(in_features).astype(dtype) + w = np.random.rand(out_features, in_features).astype(dtype) + if bias_exist: + b = np.random.rand(out_features).astype(dtype) + else: + b = np.empty(shape=[], dtype=dtype) + y = np.empty(out_features, dtype=dtype) + + test_case = LinearTestCase( + y=y, + y_shape=[out_features], + y_strides=y_strides, + x=x, + x_shape=[in_features], + x_strides=x_strides, + w=w, + w_shape=[out_features, in_features], + w_strides=w_strides, + b=b, + b_shape=b.shape, + b_strides=None, + bias_exist=bias_exist + ) + test_cases.append(test_case) + test_writer.add_tests(test_cases) + test_writer.save() diff --git a/test/infiniop-test/test_generate/testcases/linear_backward.py b/test/infiniop-test/test_generate/testcases/linear_backward.py new file mode 100644 index 000000000..551f077da --- /dev/null +++ b/test/infiniop-test/test_generate/testcases/linear_backward.py @@ -0,0 +1,196 @@ +from ast import List +import numpy as np +import gguf +from typing import List +from numpy.lib.stride_tricks import as_strided + +from .. import InfiniopTestWriter, InfiniopTestCase, np_dtype_to_ggml, gguf_strides, contiguous_gguf_strides, process_zero_stride_tensor +import torch + +def linear_backward( + grad_y: np.ndarray, + x: np.ndarray, + w: np.ndarray, + b: np.ndarray, + bias_exist: bool +): + torch_grad_y = torch.from_numpy(grad_y) + torch_x = torch.from_numpy(x) + torch_w = torch.from_numpy(w) + + torch_x.requires_grad_(True) + torch_w.requires_grad_(True) + if bias_exist: + torch_b = torch.from_numpy(b) + torch_b.requires_grad_(True) + + torch_y = torch.nn.functional.linear(torch_x, torch_w, bias=(torch_b if bias_exist else None)) + torch_y.backward(torch_grad_y) + return torch_x.grad.detach().numpy(), \ + torch_w.grad.detach().numpy(), \ + torch_b.grad.detach().numpy() if bias_exist else np.empty([], dtype=np.float64) + + +class LinearBackwardTestCase(InfiniopTestCase): + def __init__( + self, + grad_x: np.ndarray, + grad_x_shape: List[int], + grad_x_strides: List[int], + grad_w: np.ndarray, + grad_w_shape: List[int], + grad_w_strides: List[int], + grad_b: np.ndarray, + grad_b_shape: List[int], + grad_b_strides: List[int], + grad_y: np.ndarray, + grad_y_shape: List[int], + grad_y_strides: List[int], + x: np.ndarray, + x_shape: List[int], + x_strides: List[int], + w: np.ndarray, + w_shape: List[int], + w_strides: List[int], + b: np.ndarray, + bias_exist: bool + ): + super().__init__("linear_backward") + self.grad_x = grad_x + self.grad_x_shape = grad_x_shape + self.grad_x_strides = grad_x_strides + self.grad_w = grad_w + self.grad_w_shape = grad_w_shape + self.grad_w_strides = grad_w_strides + self.grad_b = grad_b + self.grad_b_shape = grad_b_shape + self.grad_b_strides = grad_b_strides + self.grad_y = grad_y + self.grad_y_shape = grad_y_shape + self.grad_y_strides = grad_y_strides + self.x = x + self.x_shape = x_shape + self.x_strides = x_strides + self.w = w + self.w_shape = w_shape + self.w_strides = w_strides + + self.b = b + self.bias_exist = bias_exist + + def write_test(self, test_writer: "InfiniopTestWriter"): + super().write_test(test_writer) + test_writer.add_array(test_writer.gguf_key("grad_x.shape"), self.grad_x_shape) + test_writer.add_array(test_writer.gguf_key("grad_w.shape"), self.grad_w_shape) + test_writer.add_array(test_writer.gguf_key("grad_b.shape"), self.grad_b_shape) + test_writer.add_array(test_writer.gguf_key("grad_y.shape"), self.grad_y_shape) + test_writer.add_array(test_writer.gguf_key("x.shape"), self.x_shape) + test_writer.add_array(test_writer.gguf_key("w.shape"), self.w_shape) + test_writer.add_bool(test_writer.gguf_key("bias_exist"), self.bias_exist) + if self.grad_y_strides is not None: + test_writer.add_array(test_writer.gguf_key("grad_y.strides"), gguf_strides(*self.grad_y_strides)) + if self.x_strides is not None: + test_writer.add_array(test_writer.gguf_key("x.strides"), gguf_strides(*self.x_strides)) + if self.w_strides is not None: + test_writer.add_array(test_writer.gguf_key("w.strides"), gguf_strides(*self.w_strides)) + test_writer.add_array( + test_writer.gguf_key("grad_x.strides"), + gguf_strides(*(self.grad_x_strides if self.grad_x_strides is not None else contiguous_gguf_strides(self.grad_x_shape))) + ) + test_writer.add_array( + test_writer.gguf_key("grad_w.strides"), + gguf_strides(*(self.grad_w_strides if self.grad_w_strides is not None else contiguous_gguf_strides(self.grad_w_shape))) + ) + test_writer.add_array( + test_writer.gguf_key("grad_b.strides"), + gguf_strides(*(self.grad_b_strides if self.grad_b_strides is not None else contiguous_gguf_strides(self.grad_b_shape))) + ) + test_writer.add_tensor( + test_writer.gguf_key("grad_x"), self.grad_x, raw_dtype=np_dtype_to_ggml(self.grad_x.dtype) + ) + test_writer.add_tensor( + test_writer.gguf_key("grad_w"), self.grad_w, raw_dtype=np_dtype_to_ggml(self.grad_w.dtype) + ) + test_writer.add_tensor( + test_writer.gguf_key("grad_b"), self.grad_b, raw_dtype=np_dtype_to_ggml(self.grad_b.dtype) + ) + test_writer.add_tensor( + test_writer.gguf_key("grad_y"), self.grad_y, raw_dtype=np_dtype_to_ggml(self.grad_y.dtype) + ) + test_writer.add_tensor( + test_writer.gguf_key("x"), self.x, raw_dtype=np_dtype_to_ggml(self.x.dtype) + ) + test_writer.add_tensor( + test_writer.gguf_key("w"), self.w, raw_dtype=np_dtype_to_ggml(self.w.dtype) + ) + + ans_grad_x, ans_grad_w, ans_grad_b = linear_backward( + self.grad_y.astype(np.float64), + self.x.astype(np.float64), + self.w.astype(np.float64), + self.b.astype(np.float64), + self.bias_exist + ) + + test_writer.add_tensor( + test_writer.gguf_key("ans_grad_x"), ans_grad_x, raw_dtype=gguf.GGMLQuantizationType.F64 + ) + test_writer.add_tensor( + test_writer.gguf_key("ans_grad_w"), ans_grad_w, raw_dtype=gguf.GGMLQuantizationType.F64 + ) + test_writer.add_tensor( + test_writer.gguf_key("ans_grad_b"), ans_grad_b, raw_dtype=gguf.GGMLQuantizationType.F64 + ) +if __name__ == "__main__": + test_writer = InfiniopTestWriter("linear_backward.gguf") + test_cases = [] + # ============================================================================== + # Configuration (Internal Use Only) + # ============================================================================== + # These are not meant to be imported from other modules + _TEST_CASES_ = [ + # in_features, out_features, bias_exist, grad_x_strides, grad_y_strides, grad_w_strides + (50, 40, True, None, None, [1, 377]), + (50, 40, False, [10], [1], None), + (50, 40, True, [10], [1], None), + (333, 999, True, [1], [10], None), + (333, 999, False, [1], [10], None), + (1001, 505, True, None, None, [3001, 3]), + ] + _TENSOR_DTYPES_ = [np.float32, np.float16] + for dtype in _TENSOR_DTYPES_: + for in_features, out_features, bias_exist, grad_x_strides, grad_y_strides, grad_w_strides in _TEST_CASES_: + grad_y = np.random.rand(out_features).astype(dtype) + x = np.random.rand(in_features).astype(dtype) + w = np.random.rand(out_features, in_features).astype(dtype) + b = np.random.rand(out_features).astype(dtype) if bias_exist else \ + np.empty([], dtype=dtype) + grad_x = np.empty([in_features], dtype=dtype) + grad_w = np.empty([out_features, in_features], dtype=dtype) + grad_b = np.empty([out_features], dtype=dtype) + + test_case = LinearBackwardTestCase( + grad_x=grad_x, + grad_x_shape=[in_features], + grad_x_strides=grad_x_strides, + grad_w=grad_w, + grad_w_shape=[out_features, in_features], + grad_w_strides=grad_w_strides, + grad_b=grad_b, + grad_b_shape=[out_features], + grad_b_strides=None, + grad_y=grad_y, + grad_y_shape=[out_features], + grad_y_strides=grad_y_strides, + x=x, + x_shape=[in_features], + x_strides=None, + w=w, + w_shape=[out_features, in_features], + w_strides=None, + b=b, + bias_exist=bias_exist + ) + test_cases.append(test_case) + test_writer.add_tests(test_cases) + test_writer.save() diff --git a/test/infiniop-test/test_generate/testcases/scatter.py b/test/infiniop-test/test_generate/testcases/scatter.py new file mode 100644 index 000000000..ea383c7fb --- /dev/null +++ b/test/infiniop-test/test_generate/testcases/scatter.py @@ -0,0 +1,148 @@ +from ast import List +import numpy as np +import gguf +from typing import List +from numpy.lib.stride_tricks import as_strided +import random +from .. import InfiniopTestWriter, InfiniopTestCase, np_dtype_to_ggml, gguf_strides, contiguous_gguf_strides, process_zero_stride_tensor +import torch + +def scatter( + output: np.ndarray, + input: np.ndarray, + index: np.ndarray, + dim: int, +): + torch_output = torch.from_numpy(output) + torch_input = torch.from_numpy(input) + torch_index = torch.from_numpy(index) + + torch_output.scatter_(dim, torch_index, src=torch_input) + return torch_output.detach().numpy() + + +class ScatterTestCase(InfiniopTestCase): + def __init__( + self, + output: np.ndarray, + output_shape: List[int], + output_strides: List[int], + input: np.ndarray, + input_shape: List[int], + input_strides: List[int], + index: np.ndarray, + index_shape: List[int], + index_strides: List[int], + dim: int, + ): + super().__init__("scatter") + self.output = output + self.output_shape = output_shape + self.output_strides = output_strides + self.input = input + self.input_shape = input_shape + self.input_strides = input_strides + self.index = index + self.index_shape = index_shape + self.index_strides = index_strides + self.dim = dim + + def write_test(self, test_writer: "InfiniopTestWriter"): + super().write_test(test_writer) + test_writer.add_array(test_writer.gguf_key("output.shape"), self.output_shape) + test_writer.add_array(test_writer.gguf_key("input.shape"), self.input_shape) + test_writer.add_array(test_writer.gguf_key("index.shape"), self.index_shape) + test_writer.add_uint64(test_writer.gguf_key("dim"), self.dim) + if self.input_strides is not None: + test_writer.add_array(test_writer.gguf_key("input.strides"), gguf_strides(*self.input_strides)) + if self.index_strides is not None: + test_writer.add_array(test_writer.gguf_key("index.strides"), gguf_strides(*self.index_strides)) + test_writer.add_array( + test_writer.gguf_key("output.strides"), + gguf_strides(*(self.output_strides if self.output_strides is not None else contiguous_gguf_strides(self.output_shape))) + ) + test_writer.add_tensor( + test_writer.gguf_key("output"), self.output, raw_dtype=np_dtype_to_ggml(self.output.dtype) + ) + test_writer.add_tensor( + test_writer.gguf_key("input"), self.input, raw_dtype=np_dtype_to_ggml(self.input.dtype) + ) + test_writer.add_tensor( + test_writer.gguf_key("index"), self.index, raw_dtype=np_dtype_to_ggml(self.index.dtype) + ) + + ans = scatter( + self.output, + self.input, + self.index, + self.dim, + ) + test_writer.add_tensor( + test_writer.gguf_key("ans"), ans, raw_dtype=np_dtype_to_ggml(self.input.dtype) + ) +def get_test_index_tensor(input_shape, index_shape, output_shape, scatter_dim): + index = np.empty(index_shape, dtype=np.int64) + ndim = len(input_shape) + if ndim == 2 and scatter_dim == 1: + for i in range(input.shape[0]): + row = list(range(output_shape[dim])) + random.shuffle(row) + index[i, :] = row[:index_shape[dim]] + elif ndim == 3 and scatter_dim == 2: + for i in range(input.shape[0]): + for j in range(input.shape[1]): + row = list(range(output_shape[dim])) + random.shuffle(row) + index[i, j, :] = row[:index_shape[dim]] + elif ndim == 3 and scatter_dim == 1: + for i in range(input.shape[0]): + for j in range(input.shape[2]): + row = list(range(output_shape[dim])) + random.shuffle(row) + index[i, :, j] = row[:index_shape[dim]] + elif ndim == 4 and scatter_dim == 0: + for i in range(input.shape[1]): + for j in range(input.shape[2]): + for k in range(input.shape[3]): + row = list(range(output_shape[dim])) + random.shuffle(row) + index[:, i, j, k] = row[:index_shape[dim]] + return index + +if __name__ == "__main__": + test_writer = InfiniopTestWriter("scatter.gguf") + test_cases = [] + # ============================================================================== + # Configuration (Internal Use Only) + # ============================================================================== + # These are not meant to be imported from other modules + _TEST_CASES_ = [ + # input_shape, index_shape, output_shape, dim, input_strides, output_strides, index_strides + ((6, 7), (6, 7), (6, 7), 1, (55, 2), (55, 2), (55, 2)), + ((2, 3, 7), (2, 3, 5), (2, 3, 5), 2, (177, 17, 1), None, None), + ((10, 5, 4), (10, 4, 4), (10, 4, 4), 1, (30, 5, 1), None, [107, 10, 1]), + ((11, 2, 2, 4), (11, 2, 2, 4), (11, 2, 2, 4), 0, None, (1007, 107, 10, 1), None), + ] + _TENSOR_DTYPES_ = [np.float32, np.float16, np.int32, np.int64, np.bool_] + for dtype in _TENSOR_DTYPES_: + for input_shape, index_shape, output_shape, dim, input_strides, output_strides, index_strides in _TEST_CASES_: + input = (np.random.rand(*input_shape) * 100).astype(dtype) + index = get_test_index_tensor(input_shape, index_shape, output_shape, dim) + + output = np.zeros(output_shape, dtype=dtype) + + test_case = ScatterTestCase( + output=output, + output_shape=output_shape, + output_strides=output_strides, + input=input, + input_shape=input_shape, + input_strides=input_strides, + index=index, + index_shape=index_shape, + index_strides=index_strides, + dim=dim, + ) + test_cases.append(test_case) + test_writer.add_tests(test_cases) + test_writer.save() diff --git a/test/infiniop-test/test_generate/testcases/tril.py b/test/infiniop-test/test_generate/testcases/tril.py new file mode 100644 index 000000000..bfd606473 --- /dev/null +++ b/test/infiniop-test/test_generate/testcases/tril.py @@ -0,0 +1,118 @@ +from ast import List +import numpy as np +import gguf +from typing import List +from numpy.lib.stride_tricks import as_strided + +from .. import InfiniopTestWriter, InfiniopTestCase, np_dtype_to_ggml, gguf_strides, contiguous_gguf_strides, process_zero_stride_tensor +import torch +from enum import Enum, auto +class Inplace(Enum): + OUT_OF_PLACE = auto() + INPLACE = auto() +_INPLACE = [ + Inplace.OUT_OF_PLACE, + Inplace.INPLACE, +] + +def tril( + output: np.ndarray, + input: np.ndarray, + diagonal: int, + inplace: Inplace +): + torch_input = torch.from_numpy(input) + if inplace == Inplace.INPLACE: + torch_output = torch_input + else: + torch_output = torch.from_numpy(output) + + torch.tril(torch_input, diagonal, out=torch_output) + + return torch_output.detach().numpy() + + +class TrilTestCase(InfiniopTestCase): + def __init__( + self, + output: np.ndarray, + output_shape: List[int], + output_strides: List[int], + input: np.ndarray, + input_shape: List[int], + input_strides: List[int], + diagonal: int, + inplace: Inplace + ): + super().__init__("tril") + self.output = output + self.output_shape = output_shape + self.output_strides = output_strides + self.input = input + self.input_shape = input_shape + self.input_strides = input_strides + self.diagonal = diagonal + self.inplace = inplace + + def write_test(self, test_writer: "InfiniopTestWriter"): + super().write_test(test_writer) + test_writer.add_array(test_writer.gguf_key("output.shape"), self.output_shape) + test_writer.add_array(test_writer.gguf_key("input.shape"), self.input_shape) + test_writer.add_int64(test_writer.gguf_key("diagonal"), self.diagonal) + if self.input_strides is not None: + test_writer.add_array(test_writer.gguf_key("input.strides"), gguf_strides(*self.input_strides)) + test_writer.add_array( + test_writer.gguf_key("output.strides"), + gguf_strides(*(self.output_strides if self.output_strides is not None else contiguous_gguf_strides(self.output_shape))) + ) + test_writer.add_tensor( + test_writer.gguf_key("output"), self.output, raw_dtype=np_dtype_to_ggml(self.output.dtype) + ) + test_writer.add_tensor( + test_writer.gguf_key("input"), self.input, raw_dtype=np_dtype_to_ggml(self.input.dtype) + ) + ans = tril( + self.output, + self.input, + self.diagonal, + self.inplace + ) + test_writer.add_tensor( + test_writer.gguf_key("ans"), ans, raw_dtype=np_dtype_to_ggml(self.input.dtype) + ) +if __name__ == "__main__": + test_writer = InfiniopTestWriter("tril.gguf") + test_cases = [] + # ============================================================================== + # Configuration (Internal Use Only) + # ============================================================================== + # These are not meant to be imported from other modules + _TEST_CASES_ = [ + ((5, 6), 0), + ((4, 5), -1), + ((61, 71), 2), + ((111, 222), 33), + ((1001, 1001), -101) + ] + _TENSOR_DTYPES_ = [np.float32, np.float16, np.int64, np.int32, np.bool_] + for dtype in _TENSOR_DTYPES_: + for shape, diagonal in _TEST_CASES_: + for inplace in _INPLACE: + input = np.random.rand(*shape).astype(dtype) + if inplace == Inplace.INPLACE: + output = input + else: + output = np.empty(shape, dtype=dtype) + test_case = TrilTestCase( + output=output, + output_shape=shape, + output_strides=None, + input=input, + input_shape=shape, + input_strides=None, + diagonal=diagonal, + inplace=inplace + ) + test_cases.append(test_case) + test_writer.add_tests(test_cases) + test_writer.save() diff --git a/test/infiniop-test/test_generate/testcases/triu.py b/test/infiniop-test/test_generate/testcases/triu.py new file mode 100644 index 000000000..3c0b965a2 --- /dev/null +++ b/test/infiniop-test/test_generate/testcases/triu.py @@ -0,0 +1,120 @@ +from ast import List +import numpy as np +import gguf +from typing import List +from numpy.lib.stride_tricks import as_strided + +from .. import InfiniopTestWriter, InfiniopTestCase, np_dtype_to_ggml, gguf_strides, contiguous_gguf_strides, process_zero_stride_tensor +import torch +from enum import Enum, auto +class Inplace(Enum): + OUT_OF_PLACE = auto() + INPLACE = auto() +_INPLACE = [ + Inplace.OUT_OF_PLACE, + Inplace.INPLACE, +] + +def triu( + output: np.ndarray, + input: np.ndarray, + diagonal: int, + inplace: Inplace +): + torch_input = torch.from_numpy(input) + if inplace == Inplace.INPLACE: + torch_output = torch_input + else: + torch_output = torch.from_numpy(output) + + torch.triu(torch_input, diagonal, out=torch_output) + + return torch_output.detach().numpy() + + +class TriuTestCase(InfiniopTestCase): + def __init__( + self, + output: np.ndarray, + output_shape: List[int], + output_strides: List[int], + input: np.ndarray, + input_shape: List[int], + input_strides: List[int], + diagonal: int, + inplace: Inplace + ): + super().__init__("triu") + self.output = output + self.output_shape = output_shape + self.output_strides = output_strides + self.input = input + self.input_shape = input_shape + self.input_strides = input_strides + self.diagonal = diagonal + self.inplace = inplace + + def write_test(self, test_writer: "InfiniopTestWriter"): + super().write_test(test_writer) + test_writer.add_array(test_writer.gguf_key("output.shape"), self.output_shape) + test_writer.add_array(test_writer.gguf_key("input.shape"), self.input_shape) + test_writer.add_int64(test_writer.gguf_key("diagonal"), self.diagonal) + if self.input_strides is not None: + test_writer.add_array(test_writer.gguf_key("input.strides"), gguf_strides(*self.input_strides)) + test_writer.add_array( + test_writer.gguf_key("output.strides"), + gguf_strides(*(self.output_strides if self.output_strides is not None else contiguous_gguf_strides(self.output_shape))) + ) + test_writer.add_tensor( + test_writer.gguf_key("output"), self.output, raw_dtype=np_dtype_to_ggml(self.output.dtype) + ) + test_writer.add_tensor( + test_writer.gguf_key("input"), self.input, raw_dtype=np_dtype_to_ggml(self.input.dtype) + ) + + ans = triu( + self.output, + self.input, + self.diagonal, + self.inplace + ) + test_writer.add_tensor( + test_writer.gguf_key("ans"), ans, raw_dtype=np_dtype_to_ggml(self.input.dtype) + ) +if __name__ == "__main__": + test_writer = InfiniopTestWriter("triu.gguf") + test_cases = [] + # ============================================================================== + # Configuration (Internal Use Only) + # ============================================================================== + # These are not meant to be imported from other modules + _TEST_CASES_ = [ + # shape, diagonal + ((5, 6), 0), + ((4, 5), -1), + ((61, 71), 2), + ((111, 222), 33), + ((1001, 1001), -101) + ] + _TENSOR_DTYPES_ = [np.float32, np.float16, np.int64, np.int32, np.bool_] + for dtype in _TENSOR_DTYPES_: + for shape, diagonal in _TEST_CASES_: + for inplace in _INPLACE: + input = np.random.rand(*shape).astype(dtype) + if inplace == Inplace.INPLACE: + output = input + else: + output = np.empty(shape, dtype=dtype) + test_case = TriuTestCase( + output=output, + output_shape=shape, + output_strides=None, + input=input, + input_shape=shape, + input_strides=None, + diagonal=diagonal, + inplace=inplace + ) + test_cases.append(test_case) + test_writer.add_tests(test_cases) + test_writer.save() diff --git a/test/infiniop/gather.py b/test/infiniop/gather.py new file mode 100644 index 000000000..d22a17811 --- /dev/null +++ b/test/infiniop/gather.py @@ -0,0 +1,161 @@ +import torch +import ctypes +from ctypes import c_uint64 +from libinfiniop import ( + LIBINFINIOP, + TestTensor, + get_test_devices, + check_error, + test_operator, + get_args, + debug, + get_tolerance, + profile_operation, + TestWorkspace, + InfiniDtype, + InfiniDtypeNames, + InfiniDeviceNames, + infiniopOperatorDescriptor_t, +) +from enum import Enum, auto + +_TEST_CASES = [ + # input_shape, output_shape, dim, input_strides, output_strides, index_strides + ((2, 3, 7), (2, 3, 5), 2, (177, 17, 1), None, None), + ((10, 5, 4), (10, 4, 4), 1, (30, 5, 1), None, [16, 4, 1]), + ((11, 2, 2, 4), (11, 2, 2, 4), 0, None, (1007, 107, 10, 1), None), + ((11, 20, 20, 13, 37), (11, 20, 20, 13, 37), 1, None, None, None) +] + +# Data types used for testing +_TENSOR_DTYPES = [InfiniDtype.F16, InfiniDtype.F32, InfiniDtype.BF16] +_TENSOR_DTYPES = [InfiniDtype.F16,] + +# Tolerance map for different data types +_TOLERANCE_MAP = { + InfiniDtype.F16: {"atol": 0, "rtol": 0}, + InfiniDtype.F32: {"atol": 0, "rtol": 0}, + InfiniDtype.BF16: {"atol": 0, "rtol": 0}, +} + +DEBUG = False +PROFILE = False +NUM_PRERUN = 10 +NUM_ITERATIONS = 1000 + + +def torch_gather(output, input, dim, index): + torch.gather(input, dim, index, out=output) + +def test( + handle, + device, + input_shape, output_shape, dim, input_strides, output_strides, index_strides, + dtype, + sync=None, +): + print( + f"Testing Gather on {InfiniDeviceNames[device]} with input shape:{input_shape}, dim:{dim}, output_shape:{output_shape}," + f"dtype:{InfiniDtypeNames[dtype]}" + ) + + input = TestTensor( + input_shape, + input_strides, + dtype, + device + ) + torch_index = torch.randint(low=0, high=input_shape[dim], size=output_shape, dtype=torch.int64) + if index_strides: + torch_index = torch_index.as_strided(output_shape, index_strides) + index = TestTensor( + output_shape, + torch_index.stride(), + InfiniDtype.I64, + device, + "manual", + set_tensor=torch_index + ) + output = TestTensor( + output_shape, + output_strides, + dtype, + device, + ) + + torch_gather(output.torch_tensor(), input.torch_tensor(), dim, index.torch_tensor()) + + if sync is not None: + sync() + + descriptor = infiniopOperatorDescriptor_t() + check_error( + LIBINFINIOP.infiniopCreateGatherDescriptor( + handle, + ctypes.byref(descriptor), + output.descriptor, + input.descriptor, + index.descriptor, + dim + ) + ) + + # Invalidate the shape and strides in the descriptor to prevent them from being directly used by the kernel + for tensor in [input, output, index]: + tensor.destroy_desc() + + workspace_size = c_uint64(0) + check_error( + LIBINFINIOP.infiniopGetGatherWorkspaceSize( + descriptor, ctypes.byref(workspace_size) + ) + ) + workspace = TestWorkspace(workspace_size.value, input.device) + + def lib_gather(): + check_error( + LIBINFINIOP.infiniopGather( + descriptor, + workspace.data(), + workspace.size(), + output.data(), + input.data(), + index.data(), + None, + ) + ) + + lib_gather() + + atol, rtol = get_tolerance(_TOLERANCE_MAP, dtype) + if DEBUG: + debug(output.actual_tensor(), output.torch_tensor(), atol=atol, rtol=rtol) + # print("x:", input.torch_tensor()) + # print("CALCULATED:\n", output.actual_tensor(), ) + # print("GT\n", output.torch_tensor()) + assert torch.allclose(output.actual_tensor(), output.torch_tensor(), atol=atol, rtol=rtol) + + # Profiling workflow + if PROFILE: + # fmt: off + profile_operation("PyTorch", lambda: torch_gather( + output.torch_tensor(), input.torch_tensor(), dim, index.torch_tensor() + ), device, NUM_PRERUN, NUM_ITERATIONS) + profile_operation(" lib", lambda: lib_gather(), device, NUM_PRERUN, NUM_ITERATIONS) + # fmt: on + check_error(LIBINFINIOP.infiniopDestroyGatherDescriptor(descriptor)) + + +if __name__ == "__main__": + args = get_args() + + # Configure testing options + DEBUG = args.debug + PROFILE = args.profile + NUM_PRERUN = args.num_prerun + NUM_ITERATIONS = args.num_iterations + + for device in get_test_devices(args): + test_operator(device, test, _TEST_CASES, _TENSOR_DTYPES) + + print("\033[92mTest my Gather passed!\033[0m") diff --git a/test/infiniop/index_copy_inplace.py b/test/infiniop/index_copy_inplace.py new file mode 100644 index 000000000..97dbd8266 --- /dev/null +++ b/test/infiniop/index_copy_inplace.py @@ -0,0 +1,180 @@ +import torch +import ctypes +from ctypes import c_uint64 +from libinfiniop import ( + LIBINFINIOP, + TestTensor, + get_test_devices, + check_error, + test_operator, + get_args, + debug, + get_tolerance, + profile_operation, + TestWorkspace, + InfiniDtype, + InfiniDtypeNames, + InfiniDeviceNames, + infiniopOperatorDescriptor_t, +) +from enum import Enum, auto +import random + + +class Inplace(Enum): + OUT_OF_PLACE = auto() + INPLACE = auto() + +_TEST_CASES = [ + # input_shape, output_shape, dim, output_strides, input_strides, + ([13, 1], [13, 4], 1, [37, 1], [37, 1], Inplace.OUT_OF_PLACE), + ([1333, 4], [1333, 4], 0, [1, 1333], [1, 2333], Inplace.INPLACE), + ([1333, 4], [1333, 4], 0, [1, 1333], [1, 2333], Inplace.OUT_OF_PLACE), + ([133, 23, 53], [133, 23, 53], 1, None, None, Inplace.OUT_OF_PLACE), + ([133, 23, 13, 53], [133, 23, 13, 53], 2, None, None, Inplace.OUT_OF_PLACE), +] + +# Data types used for testing +_TENSOR_DTYPES = [InfiniDtype.F16, InfiniDtype.F32, InfiniDtype.BF16] + +# Tolerance map for different data types +_TOLERANCE_MAP = { + InfiniDtype.F16: {"atol": 0, "rtol": 0}, + InfiniDtype.F32: {"atol": 0, "rtol": 0}, + InfiniDtype.BF16: {"atol": 0, "rtol": 0}, +} + + +DEBUG = False +PROFILE = False +NUM_PRERUN = 10 +NUM_ITERATIONS = 1000 + + +def torch_index_copy_inplace(output, input, index, dim): + output.index_copy_(dim, index, input.clone()) + + +def test( + handle, + device, + input_shape, output_shape, dim, output_strides, input_strides, + inplace, + dtype, + sync=None, +): + print( + f"Testing index_copy_inplace on {InfiniDeviceNames[device]} with shape:{input_shape}," + f"inplace:{inplace}," + f"dtype:{InfiniDtypeNames[dtype]}" + ) + + input = TestTensor( + input_shape, + input_strides, + dtype, + device, + ) + if inplace == Inplace.INPLACE: + assert output_shape == input_shape + output = input + else: + output = TestTensor( + output_shape, + output_strides, + dtype, + device, + "zeros", + ) + + index_list = list(range(output_shape[dim])) + + random.shuffle(index_list) + torch_index = torch.tensor(index_list[:input_shape[dim]], dtype=torch.int64) + index = TestTensor( + [input_shape[dim]], + torch_index.stride(), + InfiniDtype.I64, + device, + "manual", + set_tensor=torch_index + ) + + torch_index_copy_inplace(output.torch_tensor(), input.torch_tensor(), index.torch_tensor(), dim) + + if sync is not None: + sync() + + descriptor = infiniopOperatorDescriptor_t() + check_error( + LIBINFINIOP.infiniopCreateIndexCopyInplaceDescriptor( + handle, + ctypes.byref(descriptor), + output.descriptor, + input.descriptor, + index.descriptor, + dim, + ) + ) + + # Invalidate the shape and strides in the descriptor to prevent them from being directly used by the kernel + for tensor in [output, input, index]: + tensor.destroy_desc() + + workspace_size = c_uint64(0) + check_error( + LIBINFINIOP.infiniopGetIndexCopyInplaceWorkspaceSize( + descriptor, ctypes.byref(workspace_size) + ) + ) + workspace = TestWorkspace(workspace_size.value, output.device) + + def lib_index_copy_inplace(): + check_error( + LIBINFINIOP.infiniopIndexCopyInplace( + descriptor, + workspace.data(), + workspace.size(), + output.data(), + input.data(), + index.data(), + None, + ) + ) + + lib_index_copy_inplace() + + atol, rtol = get_tolerance(_TOLERANCE_MAP, dtype) + if DEBUG: + debug(output.actual_tensor(), output.torch_tensor(), atol=atol, rtol=rtol) + # print('input:\n', input.torch_tensor()) + # print('index:\n', index.torch_tensor()) + # print('output:\n', output.torch_tensor(), '\n', output.actual_tensor(), ) + + + assert torch.allclose(output.actual_tensor(), output.torch_tensor(), atol=atol, rtol=rtol) + + # Profiling workflow + if PROFILE: + # fmt: off + profile_operation("PyTorch", lambda: torch_index_copy_inplace( + output.torch_tensor(), input.torch_tensor(), index.torch_tensor(), dim + ), device, NUM_PRERUN, NUM_ITERATIONS) + profile_operation(" lib", lambda: lib_index_copy_inplace(), device, NUM_PRERUN, NUM_ITERATIONS) + # fmt: on + check_error(LIBINFINIOP.infiniopDestroyIndexCopyInplaceDescriptor(descriptor)) + + +if __name__ == "__main__": + args = get_args() + + # Configure testing options + DEBUG = args.debug + PROFILE = args.profile + NUM_PRERUN = args.num_prerun + NUM_ITERATIONS = args.num_iterations + + for device in get_test_devices(args): + test_operator(device, test, _TEST_CASES, _TENSOR_DTYPES) + + print("\033[92mTest my index_copy_inplace passed!\033[0m") diff --git a/test/infiniop/libinfiniop/op_register.py b/test/infiniop/libinfiniop/op_register.py index e92e77105..a75300a76 100644 --- a/test/infiniop/libinfiniop/op_register.py +++ b/test/infiniop/libinfiniop/op_register.py @@ -489,3 +489,209 @@ def conv_(lib): lib.infiniopDestroyConvDescriptor.argtypes = [ infiniopOperatorDescriptor_t, ] + +@OpRegister.operator +def index_copy_inplace_(lib): + lib.infiniopCreateIndexCopyInplaceDescriptor.restype = c_int32 + lib.infiniopCreateIndexCopyInplaceDescriptor.argtypes = [ + infiniopHandle_t, + POINTER(infiniopOperatorDescriptor_t), + infiniopTensorDescriptor_t, + infiniopTensorDescriptor_t, + infiniopTensorDescriptor_t, + c_size_t, + ] + lib.infiniopGetIndexCopyInplaceWorkspaceSize.restype = c_int32 + lib.infiniopGetIndexCopyInplaceWorkspaceSize.argtypes = [ + infiniopOperatorDescriptor_t, + POINTER(c_size_t), + ] + lib.infiniopIndexCopyInplace.restype = c_int32 + lib.infiniopIndexCopyInplace.argtypes = [ + infiniopOperatorDescriptor_t, + c_void_p, + c_size_t, + c_void_p, + c_void_p, + c_void_p, + c_void_p, + ] + lib.infiniopDestroyIndexCopyInplaceDescriptor.restype = c_int32 + lib.infiniopDestroyIndexCopyInplaceDescriptor.argtypes = [infiniopOperatorDescriptor_t] + +@OpRegister.operator +def gather_(lib): + lib.infiniopCreateGatherDescriptor.restype = c_int32 + lib.infiniopCreateGatherDescriptor.argtypes = [ + infiniopHandle_t, + POINTER(infiniopOperatorDescriptor_t), + infiniopTensorDescriptor_t, + infiniopTensorDescriptor_t, + infiniopTensorDescriptor_t, + c_size_t, + ] + lib.infiniopGetGatherWorkspaceSize.restype = c_int32 + lib.infiniopGetGatherWorkspaceSize.argtypes = [ + infiniopOperatorDescriptor_t, + POINTER(c_size_t), + ] + lib.infiniopGather.restype = c_int32 + lib.infiniopGather.argtypes = [ + infiniopOperatorDescriptor_t, + c_void_p, + c_size_t, + c_void_p, + c_void_p, + c_void_p, + c_void_p, + ] + lib.infiniopDestroyGatherDescriptor.restype = c_int32 + lib.infiniopDestroyGatherDescriptor.argtypes = [infiniopOperatorDescriptor_t] + +@OpRegister.operator +def scatter_(lib): + lib.infiniopCreateScatterDescriptor.restype = c_int32 + lib.infiniopCreateScatterDescriptor.argtypes = [ + infiniopHandle_t, + POINTER(infiniopOperatorDescriptor_t), + infiniopTensorDescriptor_t, + infiniopTensorDescriptor_t, + infiniopTensorDescriptor_t, + c_size_t, + ] + lib.infiniopGetScatterWorkspaceSize.restype = c_int32 + lib.infiniopGetScatterWorkspaceSize.argtypes = [ + infiniopOperatorDescriptor_t, + POINTER(c_size_t), + ] + lib.infiniopScatter.restype = c_int32 + lib.infiniopScatter.argtypes = [ + infiniopOperatorDescriptor_t, + c_void_p, + c_size_t, + c_void_p, + c_void_p, + c_void_p, + c_void_p, + ] + lib.infiniopDestroyScatterDescriptor.restype = c_int32 + lib.infiniopDestroyScatterDescriptor.argtypes = [infiniopOperatorDescriptor_t] + +@OpRegister.operator +def tril_(lib): + lib.infiniopCreateTrilDescriptor.restype = c_int32 + lib.infiniopCreateTrilDescriptor.argtypes = [ + infiniopHandle_t, + POINTER(infiniopOperatorDescriptor_t), + infiniopTensorDescriptor_t, + infiniopTensorDescriptor_t, + c_int32, + ] + lib.infiniopGetTrilWorkspaceSize.restype = c_int32 + lib.infiniopGetTrilWorkspaceSize.argtypes = [ + infiniopOperatorDescriptor_t, + POINTER(c_size_t), + ] + lib.infiniopTril.restype = c_int32 + lib.infiniopTril.argtypes = [ + infiniopOperatorDescriptor_t, + c_void_p, + c_size_t, + c_void_p, + c_void_p, + c_void_p, + ] + lib.infiniopDestroyTrilDescriptor.restype = c_int32 + lib.infiniopDestroyTrilDescriptor.argtypes = [infiniopOperatorDescriptor_t] + +@OpRegister.operator +def triu_(lib): + lib.infiniopCreateTriuDescriptor.restype = c_int32 + lib.infiniopCreateTriuDescriptor.argtypes = [ + infiniopHandle_t, + POINTER(infiniopOperatorDescriptor_t), + infiniopTensorDescriptor_t, + infiniopTensorDescriptor_t, + c_int32, + ] + lib.infiniopGetTriuWorkspaceSize.restype = c_int32 + lib.infiniopGetTriuWorkspaceSize.argtypes = [ + infiniopOperatorDescriptor_t, + POINTER(c_size_t), + ] + lib.infiniopTriu.restype = c_int32 + lib.infiniopTriu.argtypes = [ + infiniopOperatorDescriptor_t, + c_void_p, + c_size_t, + c_void_p, + c_void_p, + c_void_p, + ] + lib.infiniopDestroyTriuDescriptor.restype = c_int32 + lib.infiniopDestroyTriuDescriptor.argtypes = [infiniopOperatorDescriptor_t] + +@OpRegister.operator +def linear_(lib): + lib.infiniopCreateLinearDescriptor.restype = c_int32 + lib.infiniopCreateLinearDescriptor.argtypes = [ + infiniopHandle_t, + POINTER(infiniopOperatorDescriptor_t), + infiniopTensorDescriptor_t, + infiniopTensorDescriptor_t, + infiniopTensorDescriptor_t, + infiniopTensorDescriptor_t, + ] + lib.infiniopGetLinearWorkspaceSize.restype = c_int32 + lib.infiniopGetLinearWorkspaceSize.argtypes = [ + infiniopOperatorDescriptor_t, + POINTER(c_size_t), + ] + lib.infiniopLinear.restype = c_int32 + lib.infiniopLinear.argtypes = [ + infiniopOperatorDescriptor_t, + c_void_p, + c_size_t, + c_void_p, + c_void_p, + c_void_p, + c_void_p, + c_void_p, + ] + lib.infiniopDestroyLinearDescriptor.restype = c_int32 + lib.infiniopDestroyLinearDescriptor.argtypes = [infiniopOperatorDescriptor_t] + +@OpRegister.operator +def linear_backward_(lib): + lib.infiniopCreateLinearBackwardDescriptor.restype = c_int32 + lib.infiniopCreateLinearBackwardDescriptor.argtypes = [ + infiniopHandle_t, + POINTER(infiniopOperatorDescriptor_t), + infiniopTensorDescriptor_t, + infiniopTensorDescriptor_t, + infiniopTensorDescriptor_t, + infiniopTensorDescriptor_t, + infiniopTensorDescriptor_t, + infiniopTensorDescriptor_t, + ] + lib.infiniopGetLinearBackwardWorkspaceSize.restype = c_int32 + lib.infiniopGetLinearBackwardWorkspaceSize.argtypes = [ + infiniopOperatorDescriptor_t, + POINTER(c_size_t), + ] + lib.infiniopLinearBackward.restype = c_int32 + lib.infiniopLinearBackward.argtypes = [ + infiniopOperatorDescriptor_t, + c_void_p, + c_size_t, + c_void_p, + c_void_p, + c_void_p, + c_void_p, + c_void_p, + c_void_p, + c_void_p, + ] + lib.infiniopDestroyLinearBackwardDescriptor.restype = c_int32 + lib.infiniopDestroyLinearBackwardDescriptor.argtypes = [infiniopOperatorDescriptor_t] + diff --git a/test/infiniop/libinfiniop/utils.py b/test/infiniop/libinfiniop/utils.py index de397a69e..093e93ec2 100644 --- a/test/infiniop/libinfiniop/utils.py +++ b/test/infiniop/libinfiniop/utils.py @@ -122,6 +122,8 @@ def from_torch(torch_tensor, dt: InfiniDtype, device: InfiniDeviceEnum): def to_torch_dtype(dt: InfiniDtype, compatability_mode=False): + if dt == InfiniDtype.BOOL: # support torch.bool input dtype + return torch.bool if dt == InfiniDtype.I8: return torch.int8 elif dt == InfiniDtype.I16: @@ -185,6 +187,8 @@ def rearrange_tensor(tensor, new_strides): Given a PyTorch tensor and a list of new strides, return a new PyTorch tensor with the given strides. """ import torch + if list(tensor.stride()) == list(new_strides):# support self-define strides as the input of TestTensor + return tensor shape = tensor.shape diff --git a/test/infiniop/linear.py b/test/infiniop/linear.py new file mode 100644 index 000000000..120853572 --- /dev/null +++ b/test/infiniop/linear.py @@ -0,0 +1,169 @@ +import torch +import ctypes +from ctypes import c_uint64 +from libinfiniop import ( + LIBINFINIOP, + TestTensor, + get_test_devices, + check_error, + test_operator, + get_args, + debug, + get_tolerance, + profile_operation, + TestWorkspace, + InfiniDtype, + InfiniDtypeNames, + InfiniDeviceNames, + infiniopOperatorDescriptor_t, +) +from enum import Enum, auto + +_TEST_CASES = [ + (50, 40, True, None, None, [1, 377]), + (50, 40, False, [10], [1], None), + (50, 40, True, [10], [1], None), + (333, 999, True, [1], [10], None), + (333, 999, False, [1], [10], None), + (1001, 505, True, None, None, [3001, 3]), +] + + + +# Data types used for testing +_TENSOR_DTYPES = [InfiniDtype.F16, InfiniDtype.F32, InfiniDtype.BF16] + +# Tolerance map for different data types +_TOLERANCE_MAP = { + InfiniDtype.F16: {"atol": 1e-3, "rtol": 1e-3}, + InfiniDtype.F32: {"atol": 1e-5, "rtol": 1e-5}, + InfiniDtype.BF16: {"atol": 1e-1, "rtol": 1e-1}, +} + +DEBUG = False +PROFILE = False +NUM_PRERUN = 10 +NUM_ITERATIONS = 1000 + + +def torch_linear(x, w, b, bias): + return torch.nn.functional.linear(x, w, bias=(b if bias else None)) + + +def test( + handle, + device, + in_features, out_features, bias_exist, x_strides, y_strides, w_strides, + dtype, + sync=None, +): + y = TestTensor( + [out_features], + y_strides, + dtype, + device, + ) + + x = TestTensor( + [in_features], + x_strides, + dtype, + device, + ) + + w = TestTensor( + [out_features, in_features], + w_strides, + dtype, + device, + ) + + b = TestTensor( + [out_features], + None, + dtype, + device, + ) if bias_exist else None + + print( + f"Testing linear on {InfiniDeviceNames[device]} with in_features:{in_features}, out_features:{out_features}, bias:{bias_exist}," + f"dtype:{InfiniDtypeNames[dtype]}" + ) + + y._torch_tensor = torch_linear(x.torch_tensor(), w.torch_tensor(), (b.torch_tensor() if bias_exist else None), bias_exist) + + if sync is not None: + sync() + + descriptor = infiniopOperatorDescriptor_t() + check_error( + LIBINFINIOP.infiniopCreateLinearDescriptor( + handle, + ctypes.byref(descriptor), + y.descriptor, + x.descriptor, + w.descriptor, + (b.descriptor if bias_exist else None), + ) + ) + + # Invalidate the shape and strides in the descriptor to prevent them from being directly used by the kernel + for tensor in [y, x, w] + ([b] if bias_exist else []): + tensor.destroy_desc() + + workspace_size = c_uint64(0) + check_error( + LIBINFINIOP.infiniopGetLinearWorkspaceSize( + descriptor, ctypes.byref(workspace_size) + ) + ) + workspace = TestWorkspace(workspace_size.value, x.device) + + def lib_linear(): + check_error( + LIBINFINIOP.infiniopLinear( + descriptor, + workspace.data(), + workspace.size(), + y.data(), + x.data(), + w.data(), + b.data() if bias_exist else None, + None, + ) + ) + + lib_linear() + + + atol, rtol = get_tolerance(_TOLERANCE_MAP, dtype) + + if DEBUG: + debug(y.actual_tensor(), y.torch_tensor(), atol=atol, rtol=rtol) + + assert torch.allclose(y.actual_tensor(), y.torch_tensor(), atol=atol, rtol=rtol) + + # Profiling workflow + if PROFILE: + # fmt: off + profile_operation("PyTorch", lambda: torch_linear( + x.torch_tensor(), w.torch_tensor(), (b.torch_tensor() if bias_exist else None), bias_exist + ), device, NUM_PRERUN, NUM_ITERATIONS) + profile_operation(" lib", lambda: lib_linear(), device, NUM_PRERUN, NUM_ITERATIONS) + # fmt: on + check_error(LIBINFINIOP.infiniopDestroyLinearDescriptor(descriptor)) + + +if __name__ == "__main__": + args = get_args() + + # Configure testing options + DEBUG = args.debug + PROFILE = args.profile + NUM_PRERUN = args.num_prerun + NUM_ITERATIONS = args.num_iterations + + for device in get_test_devices(args): + test_operator(device, test, _TEST_CASES, _TENSOR_DTYPES) + + print("\033[92mTest my linear passed!\033[0m") diff --git a/test/infiniop/linear_backward.py b/test/infiniop/linear_backward.py new file mode 100644 index 000000000..b9ece88ff --- /dev/null +++ b/test/infiniop/linear_backward.py @@ -0,0 +1,233 @@ +import torch +import ctypes +from ctypes import c_uint64 +from libinfiniop import ( + LIBINFINIOP, + TestTensor, + get_test_devices, + check_error, + test_operator, + get_args, + debug, + get_tolerance, + profile_operation, + TestWorkspace, + InfiniDtype, + InfiniDtypeNames, + InfiniDeviceNames, + infiniopOperatorDescriptor_t, +) +from enum import Enum, auto + +_TEST_CASES = [ + (50, 40, True, None, None, [1, 377]), + (50, 40, False, [10], [1], None), + (50, 40, True, [10], [1], None), + (333, 999, True, [1], [10], None), + (333, 999, False, [1], [10], None), + (1001, 505, True, None, None, [3001, 3]), +] + +# Data types used for testing +_TENSOR_DTYPES = [InfiniDtype.F16, InfiniDtype.F32, InfiniDtype.BF16] + +# Tolerance map for different data types + +_TOLERANCE_MAP = { + InfiniDtype.F16: {"atol": 1e-3, "rtol": 1e-2}, + InfiniDtype.F32: {"atol": 1e-5, "rtol": 1e-5}, + InfiniDtype.BF16: {"atol": 1e-3, "rtol": 1e-2}, +} + +DEBUG = False +PROFILE = False +NUM_PRERUN = 10 +NUM_ITERATIONS = 1000 + + +def torch_linear_backward( + grad_x: torch.Tensor, + grad_w: torch.Tensor, + grad_b: torch.Tensor, + grad_y: torch.Tensor, + x: torch.Tensor, + w: torch.Tensor, + b: torch.Tensor, + bias_exist:bool + ): + x.requires_grad_(True) + w.requires_grad_(True) + if bias_exist: + b.requires_grad_(True) + y = torch.nn.functional.linear(x, w, bias=(b if bias_exist else None)) + y.backward(grad_y) + grad_x.copy_(x.grad) + grad_w.copy_(w.grad) + if bias_exist: + grad_b.copy_(b.grad) + +def test( + handle, + device, + in_features, out_features, bias_exist, grad_x_strides, grad_y_strides, grad_w_strides, + dtype, + sync=None, +): + print( + f"Testing linear_backward on {InfiniDeviceNames[device]} with in_features:{in_features}, out_features: {out_features}," + f"bias:{bias_exist}," + f"dtype:{InfiniDtypeNames[dtype]}" + ) + + grad_x = TestTensor( + [in_features], + grad_x_strides, + dtype, + device, + ) + + grad_w = TestTensor( + [out_features, in_features], + grad_w_strides, + dtype, + device, + ) + + grad_b = TestTensor( + [out_features], + None, + dtype, + device, + ) if bias_exist else None + + grad_y = TestTensor( + [out_features], + grad_y_strides, + dtype, + device, + ) + + x = TestTensor( + [in_features], + None, + dtype, + device, + ) + + w = TestTensor( + [out_features, in_features], + None, + dtype, + device, + ) + + b = TestTensor( + [out_features], + None, + dtype, + device, + ) if bias_exist else None + + + + + torch_linear_backward( + grad_x.torch_tensor(), grad_w.torch_tensor(), + grad_b.torch_tensor() if bias_exist else None, + grad_y.torch_tensor(), x.torch_tensor(), w.torch_tensor(), + b.torch_tensor() if bias_exist else None, + bias_exist + ) + + if sync is not None: + sync() + + descriptor = infiniopOperatorDescriptor_t() + check_error( + LIBINFINIOP.infiniopCreateLinearBackwardDescriptor( + handle, + ctypes.byref(descriptor), + grad_x.descriptor, + grad_w.descriptor, + (grad_b.descriptor if bias_exist else None), + grad_y.descriptor, + x.descriptor, + w.descriptor, + ) + ) + + # Invalidate the shape and strides in the descriptor to prevent them from being directly used by the kernel + for tensor in [grad_x, grad_w, grad_y, x, w,] + [grad_b, b] if bias_exist else []: + if tensor is not None: + tensor.destroy_desc() + + workspace_size = c_uint64(0) + check_error( + LIBINFINIOP.infiniopGetLinearBackwardWorkspaceSize( + descriptor, ctypes.byref(workspace_size) + ) + ) + workspace = TestWorkspace(workspace_size.value, grad_x.device) + + def lib_linear_backward(): + check_error( + LIBINFINIOP.infiniopLinearBackward( + descriptor, + workspace.data(), + workspace.size(), + grad_x.data(), + grad_w.data(), + grad_b.data() if bias_exist else None, + grad_y.data(), + x.data(), + w.data(), + None, + ) + ) + + lib_linear_backward() + + atol, rtol = get_tolerance(_TOLERANCE_MAP, dtype) + if DEBUG: + debug(grad_x.actual_tensor(), grad_x.torch_tensor(), atol=atol, rtol=rtol) + debug(grad_w.actual_tensor(), grad_w.torch_tensor(), atol=atol, rtol=rtol) + if bias_exist: + debug(grad_b.actual_tensor(), grad_b.torch_tensor(), atol=atol, rtol=rtol) + + + assert torch.allclose(grad_x.actual_tensor(), grad_x.torch_tensor(), atol=atol, rtol=rtol) + assert torch.allclose(grad_w.actual_tensor(), grad_w.torch_tensor(), atol=atol, rtol=rtol) + if bias_exist: + assert torch.allclose(grad_b.actual_tensor(), grad_b.torch_tensor(), atol=atol, rtol=rtol) + + # Profiling workflow + if PROFILE: + # fmt: off + profile_operation("PyTorch", lambda: torch_linear_backward( + grad_x.torch_tensor(), + grad_w.torch_tensor(), + grad_b.torch_tensor() if bias_exist else None, + grad_y.torch_tensor(), + x.torch_tensor(), + w.torch_tensor(), + b.torch_tensor() if bias_exist else None, + bias_exist + ), device, NUM_PRERUN, NUM_ITERATIONS) + profile_operation(" lib", lambda: lib_linear_backward(), device, NUM_PRERUN, NUM_ITERATIONS) + # fmt: on + check_error(LIBINFINIOP.infiniopDestroyLinearBackwardDescriptor(descriptor)) + + +if __name__ == "__main__": + args = get_args() + + # Configure testing options + DEBUG = args.debug + PROFILE = args.profile + NUM_PRERUN = args.num_prerun + NUM_ITERATIONS = args.num_iterations + + for device in get_test_devices(args): + test_operator(device, test, _TEST_CASES, _TENSOR_DTYPES) + + print("\033[92mTest my linear_backward passed!\033[0m") diff --git a/test/infiniop/scatter.py b/test/infiniop/scatter.py new file mode 100644 index 000000000..86ccdcdeb --- /dev/null +++ b/test/infiniop/scatter.py @@ -0,0 +1,196 @@ +import torch +import ctypes +from ctypes import c_uint64 +from libinfiniop import ( + LIBINFINIOP, + TestTensor, + get_test_devices, + check_error, + test_operator, + get_args, + debug, + get_tolerance, + profile_operation, + TestWorkspace, + InfiniDtype, + InfiniDtypeNames, + InfiniDeviceNames, + infiniopOperatorDescriptor_t, +) +from enum import Enum, auto +import random + +_TEST_CASES = [ + # input_shape, index_shape, output_shape, dim, input_strides, output_strides, index_strides + ((6, 7), (6, 7), (6, 7), 1, (7, 1), (1, 7), None), + ((2, 3, 7), (2, 3, 5), (2, 3, 5), 2, (1, 2, 6), None, None), + ((10, 5, 4), (10, 4, 4), (10, 4, 4), 1, None, None, [16, 4, 1]), + ((11, 2, 2, 4), (11, 2, 2, 4), (11, 2, 2, 4), 0, None, [16, 8, 4, 1], None), +] + + +_TENSOR_DTYPES = [InfiniDtype.F16, InfiniDtype.F32, InfiniDtype.BF16] + +# Tolerance map for different data types +_TOLERANCE_MAP = { + InfiniDtype.F16: {"atol": 0, "rtol": 0}, + InfiniDtype.F32: {"atol": 0, "rtol": 0}, + InfiniDtype.BF16: {"atol": 0, "rtol": 0}, +} + +DEBUG = False +PROFILE = False +NUM_PRERUN = 10 +NUM_ITERATIONS = 1000 + + +def torch_scatter(output: torch.Tensor, input, index, dim): + output.scatter_(dim, index, src=input) + + +def test( + handle, + device, + input_shape, index_shape, output_shape, dim, input_strides, output_strides, index_strides, + dtype, + sync=None, +): + print( + f"Testing scatter on {InfiniDeviceNames[device]} with input_shape:{input_shape}, index_shape:{index_shape}, output_shape:{output_shape}, dim:{dim}," + f"dtype:{InfiniDtypeNames[dtype]}" + ) + + output = TestTensor( + output_shape, + output_strides, + dtype, + device, + "zeros", + ) + + input = TestTensor( + input_shape, + input_strides, + dtype, + device, + ) + + def get_test_index_tensor(input_shape, index_shape, output_shape, scatter_dim): + index = torch.empty(index_shape, dtype=torch.int64) + ndim = len(input_shape) + if ndim == 2 and scatter_dim == 1: + for i in range(input.shape[0]): + row = list(range(output_shape[dim])) + random.shuffle(row) + index[i, :] = torch.tensor(row[:index_shape[dim]]).type(torch.float64) + elif ndim == 3 and scatter_dim == 2: + for i in range(input.shape[0]): + for j in range(input.shape[1]): + row = list(range(output_shape[dim])) + random.shuffle(row) + index[i, j, :] = torch.tensor(row[:index_shape[dim]]).type(torch.float64) + elif ndim == 3 and scatter_dim == 1: + for i in range(input.shape[0]): + for j in range(input.shape[2]): + row = list(range(output_shape[dim])) + random.shuffle(row) + index[i, :, j] = torch.tensor(row[:index_shape[dim]]).type(torch.float64) + elif ndim == 4 and scatter_dim == 0: + for i in range(input.shape[1]): + for j in range(input.shape[2]): + for k in range(input.shape[3]): + row = list(range(output_shape[dim])) + random.shuffle(row) + index[:, i, j, k] = torch.tensor(row[:index_shape[dim]]).type(torch.float64) + return index + + torch_index = get_test_index_tensor(input_shape, index_shape, output_shape, dim).type(torch.int64) + if index_strides: + torch_index = torch_index.as_strided(index_shape, index_strides) + index = TestTensor( + index_shape, + torch_index.stride(), + InfiniDtype.I64, + device, + "manual", + set_tensor=torch_index + ) + + torch_scatter(output.torch_tensor(), input.torch_tensor(), index.torch_tensor(), dim) + + if sync is not None: + sync() + + descriptor = infiniopOperatorDescriptor_t() + check_error( + LIBINFINIOP.infiniopCreateScatterDescriptor( + handle, + ctypes.byref(descriptor), + output.descriptor, + input.descriptor, + index.descriptor, + dim, + ) + ) + + # Invalidate the shape and strides in the descriptor to prevent them from being directly used by the kernel + for tensor in [output, input, index]: + tensor.destroy_desc() + + workspace_size = c_uint64(0) + check_error( + LIBINFINIOP.infiniopGetScatterWorkspaceSize( + descriptor, ctypes.byref(workspace_size) + ) + ) + workspace = TestWorkspace(workspace_size.value, output.device) + + def lib_scatter(): + check_error( + LIBINFINIOP.infiniopScatter( + descriptor, + workspace.data(), + workspace.size(), + output.data(), + input.data(), + index.data(), + None, + ) + ) + + lib_scatter() + + atol, rtol = get_tolerance(_TOLERANCE_MAP, dtype) + if DEBUG: + debug(output.actual_tensor(), output.torch_tensor(), atol=atol, rtol=rtol) + # print('input:\n', input.torch_tensor()) + # print('index:\n', index.torch_tensor()) + # print('output:\n', output.torch_tensor(), '\n', output.actual_tensor(), ) + + + assert torch.allclose(output.actual_tensor(), output.torch_tensor(), atol=atol, rtol=rtol) + + # Profiling workflow + if PROFILE: + # fmt: off + profile_operation("PyTorch", lambda: torch_scatter( + output.torch_tensor(), input.torch_tensor(), index.torch_tensor(), dim + ), device, NUM_PRERUN, NUM_ITERATIONS) + profile_operation(" lib", lambda: lib_scatter(), device, NUM_PRERUN, NUM_ITERATIONS) + # fmt: on + check_error(LIBINFINIOP.infiniopDestroyScatterDescriptor(descriptor)) + + +if __name__ == "__main__": + args = get_args() + + # Configure testing options + DEBUG = args.debug + PROFILE = args.profile + NUM_PRERUN = args.num_prerun + NUM_ITERATIONS = args.num_iterations + + for device in get_test_devices(args): + test_operator(device, test, _TEST_CASES, _TENSOR_DTYPES) + + print("\033[92mTest my scatter passed!\033[0m") diff --git a/test/infiniop/tril.py b/test/infiniop/tril.py new file mode 100644 index 000000000..d4c40eff7 --- /dev/null +++ b/test/infiniop/tril.py @@ -0,0 +1,188 @@ +import torch +import ctypes +from ctypes import c_uint64 +from libinfiniop import ( + LIBINFINIOP, + TestTensor, + get_test_devices, + check_error, + test_operator, + get_args, + debug, + get_tolerance, + profile_operation, + TestWorkspace, + InfiniDtype, + InfiniDtypeNames, + InfiniDeviceNames, + infiniopOperatorDescriptor_t, +) +from enum import Enum, auto + +_TEST_CASES_ = [ + ((5, 6), 0), + ((4, 5), -1), + ((5, 2), 3), + ((89, 80), -20), +] + + +class Inplace(Enum): + OUT_OF_PLACE = auto() + INPLACE = auto() + + +# Inplace options applied for each test case in _TEST_CASES_ +_INPLACE = [ + Inplace.OUT_OF_PLACE, + Inplace.INPLACE, +] + + +# Form the test cases by appending each element of _INPLACE to each tuple in _TEST_CASES_ +_TEST_CASES = [ + test_case + (inplace_item,) + for test_case in _TEST_CASES_ + for inplace_item in _INPLACE +] + + +# Data types used for testing +_TENSOR_DTYPES = [InfiniDtype.F16, InfiniDtype.F32, InfiniDtype.BF16, InfiniDtype.I64, InfiniDtype.I32] + +# Tolerance map for different data types + +_TOLERANCE_MAP = { + InfiniDtype.F16: {"atol": 0, "rtol": 0}, + InfiniDtype.F32: {"atol": 0, "rtol": 0}, + InfiniDtype.BF16: {"atol": 0, "rtol": 0}, + InfiniDtype.I32: {"atol": 0, "rtol": 0}, + InfiniDtype.I64: {"atol": 0, "rtol": 0}, + +} + +DEBUG = False +PROFILE = False +NUM_PRERUN = 10 +NUM_ITERATIONS = 1000 + + +def torch_tril(output, input, diagonal): + torch.tril(input, diagonal, out=output) + + + +def test( + handle, + device, + input_shape, diagonal, + inplace, + dtype, + sync=None, +): + torch_dtype = { + InfiniDtype.F16: torch.half, + InfiniDtype.F32: torch.float, + InfiniDtype.BF16: torch.bfloat16, + InfiniDtype.I32: torch.int32, + InfiniDtype.I64: torch.int64 + }[dtype] + + print( + f"Testing tril on {InfiniDeviceNames[device]} with shape:{input_shape}, diagonal:{diagonal}, " + f"inplace:{inplace}," + f"dtype:{InfiniDtypeNames[dtype]}" + ) + + torch_input = (torch.rand(size=input_shape) * 100 - 50).type(torch_dtype) + input = TestTensor( + input_shape, + torch_input.stride(), + dtype, + device, + "manual", + set_tensor=torch_input + ) + if inplace == Inplace.INPLACE: + output = input + else: + output = TestTensor( + input_shape, + None, + dtype, + device, + "zeros" + ) + + + torch_tril(output.torch_tensor(), input.torch_tensor(), diagonal) + + if sync is not None: + sync() + + descriptor = infiniopOperatorDescriptor_t() + check_error( + LIBINFINIOP.infiniopCreateTrilDescriptor( + handle, + ctypes.byref(descriptor), + output.descriptor, + input.descriptor, + diagonal, + ) + ) + + # Invalidate the shape and strides in the descriptor to prevent them from being directly used by the kernel + for tensor in [output, input]: + tensor.destroy_desc() + + workspace_size = c_uint64(0) + check_error( + LIBINFINIOP.infiniopGetTrilWorkspaceSize( + descriptor, ctypes.byref(workspace_size) + ) + ) + workspace = TestWorkspace(workspace_size.value, input.device) + + def lib_tril(): + check_error( + LIBINFINIOP.infiniopTril( + descriptor, + workspace.data(), + workspace.size(), + output.data(), + input.data(), + None, + ) + ) + + lib_tril() + + atol, rtol = get_tolerance(_TOLERANCE_MAP, dtype) + if DEBUG: + debug(output.actual_tensor(), output.torch_tensor(), atol=atol, rtol=rtol) + assert torch.allclose(output.actual_tensor(), output.torch_tensor(), atol=atol, rtol=rtol) + + # Profiling workflow + if PROFILE: + # fmt: off + profile_operation("PyTorch", lambda: torch_tril( + output.torch_tensor(), input.torch_tensor(), diagonal + ), device, NUM_PRERUN, NUM_ITERATIONS) + profile_operation(" lib", lambda: lib_tril(), device, NUM_PRERUN, NUM_ITERATIONS) + # fmt: on + check_error(LIBINFINIOP.infiniopDestroyTrilDescriptor(descriptor)) + + +if __name__ == "__main__": + args = get_args() + + # Configure testing options + DEBUG = args.debug + PROFILE = args.profile + NUM_PRERUN = args.num_prerun + NUM_ITERATIONS = args.num_iterations + + for device in get_test_devices(args): + test_operator(device, test, _TEST_CASES, _TENSOR_DTYPES) + + print("\033[92mTest my tril passed!\033[0m") diff --git a/test/infiniop/triu.py b/test/infiniop/triu.py new file mode 100644 index 000000000..5c3ec63d1 --- /dev/null +++ b/test/infiniop/triu.py @@ -0,0 +1,186 @@ +import torch +import ctypes +from ctypes import c_uint64 +from libinfiniop import ( + LIBINFINIOP, + TestTensor, + get_test_devices, + check_error, + test_operator, + get_args, + debug, + get_tolerance, + profile_operation, + TestWorkspace, + InfiniDtype, + InfiniDtypeNames, + InfiniDeviceNames, + infiniopOperatorDescriptor_t, +) +from enum import Enum, auto + +_TEST_CASES_ = [ + ((5, 6), 0), + ((4, 5), -1), + ((5, 2), 3), + ((89, 80), -20), +] + + +class Inplace(Enum): + OUT_OF_PLACE = auto() + INPLACE = auto() + + +# Inplace options applied for each test case in _TEST_CASES_ +_INPLACE = [ + Inplace.OUT_OF_PLACE, + Inplace.INPLACE, +] + + +# Form the test cases by appending each element of _INPLACE to each tuple in _TEST_CASES_ +_TEST_CASES = [ + test_case + (inplace_item,) + for test_case in _TEST_CASES_ + for inplace_item in _INPLACE +] + + +# Data types used for testing +_TENSOR_DTYPES = [InfiniDtype.F16, InfiniDtype.F32, InfiniDtype.BF16, InfiniDtype.I64, InfiniDtype.I32] + +# Tolerance map for different data types + +_TOLERANCE_MAP = { + InfiniDtype.F16: {"atol": 0, "rtol": 0}, + InfiniDtype.F32: {"atol": 0, "rtol": 0}, + InfiniDtype.BF16: {"atol": 0, "rtol": 0}, + InfiniDtype.I32: {"atol": 0, "rtol": 0}, + InfiniDtype.I64: {"atol": 0, "rtol": 0}, +} + + +DEBUG = False +PROFILE = False +NUM_PRERUN = 10 +NUM_ITERATIONS = 1000 + + +def torch_triu(output, input, diagonal): + torch.triu(input, diagonal, out=output) + + +def test( + handle, + device, + input_shape, diagonal, + inplace, + dtype, + sync=None, +): + torch_dtype = { + InfiniDtype.F16: torch.half, + InfiniDtype.F32: torch.float, + InfiniDtype.BF16: torch.bfloat16, + InfiniDtype.I32: torch.int32, + InfiniDtype.I64: torch.int64 + }[dtype] + + print( + f"Testing triu on {InfiniDeviceNames[device]} with shape:{input_shape}, diagonal:{diagonal}, " + f"inplace:{inplace}," + f"dtype:{InfiniDtypeNames[dtype]}" + ) + + torch_input = (torch.rand(size=input_shape) * 100 - 50).type(torch_dtype) + input = TestTensor( + input_shape, + torch_input.stride(), + dtype, + device, + "manual", + set_tensor=torch_input + ) + if inplace == Inplace.INPLACE: + output = input + else: + output = TestTensor( + input_shape, + None, + dtype, + device, + "zeros" + ) + + torch_triu(output.torch_tensor(), input.torch_tensor(), diagonal) + + if sync is not None: + sync() + + descriptor = infiniopOperatorDescriptor_t() + check_error( + LIBINFINIOP.infiniopCreateTriuDescriptor( + handle, + ctypes.byref(descriptor), + output.descriptor, + input.descriptor, + diagonal, + ) + ) + + # Invalidate the shape and strides in the descriptor to prevent them from being directly used by the kernel + for tensor in [output, input]: + tensor.destroy_desc() + + workspace_size = c_uint64(0) + check_error( + LIBINFINIOP.infiniopGetTriuWorkspaceSize( + descriptor, ctypes.byref(workspace_size) + ) + ) + workspace = TestWorkspace(workspace_size.value, output.device) + + def lib_triu(): + check_error( + LIBINFINIOP.infiniopTriu( + descriptor, + workspace.data(), + workspace.size(), + output.data(), + input.data(), + None, + ) + ) + + lib_triu() + + atol, rtol = get_tolerance(_TOLERANCE_MAP, dtype) + if DEBUG: + debug(output.actual_tensor(), output.torch_tensor(), atol=atol, rtol=rtol) + assert torch.allclose(output.actual_tensor(), output.torch_tensor(), atol=atol, rtol=rtol) + + # Profiling workflow + if PROFILE: + # fmt: off + profile_operation("PyTorch", lambda: torch_triu( + output.torch_tensor(), input.torch_tensor(), diagonal + ), device, NUM_PRERUN, NUM_ITERATIONS) + profile_operation(" lib", lambda: lib_triu(), device, NUM_PRERUN, NUM_ITERATIONS) + # fmt: on + check_error(LIBINFINIOP.infiniopDestroyTriuDescriptor(descriptor)) + + +if __name__ == "__main__": + args = get_args() + + # Configure testing options + DEBUG = args.debug + PROFILE = args.profile + NUM_PRERUN = args.num_prerun + NUM_ITERATIONS = args.num_iterations + + for device in get_test_devices(args): + test_operator(device, test, _TEST_CASES, _TENSOR_DTYPES) + + print("\033[92mTest my triu passed!\033[0m")