From cf4b44f967fa465690c68397496d945ba41a4c8b Mon Sep 17 00:00:00 2001 From: zhaoguochun1995 Date: Thu, 11 Apr 2024 09:44:56 +0000 Subject: [PATCH 1/7] support op dispatch --- csrc/extensions.cpp | 54 +++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 54 insertions(+) diff --git a/csrc/extensions.cpp b/csrc/extensions.cpp index bab6b7bc..1425b6c2 100644 --- a/csrc/extensions.cpp +++ b/csrc/extensions.cpp @@ -27,6 +27,7 @@ #include "diopi_helper.h" #include "pybind_type_cast.h" +#include "torch/library.h" namespace dipu::dipu_ext { @@ -363,4 +364,57 @@ PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) { } } +at::Tensor& apply_penalty(at::Tensor& logits, const at::Tensor& presence_penalty, + const at::Tensor& frequency_penalty, + const at::Tensor& p_token_ids, + const at::Tensor& p_token_counts, + const at::Tensor& p_cumsum_seq_len, + int64_t p_max_len_in_batch) { + callDiopi(diopiApplyPenalty, logits, presence_penalty, frequency_penalty, + p_token_ids, p_token_counts, p_cumsum_seq_len, p_max_len_in_batch); + return logits; +} + +at::Tensor& dest_index_copy_kv(const at::Tensor& k, const at::Tensor& dest_loc, + at::Tensor& out) { + callDiopi(diopiDestIndexCopyKV, out, k, dest_loc); + return out; +} + +TORCH_LIBRARY(ops, m) { + //m.def("adamw(Tensor(a!) input, Tensor(b!) grad, Tensor(c!) exp_avg, Tensor(d!) exp_avg_sq, Tensor(e!) max_exp_avg_sq, float lr, float beta1, float beta2, float eps, float weight_decay, int step, bool amsgrad)->(Tensor(a!), Tensor(b!), Tensor(c!), Tensor(d!))"); + m.def("apply_penalty(Tensor(a!) logits, Tensor presence_penalty, Tensor frequency_penalty, Tensor p_token_ids, Tensor p_token_counts, Tensor p_cumsum_seq_len, int p_max_len_in_batch)->Tensor(a!)"); + m.def("dest_index_copy_kv(Tensor(a!) out, Tensor k, Tensor dest_loc)->Tensor(a!)"); +} + +// impl for dipu +TORCH_LIBRARY_IMPL(ops, XPU, m) { + if (reinterpret_cast(diopiApplyPenalty) != nullptr) { + m.impl("apply_penalty", apply_penalty); + } + if (reinterpret_cast(diopiDestIndexCopyKV) != nullptr) { + m.impl("dest_index_copy_kv", dest_index_copy_kv); + } +} + +// impl for torch +TORCH_LIBRARY_IMPL(ops, CUDA, m) { + if (reinterpret_cast(diopiApplyPenalty) != nullptr) { + m.impl("apply_penalty", apply_penalty); + } + if (reinterpret_cast(diopiDestIndexCopyKV) != nullptr) { + m.impl("dest_index_copy_kv", dest_index_copy_kv); + } +} + +// impl for torch_npu +TORCH_LIBRARY_IMPL(ops, PrivateUse1, m) { + if (reinterpret_cast(diopiApplyPenalty) != nullptr) { + m.impl("apply_penalty", apply_penalty); + } + if (reinterpret_cast(diopiDestIndexCopyKV) != nullptr) { + m.impl("dest_index_copy_kv", dest_index_copy_kv); + } +} + } // namespace dipu::dipu_ext From d0c203a9627062a2e6c29af4933f0b95c13c3901 Mon Sep 17 00:00:00 2001 From: zhaoguochun1995 Date: Mon, 15 Apr 2024 17:09:28 +0800 Subject: [PATCH 2/7] support op dispatch --- csrc/extensions.cpp | 48 +++++++++++++++++---------------------------- test_dispatch.py | 24 +++++++++++++++++++++++ 2 files changed, 42 insertions(+), 30 deletions(-) create mode 100644 test_dispatch.py diff --git a/csrc/extensions.cpp b/csrc/extensions.cpp index 1425b6c2..91007e6d 100644 --- a/csrc/extensions.cpp +++ b/csrc/extensions.cpp @@ -292,7 +292,7 @@ void extApplyPenalty(at::Tensor& logits, const at::Tensor& presence_penalty, callDiopi(diopiApplyPenalty, logits, presence_penalty, frequency_penalty, p_token_ids, p_token_counts, p_cumsum_seq_len, p_max_len_in_batch); } - +#if 0 // 判断是否有对应的 diopi 实现: // 如果有, 则直接 pybind 上去; // 否则不注册, 等到 python 层处理. @@ -363,6 +363,7 @@ PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) { "deeplink ext_scaled_masked_softmax_bwd"); } } +#endif at::Tensor& apply_penalty(at::Tensor& logits, const at::Tensor& presence_penalty, const at::Tensor& frequency_penalty, @@ -381,40 +382,27 @@ at::Tensor& dest_index_copy_kv(const at::Tensor& k, const at::Tensor& dest_loc, return out; } -TORCH_LIBRARY(ops, m) { - //m.def("adamw(Tensor(a!) input, Tensor(b!) grad, Tensor(c!) exp_avg, Tensor(d!) exp_avg_sq, Tensor(e!) max_exp_avg_sq, float lr, float beta1, float beta2, float eps, float weight_decay, int step, bool amsgrad)->(Tensor(a!), Tensor(b!), Tensor(c!), Tensor(d!))"); - m.def("apply_penalty(Tensor(a!) logits, Tensor presence_penalty, Tensor frequency_penalty, Tensor p_token_ids, Tensor p_token_counts, Tensor p_cumsum_seq_len, int p_max_len_in_batch)->Tensor(a!)"); - m.def("dest_index_copy_kv(Tensor(a!) out, Tensor k, Tensor dest_loc)->Tensor(a!)"); +at::Tensor& example_for_all_backend(at::Tensor& inout) { + std::cout << __FUNCTION__ << ": "<< inout.options() << std::endl; + return inout; } -// impl for dipu -TORCH_LIBRARY_IMPL(ops, XPU, m) { - if (reinterpret_cast(diopiApplyPenalty) != nullptr) { - m.impl("apply_penalty", apply_penalty); - } - if (reinterpret_cast(diopiDestIndexCopyKV) != nullptr) { - m.impl("dest_index_copy_kv", dest_index_copy_kv); - } +at::Tensor& example_only_for_xpu(at::Tensor& inout) { + std::cout << __FUNCTION__ << ": " << inout.options() << std::endl; + return inout; } -// impl for torch -TORCH_LIBRARY_IMPL(ops, CUDA, m) { - if (reinterpret_cast(diopiApplyPenalty) != nullptr) { - m.impl("apply_penalty", apply_penalty); - } - if (reinterpret_cast(diopiDestIndexCopyKV) != nullptr) { - m.impl("dest_index_copy_kv", dest_index_copy_kv); - } +// By default, all backends (XPU, AutocastXPU, AutoGradXPU, CUDA, PrivateUse1, AutogradPrivateUse1 etc) are registered. If you need to register separately for a certain backend, separate registration for a certain backend is also supported. +TORCH_LIBRARY(deeplink_ext_, m) { + m.def("adamw(Tensor(a!) input, Tensor(b!) grad, Tensor(c!) exp_avg, Tensor(d!) exp_avg_sq, Tensor(e!) max_exp_avg_sq, float lr, float beta1, float beta2, float eps, float weight_decay, int step, bool amsgrad)->(Tensor(a!), Tensor(b!), Tensor(c!), Tensor(d!))"); + m.def("apply_penalty(Tensor(a!) logits, Tensor presence_penalty, Tensor frequency_penalty, Tensor p_token_ids, Tensor p_token_counts, Tensor p_cumsum_seq_len, int p_max_len_in_batch)->Tensor(a!)"); + m.def("dest_index_copy_kv(Tensor(a!) out, Tensor k, Tensor dest_loc)->Tensor(a!)"); + m.def("example(Tensor(a!) inout)->Tensor(a!)", example_for_all_backend); } -// impl for torch_npu -TORCH_LIBRARY_IMPL(ops, PrivateUse1, m) { - if (reinterpret_cast(diopiApplyPenalty) != nullptr) { - m.impl("apply_penalty", apply_penalty); - } - if (reinterpret_cast(diopiDestIndexCopyKV) != nullptr) { - m.impl("dest_index_copy_kv", dest_index_copy_kv); - } +// only impl for dipu +TORCH_LIBRARY_IMPL(deeplink_ext_, XPU, m) { + // m.impl("example", example_only_for_xpu); } -} // namespace dipu::dipu_ext +} // namespace dipu::dipu_ext \ No newline at end of file diff --git a/test_dispatch.py b/test_dispatch.py new file mode 100644 index 00000000..44a1e48c --- /dev/null +++ b/test_dispatch.py @@ -0,0 +1,24 @@ +import torch +import torch_dipu +import deeplink_ext +torch.ops.load_library(deeplink_ext.__path__[0] + "/cpp_extensions.cpython-39-x86_64-linux-gnu.so") +print(f"torch.ops.loaded_libraries:{torch.ops.loaded_libraries}") + +#print(torch.ops.deeplink_ext_.dest_index_copy_kv) + +def code_to_profile(): + x = torch.randn(3,4) + y = torch.ops.deeplink_ext_.example(x) + y = torch.ops.deeplink_ext_.example(x.cuda()) + + +with torch.profiler.profile( + activities=[ + torch.profiler.ProfilerActivity.CPU, + torch.profiler.ProfilerActivity.CUDA, + ] +) as p: + code_to_profile() +print(p.key_averages().table( + sort_by="self_cuda_time_total", row_limit=-1)) + From 94b26005e41f4555fc3e1e1b6d570e324fe792f7 Mon Sep 17 00:00:00 2001 From: zhaoguochun1995 Date: Mon, 15 Apr 2024 18:26:14 +0800 Subject: [PATCH 3/7] add some op --- csrc/extensions.cpp | 21 ++++++++++++++++----- 1 file changed, 16 insertions(+), 5 deletions(-) diff --git a/csrc/extensions.cpp b/csrc/extensions.cpp index 91007e6d..4d4dd0d6 100644 --- a/csrc/extensions.cpp +++ b/csrc/extensions.cpp @@ -292,7 +292,7 @@ void extApplyPenalty(at::Tensor& logits, const at::Tensor& presence_penalty, callDiopi(diopiApplyPenalty, logits, presence_penalty, frequency_penalty, p_token_ids, p_token_counts, p_cumsum_seq_len, p_max_len_in_batch); } -#if 0 + // 判断是否有对应的 diopi 实现: // 如果有, 则直接 pybind 上去; // 否则不注册, 等到 python 层处理. @@ -363,7 +363,18 @@ PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) { "deeplink ext_scaled_masked_softmax_bwd"); } } -#endif + +std::tuple adamw(at::Tensor& param, at::Tensor& exp_avg, at::Tensor& exp_avg_sq, + const c10::optional& max_exp_avg_sq_opt, const at::Tensor& grad, + double lr, double beta1, double beta2, double epsilon, + double weight_decay, int64_t step, bool amsgrad) { + // the diopiAdamW func has no "maximize" param + at::Tensor& grad_ref = const_cast(grad); // todo: grad is const value + at::Tensor max_exp_avg_sq_opt_value = max_exp_avg_sq_opt.value_or(at::Tensor()); + callDiopi(diopiAdamW, param, grad_ref, exp_avg, exp_avg_sq, max_exp_avg_sq_opt_value, + lr, beta1, beta2, epsilon, weight_decay, step, amsgrad); + return std::tie(param, exp_avg, exp_avg_sq); +} at::Tensor& apply_penalty(at::Tensor& logits, const at::Tensor& presence_penalty, const at::Tensor& frequency_penalty, @@ -394,9 +405,9 @@ at::Tensor& example_only_for_xpu(at::Tensor& inout) { // By default, all backends (XPU, AutocastXPU, AutoGradXPU, CUDA, PrivateUse1, AutogradPrivateUse1 etc) are registered. If you need to register separately for a certain backend, separate registration for a certain backend is also supported. TORCH_LIBRARY(deeplink_ext_, m) { - m.def("adamw(Tensor(a!) input, Tensor(b!) grad, Tensor(c!) exp_avg, Tensor(d!) exp_avg_sq, Tensor(e!) max_exp_avg_sq, float lr, float beta1, float beta2, float eps, float weight_decay, int step, bool amsgrad)->(Tensor(a!), Tensor(b!), Tensor(c!), Tensor(d!))"); - m.def("apply_penalty(Tensor(a!) logits, Tensor presence_penalty, Tensor frequency_penalty, Tensor p_token_ids, Tensor p_token_counts, Tensor p_cumsum_seq_len, int p_max_len_in_batch)->Tensor(a!)"); - m.def("dest_index_copy_kv(Tensor(a!) out, Tensor k, Tensor dest_loc)->Tensor(a!)"); + m.def("adamw(Tensor(a!) param, Tensor(b!) exp_avg, Tensor(c!) exp_avg_sq, Tensor? max_exp_avg_sq_opt, Tensor grad, float lr, float beta1, float beta2, float epsilon, float weight_decay, int step, bool amsgrad)->(Tensor(a!), Tensor(b!), Tensor(c!))", adamw); + m.def("apply_penalty(Tensor(a!) logits, Tensor presence_penalty, Tensor frequency_penalty, Tensor p_token_ids, Tensor p_token_counts, Tensor p_cumsum_seq_len, int p_max_len_in_batch)->Tensor(a!)", apply_penalty); + m.def("dest_index_copy_kv(Tensor(a!) out, Tensor k, Tensor dest_loc)->Tensor(a!)", dest_index_copy_kv); m.def("example(Tensor(a!) inout)->Tensor(a!)", example_for_all_backend); } From a1b1db3f79f90df790259d3ac5a49790f19affb1 Mon Sep 17 00:00:00 2001 From: zhaoguochun1995 Date: Tue, 16 Apr 2024 10:38:51 +0800 Subject: [PATCH 4/7] add rms_norm op --- csrc/extensions.cpp | 17 +++++++++++++++-- 1 file changed, 15 insertions(+), 2 deletions(-) diff --git a/csrc/extensions.cpp b/csrc/extensions.cpp index 4d4dd0d6..cc3bd18a 100644 --- a/csrc/extensions.cpp +++ b/csrc/extensions.cpp @@ -2,6 +2,7 @@ #include #include +#include #include #include @@ -393,13 +394,23 @@ at::Tensor& dest_index_copy_kv(const at::Tensor& k, const at::Tensor& dest_loc, return out; } +std::tuple rms_norm(at::Tensor& output, at::Tensor& inv_rms, + const at::Tensor& input, + const OptionalIntArray& normalized_shape, + const at::Tensor& weight, + const c10::optional& bias_opt, double eps) { + callDiopi(diopiRMSNorm, output, inv_rms, input, normalized_shape, weight, + bias_opt, eps); + return std::tie(output, inv_rms); +} + at::Tensor& example_for_all_backend(at::Tensor& inout) { - std::cout << __FUNCTION__ << ": "<< inout.options() << std::endl; + std::cout << __FUNCTION__ << ": "<< inout.options() << "\n"; return inout; } at::Tensor& example_only_for_xpu(at::Tensor& inout) { - std::cout << __FUNCTION__ << ": " << inout.options() << std::endl; + std::cout << __FUNCTION__ << ": " << inout.options() << "\n"; return inout; } @@ -408,6 +419,8 @@ TORCH_LIBRARY(deeplink_ext_, m) { m.def("adamw(Tensor(a!) param, Tensor(b!) exp_avg, Tensor(c!) exp_avg_sq, Tensor? max_exp_avg_sq_opt, Tensor grad, float lr, float beta1, float beta2, float epsilon, float weight_decay, int step, bool amsgrad)->(Tensor(a!), Tensor(b!), Tensor(c!))", adamw); m.def("apply_penalty(Tensor(a!) logits, Tensor presence_penalty, Tensor frequency_penalty, Tensor p_token_ids, Tensor p_token_counts, Tensor p_cumsum_seq_len, int p_max_len_in_batch)->Tensor(a!)", apply_penalty); m.def("dest_index_copy_kv(Tensor(a!) out, Tensor k, Tensor dest_loc)->Tensor(a!)", dest_index_copy_kv); + m.def("rms_norm(Tensor(a!) output, Tensor(b!) inv_rms, Tensor input, int[]? normalized_shape, Tensor weight, Tensor? bias_opt, float eps) -> (Tensor(a!), Tensor(b!))", rms_norm); + m.def("example(Tensor(a!) inout)->Tensor(a!)", example_for_all_backend); } From 30dc807f80eac2808b19f03e232cb98bdb95e1b1 Mon Sep 17 00:00:00 2001 From: zhaoguochun1995 Date: Tue, 16 Apr 2024 10:40:44 +0800 Subject: [PATCH 5/7] auto format --- csrc/extensions.cpp | 83 +++++++++++++++++++++++++++++---------------- 1 file changed, 53 insertions(+), 30 deletions(-) diff --git a/csrc/extensions.cpp b/csrc/extensions.cpp index cc3bd18a..fa5a96ed 100644 --- a/csrc/extensions.cpp +++ b/csrc/extensions.cpp @@ -1,11 +1,12 @@ // Copyright (c) 2023, DeepLink. #include -#include #include +#include #include #include +#include "torch/library.h" #include #include #include @@ -28,7 +29,6 @@ #include "diopi_helper.h" #include "pybind_type_cast.h" -#include "torch/library.h" namespace dipu::dipu_ext { @@ -365,47 +365,51 @@ PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) { } } -std::tuple adamw(at::Tensor& param, at::Tensor& exp_avg, at::Tensor& exp_avg_sq, - const c10::optional& max_exp_avg_sq_opt, const at::Tensor& grad, - double lr, double beta1, double beta2, double epsilon, - double weight_decay, int64_t step, bool amsgrad) { +std::tuple adamw( + at::Tensor& param, at::Tensor& exp_avg, at::Tensor& exp_avg_sq, + const c10::optional& max_exp_avg_sq_opt, const at::Tensor& grad, + double lr, double beta1, double beta2, double epsilon, double weight_decay, + int64_t step, bool amsgrad) { // the diopiAdamW func has no "maximize" param - at::Tensor& grad_ref = const_cast(grad); // todo: grad is const value - at::Tensor max_exp_avg_sq_opt_value = max_exp_avg_sq_opt.value_or(at::Tensor()); - callDiopi(diopiAdamW, param, grad_ref, exp_avg, exp_avg_sq, max_exp_avg_sq_opt_value, - lr, beta1, beta2, epsilon, weight_decay, step, amsgrad); + at::Tensor& grad_ref = + const_cast(grad); // todo: grad is const value + at::Tensor max_exp_avg_sq_opt_value = + max_exp_avg_sq_opt.value_or(at::Tensor()); + callDiopi(diopiAdamW, param, grad_ref, exp_avg, exp_avg_sq, + max_exp_avg_sq_opt_value, lr, beta1, beta2, epsilon, weight_decay, + step, amsgrad); return std::tie(param, exp_avg, exp_avg_sq); } -at::Tensor& apply_penalty(at::Tensor& logits, const at::Tensor& presence_penalty, - const at::Tensor& frequency_penalty, - const at::Tensor& p_token_ids, - const at::Tensor& p_token_counts, - const at::Tensor& p_cumsum_seq_len, - int64_t p_max_len_in_batch) { +at::Tensor& apply_penalty(at::Tensor& logits, + const at::Tensor& presence_penalty, + const at::Tensor& frequency_penalty, + const at::Tensor& p_token_ids, + const at::Tensor& p_token_counts, + const at::Tensor& p_cumsum_seq_len, + int64_t p_max_len_in_batch) { callDiopi(diopiApplyPenalty, logits, presence_penalty, frequency_penalty, p_token_ids, p_token_counts, p_cumsum_seq_len, p_max_len_in_batch); return logits; } at::Tensor& dest_index_copy_kv(const at::Tensor& k, const at::Tensor& dest_loc, - at::Tensor& out) { + at::Tensor& out) { callDiopi(diopiDestIndexCopyKV, out, k, dest_loc); return out; } -std::tuple rms_norm(at::Tensor& output, at::Tensor& inv_rms, - const at::Tensor& input, - const OptionalIntArray& normalized_shape, - const at::Tensor& weight, - const c10::optional& bias_opt, double eps) { +std::tuple rms_norm( + at::Tensor& output, at::Tensor& inv_rms, const at::Tensor& input, + const OptionalIntArray& normalized_shape, const at::Tensor& weight, + const c10::optional& bias_opt, double eps) { callDiopi(diopiRMSNorm, output, inv_rms, input, normalized_shape, weight, - bias_opt, eps); + bias_opt, eps); return std::tie(output, inv_rms); } at::Tensor& example_for_all_backend(at::Tensor& inout) { - std::cout << __FUNCTION__ << ": "<< inout.options() << "\n"; + std::cout << __FUNCTION__ << ": " << inout.options() << "\n"; return inout; } @@ -414,12 +418,31 @@ at::Tensor& example_only_for_xpu(at::Tensor& inout) { return inout; } -// By default, all backends (XPU, AutocastXPU, AutoGradXPU, CUDA, PrivateUse1, AutogradPrivateUse1 etc) are registered. If you need to register separately for a certain backend, separate registration for a certain backend is also supported. +// By default, all backends (XPU, AutocastXPU, AutoGradXPU, CUDA, PrivateUse1, +// AutogradPrivateUse1 etc) are registered. If you need to register separately +// for a certain backend, separate registration for a certain backend is also +// supported. TORCH_LIBRARY(deeplink_ext_, m) { - m.def("adamw(Tensor(a!) param, Tensor(b!) exp_avg, Tensor(c!) exp_avg_sq, Tensor? max_exp_avg_sq_opt, Tensor grad, float lr, float beta1, float beta2, float epsilon, float weight_decay, int step, bool amsgrad)->(Tensor(a!), Tensor(b!), Tensor(c!))", adamw); - m.def("apply_penalty(Tensor(a!) logits, Tensor presence_penalty, Tensor frequency_penalty, Tensor p_token_ids, Tensor p_token_counts, Tensor p_cumsum_seq_len, int p_max_len_in_batch)->Tensor(a!)", apply_penalty); - m.def("dest_index_copy_kv(Tensor(a!) out, Tensor k, Tensor dest_loc)->Tensor(a!)", dest_index_copy_kv); - m.def("rms_norm(Tensor(a!) output, Tensor(b!) inv_rms, Tensor input, int[]? normalized_shape, Tensor weight, Tensor? bias_opt, float eps) -> (Tensor(a!), Tensor(b!))", rms_norm); + m.def( + "adamw(Tensor(a!) param, Tensor(b!) exp_avg, Tensor(c!) exp_avg_sq, " + "Tensor? max_exp_avg_sq_opt, Tensor grad, float lr, float beta1, float " + "beta2, float epsilon, float weight_decay, int step, bool " + "amsgrad)->(Tensor(a!), Tensor(b!), Tensor(c!))", + adamw); + m.def( + "apply_penalty(Tensor(a!) logits, Tensor presence_penalty, Tensor " + "frequency_penalty, Tensor p_token_ids, Tensor p_token_counts, Tensor " + "p_cumsum_seq_len, int p_max_len_in_batch)->Tensor(a!)", + apply_penalty); + m.def( + "dest_index_copy_kv(Tensor(a!) out, Tensor k, Tensor " + "dest_loc)->Tensor(a!)", + dest_index_copy_kv); + m.def( + "rms_norm(Tensor(a!) output, Tensor(b!) inv_rms, Tensor input, int[]? " + "normalized_shape, Tensor weight, Tensor? bias_opt, float eps) -> " + "(Tensor(a!), Tensor(b!))", + rms_norm); m.def("example(Tensor(a!) inout)->Tensor(a!)", example_for_all_backend); } @@ -429,4 +452,4 @@ TORCH_LIBRARY_IMPL(deeplink_ext_, XPU, m) { // m.impl("example", example_only_for_xpu); } -} // namespace dipu::dipu_ext \ No newline at end of file +} // namespace dipu::dipu_ext From 2e4b897c3164132dbe01284a2a4281e6f9565f75 Mon Sep 17 00:00:00 2001 From: zhaoguochun1995 Date: Tue, 16 Apr 2024 13:54:06 +0800 Subject: [PATCH 6/7] add apply_rotary --- csrc/extensions.cpp | 31 +++++++++++++++++++++++++++++++ 1 file changed, 31 insertions(+) diff --git a/csrc/extensions.cpp b/csrc/extensions.cpp index fa5a96ed..ca0f561b 100644 --- a/csrc/extensions.cpp +++ b/csrc/extensions.cpp @@ -408,6 +408,25 @@ std::tuple rms_norm( return std::tie(output, inv_rms); } +std::tuple rms_norm_backward( + at::Tensor& grad_input, at::Tensor& grad_weight, at::Tensor& grad_bias_opt, + const at::Tensor& grad_output, const at::Tensor& input, + const at::Tensor& weight, const c10::optional& bias_opt, + const at::Tensor& inv_rms, const OptionalIntArray& normalized_shape, + double eps) { + callDiopi(diopiRMSNormBackward, grad_input, grad_weight, grad_bias_opt, + grad_output, input, weight, bias_opt, inv_rms, normalized_shape, + eps); + return std::tie(grad_input, grad_weight, grad_bias_opt); +} + +at::Tensor& apply_rotary(at::Tensor& output, const at::Tensor& input, + const at::Tensor& cos, const at::Tensor& sin, + const bool conj, const bool interleaved) { + callDiopi(diopiRotaryEmbedding, output, input, cos, sin, conj, interleaved); + return output; +} + at::Tensor& example_for_all_backend(at::Tensor& inout) { std::cout << __FUNCTION__ << ": " << inout.options() << "\n"; return inout; @@ -444,6 +463,18 @@ TORCH_LIBRARY(deeplink_ext_, m) { "(Tensor(a!), Tensor(b!))", rms_norm); + m.def( + "rms_norm_backward(Tensor(a!) grad_input, Tensor(b!) grad_weight, " + "Tensor(c!) grad_bias_opt, Tensor grad_output, Tensor input, Tensor " + "weight, Tensor? bias_opt, Tensor inv_rms, int[]? normalized_shape, " + "float eps) -> (Tensor(a!), Tensor(b!), Tensor(c!))", + rms_norm_backward); + + m.def( + "apply_rotary(Tensor(a!) output, Tensor input, Tensor cos, Tensor sin, " + "bool conj, bool interleaved) -> Tensor(a!)", + apply_rotary); + m.def("example(Tensor(a!) inout)->Tensor(a!)", example_for_all_backend); } From bd1756534524aa85c17dd719da531a67f8ba7846 Mon Sep 17 00:00:00 2001 From: zhaoguochun1995 Date: Tue, 16 Apr 2024 17:13:21 +0800 Subject: [PATCH 7/7] backup --- csrc/extensions.cpp | 80 +++++++++++++++++++++++++++------------------ setup.py | 16 ++++++++- test_dispatch.py | 4 ++- 3 files changed, 66 insertions(+), 34 deletions(-) diff --git a/csrc/extensions.cpp b/csrc/extensions.cpp index ca0f561b..5aa6abf3 100644 --- a/csrc/extensions.cpp +++ b/csrc/extensions.cpp @@ -442,38 +442,49 @@ at::Tensor& example_only_for_xpu(at::Tensor& inout) { // for a certain backend, separate registration for a certain backend is also // supported. TORCH_LIBRARY(deeplink_ext_, m) { - m.def( - "adamw(Tensor(a!) param, Tensor(b!) exp_avg, Tensor(c!) exp_avg_sq, " - "Tensor? max_exp_avg_sq_opt, Tensor grad, float lr, float beta1, float " - "beta2, float epsilon, float weight_decay, int step, bool " - "amsgrad)->(Tensor(a!), Tensor(b!), Tensor(c!))", - adamw); - m.def( - "apply_penalty(Tensor(a!) logits, Tensor presence_penalty, Tensor " - "frequency_penalty, Tensor p_token_ids, Tensor p_token_counts, Tensor " - "p_cumsum_seq_len, int p_max_len_in_batch)->Tensor(a!)", - apply_penalty); - m.def( - "dest_index_copy_kv(Tensor(a!) out, Tensor k, Tensor " - "dest_loc)->Tensor(a!)", - dest_index_copy_kv); - m.def( - "rms_norm(Tensor(a!) output, Tensor(b!) inv_rms, Tensor input, int[]? " - "normalized_shape, Tensor weight, Tensor? bias_opt, float eps) -> " - "(Tensor(a!), Tensor(b!))", - rms_norm); - - m.def( - "rms_norm_backward(Tensor(a!) grad_input, Tensor(b!) grad_weight, " - "Tensor(c!) grad_bias_opt, Tensor grad_output, Tensor input, Tensor " - "weight, Tensor? bias_opt, Tensor inv_rms, int[]? normalized_shape, " - "float eps) -> (Tensor(a!), Tensor(b!), Tensor(c!))", - rms_norm_backward); - - m.def( - "apply_rotary(Tensor(a!) output, Tensor input, Tensor cos, Tensor sin, " - "bool conj, bool interleaved) -> Tensor(a!)", - apply_rotary); + if (&diopiAdamW != nullptr) { + m.def( + "adamw(Tensor(a!) param, Tensor(b!) exp_avg, Tensor(c!) exp_avg_sq, " + "Tensor? max_exp_avg_sq_opt, Tensor grad, float lr, float beta1, float " + "beta2, float epsilon, float weight_decay, int step, bool " + "amsgrad)->(Tensor(a!), Tensor(b!), Tensor(c!))", + adamw); + } + if (&diopiApplyPenalty != nullptr) { + m.def( + "apply_penalty(Tensor(a!) logits, Tensor presence_penalty, Tensor " + "frequency_penalty, Tensor p_token_ids, Tensor p_token_counts, Tensor " + "p_cumsum_seq_len, int p_max_len_in_batch)->Tensor(a!)", + apply_penalty); + } + if (&diopiDestIndexCopyKV != nullptr) { + m.def( + "dest_index_copy_kv(Tensor(a!) out, Tensor k, Tensor " + "dest_loc)->Tensor(a!)", + dest_index_copy_kv); + } + if (&diopiDestIndexCopyKV != nullptr) { + m.def( + "rms_norm(Tensor(a!) output, Tensor(b!) inv_rms, Tensor input, int[]? " + "normalized_shape, Tensor weight, Tensor? bias_opt, float eps) -> " + "(Tensor(a!), Tensor(b!))", + rms_norm); + } + + if (&diopiRMSNormBackward != nullptr) { + m.def( + "rms_norm_backward(Tensor(a!) grad_input, Tensor(b!) grad_weight, " + "Tensor(c!) grad_bias_opt, Tensor grad_output, Tensor input, Tensor " + "weight, Tensor? bias_opt, Tensor inv_rms, int[]? normalized_shape, " + "float eps) -> (Tensor(a!), Tensor(b!), Tensor(c!))", + rms_norm_backward); + } + if (&diopiRotaryEmbedding != nullptr) { + m.def( + "apply_rotary(Tensor(a!) output, Tensor input, Tensor cos, Tensor sin, " + "bool conj, bool interleaved) -> Tensor(a!)", + apply_rotary); + } m.def("example(Tensor(a!) inout)->Tensor(a!)", example_for_all_backend); } @@ -483,4 +494,9 @@ TORCH_LIBRARY_IMPL(deeplink_ext_, XPU, m) { // m.impl("example", example_only_for_xpu); } +int n = [](){ + std::cout << "deeplink_ext_ loaded" << std::endl; + return 0; +}(); + } // namespace dipu::dipu_ext diff --git a/setup.py b/setup.py index 0f7fb640..352ca023 100644 --- a/setup.py +++ b/setup.py @@ -1,7 +1,8 @@ # Copyright (c) 2024, DeepLink. from setuptools import find_packages, setup, Extension -from torch.utils.cpp_extension import BuildExtension, include_paths, library_paths +from torch.utils.cpp_extension import BuildExtension, CppExtension, include_paths, library_paths + import glob import os import subprocess @@ -86,3 +87,16 @@ def get_ext(): cmdclass={"build_ext": BuildExtensionWithCompdb}, install_requires=["einops"], ) + + +setup( + name='deeplink_ext_ops', + ext_modules=[ + CppExtension( + name='deeplink_ext_ops', + sources=glob.glob("./csrc/*.cpp"), + extra_compile_args=[' -g ']), + ], + cmdclass={ + 'build_ext': BuildExtension + }) \ No newline at end of file diff --git a/test_dispatch.py b/test_dispatch.py index 44a1e48c..4e2b62f5 100644 --- a/test_dispatch.py +++ b/test_dispatch.py @@ -1,7 +1,9 @@ import torch import torch_dipu import deeplink_ext -torch.ops.load_library(deeplink_ext.__path__[0] + "/cpp_extensions.cpython-39-x86_64-linux-gnu.so") + +so_path = deeplink_ext.__path__[0] + "/cpp_extensions.cpython-39-x86_64-linux-gnu.so" +torch.ops.load_library(so_path) print(f"torch.ops.loaded_libraries:{torch.ops.loaded_libraries}") #print(torch.ops.deeplink_ext_.dest_index_copy_kv)