Revert "fix cat bug" (DeepLink-org#784)

yangbofun · web-flow · commit ba39537e340b · 2023-12-27T11:39:07.000+08:00
Revert "fix cat bug (DeepLink-org#772)" This reverts commit 285cca8.
diff --git a/impl/ascend/common/acloprunner.hpp b/impl/ascend/common/acloprunner.hpp
diff --git a/impl/ascend/device_configs.py b/impl/ascend/device_configs.py
@@ -207,28 +207,9 @@
     ),
 
     'conv_2d_no_contiguous': dict(
-        name=["conv2d"],
-        tensor_para=dict(
-            args=[
-                {
-                    "ins": ["input"],
-                    "dtype": [Skip(np.float32), Skip(np.float16), Skip(np.float64)],
-                },
-            ]
-        ),
-    ),
-
-    'relu_no_contiguous': dict(
-        name=["relu"],
-        is_inplace=True,
-        tensor_para=dict(
-            args=[
-                {
-                    "ins": ['input'],
-                    "dtype": [Skip(np.float32), Skip(np.float64)],
-                },
-            ],
-        ),
+        name=['conv2d'],
+        atol=1e-1,
+        rtol=1e-2,
     ),
 
     'hardswish': dict(
@@ -1607,6 +1588,78 @@
         ),
     ),
 
+    'copy': dict(
+        name=["copy_"],
+        tensor_para=dict(
+            # FIXME data type DT_COMPLEX128 of input [dst] is not supported
+            args=[
+                {
+                    "ins": ["input"],
+                    "shape": [Skip((12, 0, 9)), Skip((8,))],
+                    "dtype": [Skip(np.complex128), Skip(np.complex64)],
+                },
+                {
+                    "ins": ["other"],
+                    "dtype": [Skip(np.complex128)]
+                },
+            ]
+        )
+    ),
+
+    'copy_input_no_contiguous': dict(
+        name=["copy_"],
+        tensor_para=dict(
+            # FIXME not supported complex
+            args=[
+                {
+                    "ins": ["input"],
+                    "shape": [Skip((12, 1, 12)),],
+                    "dtype": [Skip(np.complex128), Skip(np.complex64)],
+                },
+                {
+                    "ins": ["other"],
+                    "dtype": [Skip(np.complex64)]
+                },
+            ]
+        )
+    ),
+
+    'copy_other_no_contiguous': dict(
+        name=["copy_"],
+        tensor_para=dict(
+            # FIXME data type DT_COMPLEX64 of input [dst] is not supported
+            # FIXME data type DT_COMPLEX128 of input [dst] is not supported
+            args=[
+                {
+                    "ins": ["input"],
+                    "shape": [Skip((6, 5, 384))],
+                    "dtype": [Skip(np.complex128), Skip(np.complex64)],
+                },
+                {
+                    "ins": ["other"],
+                    "dtype": [Skip(np.complex128)],
+                },
+            ]
+        )
+    ),
+
+    'copy_all_no_contiguous': dict(
+        name=["copy_"],
+        tensor_para=dict(
+            # FIXME data type DT_COMPLEX64 of input [dst] is not supported
+            args=[
+                {
+                    "ins": ["input"],
+                    "shape": [Skip((192, 147, 2)), Skip((2, 12, 38, 45, 3))],
+                },
+                {
+                    "ins": ["other"],
+                    "dtype": [Skip(np.complex64)],
+                },
+            ]
+        )
+    ),
+
     'fill_not_float': dict(
         name=["fill_"],
         tensor_para=dict(
diff --git a/impl/ascend/functions/cast.cpp b/impl/ascend/functions/cast.cpp
@@ -7,14 +7,8 @@
 #include "../common/acloprunner.hpp"
 
 namespace impl {
-
-// TODO(zhaoguochun): fix me
-namespace ascend_npu {
-extern diopiError_t diopiCastDtype(diopiContextHandle_t ctx, diopiTensorHandle_t out, diopiConstTensorHandle_t input);
-}
-
 namespace ascend {
-#if 0
+
 diopiError_t diopiCastDtype(diopiContextHandle_t ctx, diopiTensorHandle_t out, diopiConstTensorHandle_t input) {
     int64_t numel = 0;
     diopiGetTensorNumel(input, &numel);
@@ -63,11 +57,6 @@ diopiError_t diopiCastDtype(diopiContextHandle_t ctx, diopiTensorHandle_t out, d
 
     return diopiSuccess;
 }
-#endif
-
-diopiError_t diopiCastDtype(diopiContextHandle_t ctx, diopiTensorHandle_t out, diopiConstTensorHandle_t input) {
-    return ascend_npu::diopiCastDtype(ctx, out, input);
-}
 
 }  // namespace ascend
 }  // namespace impl
diff --git a/impl/ascend_npu/ascend_config.yaml b/impl/ascend_npu/ascend_config.yaml
@@ -17,6 +17,8 @@ ascend:
 - diopiBitwiseAnd
 - diopiBitwiseNot
 - diopiBmm
+- diopiCastDtype
+- diopiCat
 - diopiClamp
 - diopiClampInp
 - diopiClampInpScalar
@@ -30,6 +32,7 @@ ascend:
 - diopiClampMinScalar
 - diopiClampScalar
 - diopiContiguous
+- diopiCopyInp
 - diopiCos
 - diopiCosInp
 - diopiCrossEntropyLoss
@@ -201,9 +204,6 @@ ascend:
 - diopiScatterInpScalar
 - diopiApplyPenalty
 ascend_npu:
-- diopiCastDtype
-- diopiCopyInp
-- diopiCat
 - diopiRemainderTensor
 - diopiRemainderScalar
 - diopiRemainder
diff --git a/impl/ascend_npu/diopi_impl/cast.cpp b/impl/ascend_npu/diopi_impl/cast.cpp
diff --git a/impl/ascend_npu/diopi_impl/cat.cpp b/impl/ascend_npu/diopi_impl/cat.cpp
diff --git a/impl/ascend_npu/diopi_impl/copy.cpp b/impl/ascend_npu/diopi_impl/copy.cpp
@@ -12,7 +12,7 @@ namespace OP_IMPL_NS {
 
 diopiError_t diopiCopyInp(diopiContextHandle_t ctx, diopiConstTensorHandle_t src, diopiTensorHandle_t dest) {
     BEGIN_CALL_ACL_OP(src, dest);
-    if (src == nullptr || dest == nullptr || !srcAt.defined() || !destAt.defined() || srcAt.numel() <= 0 || destAt.numel() <= 0) {
+    if (!srcAt.defined() || !destAt.defined()) {
         return diopiSuccess;
     }
     at_npu::native::NPUNativeFunctions::copy_(destAt, srcAt, false);
diff --git a/impl/ascend_npu/diopi_impl/helper.hpp b/impl/ascend_npu/diopi_impl/helper.hpp
@@ -108,11 +108,8 @@ inline int debugLevel() {
     impl::aten::setCurCtx(ctx);                                                        \
     BUILD_ATEN_ARGS(__VA_ARGS__)
 
-#define END_CALL_ACL_OP()                                                                         \
-    impl::aten::unsetCurCtx();                                                                    \
-    if (debugLevel()) {                                                                           \
-        std::cout << __FILE__ << ":" << __LINE__ << " :" << __FUNCTION__ << " over" << std::endl; \
-    }                                                                                             \
+#define END_CALL_ACL_OP()      \
+    impl::aten::unsetCurCtx(); \
     return diopiSuccess;
 
 inline void logError() { std::cerr << std::endl; }
diff --git a/impl/ascend_npu/torch_npu/csrc/CopyKernel.cpp b/impl/ascend_npu/torch_npu/csrc/CopyKernel.cpp
@@ -282,7 +282,7 @@ bool try_to_optimize_copy_with_any_format(at::Tensor& self, const at::Tensor& sr
 }
 
 at::Tensor& NPUNativeFunctions::copy_(at::Tensor& self, const at::Tensor& src, bool non_blocking) {
-    if (!self.defined() || self.numel() == 0) {
+    if (self.numel() == 0) {
         return self;
     }
     // save tensor dim name
diff --git a/impl/ascend_npu/torch_npu/csrc/DIOPIAdapter.cpp b/impl/ascend_npu/torch_npu/csrc/DIOPIAdapter.cpp
@@ -1572,26 +1572,16 @@ class AclTensorDescMaker {
             dims = storageDesc.base_sizes_;
         }
         auto format = storageDesc.origin_format_;
-        if (debugLevel()) {
-            std::cout << __FUNCTION__ << ":" << dataType << "," << dims << "," << format << std::endl;
-        }
-
         desc = aclCreateTensorDesc(dataType, dims.size(), dims.data(), format);
         return *this;
     }
 
     inline AclTensorDescMaker& Create(aclDataType dataType, c10::IntArrayRef dims, aclFormat format) {
-        if (debugLevel()) {
-            std::cout << __FUNCTION__ << ":" << dataType << "," << dims << "," << format << std::endl;
-        }
         desc = aclCreateTensorDesc(dataType, dims.size(), dims.data(), format);
         return *this;
     }
 
     inline AclTensorDescMaker& Create(aclDataType dataType, aclFormat format) {
-        if (debugLevel()) {
-            std::cout << __FUNCTION__ << ":" << dataType << "," << format << std::endl;
-        }
         desc = aclCreateTensorDesc(dataType, 0, nullptr, format);
         return *this;
     }
@@ -2176,7 +2166,19 @@ std::tuple<aclTensorDesc*, aclDataBuffer*> CovertToAclOutput(const at::Tensor& t
 // This class maintain the position of the current
 // OpCommandImpl object in vector, the resources in
 // the object is
+class OpCommandImpls {
+public:
+    TORCH_NPU_API static OpCommandImpls* GetInstanceByTid(std::thread::id tid);
+    TORCH_NPU_API void Push(OpCommandImpl*& ptr);
+    TORCH_NPU_API void Pop();
 
+private:
+    int32_t offset = -1;
+    c10::SmallVector<OpCommandImpl, N> objs;
+};  // class OpCommandImpls
+
+static std::unordered_map<std::thread::id, OpCommandImpls> opcommand_impls_map;
+static std::mutex map_mutex;
 static bool deterministicaclnn_oldstatus = false;
 
 void OpCommandImpl::SetDeterministic() {
@@ -2190,15 +2192,38 @@ void OpCommandImpl::SetDeterministic() {
     }
 }
 
+OpCommandImpls* OpCommandImpls::GetInstanceByTid(std::thread::id tid) {
+    if (opcommand_impls_map.find(tid) == opcommand_impls_map.end()) {
+        OpCommandImpls impl;
+        std::lock_guard<std::mutex> lock(map_mutex);
+        opcommand_impls_map[tid] = std::move(impl);
+    }
+    return &opcommand_impls_map[tid];
+}
+
+void OpCommandImpls::Push(OpCommandImpl*& ptr) {
+    ++offset;
+    if (static_cast<int32_t>(objs.size()) <= offset) {
+        OpCommandImpl impl;
+        objs.emplace_back(std::move(impl));
+    }
+    TORCH_CHECK(objs.size() > offset, "OpCommand size (", objs.size(), ") is smaller than offset (", offset, ")");
+    ptr = &objs[offset];
+}
+
+void OpCommandImpls::Pop() {
+    TORCH_CHECK(offset >= 0, "OpCommand current offset should not be less than ", offset);
+    offset -= 1;
+}
+
 OpCommand::OpCommand() {
-    aclCmd = new OpCommandImpl();
+    aclCmds = OpCommandImpls::GetInstanceByTid(std::this_thread::get_id());
+
+    aclCmds->Push(aclCmd);
     aclCmd->SetCustomHandler(nullptr);
 }
 
-OpCommand::~OpCommand() {
-    OpCommandImpl* impl = static_cast<OpCommandImpl*>(aclCmd);
-    delete impl;
-}
+OpCommand::~OpCommand() {}
 
 OpCommand& OpCommand::Name(const string& name) {
     aclCmd->SetName(name);
@@ -2390,6 +2415,7 @@ void OpCommand::Run() {
         Sync();
     }
     aclCmd->releaseSource();
+    aclCmds->Pop();
 }
 
 OpCommand& OpCommand::Sync(c10::SmallVector<int64_t, N>& index) {
diff --git a/impl/ascend_npu/torch_npu/csrc/NPUNativeFunctions.cpp b/impl/ascend_npu/torch_npu/csrc/NPUNativeFunctions.cpp
@@ -173,14 +173,10 @@ at::Tensor& npu_dtype_cast_(at::Tensor& self, const at::Tensor& src) {
         source = npu_broadcast(source, self.sizes());
     }
     if (source.strides() == self.strides()) {
-        // TODO(zhaoguochun): This must be repaired
-        // acl_op::npu_dtype_cast_(self, source);
-        self.copy_(source.cpu().to(self.scalar_type()));
+        acl_op::npu_dtype_cast_(self, source);
     } else {
         at::Tensor selfTemp = at_npu::native::empty_npu(source.sizes(), self.options());
-        // TODO(zhaoguochun): This must be repaired
-        // acl_op::npu_dtype_cast_(selfTemp, source);
-        selfTemp.copy_(source.cpu().to(self.scalar_type()));
+        acl_op::npu_dtype_cast_(selfTemp, source);
         self.copy_(selfTemp);
     }
     return self;
@@ -535,11 +531,7 @@ ::std::tuple<at::Tensor, at::Tensor> npu_max(const at::Tensor& self, int64_t dim
 ::std::tuple<at::Tensor, at::Tensor> npu_max(const at::Tensor& self, at::Dimname dim, bool keepdim) { CUSTOM_OP_NOT_IMPL; }
 at::Tensor npu_bmmV2(const at::Tensor& self, const at::Tensor& mat2, at::IntArrayRef output_sizes) { CUSTOM_OP_NOT_IMPL; }
 
-at::Tensor npu_dtype_cast(const at::Tensor& self, at::ScalarType dtype) {
-    // TODO(zhaoguochun): This must be repaired
-    // return acl_op::npu_dtype_cast(self, dtype);
-    return self.cpu().to(dtype).to(self.device());
-}
+at::Tensor npu_dtype_cast(const at::Tensor& self, at::ScalarType dtype) { return acl_op::npu_dtype_cast(self, dtype); }
 
 at::Tensor npu_silu(const at::Tensor& self) { CUSTOM_OP_NOT_IMPL; }
 at::Tensor& npu_silu_(at::Tensor& self) { CUSTOM_OP_NOT_IMPL; }
diff --git a/impl/ascend_npu/torch_npu/csrc/framework/DIOPIAdapter.h b/impl/ascend_npu/torch_npu/csrc/framework/DIOPIAdapter.h
@@ -770,6 +770,7 @@ using DynamicInputRegFunc = std::function<ge::OperatorPtr(DyNumAndIndex, std::st
 using baseFormatConverter = std::function<FormatShape(c10::IntArrayRef storage_dims, c10::IntArrayRef base_dims)>;
 
 class OpCommand {
+    class OpCommandImpls* aclCmds = nullptr;
     class OpCommandImpl* aclCmd = nullptr;
     c10::SmallVector<at::Tensor, N> storage;
     c10::optional<at::ScalarType> commonType = c10::nullopt;

Original file line number	Diff line number	Diff line change
`@@ -12,7 +12,7 @@ namespace OP_IMPL_NS {`
`12`	`12`
`13`	`13`	`diopiError_t diopiCopyInp(diopiContextHandle_t ctx, diopiConstTensorHandle_t src, diopiTensorHandle_t dest) {`
`14`	`14`	`BEGIN_CALL_ACL_OP(src, dest);`
`15`		`- if (src == nullptr \|\| dest == nullptr \|\| !srcAt.defined() \|\| !destAt.defined() \|\| srcAt.numel() <= 0 \|\| destAt.numel() <= 0) {`
	`15`	`+ if (!srcAt.defined() \|\| !destAt.defined()) {`
`16`	`16`	`return diopiSuccess;`
`17`	`17`	`}`
`18`	`18`	`at_npu::native::NPUNativeFunctions::copy_(destAt, srcAt, false);`
Original file line number	Diff line number	Diff line change
`@@ -282,7 +282,7 @@ bool try_to_optimize_copy_with_any_format(at::Tensor& self, const at::Tensor& sr`
`282`	`282`	`}`
`283`	`283`
`284`	`284`	`at::Tensor& NPUNativeFunctions::copy_(at::Tensor& self, const at::Tensor& src, bool non_blocking) {`
`285`		`- if (!self.defined() \|\| self.numel() == 0) {`
	`285`	`+ if (self.numel() == 0) {`
`286`	`286`	`return self;`
`287`	`287`	`}`
`288`	`288`	`// save tensor dim name`