DeepLink-org · wanfengcxz · Jul 10, 2024 · Jul 10, 2024 · Jul 10, 2024 · Jul 11, 2024
@@ -24,12 +24,10 @@ Standard:  Cpp11
 AllowAllParametersOfDeclarationOnNextLine: true
 BinPackParameters: true
 BinPackArguments: false
-BreakAfterAttributes: Leave
 ColumnLimit: 160
 DerivePointerAlignment: false
 PointerAlignment: Left
 ReferenceAlignment: Left
-InsertNewlineAtEOF: true
 SpacesBeforeTrailingComments: 2
 IncludeIsMainSourceRegex: (\.cu)$
 IncludeCategories:

@@ -6372,10 +6372,10 @@
         name=['unique'],
         interface=['torch'],
         para=dict(
-            sorted=[True, True, False, True, False, False, True, False],
-            return_inverse=[False, True, True, False, True, True, False, True],
-            return_counts=[False, False, True, True, True, False, True, False],
-            dim=[None, -1, 1, None, 2, 0, 1, -2],
+            sorted=         [True, True, False, True, False, False, True, False],
+            return_inverse= [False, True, True, False, True, True, False, True],
+            return_counts=  [False, False, True, True, True, False, True, False],
+            dim=            [None, -1,    1,     None, 2,    0,     1,    -2],
         ),
         tensor_para=dict(
             gen_fn='Genfunc.randn',

@@ -139,6 +139,7 @@ def compare_others(output, output_reference, **kwargs):
     def allclose(tensor_dev: np.ndarray, tensor_ref: np.ndarray, **kwargs) -> bool:
         var_name = kwargs.get('name', 'out')
         sum_to_compare = kwargs.get('sum_to_compare', False)
+        # sum_to_compare = False
         rtol = kwargs.get('rtol', 1e-5)
         atol = kwargs.get('atol', 1e-8)
         mismatch_ratio_threshold = kwargs.get('mismatch_ratio_threshold', 1e-3)
@@ -150,15 +151,20 @@ def allclose(tensor_dev: np.ndarray, tensor_ref: np.ndarray, **kwargs) -> bool:
         glob_vars.func_status[glob_vars.cur_test_func] = 'passed'
         if not passed:
             glob_vars.func_status[glob_vars.cur_test_func] = 'failed'
+            print(f"tensor_dev {tensor_dev}")
+            print(f"tensor_ref {tensor_ref}")
             sum1 = tensor_dev.sum()
             sum2 = tensor_ref.sum()
             mask = np.isclose(tensor_dev, tensor_ref, rtol, atol, equal_nan=True)
             count = np.count_nonzero(np.equal(mask, False))
             debug_level = glob_vars.debug_level
+            if debug_level < 1:
+                print(f'debug_level {debug_level}')
+                debug_level = 100
             if tensor_dev.dtype == np.bool_:
                 max_diff = 1
                 error_info = f"The count of elements that do not meet the accuracy requirement is {count}.\n" + \
-                    f"Max of diff is {max_diff}.\n"
+                    f"\n"
             elif tensor_dev.ndim == 0 and tensor_ref.ndim == 0:
                 # result is scalar array
                 error_info = f"The actual val is {tensor_dev} and the expected is {tensor_ref}.\n"
@@ -187,7 +193,7 @@ def allclose(tensor_dev: np.ndarray, tensor_ref: np.ndarray, **kwargs) -> bool:
                 if np.isnan(sum1) or np.isnan(sum2):
                     error_info += f"Exists nan, {var_name} is {sum1} and {var_name}_ref is {sum2}.\n"
                 else:
-                    error_info += f"Sum of {var_name} is {sum1}, Sum of {var_name}_ref is {sum2}, Max of diff is {max_diff}.\n"
+                    error_info += f"Sum of {var_name} is {sum1}, Sum of {var_name}_ref is {sum2}.\n"
                 if debug_level > 1:
                     error_info += f"{var_name} is {tensor_dev},\n{var_name}_ref is {tensor_ref},\nMask is {mask}\n"
             raise OutputCheckFailedException(error_info)
@@ -186,6 +186,7 @@ def parse_args():
         if args.pytest_args is not None:
             pytest_args.extend(args.pytest_args.split())
         pytest_args = ['--cache-clear', '--disable-warnings'] + pytest_args
+        # pytest_args = [ '-s'] + pytest_args
         exit_code = pytest.main(pytest_args)
         if exit_code != 0:
             raise SystemExit(exit_code)

@@ -0,0 +1,62 @@
+import torch
+import torch_dipu
+import torch.nn as nn
+import os
+
+def add_forward(x):
+    return x + x
+
+def linear_forward(x):
+    torch.manual_seed(0)
+    device = x.device
+    dim = x.shape[-1]
+
+    linear = nn.Linear(dim, dim)
+    linear.to(device)
+
+    y = linear(x)
+    return y
+
+def conv_forward(x):
+    torch.manual_seed(0)
+    device = x.device
+    batch, in_ch, w, h = x.shape
+
+    conv1 = nn.Conv2d(in_ch, in_ch, 3, 1, 1)
+    conv1.to(device)
+
+    y = conv1(x)
+    return y
+
+
+def batch_norm_forward(x):
+    device = x.device
+    batch, in_ch, w, h = x.shape
+
+    conv1 = nn.Conv2d(in_ch, in_ch, 3, 1, 1)
+    conv1.to(device)
+
+    y = conv1(x)
+    return y
+
+
+def test_func_acc(func, x_cpu):
+    x_dipu = x_cpu.cuda()
+    # x_dipu = x_cpu
+
+    y_cpu = func(x_cpu)
+    y_dipu = func(x_dipu)
+
+    return torch.sum(torch.abs(y_dipu.cpu() - y_cpu)) / x_cpu.numel()
+
+def print_diff(fun_name, diff):
+    print(f"{fun_name} mean diff: {diff}")
+
+if __name__ == "__main__":
+
+    torch.cuda.set_device(0)
+    os.environ['DIPU_DUMP_OP_ARGS'] = '1'
+    os.environ['DIPU_AUTOCOMPARE_OPS_LIST'] = '.*'
+    print_diff("add_forward" ,test_func_acc(add_forward, x_cpu=torch.randn(100,100)))
+    print_diff("linear_forward", test_func_acc(linear_forward, x_cpu=torch.randn(1000,1000)))
+    print_diff("conv_forward", test_func_acc(conv_forward, x_cpu=torch.randn(2, 32, 100,100)))
@@ -0,0 +1,8 @@
+#!/bin/bash
+# echo "First argument: $1"
+
+
+current_time=$(date "+%Y.%m.%d-%H.%M.%S")
+
+
+DIOPI_TRACK_ACL=1 DIPU_TRACK_ACL=1 DIPU_TRACK_HCCL=1 ASCEND_COREDUMP_SIGNAL=1 ASCEND_GLOBAL_LOG_LEVEL=0 DIPU_TRACK_ACL=1 DIPU_DEBUG_ALLOCATOR=15 python main.py --mode run_test | tee test_unique_${current_time}.log
@@ -0,0 +1,30 @@
+import torch
+import torch_dipu
+
+# # 创建一个张量
+# x = torch.tensor([1, 2, 3, 2, 3, 4, 4, 5, 6, 6]).cuda()
+# # 使用unique算子
+# unique_x = torch.unique(x)
+
+# print(unique_x)
+
+
+# 创建一个二维张量
+x = torch.tensor([[4, 5, 6,1],[1, 2, 3,1], [2, 3, 4,1],  [1, 2, 3,1]])
+
+# 在第一维度上使用unique算子
+unique_x, inverse_indices , counts  = torch.unique(x, dim=0, sorted=False, return_inverse=True, return_counts=True)
+
+print(unique_x)
+
+# 创建一个二维张量
+x = torch.tensor([[4, 5, 6,1],[1, 2, 3,1], [2, 3, 4,1],  [1, 2, 3,1]]).cuda()
+
+# 在第一维度上使用unique算子
+unique_x = torch.unique(x, dim=0)
+
+print(unique_x)
+
+print(inverse_indices)
+
+print(counts)
@@ -0,0 +1,14 @@
+import torch
+import torch_dipu
+# import torch_npu
+# from torch_npu.contrib import transfer_to_npu
+
+x = torch.ones([4, 64, 128], dtype=torch.int32).cuda()
+
+unique_x, inverse_indices  , counts = torch.unique(x, dim=1, sorted=False, return_inverse=True, return_counts=True)
+
+print(unique_x)
+
+print(inverse_indices)
+
+print(counts)
@@ -0,0 +1,25 @@
+import torch
+import torch_dipu
+
+x = torch.randn([252], dtype=torch.float32)
+# x = torch.tensor([[4, 5, 6,1],[1, 2, 3,1], [2, 3, 4,1],  [1, 2, 3,1]])
+
+# 在第一维度上使用unique算子
+# unique_x = torch.unique(x, dim=1,)
+unique_x, inverse_indices   = torch.unique(x, dim=-1, sorted=True, return_inverse=True, return_counts=False)
+
+# print(unique_x)
+
+# # 创建一个二维张量
+# x = torch.tensor([[4, 5, 6,1],[1, 2, 3,1], [2, 3, 4,1],  [1, 2, 3,1]]).cuda()
+
+# # 在第一维度上使用unique算子
+# unique_x = torch.unique(x, dim=0)
+
+# print(unique_x)
+
+print(unique_x)
+
+print(inverse_indices)
+
+# print(counts)
@@ -364,7 +364,7 @@ void callAclnnImpl(diopiContextHandle_t ctx, const std::tuple<Args...>& tuple) {
         static constexpr const char kWorkspaceApiName[] = #api "GetWorkspaceSize";                                \
         auto convertedParams = ::impl::ascend::aclnn_adaptor::convertParams(__VA_ARGS__);                         \
         ::impl::ascend::aclnn_adaptor::callAclnnImpl<kApiName, kWorkspaceApiName>(ctx, convertedParams.params()); \
-    } while (false)
+    } while (false);
 
 #define DIOPI_ASECND_CALL_ACLNN_TYPE_SYNC(api, ctx, ...)                                             \
     do {                                                                                             \
@@ -374,12 +374,12 @@ void callAclnnImpl(diopiContextHandle_t ctx, const std::tuple<Args...>& tuple) {
         diopiStreamHandle_t stream;                                                                  \
         diopiGetStream(ctx, &stream);                                                                \
         CALL_ACLRT(aclrtSynchronizeStream(reinterpret_cast<aclrtStream>(stream)));                   \
-    } while (false)
+    } while (false);
 
 #define DIOPI_ASCEND_CALL_ACLNN_SYNC(api, ctx, ...)                                       \
     do {                                                                                  \
         auto convertedParams = ::impl::ascend::aclnn_adaptor::convertParams(__VA_ARGS__); \
         DIOPI_ASECND_CALL_ACLNN_TYPE_SYNC(api, ctx, convertedParams.params())             \
-    } while (false)
+    } while (false);
 
 #endif  // IMPL_ASCEND_ACLNN_ADAPTOR_HPP_
@@ -212,7 +212,7 @@
     ),
 
     'pointwise_op': dict(
-        name=['erfinv', 'asin'],
+        name=['asin'],
         tensor_para=dict(
             args=[
                 {
@@ -982,30 +982,6 @@
         ),
     ),
 
-    'bernoulli': dict(
-        name=['bernoulli'],
-        tensor_para=dict(
-            args=[
-                {
-                    "ins": ['input'],
-                    "dtype": [Skip(np.float32),Skip(np.float64),Skip(np.float16),],
-                },
-            ]
-        ),
-    ),
-
-    'bernoulli_int': dict(
-        name=['bernoulli'],
-        tensor_para=dict(
-            args=[
-                {
-                    "ins": ['input'],
-                    "dtype": [Skip(np.int64),Skip(np.int32),Skip(np.int16),Skip(np.int8),Skip(np.uint8),Skip(np.bool_),],
-                },
-            ]
-        ),
-    ),
-
     'layer_norm': dict(
         name=['layer_norm'],
         atol=1e-2,

@@ -0,0 +1,40 @@
+/**
+ * @file
+ * @author DeepLink
+ * @copyright  (c) 2024, DeepLink.
+ */
+
+#include "../aclnn/acl_scalar.hpp"
+#include "../aclnn/adaptor.hpp"
+
+namespace impl {
+namespace ascend {
+static const uint64_t philoxDefaultNum = 10;
+
+diopiError_t diopiBernoulli(diopiContextHandle_t ctx, diopiTensorHandle_t out, diopiConstTensorHandle_t input, diopiGeneratorHandle_t generator) {
+    const std::pair<uint64_t, uint64_t> gen = getSeedAndOffset(ctx, generator, philoxDefaultNum);
+    const uint64_t seed = gen.first;
+    const uint64_t offset = gen.second;
+    DIOPI_ASCEND_CALL_ACLNN(aclnnInplaceBernoulliTensor, ctx, out, input, seed, offset);
+    return diopiSuccess;
+}
+
+diopiError_t diopiBernoulliInp(diopiContextHandle_t ctx, diopiTensorHandle_t inout, diopiGeneratorHandle_t generator) {
+    const std::pair<uint64_t, uint64_t> gen = getSeedAndOffset(ctx, generator, philoxDefaultNum);
+    const uint64_t seed = gen.first;
+    const uint64_t offset = gen.second;
+    DIOPI_ASCEND_CALL_ACLNN(aclnnInplaceBernoulliTensor, ctx, inout, inout, seed, offset);
+    return diopiSuccess;
+}
+
+diopiError_t diopiBernoulliScalar(diopiContextHandle_t ctx, diopiTensorHandle_t out, double p, diopiGeneratorHandle_t generator) {
+    const std::pair<uint64_t, uint64_t> gen = getSeedAndOffset(ctx, generator, philoxDefaultNum);
+    const uint64_t seed = gen.first;
+    const uint64_t offset = gen.second;
+    auto pScalar = constructDiopiScalarT(diopi_dtype_float64, p);
+    DIOPI_ASCEND_CALL_ACLNN(aclnnInplaceBernoulli, ctx, out, &pScalar, seed, offset);
+    return diopiSuccess;
+}
+
+}  // namespace ascend
+}  // namespace impl
@@ -0,0 +1,23 @@
+/**
+ * @file
+ * @author DeepLink
+ * @copyright  (c) 2024, DeepLink.
+ */
+
+#include "../aclnn/adaptor.hpp"
+
+namespace impl {
+namespace ascend {
+
+diopiError_t diopiErfinv(diopiContextHandle_t ctx, diopiTensorHandle_t out, diopiConstTensorHandle_t input) {
+    DIOPI_ASCEND_CALL_ACLNN(aclnnErfinv, ctx, input, out);
+    return diopiSuccess;
+}
+
+diopiError_t diopiErfinvInp(diopiContextHandle_t ctx, diopiTensorHandle_t input) {
+    DIOPI_ASCEND_CALL_ACLNN(aclnnInplaceErfinv, ctx, input);
+    return diopiSuccess;
+}
+
+}  // namespace ascend
+}  // namespace impl
@@ -90,7 +90,7 @@ diopiError_t diopiMaxPool2dWithIndices(diopiContextHandle_t ctx, diopiTensorHand
     ;
     ASCEND_CHECK_ABORT(indicesAt.dtype() == diopi_dtype_int32, "aclnnMaxPool2dWithIndices only support int32 indices");
 
-    DIOPI_ASCEND_CALL_ACLNN(aclnnMaxPool2dWithIndices,
+    DIOPI_ASCEND_CALL_ACLNN(aclnnMaxPool2dWithIndices, /* aclnnAdaptiveMaxPool2d */
                             ctx,
                             inputAt,
                             diopiSize_t{kernelSizeData, 2},