diff --git a/third_party/compute_library/acl_thread_local_scheduler.patch b/third_party/compute_library/acl_thread_local_scheduler.patch deleted file mode 100644 index 9ebf6b71f..000000000 --- a/third_party/compute_library/acl_thread_local_scheduler.patch +++ /dev/null @@ -1,98 +0,0 @@ -diff --git a/arm_compute/runtime/Scheduler.h b/arm_compute/runtime/Scheduler.h -index 9e8add1f9..cf5e2bf4c 100644 ---- a/arm_compute/runtime/Scheduler.h -+++ b/arm_compute/runtime/Scheduler.h -@@ -75,7 +75,7 @@ public: - - private: - static Type _scheduler_type; -- static std::shared_ptr _custom_scheduler; -+ static thread_local std::shared_ptr _custom_scheduler; - static std::map> _schedulers; - - Scheduler(); -diff --git a/src/cpu/operators/CpuDepthwiseConv2dAssemblyDispatch.cpp b/src/cpu/operators/CpuDepthwiseConv2dAssemblyDispatch.cpp -index a5b9eca56..d1ab19397 100644 ---- a/src/cpu/operators/CpuDepthwiseConv2dAssemblyDispatch.cpp -+++ b/src/cpu/operators/CpuDepthwiseConv2dAssemblyDispatch.cpp -@@ -60,8 +60,8 @@ void CpuDepthwiseConv2dAssemblyDispatch::configure(const ITensorInfo *src, - const ConvolutionInfo &info) - { - ARM_COMPUTE_LOG_PARAMS(src, weights, bias, dst, info); -- const CPUInfo &ci = NEScheduler::get().cpu_info(); -- const unsigned int num_threads = NEScheduler::get().num_threads(); -+ const CPUInfo &ci = CPUInfo::get(); -+ const unsigned int num_threads = CPUInfo::get().get_cpu_num(); - _pImpl->is_prepared = false; - _pImpl->are_weights_const = weights->are_values_constant(); - -diff --git a/src/cpu/operators/CpuPool2d.cpp b/src/cpu/operators/CpuPool2d.cpp -index 722cd36ee..03aef1632 100644 ---- a/src/cpu/operators/CpuPool2d.cpp -+++ b/src/cpu/operators/CpuPool2d.cpp -@@ -66,8 +66,8 @@ void CpuPool2d::configure(ITensorInfo *src, ITensorInfo *dst, const PoolingLayer - - if(run_optimised) - { -- const CPUInfo &ci = NEScheduler::get().cpu_info(); -- const unsigned int num_threads = NEScheduler::get().num_threads(); -+ const CPUInfo &ci = CPUInfo::get(); -+ const unsigned int num_threads = CPUInfo::get().get_cpu_num(); - - auto pooling_wrapper = std::make_unique(); - ARM_COMPUTE_ERROR_ON(pooling_wrapper == nullptr); -diff --git a/src/cpu/operators/internal/CpuGemmAssemblyDispatch.cpp b/src/cpu/operators/internal/CpuGemmAssemblyDispatch.cpp - ******************************************************************************* - Copyright 2023 Arm Limited and affiliates. - SPDX-License-Identifier: Apache-2.0 - - Licensed under the Apache License, Version 2.0 (the "License"); - you may not use this file except in compliance with the License. - You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - - Unless required by applicable law or agreed to in writing, software - distributed under the License is distributed on an "AS IS" BASIS, - WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - See the License for the specific language governing permissions and - limitations under the License. - ******************************************************************************* -index 9c8563140..f7771945a 100644 ---- a/src/cpu/operators/internal/CpuGemmAssemblyDispatch.cpp -+++ b/src/cpu/operators/internal/CpuGemmAssemblyDispatch.cpp -@@ -623,8 +623,8 @@ void create_arm_gemm(std::unique_ptr &arm_ge - arm_gemm::Activation activation, const AsmGemmInfo &info) - { - Params p = extract_parameters(a, b, d, info); -- const CPUInfo &ci = NEScheduler::get().cpu_info(); -- unsigned int num_threads = NEScheduler::get().num_threads(); -+ const CPUInfo &ci = CPUInfo::get(); -+ unsigned int num_threads = CPUInfo::get().get_cpu_num(); - - arm_gemm::GemmConfig cfg; - cfg.weight_format = assembly_utils::map_to_arm_gemm_weight_format(info.weight_format); -@@ -696,8 +696,8 @@ Status CpuGemmAssemblyDispatch::has_opt_impl(arm_compute::WeightFormat &expected - ARM_COMPUTE_UNUSED(c); - arm_gemm::Activation act = assembly_utils::map_to_arm_gemm_activation(info.activation_info); - Params p = extract_parameters(a, b, d, info); -- const CPUInfo &ci = NEScheduler::get().cpu_info(); -- unsigned int num_threads = NEScheduler::get().num_threads(); -+ const CPUInfo &ci = CPUInfo::get(); -+ unsigned int num_threads = CPUInfo::get().get_cpu_num(); - arm_gemm::GemmConfig cfg; - cfg.weight_format = assembly_utils::map_to_arm_gemm_weight_format(info.weight_format); - arm_gemm::WeightFormat arm_gemm_expected_wf = assembly_utils::map_to_arm_gemm_weight_format(expected_weight_format); -diff --git a/src/runtime/Scheduler.cpp b/src/runtime/Scheduler.cpp -index 0713b9a2a..f15ac2e22 100644 ---- a/src/runtime/Scheduler.cpp -+++ b/src/runtime/Scheduler.cpp -@@ -47,7 +47,7 @@ Scheduler::Type Scheduler::_scheduler_type = Scheduler::Type::CPP; - Scheduler::Type Scheduler::_scheduler_type = Scheduler::Type::ST; - #endif /* ARM_COMPUTE_*_SCHEDULER */ - --std::shared_ptr Scheduler::_custom_scheduler = nullptr; -+thread_local std::shared_ptr Scheduler::_custom_scheduler = nullptr; - - namespace - { diff --git a/third_party/compute_library/exclude_omp_scheduler.patch b/third_party/compute_library/exclude_omp_scheduler.patch index 7ccfebbd3..1724c4ab7 100644 --- a/third_party/compute_library/exclude_omp_scheduler.patch +++ b/third_party/compute_library/exclude_omp_scheduler.patch @@ -1,8 +1,8 @@ diff --git a/src/BUILD.bazel b/src/BUILD.bazel -index bf71e534e2..22377f1a32 100644 +index 547c98576..a31301230 100644 --- a/src/BUILD.bazel +++ b/src/BUILD.bazel -@@ -971,7 +971,6 @@ filegroup( +@@ -1029,7 +1029,6 @@ filegroup( "runtime/NEON/functions/NETranspose.cpp", "runtime/NEON/functions/NEUnstack.cpp", "runtime/NEON/functions/NEWinogradConvolutionLayer.cpp", @@ -10,10 +10,10 @@ index bf71e534e2..22377f1a32 100644 "runtime/OffsetLifetimeManager.cpp", "runtime/OffsetMemoryPool.cpp", "runtime/OperatorTensor.cpp", -@@ -984,6 +983,10 @@ filegroup( - "runtime/Tensor.cpp", - "runtime/TensorAllocator.cpp", - "runtime/Utils.cpp"] + +@@ -1058,6 +1057,10 @@ filegroup( + "runtime/experimental/operators/CpuSub.cpp", + "runtime/experimental/operators/CpuTranspose.cpp", + "runtime/experimental/operators/CpuWinogradConv2d.cpp"] + + select({ + "//:openmp_flag": ["runtime/OMP/OMPScheduler.cpp"], + "//conditions:default": [], diff --git a/third_party/mkl_dnn/mkldnn_acl.BUILD b/third_party/mkl_dnn/mkldnn_acl.BUILD index cbde2d13b..ce02d655e 100644 --- a/third_party/mkl_dnn/mkldnn_acl.BUILD +++ b/third_party/mkl_dnn/mkldnn_acl.BUILD @@ -9,13 +9,6 @@ _DNNL_COPTS_THREADPOOL = [ "-UUSE_CBLAS", ] -_DNNL_COPTS_OMP = [ - "-fopenmp", - "-fexceptions", - "-UUSE_MKL", - "-UUSE_CBLAS", -] - _DNNL_RUNTIME_THREADPOOL = { "#cmakedefine DNNL_CPU_THREADING_RUNTIME DNNL_RUNTIME_${DNNL_CPU_THREADING_RUNTIME}": "#define DNNL_CPU_THREADING_RUNTIME DNNL_RUNTIME_THREADPOOL", "#cmakedefine DNNL_CPU_RUNTIME DNNL_RUNTIME_${DNNL_CPU_RUNTIME}": "#define DNNL_CPU_RUNTIME DNNL_RUNTIME_THREADPOOL", @@ -63,61 +56,24 @@ _DNNL_RUNTIME_THREADPOOL = { "#cmakedefine01 BUILD_XEHPG": "#define BUILD_XEHPG 0", "#cmakedefine01 BUILD_XEHPC": "#define BUILD_XEHPC 0", "#cmakedefine01 BUILD_XEHP": "#define BUILD_XEHP 0", -} - -_DNNL_RUNTIME_OMP = { - "#cmakedefine DNNL_CPU_THREADING_RUNTIME DNNL_RUNTIME_${DNNL_CPU_THREADING_RUNTIME}": "#define DNNL_CPU_THREADING_RUNTIME DNNL_RUNTIME_OMP", - "#cmakedefine DNNL_CPU_RUNTIME DNNL_RUNTIME_${DNNL_CPU_RUNTIME}": "#define DNNL_CPU_RUNTIME DNNL_RUNTIME_OMP", - "#cmakedefine DNNL_GPU_RUNTIME DNNL_RUNTIME_${DNNL_GPU_RUNTIME}": "#define DNNL_GPU_RUNTIME DNNL_RUNTIME_NONE", - "#cmakedefine DNNL_USE_RT_OBJECTS_IN_PRIMITIVE_CACHE": "#undef DNNL_USE_RT_OBJECTS_IN_PRIMITIVE_CACHE", - "#cmakedefine DNNL_WITH_SYCL": "#undef DNNL_WITH_SYCL", - "#cmakedefine DNNL_WITH_LEVEL_ZERO": "#undef DNNL_WITH_LEVEL_ZERO", - "#cmakedefine DNNL_SYCL_CUDA": "#undef DNNL_SYCL_CUDA", - "#cmakedefine DNNL_SYCL_HIP": "#undef DNNL_SYCL_HIP", - "#cmakedefine DNNL_ENABLE_STACK_CHECKER": "#undef DNNL_ENABLE_STACK_CHECKER", - "#cmakedefine DNNL_EXPERIMENTAL": "#undef DNNL_EXPERIMENTAL", - "#cmakedefine ONEDNN_BUILD_GRAPH": "#undef ONEDNN_BUILD_GRAPH", - "#cmakedefine01 BUILD_TRAINING": "#define BUILD_TRAINING 1", - "#cmakedefine01 BUILD_INFERENCE": "#define BUILD_INFERENCE 0", - "#cmakedefine01 BUILD_PRIMITIVE_ALL": "#define BUILD_PRIMITIVE_ALL 1", - "#cmakedefine01 BUILD_BATCH_NORMALIZATION": "#define BUILD_BATCH_NORMALIZATION 0", - "#cmakedefine01 BUILD_BINARY": "#define BUILD_BINARY 0", - "#cmakedefine01 BUILD_CONCAT": "#define BUILD_CONCAT 0", - "#cmakedefine01 BUILD_CONVOLUTION": "#define BUILD_CONVOLUTION 0", - "#cmakedefine01 BUILD_DECONVOLUTION": "#define BUILD_DECONVOLUTION 0", - "#cmakedefine01 BUILD_ELTWISE": "#define BUILD_ELTWISE 0", - "#cmakedefine01 BUILD_INNER_PRODUCT": "#define BUILD_INNER_PRODUCT 0", - "#cmakedefine01 BUILD_LAYER_NORMALIZATION": "#define BUILD_LAYER_NORMALIZATION 0", - "#cmakedefine01 BUILD_LRN": "#define BUILD_LRN 0", - "#cmakedefine01 BUILD_MATMUL": "#define BUILD_MATMUL 0", - "#cmakedefine01 BUILD_POOLING": "#define BUILD_POOLING 0", - "#cmakedefine01 BUILD_PRELU": "#define BUILD_PRELU 0", - "#cmakedefine01 BUILD_REDUCTION": "#define BUILD_REDUCTION 0", - "#cmakedefine01 BUILD_REORDER": "#define BUILD_REORDER 0", - "#cmakedefine01 BUILD_RESAMPLING": "#define BUILD_RESAMPLING 0", - "#cmakedefine01 BUILD_RNN": "#define BUILD_RNN 0", - "#cmakedefine01 BUILD_SHUFFLE": "#define BUILD_SHUFFLE 0", - "#cmakedefine01 BUILD_SOFTMAX": "#define BUILD_SOFTMAX 0", - "#cmakedefine01 BUILD_SUM": "#define BUILD_SUM 0", - "#cmakedefine01 BUILD_PRIMITIVE_CPU_ISA_ALL": "#define BUILD_PRIMITIVE_CPU_ISA_ALL 0", - "#cmakedefine01 BUILD_SSE41": "#define BUILD_SSE41 0", - "#cmakedefine01 BUILD_AVX2": "#define BUILD_AVX2 0", - "#cmakedefine01 BUILD_AVX512": "#define BUILD_AVX512 0", - "#cmakedefine01 BUILD_AMX": "#define BUILD_AMX 0", - "#cmakedefine01 BUILD_PRIMITIVE_GPU_ISA_ALL": "#define BUILD_PRIMITIVE_GPU_ISA_ALL 0", - "#cmakedefine01 BUILD_GEN9": "#define BUILD_GEN9 0", - "#cmakedefine01 BUILD_GEN11": "#define BUILD_GEN11 0", - "#cmakedefine01 BUILD_XELP": "#define BUILD_XELP 0", - "#cmakedefine01 BUILD_XEHPG": "#define BUILD_XEHPG 0", - "#cmakedefine01 BUILD_XEHPC": "#define BUILD_XEHPC 0", - "#cmakedefine01 BUILD_XEHP": "#define BUILD_XEHP 0", + "#cmakedefine01 BUILD_GROUP_NORMALIZATION": "#define BUILD_GROUP_NORMALIZATION 0", + "#cmakedefine01 BUILD_GEMM_KERNELS_ALL": "#define BUILD_GEMM_KERNELS_ALL 1", + "#cmakedefine01 BUILD_GEMM_KERNELS_NONE": "#define BUILD_GEMM_KERNELS_NONE 0", + "#cmakedefine01 BUILD_GEMM_SSE41": "#define BUILD_GEMM_SSE41 0", + "#cmakedefine01 BUILD_GEMM_AVX2": "#define BUILD_GEMM_AVX2 0", + "#cmakedefine01 BUILD_GEMM_AVX512": "#define BUILD_GEMM_AVX512 0", + "#cmakedefine DNNL_GPU_VENDOR": "#define DNNL_GPU_VENDOR INTEL", + "#cmakedefine DNNL_SYCL_GENERIC": "#undef DNNL_SYCL_GENERIC", + "#cmakedefine DNNL_DISABLE_GPU_REF_KERNELS": "#undef DNNL_DISABLE_GPU_REF_KERNELS", + "#cmakedefine01 BUILD_SDPA": "#define BUILD_SDPA 0", + "#cmakedefine01 BUILD_XE2": "#define BUILD_XE2 0", + "#cmakedefine01 BUILD_XE3": "#define BUILD_XE3 0", } expand_template( name = "dnnl_config_h", out = "include/oneapi/dnnl/dnnl_config.h", substitutions = select({ - "@xla//xla/tsl/mkl:build_with_mkl_aarch64_openmp": _DNNL_RUNTIME_OMP, "//conditions:default": _DNNL_RUNTIME_THREADPOOL, }), template = "include/oneapi/dnnl/dnnl_config.h.in", @@ -128,13 +84,21 @@ expand_template( out = "include/oneapi/dnnl/dnnl_version.h", substitutions = { "@DNNL_VERSION_MAJOR@": "3", - "@DNNL_VERSION_MINOR@": "2", - "@DNNL_VERSION_PATCH@": "1", - "@DNNL_VERSION_HASH@": "N/A", + "@DNNL_VERSION_MINOR@": "7", + "@DNNL_VERSION_PATCH@": "0", }, template = "include/oneapi/dnnl/dnnl_version.h.in", ) +expand_template( + name = "dnnl_version_hash_h", + out = "include/oneapi/dnnl/dnnl_version_hash.h", + substitutions = { + "@DNNL_VERSION_HASH@": "N/A", + }, + template = "include/oneapi/dnnl/dnnl_version_hash.h.in", +) + cc_library( name = "mkl_dnn_acl", srcs = glob( @@ -146,10 +110,11 @@ cc_library( exclude = [ "src/cpu/x64/**", "src/cpu/rv64/**", + "src/cpu/sycl/**", + "src/xpu/**", ], ), copts = select({ - "@xla//xla/tsl/mkl:build_with_mkl_aarch64_openmp": _DNNL_COPTS_OMP, "//conditions:default": _DNNL_COPTS_THREADPOOL, }), defines = ["DNNL_AARCH64_USE_ACL=1"], @@ -175,6 +140,7 @@ cc_library( ) + [ ":dnnl_config_h", ":dnnl_version_h", + ":dnnl_version_hash_h", ], visibility = ["//visibility:public"], deps = [ diff --git a/third_party/mkl_dnn/onednn_acl_add_bf16_platform_support_check.patch b/third_party/mkl_dnn/onednn_acl_add_bf16_platform_support_check.patch deleted file mode 100644 index 42dd26232..000000000 --- a/third_party/mkl_dnn/onednn_acl_add_bf16_platform_support_check.patch +++ /dev/null @@ -1,31 +0,0 @@ -/* Copyright 2024 The OpenXLA Authors. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. -==============================================================================*/ - -diff --git a/src/cpu/platform.cpp b/src/cpu/platform.cpp -index 65b887ea21..eabdb827bd 100644 ---- a/src/cpu/platform.cpp -+++ b/src/cpu/platform.cpp -@@ -117,6 +117,8 @@ bool has_data_type_support(data_type_t data_type) { - #if defined(USE_CBLAS) && defined(BLAS_HAS_SBGEMM) && defined(__MMA__) - return true; - #endif -+#elif DNNL_AARCH64_USE_ACL -+ return arm_compute::CPUInfo::get().has_bf16(); - #else - return false; - #endif --- -2.34.1 - diff --git a/third_party/mkl_dnn/onednn_acl_add_sbgemm_matmul_primitive_definition.patch b/third_party/mkl_dnn/onednn_acl_add_sbgemm_matmul_primitive_definition.patch deleted file mode 100644 index 779608a68..000000000 --- a/third_party/mkl_dnn/onednn_acl_add_sbgemm_matmul_primitive_definition.patch +++ /dev/null @@ -1,44 +0,0 @@ -/* Copyright 2024 The OpenXLA Authors. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. -==============================================================================*/ - -diff --git a/src/cpu/aarch64/matmul/acl_matmul.hpp b/src/cpu/aarch64/matmul/acl_matmul.hpp -index ab13efb9b2..ec261e156d 100644 ---- a/src/cpu/aarch64/matmul/acl_matmul.hpp -+++ b/src/cpu/aarch64/matmul/acl_matmul.hpp -@@ -78,11 +78,21 @@ struct acl_matmul_t : public primitive_t { - = utils::everyone_is(data_type::f16, src_md()->data_type, - weights_md()->data_type, dst_md()->data_type) - && platform::has_data_type_support(data_type::f16); -+ const bool is_fp32_bf16_ok -+ = (utils::everyone_is(data_type::f32, src_md()->data_type, -+ dst_md()->data_type, desc()->accum_data_type) -+ && platform::has_data_type_support(data_type::f32) -+ && utils::everyone_is( -+ data_type::bf16, weights_md()->data_type) -+ && platform::has_data_type_support( -+ data_type::bf16)); -+ - const bool is_weights_md_format_ok - = utils::one_of(weights_format_kind_received, - format_kind::any, format_kind::blocked); - bool ok = is_dense_data() -- && utils::one_of(true, is_fp32_ok, is_fp16_ok) -+ && utils::one_of( -+ true, is_fp32_ok, is_fp16_ok, is_fp32_bf16_ok) - && !has_zero_dim_memory() && is_weights_md_format_ok - && set_default_formats() - && attr()->has_default_values( --- -2.34.1 diff --git a/third_party/mkl_dnn/onednn_acl_allow_blocked_weight_format_for_matmul_primitive.patch b/third_party/mkl_dnn/onednn_acl_allow_blocked_weight_format_for_matmul_primitive.patch deleted file mode 100644 index ec2cb97f5..000000000 --- a/third_party/mkl_dnn/onednn_acl_allow_blocked_weight_format_for_matmul_primitive.patch +++ /dev/null @@ -1,100 +0,0 @@ -/* Copyright 2024 The OpenXLA Authors. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. -==============================================================================*/ - -diff --git a/src/cpu/aarch64/matmul/acl_matmul.hpp b/src/cpu/aarch64/matmul/acl_matmul.hpp -index 451cc78d52..ab13efb9b2 100644 ---- a/src/cpu/aarch64/matmul/acl_matmul.hpp -+++ b/src/cpu/aarch64/matmul/acl_matmul.hpp -@@ -67,6 +67,8 @@ struct acl_matmul_t : public primitive_t { - - status_t init(engine_t *engine) { - using smask_t = primitive_attr_t::skip_mask_t; -+ const format_kind_t weights_format_kind_received -+ = weights_md_.format_kind; - const bool is_fp32_ok - = utils::everyone_is(data_type::f32, src_md()->data_type, - weights_md()->data_type, dst_md()->data_type, -@@ -76,18 +78,20 @@ struct acl_matmul_t : public primitive_t { - = utils::everyone_is(data_type::f16, src_md()->data_type, - weights_md()->data_type, dst_md()->data_type) - && platform::has_data_type_support(data_type::f16); -+ const bool is_weights_md_format_ok -+ = utils::one_of(weights_format_kind_received, -+ format_kind::any, format_kind::blocked); - bool ok = is_dense_data() - && utils::one_of(true, is_fp32_ok, is_fp16_ok) -- && !has_zero_dim_memory() -- && weights_md_.format_kind == format_kind::any -+ && !has_zero_dim_memory() && is_weights_md_format_ok - && set_default_formats() - && attr()->has_default_values( - smask_t::oscale | smask_t::post_ops) - && attr_oscale_ok() && !has_runtime_dims_or_strides(); - if (!ok) return status::unimplemented; - -- CHECK(acl_matmul_utils::init_conf_matmul( -- amp_, src_md_, weights_md_, dst_md_, *desc(), *attr())); -+ CHECK(acl_matmul_utils::init_conf_matmul(amp_, src_md_, weights_md_, -+ dst_md_, *desc(), *attr(), weights_format_kind_received)); - - arm_compute::ActivationLayerInfo act_info; - CHECK(post_ops.init(engine, attr_.post_ops_, dst_md_, act_info)); -diff --git a/src/cpu/aarch64/matmul/acl_matmul_utils.cpp b/src/cpu/aarch64/matmul/acl_matmul_utils.cpp -index a314d96384..027f915a8a 100644 ---- a/src/cpu/aarch64/matmul/acl_matmul_utils.cpp -+++ b/src/cpu/aarch64/matmul/acl_matmul_utils.cpp -@@ -27,7 +27,8 @@ namespace acl_matmul_utils { - - status_t init_conf_matmul(acl_matmul_conf_t &, memory_desc_t &src_md, - memory_desc_t &wei_md, memory_desc_t &dst_md, const matmul_desc_t &md, -- const primitive_attr_t &attr) { -+ const primitive_attr_t &attr, -+ format_kind_t weights_format_kind_received) { - - const memory_desc_wrapper src_d(&src_md); - const memory_desc_wrapper wei_d(&wei_md); -@@ -128,9 +129,16 @@ status_t init_conf_matmul(acl_matmul_conf_t &, memory_desc_t &src_md, - for (dim_t i = K_dim - 1; i >= 0; --i) - batch_dims.push_back(i); - -+ const memory_desc_t weights_md_received = wei_md; - acl_utils::reorder_to_weight_format(amp.wei_tensor_info, wei_md, - expected_weight_format, K_dim, N_dim, {}, batch_dims); - -+ ACL_CHECK_SUPPORT((weights_format_kind_received == format_kind::blocked) -+ && !(dnnl_memory_desc_equal(&weights_md_received, &wei_md)), -+ "specified blocked format not supported by ACL, use " -+ "format_kind_t::any to find a supported blocked format for " -+ "your platform"); -+ - return status::success; - } - -diff --git a/src/cpu/aarch64/matmul/acl_matmul_utils.hpp b/src/cpu/aarch64/matmul/acl_matmul_utils.hpp -index 67bb2e78eb..5ba4241abc 100644 ---- a/src/cpu/aarch64/matmul/acl_matmul_utils.hpp -+++ b/src/cpu/aarch64/matmul/acl_matmul_utils.hpp -@@ -52,7 +52,8 @@ namespace acl_matmul_utils { - - status_t init_conf_matmul(acl_matmul_conf_t &, memory_desc_t &src_md, - memory_desc_t &wei_md, memory_desc_t &dst_md, const matmul_desc_t &md, -- const primitive_attr_t &attr); -+ const primitive_attr_t &attr, -+ format_kind_t weights_format_kind_received); - - } // namespace acl_matmul_utils - --- -2.34.1 diff --git a/third_party/mkl_dnn/onednn_acl_bf16_capability_detection_for_ubuntu20.04.patch b/third_party/mkl_dnn/onednn_acl_bf16_capability_detection_for_ubuntu20.04.patch deleted file mode 100644 index 6d6f0c0ea..000000000 --- a/third_party/mkl_dnn/onednn_acl_bf16_capability_detection_for_ubuntu20.04.patch +++ /dev/null @@ -1,50 +0,0 @@ -From 9a9430c7db870b78c6402d786a67921af4a66334 Mon Sep 17 00:00:00 2001 -From: Kentaro Kawakami -Date: Fri, 26 May 2023 10:58:36 +0900 -Subject: [PATCH] cpu: aarch64: xbyak_aarch64: BF16 capability detection for - Ubuntu 20.04 - ---- - .../aarch64/xbyak_aarch64/src/util_impl_linux.h | 15 ++++++++++++--- - 1 file changed, 12 insertions(+), 3 deletions(-) - -diff --git a/src/cpu/aarch64/xbyak_aarch64/src/util_impl_linux.h b/src/cpu/aarch64/xbyak_aarch64/src/util_impl_linux.h -index 743843bae50..3db37e972d1 100644 ---- a/src/cpu/aarch64/xbyak_aarch64/src/util_impl_linux.h -+++ b/src/cpu/aarch64/xbyak_aarch64/src/util_impl_linux.h -@@ -39,6 +39,13 @@ - #include - #endif - -+/* Linux kernel used in Ubuntu 20.04 does not have HWCAP2_BF16 definition. */ -+#ifdef AT_HWCAP2 -+#ifndef HWCAP2_BF16 -+#define HWCAP2_BF16 (1UL << 14) -+#endif -+#endif -+ - namespace Xbyak_aarch64 { - namespace util { - #define XBYAK_AARCH64_ERROR_ fprintf(stderr, "%s, %d, Error occurrs during read cache infomation.\n", __FILE__, __LINE__); -@@ -383,7 +390,7 @@ class CpuInfoLinux : public CpuInfo { - } - - void setHwCap() { -- unsigned long hwcap = getauxval(AT_HWCAP); -+ const unsigned long hwcap = getauxval(AT_HWCAP); - if (hwcap & HWCAP_ATOMICS) - type_ |= (Type)XBYAK_AARCH64_HWCAP_ATOMIC; - -@@ -391,8 +398,10 @@ class CpuInfoLinux : public CpuInfo { - type_ |= (Type)XBYAK_AARCH64_HWCAP_FP; - if (hwcap & HWCAP_ASIMD) - type_ |= (Type)XBYAK_AARCH64_HWCAP_ADVSIMD; --#ifdef HWCAP2_BF16 -- if (hwcap & HWCAP2_BF16) -+ -+#ifdef AT_HWCAP2 -+ const unsigned long hwcap2 = getauxval(AT_HWCAP2); -+ if (hwcap2 & HWCAP2_BF16) - type_ |= (Type)XBYAK_AARCH64_HWCAP_BF16; - #endif - diff --git a/third_party/mkl_dnn/onednn_acl_fix_segfault_during_postop_execute.patch b/third_party/mkl_dnn/onednn_acl_fix_segfault_during_postop_execute.patch deleted file mode 100644 index 39f7e7434..000000000 --- a/third_party/mkl_dnn/onednn_acl_fix_segfault_during_postop_execute.patch +++ /dev/null @@ -1,96 +0,0 @@ -/* Copyright 2024 The OpenXLA Authors. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. -==============================================================================*/ - -diff --git a/src/cpu/aarch64/acl_post_ops.cpp b/src/cpu/aarch64/acl_post_ops.cpp -index ea4bb200ec..3eb53b81bd 100644 ---- a/src/cpu/aarch64/acl_post_ops.cpp -+++ b/src/cpu/aarch64/acl_post_ops.cpp -@@ -24,7 +24,7 @@ namespace aarch64 { - - status_t acl_post_ops_t::execute(const exec_ctx_t &ctx, void *src_orig) const { - -- int post_op_index = 0; -+ int post_op_index = post_op_start_index_; - - // As these are post ops, this src will also be our dst. If we have a sum - // post op, the src/dst will start off in a temporary, then change to -diff --git a/src/cpu/aarch64/acl_post_ops.hpp b/src/cpu/aarch64/acl_post_ops.hpp -index 7b59ad71d3..ceaa95b73a 100644 ---- a/src/cpu/aarch64/acl_post_ops.hpp -+++ b/src/cpu/aarch64/acl_post_ops.hpp -@@ -32,7 +32,9 @@ struct acl_post_ops_t { - // init the acl_post_ops_t. Note that this function modifies the passed in - // post ops by setting the preferred memory formats - status_t init(engine_t *engine, post_ops_t &post_ops, -- const memory_desc_t &dst_md) { -+ const memory_desc_t &dst_md, int post_op_start_index = 0) { -+ -+ post_op_start_index_ = post_op_start_index; - - CHECK(post_ops.set_default_formats(&dst_md)); - dst_data_type = dst_md.data_type; -@@ -41,7 +43,7 @@ struct acl_post_ops_t { - sum_index = -1; - post_op_primitives = {}; - -- for (int i = 0; i < post_ops.len(); i++) { -+ for (int i = post_op_start_index; i < post_ops.len(); i++) { - auto &po = post_ops.entry_[i]; - - if (po.is_sum()) { -@@ -135,7 +137,8 @@ struct acl_post_ops_t { - // formats - status_t init(engine_t *engine, post_ops_t &base_post_ops, - const memory_desc_t &dst_md, -- arm_compute::ActivationLayerInfo &act_info_to_fuse) { -+ arm_compute::ActivationLayerInfo &act_info_to_fuse, -+ int post_op_start_index = 0) { - - CHECK(base_post_ops.set_default_formats(&dst_md)); - dst_data_type = dst_md.data_type; -@@ -149,18 +152,11 @@ struct acl_post_ops_t { - "eltwise post op scale must be 1 (no scale)"); - CHECK(acl_utils::convert_to_acl_act(first_po, act_info_to_fuse)); - -- // Copy all but the first, because it has been fused -- post_ops_t post_ops; -- for (int idx = 1; idx < base_post_ops.len(); ++idx) { -- // Construct empty entry then copy, so that we can check for failure -- post_ops.entry_.emplace_back(); -- post_ops.entry_.back().copy_from(base_post_ops.entry_[idx]); -- } -- return init(engine, post_ops, dst_md); -- -+ // post_op_start_index + 1 to skip the fused eltwise -+ return init(engine, base_post_ops, dst_md, post_op_start_index + 1); - } else { - // Nothing to fuse, just copy all post ops -- return init(engine, base_post_ops, dst_md); -+ return init(engine, base_post_ops, dst_md, post_op_start_index); - } - } - -@@ -179,6 +175,9 @@ struct acl_post_ops_t { - private: - // Index of the sum post op if there is one, < 0 means no sum - int sum_index = -1; -+ // Index of the first post op this primitive executes. This is typically the -+ // number of post ops which were fused. -+ int post_op_start_index_ = 0; - data_type_t dst_data_type; - // Vector of primitives used to execute the post ops. They are constructed - // in init to be either acl_binary_t (for sum, add, sub, div, mul, min and --- -2.34.1 diff --git a/third_party/mkl_dnn/onednn_acl_fp32_bf16_reorder.patch b/third_party/mkl_dnn/onednn_acl_fp32_bf16_reorder.patch deleted file mode 100644 index 202902a18..000000000 --- a/third_party/mkl_dnn/onednn_acl_fp32_bf16_reorder.patch +++ /dev/null @@ -1,111 +0,0 @@ - ******************************************************************************* - Copyright 2023 Arm Limited and affiliates. - SPDX-License-Identifier: Apache-2.0 - - Licensed under the Apache License, Version 2.0 (the "License"); - you may not use this file except in compliance with the License. - You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - - Unless required by applicable law or agreed to in writing, software - distributed under the License is distributed on an "AS IS" BASIS, - WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - See the License for the specific language governing permissions and - limitations under the License. - ******************************************************************************* -diff --git a/src/cpu/aarch64/cpu_isa_traits.hpp b/src/cpu/aarch64/cpu_isa_traits.hpp -index 4a43b24c5..1a5cfe590 100644 ---- a/src/cpu/aarch64/cpu_isa_traits.hpp -+++ b/src/cpu/aarch64/cpu_isa_traits.hpp -@@ -1,6 +1,7 @@ - /******************************************************************************* - * Copyright 2018-2023 Intel Corporation - * Copyright 2020-2023 FUJITSU LIMITED -+* Copyright 2023 Arm Ltd. and affiliates - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. -@@ -211,10 +212,10 @@ static inline bool mayiuse_atomic() { - return cpu().isAtomicSupported(); - } - --inline bool isa_has_bf16(cpu_isa_t isa) { -- return false; -+static inline bool mayiuse_bf16() { -+ using namespace Xbyak_aarch64::util; -+ return cpu().isBf16Supported(); - } -- - } // namespace - - /* whatever is required to generate string literals... */ -diff --git a/src/cpu/aarch64/jit_uni_reorder.cpp b/src/cpu/aarch64/jit_uni_reorder.cpp -index 6bd259ec2..5541bb702 100644 ---- a/src/cpu/aarch64/jit_uni_reorder.cpp -+++ b/src/cpu/aarch64/jit_uni_reorder.cpp -@@ -1,7 +1,7 @@ - /******************************************************************************* - * Copyright 2018-2023 Intel Corporation - * Copyright 2020-2023 FUJITSU LIMITED --* Copyright 2022 Arm Ltd. and affiliates -+* Copyright 2022-2023 Arm Ltd. and affiliates - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. -@@ -163,11 +163,11 @@ struct jit_uni_reorder_kernel_f32_t : public kernel_t, public jit_generator { - - bool ok = true && p.ndims > 0 - && utils::one_of(p.itype, f32, s32, data_type::s8, u8) -- && utils::one_of(p.otype, f32, s32, data_type::s8, u8) -+ && utils::one_of(p.otype, f32, bf16, s32, data_type::s8, u8) - && utils::everyone_is(0, p.ioff, p.ooff) /* do we need this? */ - && utils::one_of(p.beta, 0.f, 1.f) /* anything else? */ -- && simple_impl_desc_init(p, nullptr) -- && prb_has_small_strides(p); -+ && simple_impl_desc_init(p, nullptr) && prb_has_small_strides(p) -+ && ((p.otype != bf16) || (p.itype == f32 && mayiuse_bf16())); - - return ok; - } -@@ -648,6 +648,9 @@ struct jit_uni_reorder_kernel_f32_t : public kernel_t, public jit_generator { - cvt_v_s32_u8(startIdx, regNum); - if (idt == data_type::s8) cvt_v_s8_u8(startIdx, regNum); - break; -+ case bf16: -+ if (idt == f32) cvt_v_f32_bf16(startIdx, regNum); -+ break; - default: assert(!"unreachable"); - } - }; -@@ -1677,6 +1680,10 @@ struct jit_uni_reorder_kernel_f32_t : public kernel_t, public jit_generator { - UNROLL_INST(fcvtzs, VReg4S, tmp, tmp); - } - -+ void cvt_v_f32_bf16(const size_t startIdx, const size_t regNum) { -+ UNROLL_INST2(bfcvtn, VReg4H(i), VReg4S(i)); -+ } -+ - void cvt_z_s8_s32(const size_t startIdx, const size_t regNum) { - cvt_z_b_s(startIdx, regNum); - UNROLL_INST(sxtb, ZRegS, tmp, P_ALL_ONE / T_m, tmp); -diff --git a/src/cpu/reorder/cpu_reorder_regular_f32_bf16.cpp b/src/cpu/reorder/cpu_reorder_regular_f32_bf16.cpp -index ba5499ba9..d4e21d316 100644 ---- a/src/cpu/reorder/cpu_reorder_regular_f32_bf16.cpp -+++ b/src/cpu/reorder/cpu_reorder_regular_f32_bf16.cpp -@@ -1,5 +1,6 @@ - /******************************************************************************* - * Copyright 2020-2022 Intel Corporation -+* Copyright 2023 Arm Ltd. and affiliates - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. -@@ -34,6 +35,8 @@ const impl_list_map_t ®ular_f32_bf16_impl_list_map() { - DNNL_NON_X64_ONLY(REG_SR_BIDIR(f32, any, bf16, nChw16c)) - DNNL_NON_X64_ONLY(REG_SR_BIDIR(f32, any, bf16, nCdhw16c)) - -+ DNNL_AARCH64_ONLY(CPU_REORDER_INSTANCE(aarch64::jit_uni_reorder_t)) -+ - DNNL_NON_X64_ONLY(REG_SR(f32, oihw, bf16, OIhw8i16o2i, fmt_order::keep)) - DNNL_NON_X64_ONLY(REG_SR(f32, goihw, bf16, gOIhw8i16o2i, fmt_order::keep)) - DNNL_NON_X64_ONLY(REG_SR(f32, oihw, bf16, OIhw8o16i2o, fmt_order::keep)) diff --git a/third_party/mkl_dnn/onednn_acl_indirect_conv.patch b/third_party/mkl_dnn/onednn_acl_indirect_conv.patch deleted file mode 100644 index 217e66835..000000000 --- a/third_party/mkl_dnn/onednn_acl_indirect_conv.patch +++ /dev/null @@ -1,31 +0,0 @@ - ******************************************************************************* - Copyright 2024 Arm Limited and affiliates. - SPDX-License-Identifier: Apache-2.0 - - Licensed under the Apache License, Version 2.0 (the "License"); - you may not use this file except in compliance with the License. - You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - - Unless required by applicable law or agreed to in writing, software - distributed under the License is distributed on an "AS IS" BASIS, - WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - See the License for the specific language governing permissions and - limitations under the License. - ******************************************************************************* -diff --git a/src/cpu/aarch64/acl_convolution_utils.cpp b/src/cpu/aarch64/acl_convolution_utils.cpp -index f043fee4bc..0384cce757 100644 ---- a/src/cpu/aarch64/acl_convolution_utils.cpp -+++ b/src/cpu/aarch64/acl_convolution_utils.cpp -@@ -313,10 +313,6 @@ status_t init_conf_indirect_gemm(acl_conv_conf_t &acp, memory_desc_t &src_md, - - CHECK(acl_init_conf(acp, src_md, weights_md, dst_md, bias_md, cd, attr)); - -- // Indirect is slower than gemm for low thread counts, except for fast math -- if (dnnl_get_max_threads() < 28 && !acp.fast_math) -- return status::unimplemented; -- - // If we do not need to pad input channels for fast math mode then it would - // be faster to run convolution with im2row instead of using indirect kernel - int block_by = arm_compute::block_by(acp.weights_info.weight_format()); diff --git a/third_party/mkl_dnn/onednn_acl_reorder.patch b/third_party/mkl_dnn/onednn_acl_reorder.patch deleted file mode 100644 index 5da6756c7..000000000 --- a/third_party/mkl_dnn/onednn_acl_reorder.patch +++ /dev/null @@ -1,371 +0,0 @@ - ******************************************************************************* - Copyright 2023 Arm Limited and affiliates. - SPDX-License-Identifier: Apache-2.0 - - Licensed under the Apache License, Version 2.0 (the "License"); - you may not use this file except in compliance with the License. - You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - - Unless required by applicable law or agreed to in writing, software - distributed under the License is distributed on an "AS IS" BASIS, - WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - See the License for the specific language governing permissions and - limitations under the License. - ******************************************************************************* -diff --git a/src/cpu/aarch64/acl_reorder.cpp b/src/cpu/aarch64/acl_reorder.cpp -new file mode 100644 -index 000000000..061751b55 ---- /dev/null -+++ b/src/cpu/aarch64/acl_reorder.cpp -@@ -0,0 +1,52 @@ -+/******************************************************************************* -+* Copyright 2023 Arm Ltd. and affiliates -+* -+* Licensed under the Apache License, Version 2.0 (the "License"); -+* you may not use this file except in compliance with the License. -+* You may obtain a copy of the License at -+* -+* http://www.apache.org/licenses/LICENSE-2.0 -+* -+* Unless required by applicable law or agreed to in writing, software -+* distributed under the License is distributed on an "AS IS" BASIS, -+* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -+* See the License for the specific language governing permissions and -+* limitations under the License. -+*******************************************************************************/ -+ -+#include "cpu/aarch64/acl_reorder.hpp" -+ -+namespace dnnl { -+namespace impl { -+namespace cpu { -+namespace aarch64 { -+ -+status_t acl_reorder_fwd_t::execute_forward(const exec_ctx_t &ctx) const { -+ // Lock here is needed because resource_mapper does not support -+ // concurrent multithreaded access. -+ std::lock_guard _lock {this->mtx}; -+ -+ auto src = CTX_IN_MEM(const void *, DNNL_ARG_FROM); -+ auto dst = CTX_OUT_MEM(void *, DNNL_ARG_TO); -+ -+ // Retrieve primitive resource and configured Compute Library objects -+ auto *acl_resource -+ = ctx.get_resource_mapper()->get(this); -+ -+ acl_reorder_obj_t &acl_obj = acl_resource->get_acl_obj(); -+ -+ acl_obj.src_tensor.allocator()->import_memory(const_cast(src)); -+ acl_obj.dst_tensor.allocator()->import_memory(dst); -+ -+ acl_obj.reorder.run(); -+ -+ acl_obj.src_tensor.allocator()->free(); -+ acl_obj.dst_tensor.allocator()->free(); -+ -+ return status::success; -+} -+ -+} // namespace aarch64 -+} // namespace cpu -+} // namespace impl -+} // namespace dnnl -diff --git a/src/cpu/aarch64/acl_reorder.hpp b/src/cpu/aarch64/acl_reorder.hpp -new file mode 100644 -index 0000000000..edbc38914d ---- /dev/null -+++ b/src/cpu/aarch64/acl_reorder.hpp -@@ -0,0 +1,262 @@ -+/******************************************************************************* -+* Copyright 2023 Arm Ltd. and affiliates -+* -+* Licensed under the Apache License, Version 2.0 (the "License"); -+* you may not use this file except in compliance with the License. -+* You may obtain a copy of the License at -+* -+* http://www.apache.org/licenses/LICENSE-2.0 -+* -+* Unless required by applicable law or agreed to in writing, software -+* distributed under the License is distributed on an "AS IS" BASIS, -+* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -+* See the License for the specific language governing permissions and -+* limitations under the License. -+*******************************************************************************/ -+#ifndef CPU_AARCH64_ACL_REORDER_HPP -+#define CPU_AARCH64_ACL_REORDER_HPP -+ -+#include "cpu/aarch64/acl_utils.hpp" -+#include "cpu/reorder/cpu_reorder_pd.hpp" -+#include "arm_compute/core/Types.h" -+#include "common/utils.hpp" -+ -+namespace dnnl { -+namespace impl { -+namespace cpu { -+namespace aarch64 { -+ -+struct acl_reorder_obj_t { -+ arm_compute::NEReorderLayer reorder; -+ arm_compute::Tensor src_tensor; -+ arm_compute::Tensor dst_tensor; -+ arm_compute::WeightFormat src_wf; -+ arm_compute::WeightFormat dst_wf; -+}; -+ -+struct acl_reorder_conf_t { -+ arm_compute::TensorInfo src_info; -+ arm_compute::TensorInfo dst_info; -+ arm_compute::WeightFormat src_wf; -+ arm_compute::WeightFormat dst_wf; -+}; -+ -+struct acl_reorder_resource_t : public resource_t { -+ acl_reorder_resource_t() : acl_obj_(utils::make_unique()) {} -+ -+ status_t configure(const acl_reorder_conf_t &app) { -+ if (!acl_obj_) return status::out_of_memory; -+ -+ // Init Compute Library tensors based on info from descriptor -+ acl_obj_->src_tensor.allocator()->init(app.src_info); -+ acl_obj_->dst_tensor.allocator()->init(app.dst_info); -+ -+ // clang-format off -+ acl_obj_->reorder.configure( -+ &acl_obj_->src_tensor, -+ &acl_obj_->dst_tensor, -+ app.src_wf, -+ app.dst_wf -+ ); -+ // clang-format on -+ -+ return status::success; -+ } -+ -+ acl_reorder_obj_t &get_acl_obj() const { return *acl_obj_; } -+ DNNL_DISALLOW_COPY_AND_ASSIGN(acl_reorder_resource_t); -+ -+private: -+ std::unique_ptr acl_obj_; -+}; // acl_reorder_resource_t -+ -+struct acl_reorder_fwd_t : public primitive_t { -+ using primitive_t::primitive_t; -+ struct pd_t : public cpu_reorder_pd_t { -+ -+ using cpu_reorder_pd_t::cpu_reorder_pd_t; -+ -+ DECLARE_COMMON_PD_T("acl", acl_reorder_fwd_t); -+ -+ static status_t create(reorder_pd_t **reorder_pd, engine_t *engine, -+ const primitive_attr_t *attr, engine_t *src_engine, -+ const memory_desc_t *src_md, engine_t *dst_engine, -+ const memory_desc_t *dst_md) { -+ -+ using namespace acl_utils; -+ // using skip_mask_t = dnnl_primitive_attr::skip_mask_t; -+ -+ bool ok = src_md->data_type -+ == dst_md->data_type // ACL only supports matching src/dst data types -+ && utils::one_of(src_md->data_type, -+ data_type::f32) // Only supports f32 for now -+ && attr->has_default_values(); -+ if (!ok) return status::unimplemented; -+ -+ int mask = -1; -+ bool is_set = false; -+ // CHECK(attr->scales_.get(DNNL_ARG_DST, &mask, &is_set)); -+ const memory_desc_wrapper input_d(src_md); -+ if (input_d.has_runtime_dims_or_strides() && is_set && mask > 0) -+ return status::unimplemented; -+ -+ // Create and check primitive descriptor -+ auto _pd = new pd_t(attr, src_engine->kind(), src_md, -+ dst_engine->kind(), dst_md); -+ if (_pd == nullptr) return status::out_of_memory; -+ if (_pd->init(engine, src_engine, dst_engine) != status::success) { -+ delete _pd; -+ return status::unimplemented; -+ } -+ -+ const memory_desc_wrapper src_d(*src_md); -+ const memory_desc_wrapper dst_d(*dst_md); -+ -+ const int ndims = src_d.ndims(); -+ -+ auto src_tag = memory_desc_matches_one_of_tag( -+ *src_md, format_tag::ba, format_tag::cdba); -+ ACL_CHECK_SUPPORT( -+ utils::one_of(format_tag::undef, src_tag), -+ ""); -+ -+ arm_compute::TensorShape acl_tensor_shape_in; -+ arm_compute::TensorShape acl_tensor_shape_out; -+ // Need even amount of dims in dim 0 for ACL kernel (eg mulitple of 8 rows when blocking by 8) -+ int dim_0_rounded_up; -+ -+ // Switch for 2 or 4 dim tensors -+ switch(ndims) -+ { -+ // Currently for Ab4a and Ab8a -+ // No format_tag for these, have to deduce from stride -+ case 2: -+ { -+ if(dst_md->dims[0] == 1 || dst_md->dims[1] == 1){ -+ return status::unimplemented; -+ } -+ int dst_dim_1 = dst_md->dims[1]; -+ int dst_dim_0_stride = dst_md->format_desc.blocking.strides[0]; -+ int dst_dim_1_stride = dst_md->format_desc.blocking.strides[1]; -+ // Interleave of 4 or 8 that stride for dim 1 -+ if (dst_dim_1_stride != 4 && dst_dim_1_stride != 8){ -+ return status::unimplemented; -+ } -+ // Check to ensure it's a blocking transpose -+ if (dst_dim_1 * dst_dim_1_stride != dst_dim_0_stride){ -+ return status::unimplemented; -+ } -+ if(dst_dim_1_stride == 4){ -+ // Set Dest WeightFormat -+ _pd->app_.dst_wf = arm_compute::WeightFormat::OHWIo4; -+ dim_0_rounded_up -+ = utils::rnd_up(src_md->dims[0], 4); -+ } else { -+ // Set Dest WeightFormat -+ _pd->app_.dst_wf = arm_compute::WeightFormat::OHWIo8; -+ dim_0_rounded_up -+ = utils::rnd_up(src_md->dims[0], 8); -+ } -+ acl_tensor_shape_in = arm_compute::TensorShape(src_md->dims[1], src_md->dims[0]); -+ acl_tensor_shape_out = arm_compute::TensorShape(src_md->dims[1], dim_0_rounded_up); -+ -+ break; -+ } -+ // Currently for Acdb4a and Acdb8a -+ case 4: -+ { -+ -+ auto dst_tag = memory_desc_matches_one_of_tag( -+ *dst_md, format_tag::Acdb4a, format_tag::Acdb8a); -+ ACL_CHECK_SUPPORT( -+ utils::one_of(format_tag::undef, dst_tag), -+ ""); -+ if(dst_tag == format_tag::Acdb4a){ -+ // Set Dest WeightFormat -+ _pd->app_.dst_wf = arm_compute::WeightFormat::OHWIo4; -+ dim_0_rounded_up -+ = utils::rnd_up(src_md->dims[0], 4); -+ } -+ else{ -+ // Set Dest WeightFormat -+ _pd->app_.dst_wf = arm_compute::WeightFormat::OHWIo8; -+ dim_0_rounded_up -+ = utils::rnd_up(src_md->dims[0], 8); -+ } -+ // Currently only supporting AxBx1x1 cases -+ if(dst_md->dims[2] != 1 || dst_md->dims[3] != 1){ -+ return status::unimplemented; -+ } -+ if(dst_md->dims[0] == 1 || dst_md->dims[1] == 1){ -+ return status::unimplemented; -+ } -+ acl_tensor_shape_in = arm_compute::TensorShape(src_md->dims[3], src_md->dims[2], src_md->dims[1], src_md->dims[0]); -+ acl_tensor_shape_out = arm_compute::TensorShape(src_md->dims[3], src_md->dims[2], src_md->dims[1], dim_0_rounded_up); -+ break; -+ } -+ default: -+ return status::unimplemented; -+ } -+ -+ // Choose the data layout -+ // bool is_nspc = utils::one_of(src_tag, format_tag::nhwc); -+ const auto acl_layout = arm_compute::DataLayout::NCHW; -+ -+ // Set Source WeightFormat -+ _pd->app_.src_wf = arm_compute::WeightFormat::OHWI; -+ -+ // Create ACL tensor infos -+ const data_type_t data_type = src_d.data_type(); -+ const arm_compute::DataType acl_data_t -+ = acl_utils::get_acl_data_t(data_type); -+ _pd->app_.src_info = arm_compute::TensorInfo( -+ acl_tensor_shape_in, 1, acl_data_t, acl_layout); -+ _pd->app_.dst_info = arm_compute::TensorInfo( -+ acl_tensor_shape_out, 1, acl_data_t, acl_layout); -+ -+ // Init scratch memory, not used so 0 in this implementation -+ _pd->init_scratchpad_md(); -+ -+ return safe_ptr_assign(*reorder_pd, _pd); -+ } // create -+ -+ friend dnnl::impl::impl_list_item_t; -+ acl_reorder_conf_t app_; -+ -+ }; // pd_t -+ -+ acl_reorder_fwd_t(const pd_t *apd) : primitive_t(apd) {} -+ -+ status_t create_resource( -+ engine_t *engine, resource_mapper_t &mapper) const override { -+ if (mapper.has_resource(this)) return status::success; -+ -+ auto r = utils::make_unique(); -+ if (!r) return status::out_of_memory; -+ -+ // Configure the resource based on information from primitive descriptor -+ CHECK(r->configure(pd()->app_)); -+ -+ mapper.add(this, std::move(r)); -+ return status::success; -+ } -+ -+ status_t execute(const exec_ctx_t &ctx) const override { -+ return execute_forward(ctx); -+ } -+ -+private: -+ // To guard the const execute_forward, the mutex must be 'mutable' -+ mutable std::mutex mtx; -+ status_t execute_forward(const exec_ctx_t &ctx) const; -+ const pd_t *pd() const { return (const pd_t *)primitive_t::pd().get(); } -+ -+ -+}; // acl_reorder_fwd_t -+ -+} // namespace aarch64 -+} // namespace cpu -+} // namespace impl -+} // namespace dnnl -+ -+#endif // CPU_AARCH64_ACL_REORDER_HPP -diff --git a/src/cpu/reorder/cpu_reorder_regular_f32_f32.cpp b/src/cpu/reorder/cpu_reorder_regular_f32_f32.cpp -index a4150b619..f4d6b4de3 100644 ---- a/src/cpu/reorder/cpu_reorder_regular_f32_f32.cpp -+++ b/src/cpu/reorder/cpu_reorder_regular_f32_f32.cpp -@@ -16,6 +16,7 @@ - *******************************************************************************/ - - #include "cpu/reorder/cpu_reorder.hpp" -+#include "cpu/aarch64/acl_reorder.hpp" - - namespace dnnl { - namespace impl { -@@ -28,6 +29,7 @@ const impl_list_map_t ®ular_f32_f32_impl_list_map() { - // f32 -> f32 - {{f32, f32, 0}, { - REG_FAST_DIRECT_COPY_F32_F32 -+ DNNL_AARCH64_ONLY(CPU_REORDER_INSTANCE(aarch64::acl_reorder_fwd_t)) - - DNNL_X64_ONLY(CPU_REORDER_INSTANCE(x64::brgemm_matmul_matrix_B_reorder_t)) - DNNL_X64_ONLY(CPU_REORDER_INSTANCE(x64::jit_blk_reorder_t)) -@@ -69,6 +71,8 @@ const impl_list_map_t ®ular_f32_f32_impl_list_map() { - nullptr, - }}, - {{f32, f32, 4}, { -+ -+ DNNL_AARCH64_ONLY(CPU_REORDER_INSTANCE(aarch64::acl_reorder_fwd_t)) - CPU_REORDER_INSTANCE(rnn_weights_reorder_t) - - REG_FAST_DIRECT_COPY_F32_F32 diff --git a/third_party/mkl_dnn/onednn_acl_thread_local_scheduler.patch b/third_party/mkl_dnn/onednn_acl_thread_local_scheduler.patch deleted file mode 100644 index 958330839..000000000 --- a/third_party/mkl_dnn/onednn_acl_thread_local_scheduler.patch +++ /dev/null @@ -1,97 +0,0 @@ - ******************************************************************************* - Copyright 2023 Arm Limited and affiliates. - SPDX-License-Identifier: Apache-2.0 - - Licensed under the Apache License, Version 2.0 (the "License"); - you may not use this file except in compliance with the License. - You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - - Unless required by applicable law or agreed to in writing, software - distributed under the License is distributed on an "AS IS" BASIS, - WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - See the License for the specific language governing permissions and - limitations under the License. - ******************************************************************************* -diff --git a/src/cpu/aarch64/acl_thread.cpp b/src/cpu/aarch64/acl_thread.cpp -index fd2c76d01..bd7bed837 100644 ---- a/src/cpu/aarch64/acl_thread.cpp -+++ b/src/cpu/aarch64/acl_thread.cpp -@@ -55,14 +55,17 @@ void acl_set_benchmark_scheduler_default() { - #endif - - #if DNNL_CPU_THREADING_RUNTIME == DNNL_RUNTIME_THREADPOOL --void acl_set_tp_scheduler() { -- static std::once_flag flag_once; -- // Create threadpool scheduler -- std::shared_ptr threadpool_scheduler -- = std::make_unique(); -+void acl_set_tp_scheduler(int intra_threads = 0) { -+ static thread_local std::once_flag flag_once; - // set CUSTOM scheduler in ACL - std::call_once(flag_once, -- [&]() { arm_compute::Scheduler::set(threadpool_scheduler); }); -+ [&]() { -+ // Create threadpool scheduler -+ std::shared_ptr threadpool_scheduler -+ = std::make_unique(); -+ threadpool_scheduler->set_num_threads(intra_threads); -+ -+ arm_compute::Scheduler::set(threadpool_scheduler); }); - } - - void acl_set_threadpool_num_threads() { -@@ -102,14 +105,6 @@ void set_acl_threading() { - acl_set_benchmark_scheduler_default(); - } - #endif --#if DNNL_CPU_THREADING_RUNTIME == DNNL_RUNTIME_THREADPOOL -- if (verbose_has_profile_externals()) { -- acl_set_tp_benchmark_scheduler(); -- } else { -- acl_set_tp_scheduler(); -- } -- --#endif - } - - } // namespace acl_thread_utils -diff --git a/src/cpu/aarch64/acl_thread.hpp b/src/cpu/aarch64/acl_thread.hpp -index f073376e6..654a2aa5d 100644 ---- a/src/cpu/aarch64/acl_thread.hpp -+++ b/src/cpu/aarch64/acl_thread.hpp -@@ -40,7 +40,7 @@ void acl_set_benchmark_scheduler_default(); - - #if DNNL_CPU_THREADING_RUNTIME == DNNL_RUNTIME_THREADPOOL - // Retrieve threadpool size during primitive execution and set ThreadpoolScheduler num_threads --void acl_set_tp_scheduler(); -+void acl_set_tp_scheduler(int intra_threads); - void acl_set_threadpool_num_threads(); - // Swap BenchmarkScheduler for custom scheduler builds (i.e. ThreadPoolScheduler) for DNNL_VERBOSE=profile,profile_externals - void acl_set_tp_benchmark_scheduler(); -diff --git a/src/cpu/aarch64/acl_threadpool_scheduler.cpp b/src/cpu/aarch64/acl_threadpool_scheduler.cpp -index 439ca862e..6656c37a5 100644 ---- a/src/cpu/aarch64/acl_threadpool_scheduler.cpp -+++ b/src/cpu/aarch64/acl_threadpool_scheduler.cpp -@@ -102,8 +102,6 @@ void ThreadpoolScheduler::schedule_op(ICPPKernel *kernel, const Hints &hints, - void ThreadpoolScheduler::run_workloads( - std::vector &workloads) { - -- arm_compute::lock_guard lock(this->_run_workloads_mutex); -- - const unsigned int num_threads - = std::min(static_cast(_num_threads), - static_cast(workloads.size())); -diff --git a/src/cpu/cpu_engine.cpp b/src/cpu/cpu_engine.cpp -index 0bfec3871..7207b2b60 100644 ---- a/src/cpu/cpu_engine.cpp -+++ b/src/cpu/cpu_engine.cpp -@@ -47,6 +47,7 @@ status_t cpu_engine_t::create_stream(stream_t **stream, unsigned flags) { - #if DNNL_CPU_RUNTIME == DNNL_RUNTIME_THREADPOOL - status_t cpu_engine_t::create_stream(stream_t **stream, - dnnl::threadpool_interop::threadpool_iface *threadpool) { -+ dnnl::impl::cpu::aarch64::acl_thread_utils::acl_set_tp_scheduler(threadpool->get_num_threads()); - return safe_ptr_assign( - *stream, new cpu_stream_t(this, threadpool)); - } diff --git a/third_party/mkl_dnn/onednn_acl_threadcap.patch b/third_party/mkl_dnn/onednn_acl_threadcap.patch deleted file mode 100644 index 3a33af153..000000000 --- a/third_party/mkl_dnn/onednn_acl_threadcap.patch +++ /dev/null @@ -1,43 +0,0 @@ - ******************************************************************************* - Copyright 2023 Arm Limited and affiliates. - SPDX-License-Identifier: Apache-2.0 - - Licensed under the Apache License, Version 2.0 (the "License"); - you may not use this file except in compliance with the License. - You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - - Unless required by applicable law or agreed to in writing, software - distributed under the License is distributed on an "AS IS" BASIS, - WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - See the License for the specific language governing permissions and - limitations under the License. - ******************************************************************************* -diff --git a/src/cpu/aarch64/acl_thread.cpp b/src/cpu/aarch64/acl_thread.cpp -index fd2c76d01..2d7c76d48 100644 ---- a/src/cpu/aarch64/acl_thread.cpp -+++ b/src/cpu/aarch64/acl_thread.cpp -@@ -17,6 +17,8 @@ - #include "cpu/aarch64/acl_thread.hpp" - #if DNNL_CPU_THREADING_RUNTIME == DNNL_RUNTIME_THREADPOOL - #include "cpu/aarch64/acl_threadpool_scheduler.hpp" -+#elif DNNL_CPU_THREADING_RUNTIME == DNNL_RUNTIME_OMP -+#include - #endif - #include "cpu/aarch64/acl_benchmark_scheduler.hpp" - -@@ -30,9 +32,10 @@ namespace acl_thread_utils { - #if DNNL_CPU_THREADING_RUNTIME == DNNL_RUNTIME_OMP - void acl_thread_bind() { - static std::once_flag flag_once; -- // The threads in Compute Library are bound for the cores 0..max_threads-1 -- // dnnl_get_max_threads() returns OMP_NUM_THREADS -- const int max_threads = dnnl_get_max_threads(); -+ // Cap the number of threads to 90% of the total core count -+ // to ensure Compute Library doesn't use too much resource -+ int capped_threads = (int)std::floor(0.9*std::thread::hardware_concurrency()); -+ const int max_threads = std::min(capped_threads, dnnl_get_max_threads()); - // arm_compute::Scheduler does not support concurrent access thus a - // workaround here restricts it to only one call - std::call_once(flag_once, [&]() {