Skip to content

Commit 0076e71

Browse files
author
Alexandra Sidorova
authored
[Snippets][ARM] Fixed GemmCopyB execution after kernel recompilation (#31747)
### Details: - *On the master branch, kernel executor for `GemmCopyB` requires `n_block_size` which is equal to `N` dimensions from `GemmCPU` subtensors. [`jit_gemm_copy_b_emitter`](https://github.com/openvinotoolkit/openvino/blob/5e3914a1d0cf29470451ebe676c14a68386a5e94/src/plugins/intel_cpu/src/emitters/snippets/aarch64/jit_gemm_copy_b_emitter.cpp#L50) initializes this `n_block_size` and set to kernel executor config. However, if there are new shapes with new broadcast pattern (we have regenerate whole kernel), generator gets cloned `LinearIR` after previous infer requests. It means that this cloned `LinearIR` might be changed by previous infer requests. For example, subtensors of `GemmCPU` are changed by [`GemmKernelExcutor::update_config()`](https://github.com/openvinotoolkit/openvino/blob/dce9b25475826c967c8a2a121e864dc7b7cab155/src/plugins/intel_cpu/src/emitters/snippets/brgemm_generic.cpp#L174). It leads to incorrect `n_block_size` initialization in `jit_gemm_copy_b_emitter` during the next kernel regeneration. This PR removes this logic from `jit_gemm_copy_b_emitters` and set static const `n_block_size` in `GemmCopyBKernelExecutor`* - *Currently we use repacking from KleidiAI with iteration by `n_blk_size` where actually `n_blk_size` doesn't depend on `N_blk` from `GemmCPU`. It can be any value* ### Tickets: - *N/A*
1 parent f666521 commit 0076e71

File tree

7 files changed

+31
-145
lines changed

7 files changed

+31
-145
lines changed

src/plugins/intel_cpu/src/emitters/snippets/aarch64/jit_gemm_copy_b_emitter.cpp

Lines changed: 1 addition & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -25,9 +25,7 @@
2525
#include "openvino/core/type/element_type.hpp"
2626
#include "snippets/kernel_executor_table.hpp"
2727
#include "snippets/lowered/expression.hpp"
28-
#include "snippets/utils/utils.hpp"
2928
#include "transformations/snippets/aarch64/op/gemm_copy_b.hpp"
30-
#include "transformations/snippets/aarch64/op/gemm_utils.hpp"
3129

3230
namespace ov::intel_cpu::aarch64 {
3331

@@ -43,18 +41,7 @@ jit_gemm_copy_b_emitter::jit_gemm_copy_b_emitter(jit_generator* h,
4341
in_out_type_ = emitter_in_out_map::gpr_to_gpr;
4442
const auto gemm_repack = ov::as_type_ptr<GemmCopyB>(expr->get_node());
4543
OV_CPU_JIT_EMITTER_ASSERT(gemm_repack, "expects GemmCopyB node");
46-
const auto& child_gemms = ov::intel_cpu::aarch64::gemm_utils::repacking::get_gemm_exprs(expr);
47-
size_t n_blk_size = 0;
48-
for (const auto& child_gemm : child_gemms) {
49-
const auto& gemm_in1_subtensor = ov::snippets::utils::get_projected_subtensor(child_gemm->get_input_port(1));
50-
const auto& current_block = *gemm_in1_subtensor.rbegin();
51-
if (current_block != snippets::utils::get_dynamic_value<size_t>() && current_block > n_blk_size) {
52-
n_blk_size = current_block;
53-
}
54-
}
55-
OV_CPU_JIT_EMITTER_ASSERT(n_blk_size > 0, "n_blk_size of gemm_repack is expected to be greater than 0.");
56-
GemmCopyBKernelKaiConfig kernel_config(n_blk_size);
57-
m_kernel_executor = kernel_table->register_kernel<GemmCopyBKaiKernelExecutor>(expr, kernel_config);
44+
m_kernel_executor = kernel_table->register_kernel<GemmCopyBKaiKernelExecutor>(expr, GemmCopyBKernelKaiConfig());
5845

5946
// Initialize memory offsets similar to x64 brgemm_copy_b implementation
6047
m_memory_offsets = {gemm_repack->get_offset_in(), gemm_repack->get_offset_out()};

src/plugins/intel_cpu/src/emitters/snippets/aarch64/kernel_executors/gemm_copy_b.cpp

Lines changed: 12 additions & 52 deletions
Original file line numberDiff line numberDiff line change
@@ -15,7 +15,6 @@
1515

1616
#include "emitters/utils.hpp"
1717
#include "kai/ukernels/matmul/pack/kai_rhs_pack_kxn_f32p8x1biasf32_f32_f32_neon.h"
18-
#include "openvino/core/except.hpp"
1918
#include "snippets/kernel_executor_table.hpp"
2019
#include "snippets/lowered/expression.hpp"
2120
#include "snippets/lowered/linear_ir.hpp"
@@ -24,15 +23,8 @@
2423

2524
namespace ov::intel_cpu::aarch64 {
2625

27-
GemmCopyBKernelKaiConfig::GemmCopyBKernelKaiConfig(const size_t n_blk_size)
28-
: m_static_params(std::make_shared<StaticParams>(n_blk_size)) {
29-
OPENVINO_ASSERT(n_blk_size != 0, "n_blk_size can not be zero in GemmCopyBKernelKaiConfig.");
30-
m_hash = compute_hash();
31-
}
32-
3326
bool GemmCopyBKernelKaiConfig::operator==(const GemmCopyBKernelKaiConfig& rhs) const {
34-
return m_N == rhs.m_N && m_K == rhs.m_K && m_copy_b_wei_stride == rhs.m_copy_b_wei_stride && m_hash == rhs.m_hash &&
35-
(m_static_params == rhs.m_static_params || *m_static_params == *(rhs.m_static_params));
27+
return m_N == rhs.m_N && m_K == rhs.m_K && m_copy_b_wei_stride == rhs.m_copy_b_wei_stride && m_hash == rhs.m_hash;
3628
}
3729

3830
bool GemmCopyBKernelKaiConfig::is_completed() const {
@@ -47,7 +39,6 @@ bool GemmCopyBKernelKaiConfig::is_empty() const {
4739
# define PRINT(X) ss << #X << " = " << (X) << "\n"
4840
std::string GemmCopyBKernelKaiConfig::to_string() const {
4941
std::stringstream ss;
50-
ss << m_static_params->to_string() << "\n";
5142
PRINT(m_N);
5243
PRINT(m_K);
5344
PRINT(m_copy_b_wei_stride);
@@ -72,64 +63,33 @@ void GemmCopyBKernelKaiConfig::update(size_t N, size_t K, size_t stride) {
7263
}
7364

7465
size_t GemmCopyBKernelKaiConfig::compute_hash() const {
75-
size_t seed = m_static_params->hash;
66+
size_t seed = 0;
7667
seed = dnnl::impl::hash_combine(seed, m_N);
7768
seed = dnnl::impl::hash_combine(seed, m_K);
7869
seed = dnnl::impl::hash_combine(seed, m_copy_b_wei_stride);
7970
return seed;
8071
}
8172

82-
GemmCopyBKernelKaiConfig::StaticParams::StaticParams(size_t wei_n_blk)
83-
: wei_N_blk(wei_n_blk),
84-
hash(init_hash(wei_N_blk)) {}
85-
86-
bool GemmCopyBKernelKaiConfig::StaticParams::operator==(const StaticParams& rhs) const {
87-
return wei_N_blk == rhs.wei_N_blk && hash == rhs.hash;
88-
}
89-
90-
size_t GemmCopyBKernelKaiConfig::StaticParams::init_hash(size_t wei_n_blk) {
91-
size_t seed = 0;
92-
seed = dnnl::impl::hash_combine(seed, wei_n_blk);
93-
return seed;
94-
}
95-
96-
#ifdef SNIPPETS_DEBUG_CAPS
97-
# define PRINT(X) ss << #X << " = " << (X) << "\n"
98-
std::string GemmCopyBKernelKaiConfig::StaticParams::to_string() const {
99-
std::stringstream ss;
100-
PRINT(wei_N_blk);
101-
return ss.str();
102-
}
103-
# undef PRINT
104-
#endif
105-
10673
GemmCopyBKaiKernelExecutor::GemmCopyBKaiKernelExecutor(GemmCopyBKernelKaiConfig config)
10774
: snippets::KernelExecutor<GemmCopyBKernelKaiConfig, GemmCopyBCompiledKernel>(std::move(config)) {}
10875

109-
void GemmCopyBKaiKernelExecutor::update_kernel(const GemmCopyBKernelKaiConfig& config,
76+
void GemmCopyBKaiKernelExecutor::update_kernel([[maybe_unused]] const GemmCopyBKernelKaiConfig& config,
11077
std::shared_ptr<GemmCopyBCompiledKernel>& kernel) const {
11178
if (kernel == nullptr) {
11279
// GemmCopyBCompiledKernel is an universal kernel, which could be used in any config and shape.
113-
// 1. It's executed block by block with binary call and config passed as parameters.
114-
// 2. In each block, at most n_blk_size bias is needed. n_blk_size is a fixed value in gemm blocking pass.
115-
// if N block size changed in gemm blocking in future and based on shape, bias_buffer should be updated
11680
kernel = std::make_shared<GemmCopyBCompiledKernel>();
117-
const auto& n_blk_size = config.get_n_blk_size();
118-
kernel->bias_buffer->resize(n_blk_size * sizeof(float), 0);
81+
kernel->bias_buffer->resize(GemmCopyBKernelKaiConfig::get_N_blk() * sizeof(float), 0);
11982
}
12083
}
12184

12285
void GemmCopyBKaiKernelExecutor::update_config(const ov::snippets::lowered::ExpressionPtr& expr,
12386
[[maybe_unused]] const ov::snippets::lowered::LinearIRCPtr& linear_ir,
12487
GemmCopyBKernelKaiConfig& config) const {
12588
const auto& in0_shape = snippets::utils::get_planar_vdims(expr->get_input_port(0));
126-
int64_t N = *in0_shape.rbegin();
127-
int64_t K = *++in0_shape.rbegin();
128-
129-
// Calculate stride similar to how Gemm executor does it
130-
const auto stride = snippets::utils::get_dim_stride(expr->get_input_port(0));
131-
132-
config.update(N, K, stride * sizeof(float));
89+
const auto N = *in0_shape.rbegin();
90+
const auto K = *++in0_shape.rbegin();
91+
const auto copy_b_wei_stride = snippets::utils::get_dim_stride(expr->get_input_port(0)) * sizeof(float);
92+
config.update(N, K, copy_b_wei_stride);
13393
}
13494

13595
// regarding K*N(32*516),
@@ -141,10 +101,10 @@ void GemmCopyBKaiKernelExecutor::execute(const GemmCopyBKaiKernelExecutor* execu
141101
const auto& config = static_cast<const GemmCopyBKernelKaiConfig&>(executor->get_config());
142102
const auto& kernel = executor->get_kernel();
143103
const auto& ukernel = kernel->copy_b_ukernel;
144-
const auto K = config.get_K(); // K
145-
const auto N = config.get_N(); // N-rhs_stride
146-
const auto copy_b_wei_stride = config.get_copy_b_wei_stride(); // RHS stride in bytes
147-
const auto& n_blk_size = config.get_n_blk_size(); // n_blk
104+
const auto K = config.get_K(); // K
105+
const auto N = config.get_N(); // N-rhs_stride
106+
const auto copy_b_wei_stride = config.get_copy_b_wei_stride(); // RHS stride in bytes
107+
const auto& n_blk_size = GemmCopyBKernelKaiConfig::get_N_blk(); // n_blk
148108
const size_t nr = ukernel->get_nr();
149109
const size_t kr = ukernel->get_kr();
150110
const size_t sr = ukernel->get_sr();

src/plugins/intel_cpu/src/emitters/snippets/aarch64/kernel_executors/gemm_copy_b.hpp

Lines changed: 8 additions & 23 deletions
Original file line numberDiff line numberDiff line change
@@ -17,7 +17,6 @@ namespace ov::intel_cpu::aarch64 {
1717
struct GemmCopyBKernelKaiConfig : public snippets::KernelExecutorBase::GenericConfig {
1818
public:
1919
GemmCopyBKernelKaiConfig() = default;
20-
GemmCopyBKernelKaiConfig(size_t n_blk_size);
2120

2221
bool operator==(const GemmCopyBKernelKaiConfig& rhs) const;
2322
bool operator!=(const GemmCopyBKernelKaiConfig& rhs) const {
@@ -50,33 +49,19 @@ struct GemmCopyBKernelKaiConfig : public snippets::KernelExecutorBase::GenericCo
5049
[[nodiscard]] size_t get_copy_b_wei_stride() const {
5150
return m_copy_b_wei_stride;
5251
}
53-
[[nodiscard]] size_t get_n_blk_size() const {
54-
return m_static_params->wei_N_blk;
52+
[[nodiscard]] static size_t get_N_blk() {
53+
return m_N_blk;
5554
}
5655

5756
private:
58-
struct StaticParams {
59-
StaticParams(size_t wei_n_blk);
60-
61-
const size_t wei_N_blk{0};
62-
const size_t hash{0};
63-
64-
bool operator==(const StaticParams& rhs) const;
65-
bool operator!=(const StaticParams& rhs) const {
66-
return !(*this == rhs);
67-
}
68-
69-
#ifdef SNIPPETS_DEBUG_CAPS
70-
[[nodiscard]] std::string to_string() const;
71-
#endif
72-
73-
private:
74-
static size_t init_hash(size_t wei_n_blk);
75-
};
76-
7757
[[nodiscard]] size_t compute_hash() const;
7858

79-
std::shared_ptr<StaticParams> m_static_params;
59+
// Just default value N_blk for:
60+
// - iterated repacking
61+
// - allocated nullified memory for Bias only once with small size
62+
// This value doesn't depend on blocking sizes of GemmCPU
63+
static constexpr size_t m_N_blk = 64;
64+
8065
size_t m_N = 0;
8166
size_t m_K = 0;
8267
size_t m_copy_b_wei_stride = 0;

src/plugins/intel_cpu/src/transformations/snippets/aarch64/op/gemm_utils.cpp

Lines changed: 0 additions & 24 deletions
Original file line numberDiff line numberDiff line change
@@ -5,7 +5,6 @@
55
#include "gemm_utils.hpp"
66

77
#include <cstddef>
8-
#include <vector>
98

109
#include "openvino/core/except.hpp"
1110
#include "openvino/core/type.hpp"
@@ -36,29 +35,6 @@ ov::snippets::lowered::ExpressionPtr get_copy_b_expr(const ov::snippets::lowered
3635
return nullptr;
3736
}
3837

39-
std::vector<snippets::lowered::ExpressionPtr> get_gemm_exprs(
40-
const ov::snippets::lowered::ExpressionPtr& gemm_copyb_expr) {
41-
OPENVINO_ASSERT(ov::is_type<GemmCopyB>(gemm_copyb_expr->get_node()),
42-
"get_gemm_exprs must be called only for GemmCopyB node");
43-
OPENVINO_ASSERT(gemm_copyb_expr->get_output_count() == 1, "gemm copyb expr must has one output");
44-
std::vector<snippets::lowered::ExpressionPtr> result;
45-
auto copyb_output_expr = gemm_copyb_expr->get_output_port_connector(0)->get_consumers().begin()->get_expr();
46-
if (ov::is_type<GemmCPU>(copyb_output_expr->get_node())) {
47-
result.push_back(copyb_output_expr);
48-
}
49-
if (ov::is_type<RepackedWeightsBufferExpression>(copyb_output_expr)) {
50-
OPENVINO_ASSERT(copyb_output_expr->get_output_count() == 1, "gemm copyb buffer expr must has one output");
51-
// repacked buffer could connect gemm expr in main loop and tail loop.
52-
const auto& consumers = copyb_output_expr->get_output_port_connector(0)->get_consumers();
53-
for (const auto& consumer : consumers) {
54-
if (ov::is_type<GemmCPU>(consumer.get_expr()->get_node())) {
55-
result.push_back(consumer.get_expr());
56-
}
57-
}
58-
}
59-
return result;
60-
}
61-
6238
size_t get_inner_n_block(const ov::element::Type& precision) {
6339
OPENVINO_ASSERT(precision == element::f32, "Only f32 is supported for snippets Matmul");
6440
return 8;

src/plugins/intel_cpu/src/transformations/snippets/aarch64/op/gemm_utils.hpp

Lines changed: 0 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -20,14 +20,6 @@ namespace ov::intel_cpu::aarch64::gemm_utils::repacking {
2020
*/
2121
snippets::lowered::ExpressionPtr get_copy_b_expr(const snippets::lowered::ExpressionPtr& gemm_expr);
2222

23-
/**
24-
* @brief Retrieves the expression pointers for the gemm expressions corresponding to the given gemm_copy_b
25-
* expression.
26-
* @param gemm_expr The expression pointer for the gemm_copy_b operation.
27-
* @return The expression pointers for the gemm operation.
28-
*/
29-
std::vector<snippets::lowered::ExpressionPtr> get_gemm_exprs(const snippets::lowered::ExpressionPtr& gemm_copyb_expr);
30-
3123
/**
3224
* @brief Get inner n block that is required by KleidiAI
3325
* @return Inner n block size

src/plugins/intel_cpu/src/transformations/snippets/aarch64/pass/lowered/expressions/gemm_copy_b_buffer_expressions.cpp

Lines changed: 4 additions & 24 deletions
Original file line numberDiff line numberDiff line change
@@ -18,7 +18,6 @@
1818
#include "snippets/shape_inference/shape_inference.hpp"
1919
#include "snippets/utils/utils.hpp"
2020
#include "transformations/snippets/aarch64/op/gemm_copy_b.hpp"
21-
#include "transformations/snippets/aarch64/op/gemm_cpu.hpp"
2221
#include "utils/general_utils.h"
2322

2423
using namespace ov::snippets::lowered;
@@ -52,34 +51,15 @@ void RepackedWeightsBufferExpression::init_allocation_size(
5251
const auto& in_shape = ov::snippets::utils::get_planar_vdims(parent_expr->get_input_port(0));
5352
OPENVINO_ASSERT(in_shape.size() >= 2 && allocation_rank >= 2, "GemmCopyB should has at least 2 rank tensor");
5453
const auto& element_type = get_node()->get_input_element_type(0);
55-
const size_t N = *in_shape.rbegin();
56-
const size_t K = *++in_shape.rbegin();
54+
const auto N = *in_shape.rbegin();
55+
const auto K = *++in_shape.rbegin();
5756

58-
const auto& consumers = get_output_port_connector(0)->get_consumers();
59-
ExpressionPtr child_gemm_expr = nullptr;
60-
// maybe connected to loopEnd besides gemm
61-
for (const auto& consumer : consumers) {
62-
if (ov::is_type<ov::intel_cpu::aarch64::GemmCPU>(consumer.get_expr()->get_node())) {
63-
child_gemm_expr = consumer.get_expr();
64-
break;
65-
}
66-
}
67-
OPENVINO_ASSERT(child_gemm_expr, "RepackedWeightsBufferExpression must connect to gemm");
68-
const auto& gemm_in_subtensor = ov::snippets::utils::get_projected_subtensor(child_gemm_expr->get_input_port(1));
69-
const size_t n_block_size = *gemm_in_subtensor.rbegin();
70-
if (snippets::utils::is_dynamic_value(N) || snippets::utils::is_dynamic_value(K) ||
71-
snippets::utils::is_dynamic_value(n_block_size)) {
57+
if (snippets::utils::is_dynamic_value(N) || snippets::utils::is_dynamic_value(K)) {
7258
m_allocation_size = snippets::utils::get_dynamic_value<size_t>();
7359
return;
7460
}
75-
size_t n_block_num = N / n_block_size;
76-
size_t n_tail_size = N % n_block_size;
77-
m_allocation_size = n_block_num * kai_get_rhs_packed_size_rhs_pack_kxn_f32p8x1biasf32_f32_f32_neon(n_block_size, K);
78-
if (n_tail_size > 0) {
79-
m_allocation_size += kai_get_rhs_packed_size_rhs_pack_kxn_f32p8x1biasf32_f32_f32_neon(n_tail_size, K);
80-
}
8161
// convert byte size to element type size
82-
m_allocation_size = m_allocation_size / element_type.size();
62+
m_allocation_size = kai_get_rhs_packed_size_rhs_pack_kxn_f32p8x1biasf32_f32_f32_neon(N, K) / element_type.size();
8363
}
8464

8565
} // namespace ov::intel_cpu::aarch64

src/plugins/intel_cpu/tests/functional/shared_tests_instances/snippets/matmul.cpp

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -135,6 +135,12 @@ std::vector<std::vector<ov::test::InputShape>> input_shapes_bias{
135135
{PartialShape{-1, -1, -1, -1}, {{2, 2, 16, 32}, {2, 2, 16, 32}, {2, 2, 16, 32}, {2, 2, 16, 32}}},
136136
{PartialShape{-1, -1, -1, -1}, {{2, 2, 32, 18}, {2, 2, 32, 18}, {2, 2, 32, 1}, {2, 2, 32, 1}}},
137137
{PartialShape{-1, -1, -1, -1}, {{1, 1, 16, 18}, {1, 1, 16, 1}, {1, 1, 16, 18}, {1, 1, 16, 1}}}
138+
},
139+
// Recompilation of whole kernel due to new broadcasting masks
140+
{
141+
{PartialShape{-1, 2, -1, 64}, {{1, 2, 20, 64}, {1, 2, 20, 64}, {2, 2, 20, 64}, {1, 2, 20, 64}}},
142+
{PartialShape{-1, 2, 64, -1}, {{1, 2, 64, 1}, {2, 2, 64, 10}, {2, 2, 64, 10}, {6, 2, 64, 10}}},
143+
{PartialShape{-1, 2, -1, -1}, {{1, 2, 20, 10}, {1, 2, 20, 10}, {2, 2, 20, 10}, {6, 2, 20, 1}}}
138144
}
139145
};
140146

0 commit comments

Comments
 (0)