[Snippets][ARM] Fixed GemmCopyB execution after kernel recompilation (#31747)

Alexandra Sidorova · web-flow · commit 0076e719587e · 2025-08-20T05:15:36.000Z
### Details: - *On the master branch, kernel executor for `GemmCopyB` requires `n_block_size` which is equal to `N` dimensions from `GemmCPU` subtensors. [`jit_gemm_copy_b_emitter`](https://github.com/openvinotoolkit/openvino/blob/5e3914a1d0cf29470451ebe676c14a68386a5e94/src/plugins/intel_cpu/src/emitters/snippets/aarch64/jit_gemm_copy_b_emitter.cpp#L50) initializes this `n_block_size` and set to kernel executor config. However, if there are new shapes with new broadcast pattern (we have regenerate whole kernel), generator gets cloned `LinearIR` after previous infer requests. It means that this cloned `LinearIR` might be changed by previous infer requests. For example, subtensors of `GemmCPU` are changed by [`GemmKernelExcutor::update_config()`](https://github.com/openvinotoolkit/openvino/blob/dce9b25475826c967c8a2a121e864dc7b7cab155/src/plugins/intel_cpu/src/emitters/snippets/brgemm_generic.cpp#L174). It leads to incorrect `n_block_size` initialization in `jit_gemm_copy_b_emitter` during the next kernel regeneration. This PR removes this logic from `jit_gemm_copy_b_emitters` and set static const `n_block_size` in `GemmCopyBKernelExecutor`* - *Currently we use repacking from KleidiAI with iteration by `n_blk_size` where actually `n_blk_size` doesn't depend on `N_blk` from `GemmCPU`. It can be any value* ### Tickets: - *N/A*
diff --git a/src/plugins/intel_cpu/src/emitters/snippets/aarch64/jit_gemm_copy_b_emitter.cpp b/src/plugins/intel_cpu/src/emitters/snippets/aarch64/jit_gemm_copy_b_emitter.cpp
@@ -25,9 +25,7 @@
 #include "openvino/core/type/element_type.hpp"
 #include "snippets/kernel_executor_table.hpp"
 #include "snippets/lowered/expression.hpp"
-#include "snippets/utils/utils.hpp"
 #include "transformations/snippets/aarch64/op/gemm_copy_b.hpp"
-#include "transformations/snippets/aarch64/op/gemm_utils.hpp"
 
 namespace ov::intel_cpu::aarch64 {
 
@@ -43,18 +41,7 @@ jit_gemm_copy_b_emitter::jit_gemm_copy_b_emitter(jit_generator* h,
     in_out_type_ = emitter_in_out_map::gpr_to_gpr;
     const auto gemm_repack = ov::as_type_ptr<GemmCopyB>(expr->get_node());
     OV_CPU_JIT_EMITTER_ASSERT(gemm_repack, "expects GemmCopyB node");
-    const auto& child_gemms = ov::intel_cpu::aarch64::gemm_utils::repacking::get_gemm_exprs(expr);
-    size_t n_blk_size = 0;
-    for (const auto& child_gemm : child_gemms) {
-        const auto& gemm_in1_subtensor = ov::snippets::utils::get_projected_subtensor(child_gemm->get_input_port(1));
-        const auto& current_block = *gemm_in1_subtensor.rbegin();
-        if (current_block != snippets::utils::get_dynamic_value<size_t>() && current_block > n_blk_size) {
-            n_blk_size = current_block;
-        }
-    }
-    OV_CPU_JIT_EMITTER_ASSERT(n_blk_size > 0, "n_blk_size of gemm_repack is expected to be greater than 0.");
-    GemmCopyBKernelKaiConfig kernel_config(n_blk_size);
-    m_kernel_executor = kernel_table->register_kernel<GemmCopyBKaiKernelExecutor>(expr, kernel_config);
+    m_kernel_executor = kernel_table->register_kernel<GemmCopyBKaiKernelExecutor>(expr, GemmCopyBKernelKaiConfig());
 
     // Initialize memory offsets similar to x64 brgemm_copy_b implementation
     m_memory_offsets = {gemm_repack->get_offset_in(), gemm_repack->get_offset_out()};
diff --git a/src/plugins/intel_cpu/src/emitters/snippets/aarch64/kernel_executors/gemm_copy_b.cpp b/src/plugins/intel_cpu/src/emitters/snippets/aarch64/kernel_executors/gemm_copy_b.cpp
@@ -15,7 +15,6 @@
 
 #include "emitters/utils.hpp"
 #include "kai/ukernels/matmul/pack/kai_rhs_pack_kxn_f32p8x1biasf32_f32_f32_neon.h"
-#include "openvino/core/except.hpp"
 #include "snippets/kernel_executor_table.hpp"
 #include "snippets/lowered/expression.hpp"
 #include "snippets/lowered/linear_ir.hpp"
@@ -24,15 +23,8 @@
 
 namespace ov::intel_cpu::aarch64 {
 
-GemmCopyBKernelKaiConfig::GemmCopyBKernelKaiConfig(const size_t n_blk_size)
-    : m_static_params(std::make_shared<StaticParams>(n_blk_size)) {
-    OPENVINO_ASSERT(n_blk_size != 0, "n_blk_size can not be zero in GemmCopyBKernelKaiConfig.");
-    m_hash = compute_hash();
-}
-
 bool GemmCopyBKernelKaiConfig::operator==(const GemmCopyBKernelKaiConfig& rhs) const {
-    return m_N == rhs.m_N && m_K == rhs.m_K && m_copy_b_wei_stride == rhs.m_copy_b_wei_stride && m_hash == rhs.m_hash &&
-           (m_static_params == rhs.m_static_params || *m_static_params == *(rhs.m_static_params));
+    return m_N == rhs.m_N && m_K == rhs.m_K && m_copy_b_wei_stride == rhs.m_copy_b_wei_stride && m_hash == rhs.m_hash;
 }
 
 bool GemmCopyBKernelKaiConfig::is_completed() const {
@@ -47,7 +39,6 @@ bool GemmCopyBKernelKaiConfig::is_empty() const {
 #    define PRINT(X) ss << #X << " = " << (X) << "\n"
 std::string GemmCopyBKernelKaiConfig::to_string() const {
     std::stringstream ss;
-    ss << m_static_params->to_string() << "\n";
     PRINT(m_N);
     PRINT(m_K);
     PRINT(m_copy_b_wei_stride);
@@ -72,64 +63,33 @@ void GemmCopyBKernelKaiConfig::update(size_t N, size_t K, size_t stride) {
 }
 
 size_t GemmCopyBKernelKaiConfig::compute_hash() const {
-    size_t seed = m_static_params->hash;
+    size_t seed = 0;
     seed = dnnl::impl::hash_combine(seed, m_N);
     seed = dnnl::impl::hash_combine(seed, m_K);
     seed = dnnl::impl::hash_combine(seed, m_copy_b_wei_stride);
     return seed;
 }
 
-GemmCopyBKernelKaiConfig::StaticParams::StaticParams(size_t wei_n_blk)
-    : wei_N_blk(wei_n_blk),
-      hash(init_hash(wei_N_blk)) {}
-
-bool GemmCopyBKernelKaiConfig::StaticParams::operator==(const StaticParams& rhs) const {
-    return wei_N_blk == rhs.wei_N_blk && hash == rhs.hash;
-}
-
-size_t GemmCopyBKernelKaiConfig::StaticParams::init_hash(size_t wei_n_blk) {
-    size_t seed = 0;
-    seed = dnnl::impl::hash_combine(seed, wei_n_blk);
-    return seed;
-}
-
-#ifdef SNIPPETS_DEBUG_CAPS
-#    define PRINT(X) ss << #X << " = " << (X) << "\n"
-std::string GemmCopyBKernelKaiConfig::StaticParams::to_string() const {
-    std::stringstream ss;
-    PRINT(wei_N_blk);
-    return ss.str();
-}
-#    undef PRINT
-#endif
-
 GemmCopyBKaiKernelExecutor::GemmCopyBKaiKernelExecutor(GemmCopyBKernelKaiConfig config)
     : snippets::KernelExecutor<GemmCopyBKernelKaiConfig, GemmCopyBCompiledKernel>(std::move(config)) {}
 
-void GemmCopyBKaiKernelExecutor::update_kernel(const GemmCopyBKernelKaiConfig& config,
+void GemmCopyBKaiKernelExecutor::update_kernel([[maybe_unused]] const GemmCopyBKernelKaiConfig& config,
                                                std::shared_ptr<GemmCopyBCompiledKernel>& kernel) const {
     if (kernel == nullptr) {
         // GemmCopyBCompiledKernel is an universal kernel, which could be used in any config and shape.
-        // 1. It's executed block by block with binary call and config passed as parameters.
-        // 2. In each block, at most n_blk_size bias is needed. n_blk_size is a fixed value in gemm blocking pass.
-        // if N block size changed in gemm blocking in future and based on shape, bias_buffer should be updated
         kernel = std::make_shared<GemmCopyBCompiledKernel>();
-        const auto& n_blk_size = config.get_n_blk_size();
-        kernel->bias_buffer->resize(n_blk_size * sizeof(float), 0);
+        kernel->bias_buffer->resize(GemmCopyBKernelKaiConfig::get_N_blk() * sizeof(float), 0);
     }
 }
 
 void GemmCopyBKaiKernelExecutor::update_config(const ov::snippets::lowered::ExpressionPtr& expr,
                                                [[maybe_unused]] const ov::snippets::lowered::LinearIRCPtr& linear_ir,
                                                GemmCopyBKernelKaiConfig& config) const {
     const auto& in0_shape = snippets::utils::get_planar_vdims(expr->get_input_port(0));
-    int64_t N = *in0_shape.rbegin();
-    int64_t K = *++in0_shape.rbegin();
-
-    // Calculate stride similar to how Gemm executor does it
-    const auto stride = snippets::utils::get_dim_stride(expr->get_input_port(0));
-
-    config.update(N, K, stride * sizeof(float));
+    const auto N = *in0_shape.rbegin();
+    const auto K = *++in0_shape.rbegin();
+    const auto copy_b_wei_stride = snippets::utils::get_dim_stride(expr->get_input_port(0)) * sizeof(float);
+    config.update(N, K, copy_b_wei_stride);
 }
 
 // regarding K*N(32*516),
@@ -141,10 +101,10 @@ void GemmCopyBKaiKernelExecutor::execute(const GemmCopyBKaiKernelExecutor* execu
     const auto& config = static_cast<const GemmCopyBKernelKaiConfig&>(executor->get_config());
     const auto& kernel = executor->get_kernel();
     const auto& ukernel = kernel->copy_b_ukernel;
-    const auto K = config.get_K();                                  // K
-    const auto N = config.get_N();                                  // N-rhs_stride
-    const auto copy_b_wei_stride = config.get_copy_b_wei_stride();  // RHS stride in bytes
-    const auto& n_blk_size = config.get_n_blk_size();               // n_blk
+    const auto K = config.get_K();                                   // K
+    const auto N = config.get_N();                                   // N-rhs_stride
+    const auto copy_b_wei_stride = config.get_copy_b_wei_stride();   // RHS stride in bytes
+    const auto& n_blk_size = GemmCopyBKernelKaiConfig::get_N_blk();  // n_blk
     const size_t nr = ukernel->get_nr();
     const size_t kr = ukernel->get_kr();
     const size_t sr = ukernel->get_sr();
diff --git a/src/plugins/intel_cpu/src/emitters/snippets/aarch64/kernel_executors/gemm_copy_b.hpp b/src/plugins/intel_cpu/src/emitters/snippets/aarch64/kernel_executors/gemm_copy_b.hpp
@@ -17,7 +17,6 @@ namespace ov::intel_cpu::aarch64 {
 struct GemmCopyBKernelKaiConfig : public snippets::KernelExecutorBase::GenericConfig {
 public:
     GemmCopyBKernelKaiConfig() = default;
-    GemmCopyBKernelKaiConfig(size_t n_blk_size);
 
     bool operator==(const GemmCopyBKernelKaiConfig& rhs) const;
     bool operator!=(const GemmCopyBKernelKaiConfig& rhs) const {
@@ -50,33 +49,19 @@ struct GemmCopyBKernelKaiConfig : public snippets::KernelExecutorBase::GenericCo
     [[nodiscard]] size_t get_copy_b_wei_stride() const {
         return m_copy_b_wei_stride;
     }
-    [[nodiscard]] size_t get_n_blk_size() const {
-        return m_static_params->wei_N_blk;
+    [[nodiscard]] static size_t get_N_blk() {
+        return m_N_blk;
     }
 
 private:
-    struct StaticParams {
-        StaticParams(size_t wei_n_blk);
-
-        const size_t wei_N_blk{0};
-        const size_t hash{0};
-
-        bool operator==(const StaticParams& rhs) const;
-        bool operator!=(const StaticParams& rhs) const {
-            return !(*this == rhs);
-        }
-
-#ifdef SNIPPETS_DEBUG_CAPS
-        [[nodiscard]] std::string to_string() const;
-#endif
-
-    private:
-        static size_t init_hash(size_t wei_n_blk);
-    };
-
     [[nodiscard]] size_t compute_hash() const;
 
-    std::shared_ptr<StaticParams> m_static_params;
+    // Just default value N_blk for:
+    // - iterated repacking
+    // - allocated nullified memory for Bias only once with small size
+    // This value doesn't depend on blocking sizes of GemmCPU
+    static constexpr size_t m_N_blk = 64;
+
     size_t m_N = 0;
     size_t m_K = 0;
     size_t m_copy_b_wei_stride = 0;
diff --git a/src/plugins/intel_cpu/src/transformations/snippets/aarch64/op/gemm_utils.cpp b/src/plugins/intel_cpu/src/transformations/snippets/aarch64/op/gemm_utils.cpp
@@ -5,7 +5,6 @@
 #include "gemm_utils.hpp"
 
 #include <cstddef>
-#include <vector>
 
 #include "openvino/core/except.hpp"
 #include "openvino/core/type.hpp"
@@ -36,29 +35,6 @@ ov::snippets::lowered::ExpressionPtr get_copy_b_expr(const ov::snippets::lowered
     return nullptr;
 }
 
-std::vector<snippets::lowered::ExpressionPtr> get_gemm_exprs(
-    const ov::snippets::lowered::ExpressionPtr& gemm_copyb_expr) {
-    OPENVINO_ASSERT(ov::is_type<GemmCopyB>(gemm_copyb_expr->get_node()),
-                    "get_gemm_exprs must be called only for GemmCopyB node");
-    OPENVINO_ASSERT(gemm_copyb_expr->get_output_count() == 1, "gemm copyb expr must has one output");
-    std::vector<snippets::lowered::ExpressionPtr> result;
-    auto copyb_output_expr = gemm_copyb_expr->get_output_port_connector(0)->get_consumers().begin()->get_expr();
-    if (ov::is_type<GemmCPU>(copyb_output_expr->get_node())) {
-        result.push_back(copyb_output_expr);
-    }
-    if (ov::is_type<RepackedWeightsBufferExpression>(copyb_output_expr)) {
-        OPENVINO_ASSERT(copyb_output_expr->get_output_count() == 1, "gemm copyb buffer expr must has one output");
-        // repacked buffer could connect gemm expr in main loop and tail loop.
-        const auto& consumers = copyb_output_expr->get_output_port_connector(0)->get_consumers();
-        for (const auto& consumer : consumers) {
-            if (ov::is_type<GemmCPU>(consumer.get_expr()->get_node())) {
-                result.push_back(consumer.get_expr());
-            }
-        }
-    }
-    return result;
-}
-
 size_t get_inner_n_block(const ov::element::Type& precision) {
     OPENVINO_ASSERT(precision == element::f32, "Only f32 is supported for snippets Matmul");
     return 8;
diff --git a/src/plugins/intel_cpu/src/transformations/snippets/aarch64/op/gemm_utils.hpp b/src/plugins/intel_cpu/src/transformations/snippets/aarch64/op/gemm_utils.hpp
@@ -20,14 +20,6 @@ namespace ov::intel_cpu::aarch64::gemm_utils::repacking {
  */
 snippets::lowered::ExpressionPtr get_copy_b_expr(const snippets::lowered::ExpressionPtr& gemm_expr);
 
-/**
- * @brief Retrieves the expression pointers for the gemm expressions corresponding to the given gemm_copy_b
- * expression.
- * @param gemm_expr The expression pointer for the gemm_copy_b operation.
- * @return The expression pointers for the gemm operation.
- */
-std::vector<snippets::lowered::ExpressionPtr> get_gemm_exprs(const snippets::lowered::ExpressionPtr& gemm_copyb_expr);
-
 /**
  * @brief Get inner n block that is required by KleidiAI
  * @return Inner n block size
diff --git a/src/plugins/intel_cpu/src/transformations/snippets/aarch64/pass/lowered/expressions/gemm_copy_b_buffer_expressions.cpp b/src/plugins/intel_cpu/src/transformations/snippets/aarch64/pass/lowered/expressions/gemm_copy_b_buffer_expressions.cpp
@@ -18,7 +18,6 @@
 #include "snippets/shape_inference/shape_inference.hpp"
 #include "snippets/utils/utils.hpp"
 #include "transformations/snippets/aarch64/op/gemm_copy_b.hpp"
-#include "transformations/snippets/aarch64/op/gemm_cpu.hpp"
 #include "utils/general_utils.h"
 
 using namespace ov::snippets::lowered;
@@ -52,34 +51,15 @@ void RepackedWeightsBufferExpression::init_allocation_size(
     const auto& in_shape = ov::snippets::utils::get_planar_vdims(parent_expr->get_input_port(0));
     OPENVINO_ASSERT(in_shape.size() >= 2 && allocation_rank >= 2, "GemmCopyB should has at least 2 rank tensor");
     const auto& element_type = get_node()->get_input_element_type(0);
-    const size_t N = *in_shape.rbegin();
-    const size_t K = *++in_shape.rbegin();
+    const auto N = *in_shape.rbegin();
+    const auto K = *++in_shape.rbegin();
 
-    const auto& consumers = get_output_port_connector(0)->get_consumers();
-    ExpressionPtr child_gemm_expr = nullptr;
-    // maybe connected to loopEnd besides gemm
-    for (const auto& consumer : consumers) {
-        if (ov::is_type<ov::intel_cpu::aarch64::GemmCPU>(consumer.get_expr()->get_node())) {
-            child_gemm_expr = consumer.get_expr();
-            break;
-        }
-    }
-    OPENVINO_ASSERT(child_gemm_expr, "RepackedWeightsBufferExpression must connect to gemm");
-    const auto& gemm_in_subtensor = ov::snippets::utils::get_projected_subtensor(child_gemm_expr->get_input_port(1));
-    const size_t n_block_size = *gemm_in_subtensor.rbegin();
-    if (snippets::utils::is_dynamic_value(N) || snippets::utils::is_dynamic_value(K) ||
-        snippets::utils::is_dynamic_value(n_block_size)) {
+    if (snippets::utils::is_dynamic_value(N) || snippets::utils::is_dynamic_value(K)) {
         m_allocation_size = snippets::utils::get_dynamic_value<size_t>();
         return;
     }
-    size_t n_block_num = N / n_block_size;
-    size_t n_tail_size = N % n_block_size;
-    m_allocation_size = n_block_num * kai_get_rhs_packed_size_rhs_pack_kxn_f32p8x1biasf32_f32_f32_neon(n_block_size, K);
-    if (n_tail_size > 0) {
-        m_allocation_size += kai_get_rhs_packed_size_rhs_pack_kxn_f32p8x1biasf32_f32_f32_neon(n_tail_size, K);
-    }
     // convert byte size to element type size
-    m_allocation_size = m_allocation_size / element_type.size();
+    m_allocation_size = kai_get_rhs_packed_size_rhs_pack_kxn_f32p8x1biasf32_f32_f32_neon(N, K) / element_type.size();
 }
 
 }  // namespace ov::intel_cpu::aarch64
diff --git a/src/plugins/intel_cpu/tests/functional/shared_tests_instances/snippets/matmul.cpp b/src/plugins/intel_cpu/tests/functional/shared_tests_instances/snippets/matmul.cpp
@@ -135,6 +135,12 @@ std::vector<std::vector<ov::test::InputShape>> input_shapes_bias{
         {PartialShape{-1, -1, -1, -1}, {{2, 2, 16, 32}, {2, 2, 16, 32}, {2, 2, 16, 32}, {2, 2, 16, 32}}},
         {PartialShape{-1, -1, -1, -1}, {{2, 2, 32, 18}, {2, 2, 32, 18}, {2, 2, 32, 1},  {2, 2, 32, 1}}},
         {PartialShape{-1, -1, -1, -1}, {{1, 1, 16, 18}, {1, 1, 16, 1},  {1, 1, 16, 18}, {1, 1, 16, 1}}}
+    },
+    // Recompilation of whole kernel due to new broadcasting masks
+    {
+        {PartialShape{-1, 2, -1, 64}, {{1, 2, 20, 64}, {1, 2, 20, 64}, {2, 2, 20, 64}, {1, 2, 20, 64}}},
+        {PartialShape{-1, 2, 64, -1}, {{1, 2, 64, 1},  {2, 2, 64, 10}, {2, 2, 64, 10}, {6, 2, 64, 10}}},
+        {PartialShape{-1, 2, -1, -1}, {{1, 2, 20, 10}, {1, 2, 20, 10}, {2, 2, 20, 10}, {6, 2, 20, 1}}}
     }
 };