diff --git a/.github/workflows/linux_riscv_xuantie_dev_cpu.yml b/.github/workflows/linux_riscv_xuantie_dev_cpu.yml index 379bfbd41aef33..440bb04c1a7641 100644 --- a/.github/workflows/linux_riscv_xuantie_dev_cpu.yml +++ b/.github/workflows/linux_riscv_xuantie_dev_cpu.yml @@ -16,6 +16,7 @@ on: - 'src/plugins/intel_cpu/src/emitters/plugin/aarch64/**' - 'src/plugins/intel_cpu/src/emitters/plugin/riscv64/**' - 'src/plugins/intel_cpu/src/emitters/plugin/x64/**' + - 'src/plugins/intel_cpu/src/emitters/snippets/riscv64/**' - 'src/plugins/intel_cpu/src/nodes/executors/aarch64/**' - 'src/plugins/intel_cpu/src/nodes/executors/shl/**' - 'src/plugins/intel_cpu/src/nodes/kernels/riscv64/**' @@ -234,7 +235,7 @@ jobs: env: INSTALL_DIR: ${{ github.workspace }}/install INSTALL_TEST_DIR: ${{ github.workspace }}/install/tests - GTEST_FILTER: ${{ inputs.testFilterType == 'SMOKE' && '*smoke*' || '*ActivationLayer*:*EltwiseLayer*:*LogicalLayer*:*ComparisonLayer*:*SelectLayer*:*MatMulLayerCPUTest*:*ExtremumLayerCPUTest*' }} + GTEST_FILTER: ${{ inputs.testFilterType == 'SMOKE' && '*smoke*' || '*ActivationLayer*:*EltwiseLayer*:*LogicalLayer*:*ComparisonLayer*:*SelectLayer*:*MatMulLayerCPUTest*:*ExtremumLayerCPUTest*:smoke_Snippets*' }} steps: - name: Download OpenVINO artifacts (package) diff --git a/src/plugins/intel_cpu/CMakeLists.txt b/src/plugins/intel_cpu/CMakeLists.txt index f0bcf4b7287285..cb40913eef2f34 100644 --- a/src/plugins/intel_cpu/CMakeLists.txt +++ b/src/plugins/intel_cpu/CMakeLists.txt @@ -212,10 +212,13 @@ if(NOT (AARCH64 OR ARM)) ${CMAKE_CURRENT_SOURCE_DIR}/src/transformations/tpp/aarch64/* ${CMAKE_CURRENT_SOURCE_DIR}/src/emitters/plugin/aarch64/* ${CMAKE_CURRENT_SOURCE_DIR}/src/emitters/tpp/aarch64/* - ${CMAKE_CURRENT_SOURCE_DIR}/src/nodes/executors/aarch64/* ${CMAKE_CURRENT_SOURCE_DIR}/src/nodes/kernels/aarch64/*) endif() +if(NOT (AARCH64 OR ARM)) + list(APPEND EXCLUDE_PATHS ${CMAKE_CURRENT_SOURCE_DIR}/src/nodes/executors/aarch64/*) +endif() + if(NOT AARCH64) list(APPEND EXCLUDE_PATHS ${CMAKE_CURRENT_SOURCE_DIR}/src/transformations/snippets/aarch64/* ${CMAKE_CURRENT_SOURCE_DIR}/src/emitters/snippets/aarch64/*) @@ -223,7 +226,9 @@ endif() if (NOT RISCV64) list(APPEND EXCLUDE_PATHS ${CMAKE_CURRENT_SOURCE_DIR}/src/emitters/plugin/riscv64/* - ${CMAKE_CURRENT_SOURCE_DIR}/src/nodes/kernels/riscv64/*) + ${CMAKE_CURRENT_SOURCE_DIR}/src/nodes/kernels/riscv64/* + ${CMAKE_CURRENT_SOURCE_DIR}/src/emitters/snippets/riscv64/* + ${CMAKE_CURRENT_SOURCE_DIR}/src/nodes/executors/riscv64/*) endif() if (NOT ENABLE_MLAS_FOR_CPU) diff --git a/src/plugins/intel_cpu/src/emitters/snippets/aarch64/cpu_generator.cpp b/src/plugins/intel_cpu/src/emitters/snippets/aarch64/cpu_generator.cpp index 31ab865ddff5e9..baf68b9bd45d90 100644 --- a/src/plugins/intel_cpu/src/emitters/snippets/aarch64/cpu_generator.cpp +++ b/src/plugins/intel_cpu/src/emitters/snippets/aarch64/cpu_generator.cpp @@ -10,7 +10,6 @@ #include #include #include -#include #include #include #include @@ -213,23 +212,6 @@ class jit_snippet : public dnnl::impl::cpu::aarch64::jit_generator { namespace intel_cpu::aarch64 { -CompiledSnippetCPU::CompiledSnippetCPU(std::unique_ptr h) - : h_compiled(std::move(h)) { - OPENVINO_ASSERT(h_compiled && h_compiled->jit_ker(), "Got invalid jit generator or kernel was nopt compiled"); -} - -const uint8_t* CompiledSnippetCPU::get_code() const { - return h_compiled->jit_ker(); -} - -size_t CompiledSnippetCPU::get_code_size() const { - return h_compiled->getSize(); -} - -bool CompiledSnippetCPU::empty() const { - return get_code_size() == 0; -} - CPUTargetMachine::CPUTargetMachine(dnnl::impl::cpu::aarch64::cpu_isa_t host_isa, ov::intel_cpu::MultiCacheWeakPtr cache) : TargetMachine(std::make_shared(cache)), h(new jit_snippet()), diff --git a/src/plugins/intel_cpu/src/emitters/snippets/aarch64/cpu_generator.hpp b/src/plugins/intel_cpu/src/emitters/snippets/aarch64/cpu_generator.hpp index 322921f545cf98..a74f36ff9491a1 100644 --- a/src/plugins/intel_cpu/src/emitters/snippets/aarch64/cpu_generator.hpp +++ b/src/plugins/intel_cpu/src/emitters/snippets/aarch64/cpu_generator.hpp @@ -12,6 +12,7 @@ #include "cache/multi_cache.h" #include "cpu/aarch64/jit_generator.hpp" +#include "emitters/snippets/common/compiled_snippet_cpu.hpp" #include "openvino/core/node.hpp" #include "openvino/core/node_output.hpp" #include "snippets/emitter.hpp" @@ -20,16 +21,7 @@ namespace ov::intel_cpu::aarch64 { -class CompiledSnippetCPU : public snippets::CompiledSnippet { -public: - explicit CompiledSnippetCPU(std::unique_ptr h); - [[nodiscard]] const uint8_t* get_code() const override; - [[nodiscard]] size_t get_code_size() const override; - [[nodiscard]] bool empty() const override; - -private: - const std::unique_ptr h_compiled; -}; +using CompiledSnippetCPU = ov::intel_cpu::CompiledSnippetCPUCommon; class CPUTargetMachine : public snippets::TargetMachine { public: diff --git a/src/plugins/intel_cpu/src/emitters/snippets/aarch64/jit_memory_emitters.cpp b/src/plugins/intel_cpu/src/emitters/snippets/aarch64/jit_memory_emitters.cpp index c199a1942f27f1..2a30df3af14e4b 100644 --- a/src/plugins/intel_cpu/src/emitters/snippets/aarch64/jit_memory_emitters.cpp +++ b/src/plugins/intel_cpu/src/emitters/snippets/aarch64/jit_memory_emitters.cpp @@ -18,11 +18,11 @@ #include "emitters/plugin/aarch64/jit_emitter.hpp" #include "emitters/plugin/aarch64/jit_load_store_emitters.hpp" #include "emitters/snippets/jit_snippets_call_args.hpp" +#include "emitters/snippets/utils/utils.hpp" #include "emitters/utils.hpp" #include "openvino/core/type.hpp" #include "openvino/core/type/element_type.hpp" #include "snippets/lowered/expression.hpp" -#include "snippets/lowered/expressions/buffer_expression.hpp" #include "snippets/lowered/loop_manager.hpp" #include "snippets/op/broadcastload.hpp" #include "snippets/op/load.hpp" @@ -55,13 +55,13 @@ jit_memory_emitter::jit_memory_emitter(jit_generator* h, OV_CPU_JIT_EMITTER_ASSERT(memory_access->is_memory_access_input_port(0), "must be input port - memory access"); count = memory_access->get_input_count(); compiled_byte_offset = memory_access->get_input_offset(); - buffer_cluster_id = get_parent_buffer_cluster_id(expr); + buffer_cluster_id = ov::intel_cpu::utils::get_parent_buffer_cluster_id(expr); } else if (in_out_type_ == emitter_in_out_map::vec_to_gpr) { OV_CPU_JIT_EMITTER_ASSERT(memory_access->is_memory_access_output_port(0), "must be output port - memory access"); count = memory_access->get_output_count(); compiled_byte_offset = memory_access->get_output_offset(); - buffer_cluster_id = get_consumer_buffer_cluster_id(expr); + buffer_cluster_id = ov::intel_cpu::utils::get_consumer_buffer_cluster_id(expr); } else { OV_CPU_JIT_EMITTER_THROW("unsupported in_out_type"); } @@ -75,26 +75,6 @@ jit_memory_emitter::jit_memory_emitter(jit_generator* h, } } -size_t jit_memory_emitter::get_parent_buffer_cluster_id(const ov::snippets::lowered::ExpressionPtr& expr) { - OV_CPU_JIT_EMITTER_ASSERT(expr->get_input_count() == 1, "MemoryAccess must have one parent"); - const auto& parent_expr = expr->get_input_expr_ptr(0); - if (const auto buffer = ov::as_type_ptr(parent_expr)) { - return buffer->get_cluster_id(); - } - return SIZE_MAX; -} - -size_t jit_memory_emitter::get_consumer_buffer_cluster_id(const ov::snippets::lowered::ExpressionPtr& expr) { - OV_CPU_JIT_EMITTER_ASSERT(expr->get_output_count() == 1, "MemoryAccess must have one output"); - const auto& consumers = expr->get_output_port_connector(0)->get_consumers(); - for (const auto& consumer : consumers) { - if (const auto buffer = ov::as_type_ptr(consumer.get_expr())) { - return buffer->get_cluster_id(); - } - } - return SIZE_MAX; -} - jit_load_memory_emitter::jit_load_memory_emitter(jit_generator* h, cpu_isa_t isa, const ExpressionPtr& expr) : jit_memory_emitter(h, isa, expr, emitter_in_out_map::gpr_to_vec) { bool is_supported_precision = diff --git a/src/plugins/intel_cpu/src/emitters/snippets/aarch64/jit_memory_emitters.hpp b/src/plugins/intel_cpu/src/emitters/snippets/aarch64/jit_memory_emitters.hpp index 14baec3f9df473..f0e98433265d23 100644 --- a/src/plugins/intel_cpu/src/emitters/snippets/aarch64/jit_memory_emitters.hpp +++ b/src/plugins/intel_cpu/src/emitters/snippets/aarch64/jit_memory_emitters.hpp @@ -34,9 +34,6 @@ class jit_memory_emitter : public jit_emitter { std::vector get_available_aux_gprs() const; protected: - static size_t get_parent_buffer_cluster_id(const ov::snippets::lowered::ExpressionPtr& expr); - static size_t get_consumer_buffer_cluster_id(const ov::snippets::lowered::ExpressionPtr& expr); - ov::element::Type src_prc; ov::element::Type dst_prc; diff --git a/src/plugins/intel_cpu/src/emitters/snippets/common/compiled_snippet_cpu.hpp b/src/plugins/intel_cpu/src/emitters/snippets/common/compiled_snippet_cpu.hpp new file mode 100644 index 00000000000000..a708c6f7876395 --- /dev/null +++ b/src/plugins/intel_cpu/src/emitters/snippets/common/compiled_snippet_cpu.hpp @@ -0,0 +1,40 @@ +// Copyright (C) 2018-2025 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 +// + +#pragma once + +#include +#include +#include + +#include "openvino/core/except.hpp" +#include "snippets/target_machine.hpp" + +namespace ov::intel_cpu { + +// A small helper that wraps a platform-specific JIT generator and exposes +// a uniform CompiledSnippet interface. This reduces duplication across +// x64, aarch64 and riscv64 backends. +template +class CompiledSnippetCPUCommon : public ov::snippets::CompiledSnippet { +public: + explicit CompiledSnippetCPUCommon(std::unique_ptr h) : h_compiled(std::move(h)) { + OPENVINO_ASSERT(h_compiled && h_compiled->jit_ker(), "Got invalid jit generator or kernel was not compiled"); + } + + [[nodiscard]] const uint8_t* get_code() const override { + return h_compiled->jit_ker(); + } + [[nodiscard]] size_t get_code_size() const override { + return h_compiled->getSize(); + } + [[nodiscard]] bool empty() const override { + return get_code_size() == 0; + } + +private: + const std::unique_ptr h_compiled; +}; + +} // namespace ov::intel_cpu diff --git a/src/plugins/intel_cpu/src/emitters/snippets/common/jit_loop_args_helper.hpp b/src/plugins/intel_cpu/src/emitters/snippets/common/jit_loop_args_helper.hpp new file mode 100644 index 00000000000000..2507a5ff14872d --- /dev/null +++ b/src/plugins/intel_cpu/src/emitters/snippets/common/jit_loop_args_helper.hpp @@ -0,0 +1,48 @@ +// Copyright (C) 2018-2025 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 +// + +#pragma once + +#include +#include +#include + +#include "emitters/snippets/jit_snippets_call_args.hpp" +#include "snippets/op/loop.hpp" +#include "snippets/utils/utils.hpp" + +namespace ov::intel_cpu::snippets_common { + +inline jit_snippets_call_args::loop_args_t compose_loop_args( + const std::shared_ptr& loop_end) { + const auto& ptr_increments = loop_end->get_ptr_increments(); + const auto& fin_offsets = loop_end->get_finalization_offsets(); + const auto& is_incremented = loop_end->get_is_incremented(); + const auto wa_increment = loop_end->get_increment(); + + const auto int_work_amount = ov::snippets::utils::is_dynamic_value(loop_end->get_work_amount()) + ? ov::snippets::utils::get_dynamic_value() + : static_cast(loop_end->get_work_amount()); + auto loop_args = jit_snippets_call_args::loop_args_t(int_work_amount, ptr_increments, fin_offsets); + + const auto& data_sizes = loop_end->get_element_type_sizes(); + for (int64_t i = 0; i < loop_args.m_num_data_ptrs; ++i) { + if (!is_incremented[i]) { + loop_args.m_ptr_increments[i] = 0; + loop_args.m_finalization_offsets[i] = 0; + continue; + } + + if (!ov::snippets::utils::is_dynamic_value(loop_args.m_ptr_increments[i])) { + loop_args.m_ptr_increments[i] *= (wa_increment * data_sizes[i]); + } + if (!ov::snippets::utils::is_dynamic_value(loop_args.m_finalization_offsets[i])) { + loop_args.m_finalization_offsets[i] *= data_sizes[i]; + } + } + + return loop_args; +} + +} // namespace ov::intel_cpu::snippets_common diff --git a/src/plugins/intel_cpu/src/emitters/snippets/riscv64/cpu_generator.cpp b/src/plugins/intel_cpu/src/emitters/snippets/riscv64/cpu_generator.cpp new file mode 100644 index 00000000000000..bff6060c77de84 --- /dev/null +++ b/src/plugins/intel_cpu/src/emitters/snippets/riscv64/cpu_generator.cpp @@ -0,0 +1,194 @@ +// Copyright (C) 2018-2025 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 +// + +#include "cpu_generator.hpp" + +#include +#include +#include +#include +#include +#include +#include + +#include "cache/multi_cache.h" +#include "emitters/plugin/riscv64/jit_eltwise_emitters.hpp" +#include "emitters/snippets/cpu_runtime_configurator.hpp" +#include "jit_kernel_emitter.hpp" +#include "jit_loop_emitters.hpp" +#include "jit_memory_emitters.hpp" +#include "jit_snippets_emitters.hpp" +#include "openvino/core/except.hpp" +#include "openvino/core/node.hpp" +#include "openvino/core/node_output.hpp" +#include "openvino/core/type/element_type.hpp" +#include "openvino/op/add.hpp" +#include "openvino/op/parameter.hpp" +#include "openvino/op/result.hpp" +#include "snippets/emitter.hpp" +#include "snippets/generator.hpp" +#include "snippets/lowered/expression.hpp" +#include "snippets/op/broadcastload.hpp" +#include "snippets/op/kernel.hpp" +#include "snippets/op/load.hpp" +#include "snippets/op/loop.hpp" +#include "snippets/op/scalar.hpp" +#include "snippets/op/store.hpp" +#include "snippets/target_machine.hpp" +#include "utils/general_utils.h" +#include "xbyak_riscv/xbyak_riscv.hpp" + +namespace ov { + +#define CREATE_SNIPPETS_EMITTER(e_type, ...) \ + {[this](const snippets::lowered::ExpressionPtr& expr) -> std::shared_ptr { \ + return std::make_shared(h.get(), isa, expr, ##__VA_ARGS__); \ + }, \ + [](const std::shared_ptr& n) -> std::set> { \ + return e_type::get_supported_precisions(n); \ + }} + +#define CREATE_CPU_EMITTER(e_type) \ + {[this](const snippets::lowered::ExpressionPtr& expr) -> std::shared_ptr { \ + return std::make_shared(h.get(), isa, expr->get_node()); \ + }, \ + [](const std::shared_ptr& n) -> std::set> { \ + return e_type::get_supported_precisions(n); \ + }} + +class jit_snippet : public ov::intel_cpu::riscv64::jit_generator_t { +public: + DECLARE_CPU_JIT_AUX_FUNCTIONS(jit_snippet) + + ~jit_snippet() override = default; + + jit_snippet() = default; + + void generate() override {} +}; + +namespace intel_cpu::riscv64 { + +CPUTargetMachine::CPUTargetMachine(ov::intel_cpu::riscv64::cpu_isa_t host_isa, ov::intel_cpu::MultiCacheWeakPtr cache) + : TargetMachine(std::make_shared(cache)), + h(new jit_snippet()), + isa(host_isa), + compiled_kernel_cache(std::move(cache)) { + // data movement + jitters[op::v0::Parameter::get_type_info_static()] = CREATE_SNIPPETS_EMITTER(jit_nop_emitter); + jitters[op::v0::Result::get_type_info_static()] = CREATE_SNIPPETS_EMITTER(jit_nop_emitter); + jitters[snippets::op::Scalar::get_type_info_static()] = CREATE_SNIPPETS_EMITTER(jit_scalar_emitter); + + // memory access + jitters[snippets::op::Load::get_type_info_static()] = CREATE_SNIPPETS_EMITTER(jit_load_memory_emitter); + jitters[snippets::op::LoadReorder::get_type_info_static()] = CREATE_SNIPPETS_EMITTER(jit_load_memory_emitter); + jitters[snippets::op::BroadcastLoad::get_type_info_static()] = CREATE_SNIPPETS_EMITTER(jit_load_broadcast_emitter); + jitters[snippets::op::Store::get_type_info_static()] = CREATE_SNIPPETS_EMITTER(jit_store_memory_emitter); + + // loop control + jitters[snippets::op::LoopBegin::get_type_info_static()] = CREATE_SNIPPETS_EMITTER(jit_loop_begin_emitter); + jitters[snippets::op::LoopEnd::get_type_info_static()] = CREATE_SNIPPETS_EMITTER(jit_loop_end_emitter); + + // service kernel entry points + jitters[snippets::op::KernelStatic::get_type_info_static()] = CREATE_SNIPPETS_EMITTER(jit_kernel_static_emitter); + jitters[snippets::op::KernelDynamic::get_type_info_static()] = CREATE_SNIPPETS_EMITTER(jit_kernel_dynamic_emitter); + + // binary operations + jitters[op::v1::Add::get_type_info_static()] = CREATE_CPU_EMITTER(ov::intel_cpu::riscv64::jit_add_emitter); +} + +std::shared_ptr CPUTargetMachine::clone() const { + return std::make_shared(isa, compiled_kernel_cache); +} + +bool CPUTargetMachine::is_supported() const { + return ov::intel_cpu::riscv64::mayiuse(ov::intel_cpu::riscv64::gv); +} + +snippets::CompiledSnippetPtr CPUTargetMachine::get_snippet() { + OPENVINO_ASSERT(h->create_kernel(), "Failed to create jit_kernel in get_snippet()"); + + const auto& result = + std::make_shared(std::unique_ptr(h.release())); + // Note that we reset all the generated code, since it was copied into CompiledSnippetCPU + h = std::make_unique(); + return result; +} + +size_t CPUTargetMachine::get_lanes() const { + switch (isa) { + case ov::intel_cpu::riscv64::gv: + // RISC-V Vector Extension lanes depend on VLEN, assume 128-bit VLEN with 32-bit elements + return 4; // 128-bit / 32-bit = 4 lanes for float32 + default: + OPENVINO_THROW("unknown isa ", isa); + } +} + +std::vector CPUTargetMachine::get_abi_arg_regs() const { + // RISC-V ABI argument registers: a0-a7 (x10-x17) + std::vector res; + for (size_t i = 0; i < 8; ++i) { + res.emplace_back(snippets::RegType::gpr, 10 + i); // a0-a7 are x10-x17 + } + return res; +} + +std::vector CPUTargetMachine::get_gp_reg_pool() const { + using Xbyak_riscv::Reg; + const auto num_gp_regs = 32; + std::vector reg_pool; + for (size_t i = 1; i < num_gp_regs; i++) { + // Reserve: x0 (zero), x1 (ra), x2 (sp), x3 (gp), x4 (tp), x8 (s0/fp) + if (none_of(static_cast(i), + Xbyak_riscv::ra.getIdx(), + Xbyak_riscv::sp.getIdx(), + Xbyak_riscv::gp.getIdx(), + Xbyak_riscv::tp.getIdx(), + Xbyak_riscv::s0.getIdx())) { + reg_pool.emplace_back(snippets::RegType::gpr, i); + } + } + return reg_pool; +} + +std::vector CPUTargetMachine::get_vec_reg_pool() const { + const auto num_vec_regs = 32; // RISC-V has 32 vector registers v0-v31 + std::vector reg_pool; + reg_pool.reserve(num_vec_regs); + // v0 is typically reserved for masks, so exclude it + for (int i = 1; i < num_vec_regs; i++) { + reg_pool.emplace_back(snippets::RegType::vec, static_cast(i)); + } + return reg_pool; +} + +ov::intel_cpu::riscv64::cpu_isa_t CPUTargetMachine::get_isa() const { + return isa; +} + +CPUGenerator::CPUGenerator(ov::intel_cpu::riscv64::cpu_isa_t isa_, ov::intel_cpu::MultiCacheWeakPtr cache) + : Generator(std::make_shared(isa_, std::move(cache))) {} +CPUGenerator::CPUGenerator(const std::shared_ptr& target) : Generator(target) {} + +std::shared_ptr CPUGenerator::clone() const { + const auto& cpu_target_machine = std::dynamic_pointer_cast(target); + OPENVINO_ASSERT(cpu_target_machine, + "Failed to clone CPUGenerator: the instance contains incompatible TargetMachine type"); + return std::make_shared(cpu_target_machine); +} + +ov::snippets::RegType CPUGenerator::get_specific_op_out_reg_type( + [[maybe_unused]] const ov::Output& out) const { + return ov::snippets::RegType::undefined; +} + +bool CPUGenerator::uses_precompiled_kernel([[maybe_unused]] const std::shared_ptr& e) const { + // RISC-V platform doesn't currently use precompiled kernels + return false; +} + +} // namespace intel_cpu::riscv64 + +} // namespace ov diff --git a/src/plugins/intel_cpu/src/emitters/snippets/riscv64/cpu_generator.hpp b/src/plugins/intel_cpu/src/emitters/snippets/riscv64/cpu_generator.hpp new file mode 100644 index 00000000000000..e880825efb983e --- /dev/null +++ b/src/plugins/intel_cpu/src/emitters/snippets/riscv64/cpu_generator.hpp @@ -0,0 +1,57 @@ +// Copyright (C) 2018-2025 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 +// + +#pragma once + +#include +#include +#include +#include +#include + +#include "cache/multi_cache.h" +#include "emitters/snippets/common/compiled_snippet_cpu.hpp" +#include "nodes/kernels/riscv64/jit_generator.hpp" +#include "openvino/core/node.hpp" +#include "openvino/core/node_output.hpp" +#include "snippets/emitter.hpp" +#include "snippets/generator.hpp" +#include "snippets/target_machine.hpp" + +namespace ov::intel_cpu::riscv64 { + +using CompiledSnippetCPU = ov::intel_cpu::CompiledSnippetCPUCommon; + +class CPUTargetMachine : public snippets::TargetMachine { +public: + explicit CPUTargetMachine(ov::intel_cpu::riscv64::cpu_isa_t host_isa, ov::intel_cpu::MultiCacheWeakPtr cache); + [[nodiscard]] std::shared_ptr clone() const override; + [[nodiscard]] bool is_supported() const override; + snippets::CompiledSnippetPtr get_snippet() override; + [[nodiscard]] size_t get_lanes() const override; + + [[nodiscard]] std::vector get_abi_arg_regs() const override; + [[nodiscard]] std::vector get_gp_reg_pool() const override; + [[nodiscard]] std::vector get_vec_reg_pool() const override; + + [[nodiscard]] ov::intel_cpu::riscv64::cpu_isa_t get_isa() const; + +private: + std::unique_ptr h; + ov::intel_cpu::riscv64::cpu_isa_t isa; + ov::intel_cpu::MultiCacheWeakPtr compiled_kernel_cache; +}; + +class CPUGenerator : public snippets::Generator { +public: + CPUGenerator(ov::intel_cpu::riscv64::cpu_isa_t isa, ov::intel_cpu::MultiCacheWeakPtr cache); + CPUGenerator(const std::shared_ptr& target); + std::shared_ptr clone() const override; + +protected: + bool uses_precompiled_kernel(const std::shared_ptr& emitter) const override; + ov::snippets::RegType get_specific_op_out_reg_type(const ov::Output& out) const override; +}; + +} // namespace ov::intel_cpu::riscv64 diff --git a/src/plugins/intel_cpu/src/emitters/snippets/riscv64/jit_kernel_emitter.cpp b/src/plugins/intel_cpu/src/emitters/snippets/riscv64/jit_kernel_emitter.cpp new file mode 100644 index 00000000000000..b8bc4465a59ee3 --- /dev/null +++ b/src/plugins/intel_cpu/src/emitters/snippets/riscv64/jit_kernel_emitter.cpp @@ -0,0 +1,296 @@ +// Copyright (C) 2018-2025 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 +// + +#include "jit_kernel_emitter.hpp" + +#include +#include +#include +#include +#include +#include +#include +#include + +#include "emitters/plugin/riscv64/jit_emitter.hpp" +#include "emitters/snippets/jit_snippets_call_args.hpp" +#include "emitters/utils.hpp" +#include "jit_snippets_emitters.hpp" +#include "nodes/kernels/riscv64/cpu_isa_traits.hpp" +#include "nodes/kernels/riscv64/jit_generator.hpp" +#include "openvino/core/except.hpp" +#include "openvino/core/type.hpp" +#include "snippets/emitter.hpp" +#include "snippets/lowered/expression.hpp" +#include "snippets/op/kernel.hpp" +#include "snippets/op/loop.hpp" +#include "snippets/op/reg_spill.hpp" +#include "snippets/utils/reg_utils.hpp" +#include "utils.hpp" +#include "xbyak_riscv/xbyak_riscv.hpp" + +using namespace Xbyak_riscv; + +namespace ov::intel_cpu::riscv64 { + +jit_kernel_emitter::jit_kernel_emitter(jit_generator_t* h, + cpu_isa_t isa, + const ov::snippets::lowered::ExpressionPtr& expr) + : jit_emitter(h, isa) { + const auto kernel = ov::as_type_ptr(expr->get_node()); + OV_CPU_JIT_EMITTER_ASSERT(kernel != nullptr, "jit_kernel_emitter invoked with invalid op argument"); + OV_CPU_JIT_EMITTER_ASSERT(!kernel->region->empty(), "jit_kernel_emitter invoked with empty body"); + body = kernel->region; + jcp = *reinterpret_cast(kernel->compile_params); + const auto& parameters = body->get_parameters(); + const auto& results = body->get_results(); + const auto& buffers = body->get_buffers(); + std::vector data_ptr_regs; + for (const auto& param : parameters) { + const auto& reg = param->get_output_port_descriptor(0)->get_reg(); + if (!reg.is_address()) { + data_ptr_regs.push_back(reg); + } + } + num_inputs = data_ptr_regs.size(); + for (const auto& result : results) { + data_ptr_regs.push_back(result->get_input_port_descriptor(0)->get_reg()); + } + num_outputs = data_ptr_regs.size() - num_inputs; + + std::set unique_buffers; + for (const auto& buffer_expr : buffers) { + const auto buffer_reg_group = buffer_expr->get_reg_group(); + if (unique_buffers.count(buffer_reg_group) == 0) { + data_ptr_regs.push_back(buffer_expr->get_output_port_descriptor(0)->get_reg()); + unique_buffers.insert(buffer_reg_group); + } + } + num_unique_buffers = unique_buffers.size(); + data_ptr_regs_idx = snippets::utils::transform_snippets_regs_to_idxs(data_ptr_regs, snippets::RegType::gpr); +} + +void jit_kernel_emitter::emit_code_impl(const std::vector& in, + const std::vector& out, + const std::vector& pool_vec_idxs, + const std::vector& pool_gpr_idxs) const { + validate_arguments(in, out); + aux_vec_idxs = pool_vec_idxs; + aux_gpr_idxs = pool_gpr_idxs; + emit_impl(in, out); +} + +void jit_kernel_emitter::validate_arguments(const std::vector& in, const std::vector& out) const { + OV_CPU_JIT_EMITTER_ASSERT(in.size() == get_inputs_num() && out.empty(), + "Unexpected number of input/output arguments"); + const auto num_params = num_inputs + num_outputs + num_unique_buffers; + // The number of used gpr may be >= num_params since LoopBegin+LoopEnd could also use gpr to store work_amount + OV_CPU_JIT_EMITTER_ASSERT(data_ptr_regs_idx.size() == num_params, + "Number of inputs and outputs is inconsistent with the number of allocated registers ", + num_params, + " data_ptr_regs_idx.size() = ", + data_ptr_regs_idx.size()); +} + +void jit_kernel_emitter::emit_impl(const std::vector& in, + [[maybe_unused]] const std::vector& out) const { + h->preamble(); + + std::set available_gpr; + std::set available_vec; + auto reg_type = snippets::RegType::gpr; + auto convert = [®_type](size_t i) -> snippets::Reg { + return {reg_type, i}; + }; + std::transform(aux_gpr_idxs.begin(), + aux_gpr_idxs.end(), + std::inserter(available_gpr, available_gpr.begin()), + convert); + // Note: data_ptr regs are globally live, so it makes no sense to keep them in the pool + for (auto idx : data_ptr_regs_idx) { + available_gpr.erase({snippets::RegType::gpr, idx}); + } + reg_type = snippets::RegType::vec; + std::transform(aux_vec_idxs.begin(), + aux_vec_idxs.end(), + std::inserter(available_vec, available_vec.begin()), + convert); + + auto data_ptr_regs = utils::transform_idxs_to_regs(data_ptr_regs_idx); + + auto get_expected_reg_types = + [](const std::shared_ptr& emitter) -> std::pair { + switch (emitter->get_in_out_type()) { + case emitter_in_out_map::gpr_to_vec: + return {snippets::RegType::gpr, snippets::RegType::vec}; + case emitter_in_out_map::gpr_to_gpr: + return {snippets::RegType::gpr, snippets::RegType::gpr}; + case emitter_in_out_map::vec_to_gpr: + return {snippets::RegType::vec, snippets::RegType::gpr}; + case emitter_in_out_map::vec_to_vec: + return {snippets::RegType::vec, snippets::RegType::vec}; + default: + OPENVINO_THROW("Unsupported emitter_in_out_map instance"); + } + }; + // Provide up to two temporary GPRs for pointer initialization math + std::vector aux_tmp_regs{}; + if (!available_gpr.empty()) { + auto it = available_gpr.begin(); + aux_tmp_regs.emplace_back(static_cast(it->idx)); + ++it; + if (it != available_gpr.end()) { + aux_tmp_regs.emplace_back(static_cast(it->idx)); + } + } + init_data_pointers(utils::transform_idxs_to_regs(in), data_ptr_regs, aux_tmp_regs); + for (const auto& expression : *body) { + const auto reg_info = expression->get_reg_info(); + const auto& emitter = std::dynamic_pointer_cast(expression->get_emitter()); + OV_CPU_JIT_EMITTER_ASSERT(emitter, "Unexpected emitter type"); + auto expected_in_type = snippets::RegType::undefined; + auto expected_out_type = snippets::RegType::undefined; + const auto& node = expression->get_node(); + // Note: A few operations are allowed to have mixed register types on their inputs (or outputs) => skip + // validation here + if (!ov::is_type_any_of(node) && + !std::dynamic_pointer_cast(emitter)) { + std::tie(expected_in_type, expected_out_type) = get_expected_reg_types(emitter); + } + // Note: live regs = regs live on input of the expression. We also need to exclude output regs from the pool + auto live_regs = expression->get_live_regs(); + for (auto r : reg_info.second) { + live_regs.insert(r); + } + std::vector pool_gp_reg; + std::vector pool_vec_reg; + std::set_difference(available_gpr.begin(), + available_gpr.end(), + live_regs.begin(), + live_regs.end(), + std::back_inserter(pool_gp_reg)); + std::set_difference(available_vec.begin(), + available_vec.end(), + live_regs.begin(), + live_regs.end(), + std::back_inserter(pool_vec_reg)); + auto in_regs = snippets::utils::transform_snippets_regs_to_idxs(reg_info.first, expected_in_type); + auto out_regs = snippets::utils::transform_snippets_regs_to_idxs(reg_info.second, expected_out_type); + auto gpr_pool = snippets::utils::transform_snippets_regs_to_idxs(pool_gp_reg); + auto vec_pool = snippets::utils::transform_snippets_regs_to_idxs(pool_vec_reg); + emitter->emit_code(in_regs, out_regs, vec_pool, gpr_pool, {}); + } + + h->postamble(); +} + +jit_kernel_static_emitter::jit_kernel_static_emitter(jit_generator_t* h, + cpu_isa_t isa, + const ov::snippets::lowered::ExpressionPtr& expr) + : jit_kernel_emitter(h, isa, expr) { + const auto kernel = ov::as_type_ptr(expr->get_node()); + OV_CPU_JIT_EMITTER_ASSERT(kernel != nullptr, "jit_kernel_static_emitter expects KernelStatic expression"); + jcp = *reinterpret_cast(kernel->compile_params); + master_shape = jcp.exec_domain; + data_offsets = jcp.data_offsets; + OV_CPU_JIT_EMITTER_ASSERT(data_offsets.size() == num_inputs + num_outputs, "Incompatible count of data offsets!"); + OV_CPU_JIT_EMITTER_ASSERT(!data_offsets.empty() && data_offsets.front().size() == master_shape.size(), + "Incompatible rank of data offsets!"); +} + +void jit_kernel_static_emitter::init_data_pointers(const std::vector& arg_regs, + const std::vector& data_ptr_regs, + const std::vector& aux_gprs) const { + OV_CPU_JIT_EMITTER_ASSERT(arg_regs.size() == 2, "Invalid arg regs size"); + auto reg_runtime_params = arg_regs[0]; + auto reg_indexes = arg_regs[1]; + + const auto num_params = num_inputs + num_outputs; + // Note that we don't need offset for the last dim, since it's handled directly by Tile emitter + const size_t offset_rank = master_shape.size() - 1; + + // helper: pointer += offsets[j] * indexes[j] + // uses two temporaries to avoid clobbering the offset constant while loading the index + auto init_ptr_with_offset = [&](Xbyak_riscv::Reg pointer, + const std::vector& offsets, + Xbyak_riscv::Reg tmp0, + Xbyak_riscv::Reg tmp1) { + for (size_t j = 0; j < offset_rank; j++) { + if (master_shape[j] != 1 && offsets[j] != 0) { + // tmp0 = offsets[j] + h->uni_li(tmp0, offsets[j]); + // tmp1 = address of index[j] + h->uni_li(tmp1, j * sizeof(size_t)); + h->add(tmp1, reg_indexes, tmp1); + // tmp1 = load index[j] + h->ld(tmp1, tmp1, 0); + // tmp0 *= tmp1 + h->mul(tmp0, tmp0, tmp1); + // pointer += tmp0 + h->add(pointer, pointer, tmp0); + } + } + }; + + // choose tmp regs + Xbyak_riscv::Reg tmp0 = !aux_gprs.empty() ? aux_gprs[0] : Xbyak_riscv::t0; + Xbyak_riscv::Reg tmp1 = aux_gprs.size() > 1 ? aux_gprs[1] : Xbyak_riscv::t1; + + // Initialize buffer scratchpad pointers + for (size_t i = 0; i < num_unique_buffers; ++i) { + Xbyak_riscv::Reg addr = tmp0; + h->uni_li(addr, GET_OFF(buffer_scratchpad_ptr)); + h->add(addr, reg_runtime_params, addr); + h->ld(data_ptr_regs[num_params + i], addr, 0); + } + + // Load input/output pointers and apply static offsets + for (size_t i = 0; i < num_params; i++) { + Xbyak_riscv::Reg addr = tmp0; + if (i < num_inputs) { + h->uni_li(addr, GET_OFF(src_ptrs) + i * sizeof(void*)); + } else { + h->uni_li(addr, GET_OFF(dst_ptrs) + (i - num_inputs) * sizeof(void*)); + } + h->add(addr, reg_runtime_params, addr); + h->ld(data_ptr_regs[i], addr, 0); + init_ptr_with_offset(data_ptr_regs[i], data_offsets[i], tmp0, tmp1); + } +} + +jit_kernel_dynamic_emitter::jit_kernel_dynamic_emitter(jit_generator_t* h, + cpu_isa_t isa, + const ov::snippets::lowered::ExpressionPtr& expr) + : jit_kernel_emitter(h, isa, expr) { + OV_CPU_JIT_EMITTER_ASSERT(ov::is_type(expr->get_node()), + "jit_kernel_dynamic_emitter expects KernelDynamic expression"); +} + +void jit_kernel_dynamic_emitter::init_data_pointers( + const std::vector& arg_regs, + const std::vector& data_ptr_regs, + [[maybe_unused]] const std::vector& aux_gprs) const { + OV_CPU_JIT_EMITTER_ASSERT(arg_regs.size() == 1, "Invalid arg regs size"); + auto reg_runtime_params = arg_regs[0]; + + const auto num_params = num_inputs + num_outputs; + for (size_t i = 0; i < num_unique_buffers; ++i) { + Xbyak_riscv::Reg addr = Xbyak_riscv::t0; + h->uni_li(addr, GET_OFF(buffer_scratchpad_ptr)); + h->add(addr, reg_runtime_params, addr); + h->ld(data_ptr_regs[num_params + i], addr, 0); + } + for (size_t i = 0; i < num_params; i++) { + Xbyak_riscv::Reg addr = aux_gprs.empty() ? Xbyak_riscv::t0 : aux_gprs.front(); + if (i < num_inputs) { + h->uni_li(addr, GET_OFF(src_ptrs) + i * sizeof(void*)); + } else { + h->uni_li(addr, GET_OFF(dst_ptrs) + (i - num_inputs) * sizeof(void*)); + } + h->add(addr, reg_runtime_params, addr); + h->ld(data_ptr_regs[i], addr, 0); + } +} + +} // namespace ov::intel_cpu::riscv64 diff --git a/src/plugins/intel_cpu/src/emitters/snippets/riscv64/jit_kernel_emitter.hpp b/src/plugins/intel_cpu/src/emitters/snippets/riscv64/jit_kernel_emitter.hpp new file mode 100644 index 00000000000000..fcc626083dce64 --- /dev/null +++ b/src/plugins/intel_cpu/src/emitters/snippets/riscv64/jit_kernel_emitter.hpp @@ -0,0 +1,81 @@ +// Copyright (C) 2018-2025 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 +// + +#pragma once + +#include +#include +#include + +#include "emitters/plugin/riscv64/jit_emitter.hpp" +#include "emitters/snippets/jit_snippets_call_args.hpp" +#include "snippets/lowered/expression.hpp" +#include "snippets/lowered/linear_ir.hpp" + +namespace ov::intel_cpu::riscv64 { + +class jit_kernel_emitter : public jit_emitter { +public: + jit_kernel_emitter(ov::intel_cpu::riscv64::jit_generator_t* h, + ov::intel_cpu::riscv64::cpu_isa_t isa, + const ov::snippets::lowered::ExpressionPtr& expr); + +protected: + void validate_arguments(const std::vector& in, const std::vector& out) const override; + + virtual void init_data_pointers(const std::vector& arg_regs, + const std::vector& data_ptr_regs, + const std::vector& aux_gprs) const = 0; + + void emit_impl(const std::vector& in, const std::vector& out) const override; + + void emit_code_impl(const std::vector& in_idxs, + const std::vector& out_idxs, + const std::vector& pool_vec_idxs, + const std::vector& pool_gpr_idxs) const override; + + ov::intel_cpu::jit_snippets_compile_args jcp{}; + // gpr's used to store data pointers, track them to apply offsets in Kernel + std::vector data_ptr_regs_idx; + size_t num_inputs = 0; + size_t num_outputs = 0; + size_t num_unique_buffers = 0; + + std::shared_ptr body; +}; + +class jit_kernel_static_emitter : public jit_kernel_emitter { +public: + jit_kernel_static_emitter(ov::intel_cpu::riscv64::jit_generator_t* h, + ov::intel_cpu::riscv64::cpu_isa_t isa, + const ov::snippets::lowered::ExpressionPtr& expr); + size_t get_inputs_num() const override { + return 2; + } + +private: + void init_data_pointers(const std::vector& arg_regs, + const std::vector& data_ptr_regs, + const std::vector& aux_gprs) const override; + + std::vector master_shape; + std::vector> data_offsets; +}; + +class jit_kernel_dynamic_emitter : public jit_kernel_emitter { +public: + jit_kernel_dynamic_emitter(ov::intel_cpu::riscv64::jit_generator_t* h, + ov::intel_cpu::riscv64::cpu_isa_t isa, + const ov::snippets::lowered::ExpressionPtr& expr); + size_t get_inputs_num() const override { + return 1; + } + +private: + void init_data_pointers(const std::vector& arg_regs, + const std::vector& data_ptr_regs, + const std::vector& aux_gprs) const override; +}; + +} // namespace ov::intel_cpu::riscv64 diff --git a/src/plugins/intel_cpu/src/emitters/snippets/riscv64/jit_loop_emitters.cpp b/src/plugins/intel_cpu/src/emitters/snippets/riscv64/jit_loop_emitters.cpp new file mode 100644 index 00000000000000..3cd5241adabe0e --- /dev/null +++ b/src/plugins/intel_cpu/src/emitters/snippets/riscv64/jit_loop_emitters.cpp @@ -0,0 +1,293 @@ +// Copyright (C) 2018-2025 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 +// + +#include "jit_loop_emitters.hpp" + +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "emitters/plugin/riscv64/jit_emitter.hpp" +#include "emitters/snippets/common/jit_loop_args_helper.hpp" +#include "emitters/snippets/jit_snippets_call_args.hpp" +#include "emitters/utils.hpp" +#include "nodes/kernels/riscv64/jit_generator.hpp" +#include "openvino/core/type.hpp" +#include "snippets/lowered/expression.hpp" +#include "snippets/lowered/loop_manager.hpp" +#include "snippets/op/loop.hpp" +#include "snippets/utils/utils.hpp" +#include "utils.hpp" +#include "xbyak_riscv/xbyak_riscv.hpp" + +using namespace Xbyak_riscv; + +namespace ov::intel_cpu::riscv64 { + +using ExpressionPtr = ov::snippets::lowered::ExpressionPtr; + +namespace { +// RAII holder for one temporary GPR: uses pool if available, otherwise preserves a caller-saved reg on stack +class jit_aux_gpr_holder { +public: + jit_aux_gpr_holder(ov::intel_cpu::riscv64::jit_generator_t* host, + std::vector& pool_gpr_idxs, + const std::vector& used_gpr_idxs) + : m_h(host), + m_pool_gpr_idxs(pool_gpr_idxs) { + if (!m_pool_gpr_idxs.empty()) { + m_reg = Xbyak_riscv::Reg(static_cast(m_pool_gpr_idxs.back())); + m_pool_gpr_idxs.pop_back(); + } else { + // choose an available caller-saved reg not in used set + m_reg = ov::intel_cpu::riscv64::utils::get_aux_gpr(used_gpr_idxs); + m_preserved = true; + // Maintain 16-byte alignment; reserve 16 bytes and save at 0 + m_h->addi(Xbyak_riscv::sp, Xbyak_riscv::sp, -16); + m_h->sd(m_reg, Xbyak_riscv::sp, 0); + } + } + ~jit_aux_gpr_holder() { + if (m_preserved) { + m_h->ld(m_reg, Xbyak_riscv::sp, 0); + m_h->addi(Xbyak_riscv::sp, Xbyak_riscv::sp, 16); + } else { + m_pool_gpr_idxs.push_back(static_cast(m_reg.getIdx())); + } + } + [[nodiscard]] const Xbyak_riscv::Reg& get_reg() const { + return m_reg; + } + +private: + ov::intel_cpu::riscv64::jit_generator_t* m_h; + std::vector& m_pool_gpr_idxs; + Xbyak_riscv::Reg m_reg; + bool m_preserved = false; +}; +} // namespace + +/* ================== jit_loop_begin_emitter ====================== */ + +jit_loop_begin_emitter::jit_loop_begin_emitter(ov::intel_cpu::riscv64::jit_generator_t* h, + ov::intel_cpu::riscv64::cpu_isa_t isa, + const ov::snippets::lowered::ExpressionPtr& expr) + : ov::intel_cpu::riscv64::jit_emitter(h, isa), + isa(isa), + h(h) { + const auto loop_begin = ov::as_type_ptr(expr->get_node()); + OV_CPU_JIT_EMITTER_ASSERT(loop_begin, "Expected LoopBegin expression"); + + const auto loop_end = loop_begin->get_loop_end(); + work_amount = loop_end->get_work_amount(); + wa_increment = loop_end->get_increment(); + evaluate_once = loop_end->get_evaluate_once(); + loop_id = loop_end->get_id(); + is_work_amount_dynamic = ov::snippets::utils::is_dynamic_value(work_amount); + OV_CPU_JIT_EMITTER_ASSERT(wa_increment > 0, "Loop increment must be > 0"); + + loop_begin_label = std::make_shared(); + loop_end_label = nullptr; + + // LoopBegin communicates work_amount via GPR to LoopEnd + in_out_type_ = emitter_in_out_map::gpr_to_gpr; +} + +void jit_loop_begin_emitter::validate_arguments(const std::vector& in, const std::vector& out) const { + OV_CPU_JIT_EMITTER_ASSERT(in.empty(), "Invalid inputs size: expected 0 got " + std::to_string(in.size())); + // The only expected output is work amount register (communicated to jit_loop_end_emitter) + OV_CPU_JIT_EMITTER_ASSERT(out.size() == 1, "Invalid outputs size: expected 1 got " + std::to_string(out.size())); +} + +void jit_loop_begin_emitter::emit_code_impl(const std::vector& in, + const std::vector& out, + const std::vector& pool_vec_idxs, + const std::vector& pool_gpr_idxs) const { + validate_arguments(in, out); + // Use base preamble/postamble to manage aux regs consistently + ov::intel_cpu::riscv64::jit_emitter::emit_code_impl(in, out, pool_vec_idxs, pool_gpr_idxs, {}); +} + +void jit_loop_begin_emitter::emit_impl([[maybe_unused]] const std::vector& in, + const std::vector& out) const { + auto reg_work_amount = Xbyak_riscv::Reg(out[0]); + if (is_work_amount_dynamic) { + const auto id_offset = loop_id * sizeof(ov::intel_cpu::jit_snippets_call_args::loop_args_t); + // Acquire two scratch regs + std::vector used = {out[0]}; + jit_aux_gpr_holder h_ptr(h, aux_gpr_idxs, used); + jit_aux_gpr_holder h_tmp(h, aux_gpr_idxs, used); + auto reg_loop_args_ptr = h_ptr.get_reg(); + auto addr = h_tmp.get_reg(); + // reg_loop_args_ptr = *(a0 + GET_OFF(loop_args)) + h->uni_li(addr, GET_OFF(loop_args)); + h->add(addr, Xbyak_riscv::a0, addr); + h->ld(reg_loop_args_ptr, addr, 0); + // reg_loop_args_ptr += id_offset + OFF(m_work_amount) + h->uni_li(addr, id_offset + GET_OFF_LOOP_ARGS(m_work_amount)); + h->add(reg_loop_args_ptr, reg_loop_args_ptr, addr); + // load m_work_amount + h->ld(reg_work_amount, reg_loop_args_ptr, 0); + } else { + h->uni_li(reg_work_amount, static_cast(work_amount)); + } + h->L(*loop_begin_label); + // If evaluate_once and not dynamic increment, skip branch to end + if (evaluate_once && !ov::snippets::utils::is_dynamic_value(wa_increment)) { + return; + } + // Compare work amount with increment and jump to end if less + size_t eff_inc = + (evaluate_once && ov::snippets::utils::is_dynamic_value(wa_increment)) ? 1 : static_cast(wa_increment); + // Use scratch for increment immediate + std::vector used2 = {out[0]}; + jit_aux_gpr_holder h_inc(h, aux_gpr_idxs, used2); + Xbyak_riscv::Reg reg_inc = h_inc.get_reg(); + h->uni_li(reg_inc, eff_inc); + h->blt(reg_work_amount, reg_inc, *loop_end_label); +} + +/* =================== jit_loop_end_emitter ======================= */ + +jit_loop_end_emitter::jit_loop_end_emitter(ov::intel_cpu::riscv64::jit_generator_t* h, + ov::intel_cpu::riscv64::cpu_isa_t isa, + const ov::snippets::lowered::ExpressionPtr& expr) + : ov::intel_cpu::riscv64::jit_emitter(h, isa), + isa(isa), + h(h) { + const auto loop_end = ov::as_type_ptr(expr->get_node()); + OV_CPU_JIT_EMITTER_ASSERT(loop_end, "Expected LoopEnd expression"); + + in_out_type_ = emitter_in_out_map::gpr_to_gpr; + num_inputs = loop_end->get_input_num(); + num_outputs = loop_end->get_output_num(); + work_amount = loop_end->get_work_amount(); + wa_increment = loop_end->get_increment(); + evaluate_once = loop_end->get_evaluate_once(); + are_ptr_increments_dynamic = ov::snippets::utils::has_dynamic_values(loop_end->get_ptr_increments()); + are_final_offsets_dynamic = ov::snippets::utils::has_dynamic_values(loop_end->get_finalization_offsets()); + OV_CPU_JIT_EMITTER_ASSERT(wa_increment > 0, "Loop increment must be > 0"); + loop_id = loop_end->get_id(); + loop_args_offset = loop_id * sizeof(ov::intel_cpu::jit_snippets_call_args::loop_args_t); + loop_args = ov::intel_cpu::snippets_common::compose_loop_args(loop_end); + OV_CPU_JIT_EMITTER_ASSERT(loop_args.m_num_data_ptrs == static_cast(num_inputs + num_outputs), + "Invalid loop args size for LoopEnd"); + + // Get corresponding LoopBegin + const auto begin_expr = get_loop_begin_expr(expr); + const auto& loop_begin_emitter = std::dynamic_pointer_cast(begin_expr->get_emitter()); + OV_CPU_JIT_EMITTER_ASSERT(loop_begin_emitter, "LoopBegin expected jit_loop_begin_emitter"); + loop_begin_label = loop_begin_emitter->get_begin_label(); + loop_end_label = std::make_shared(); + loop_begin_emitter->set_loop_end_label(loop_end_label); +} + +void jit_loop_end_emitter::validate_arguments(const std::vector& in, const std::vector& out) const { + const auto io_size = num_inputs + num_outputs; + OV_CPU_JIT_EMITTER_ASSERT(out.empty(), + "Invalid number of out arguments: expected 0 got " + std::to_string(out.size())); + OV_CPU_JIT_EMITTER_ASSERT(in.size() == io_size + 1, + "Invalid number of in arguments: expected " + std::to_string(io_size + 1) + " got " + + std::to_string(in.size())); + OV_CPU_JIT_EMITTER_ASSERT(loop_args.m_num_data_ptrs == static_cast(io_size), + "Invalid loop args size: expected " + std::to_string(io_size) + " got " + + std::to_string(loop_args.m_num_data_ptrs)); +} + +void jit_loop_end_emitter::emit_code_impl(const std::vector& in, + const std::vector& out, + const std::vector& pool_vec_idxs, + const std::vector& pool_gpr_idxs) const { + validate_arguments(in, out); + ov::intel_cpu::riscv64::jit_emitter::emit_code_impl(in, out, pool_vec_idxs, pool_gpr_idxs, {}); +} + +void jit_loop_end_emitter::emit_impl(const std::vector& in, + [[maybe_unused]] const std::vector& out) const { + // Build list of data pointer regs: in[0..io_size-1], work_amount is in.back() + std::vector data_ptr_reg_idxs; + const size_t io_size = num_inputs + num_outputs; + data_ptr_reg_idxs.reserve(io_size); + std::copy(in.begin(), in.end() - 1, std::back_inserter(data_ptr_reg_idxs)); + + auto apply_increments = [&](const int64_t* increments, bool use_runtime_args, size_t field_offset) { + if (increments == nullptr || data_ptr_reg_idxs.empty()) { + return; + } + + std::vector used = in; + std::unique_ptr reg_increments_holder; + std::optional reg_increments; + if (use_runtime_args) { + reg_increments_holder = std::make_unique(h, aux_gpr_idxs, used); + reg_increments = reg_increments_holder->get_reg(); + used.push_back(static_cast(reg_increments->getIdx())); + } + jit_aux_gpr_holder h_tmp(h, aux_gpr_idxs, used); + Xbyak_riscv::Reg tmp = h_tmp.get_reg(); + + if (use_runtime_args) { + h->uni_li(tmp, GET_OFF(loop_args)); + h->add(tmp, Xbyak_riscv::a0, tmp); + h->ld(*reg_increments, tmp, 0); + h->uni_li(tmp, loop_args_offset + field_offset); + h->add(*reg_increments, *reg_increments, tmp); + h->ld(*reg_increments, *reg_increments, 0); + } + + for (size_t idx = 0; idx < data_ptr_reg_idxs.size(); ++idx) { + const auto increment = increments[idx]; + if (increment == 0) { + continue; + } + auto ptr_reg = Xbyak_riscv::Reg(data_ptr_reg_idxs[idx]); + if (ov::snippets::utils::is_dynamic_value(increment)) { + OV_CPU_JIT_EMITTER_ASSERT(use_runtime_args, "Dynamic increments require runtime loop arguments"); + h->uni_li(tmp, idx * sizeof(int64_t)); + h->add(tmp, *reg_increments, tmp); + h->ld(tmp, tmp, 0); + h->add(ptr_reg, ptr_reg, tmp); + } else { + h->uni_li(tmp, static_cast(increment)); + h->add(ptr_reg, ptr_reg, tmp); + } + } + }; + + if (!evaluate_once) { + apply_increments(loop_args.m_ptr_increments, are_ptr_increments_dynamic, GET_OFF_LOOP_ARGS(m_ptr_increments)); + + auto reg_work_amount = Xbyak_riscv::Reg(in.back()); + // reg_work_amount -= wa_increment + // use scratch for increment immediate + jit_aux_gpr_holder h_inc(h, aux_gpr_idxs, in); + auto reg_inc = h_inc.get_reg(); + h->uni_li(reg_inc, static_cast(wa_increment)); + h->sub(reg_work_amount, reg_work_amount, reg_inc); + // if reg_work_amount >= wa_increment -> loop + h->bge(reg_work_amount, reg_inc, *loop_begin_label); + } + + apply_increments(loop_args.m_finalization_offsets, + are_final_offsets_dynamic, + GET_OFF_LOOP_ARGS(m_finalization_offsets)); + + h->L(*loop_end_label); +} + +ov::snippets::lowered::ExpressionPtr jit_loop_end_emitter::get_loop_begin_expr( + const ov::snippets::lowered::ExpressionPtr& expr) { + auto begin_expr = expr->get_input_port_connectors().back()->get_source().get_expr(); + OV_CPU_JIT_EMITTER_ASSERT(ov::is_type(begin_expr->get_node()), + "LoopEnd expression must have the last port connector to LoopBegin"); + return begin_expr; +} + +} // namespace ov::intel_cpu::riscv64 diff --git a/src/plugins/intel_cpu/src/emitters/snippets/riscv64/jit_loop_emitters.hpp b/src/plugins/intel_cpu/src/emitters/snippets/riscv64/jit_loop_emitters.hpp new file mode 100644 index 00000000000000..5d116d52fb16ec --- /dev/null +++ b/src/plugins/intel_cpu/src/emitters/snippets/riscv64/jit_loop_emitters.hpp @@ -0,0 +1,95 @@ +// Copyright (C) 2018-2025 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 +// + +#pragma once + +#include +#include + +#include "emitters/plugin/riscv64/jit_emitter.hpp" +#include "emitters/snippets/jit_snippets_call_args.hpp" +#include "snippets/lowered/expression.hpp" + +namespace ov::intel_cpu::riscv64 { + +/* ================== jit_loop_begin_emitter ====================== */ + +class jit_loop_begin_emitter : public ov::intel_cpu::riscv64::jit_emitter { +public: + jit_loop_begin_emitter(ov::intel_cpu::riscv64::jit_generator_t* h, + ov::intel_cpu::riscv64::cpu_isa_t isa, + const ov::snippets::lowered::ExpressionPtr& expr); + + size_t get_inputs_num() const override { + return 0; + } + + void validate_arguments(const std::vector& in, const std::vector& out) const override; + void emit_code_impl(const std::vector& in, + const std::vector& out, + const std::vector& pool_vec_idxs, + const std::vector& pool_gpr_idxs) const override; + + void emit_impl(const std::vector& in, const std::vector& out) const override; + + std::shared_ptr get_begin_label() const { + return loop_begin_label; + } + void set_loop_end_label(const std::shared_ptr& lbl) { + this->loop_end_label = lbl; + } + +private: + ov::intel_cpu::riscv64::cpu_isa_t isa; + ov::intel_cpu::riscv64::jit_generator_t* h; + bool evaluate_once = false; + size_t work_amount = 0LU; + size_t wa_increment = 0; + size_t loop_id = 0; + bool is_work_amount_dynamic = false; + mutable std::shared_ptr loop_begin_label = nullptr; + mutable std::shared_ptr loop_end_label = nullptr; +}; + +/* =================== jit_loop_end_emitter ======================= */ + +class jit_loop_end_emitter : public ov::intel_cpu::riscv64::jit_emitter { +public: + jit_loop_end_emitter(ov::intel_cpu::riscv64::jit_generator_t* h, + ov::intel_cpu::riscv64::cpu_isa_t isa, + const ov::snippets::lowered::ExpressionPtr& expr); + + size_t get_inputs_num() const override { + return 0; + } + + void validate_arguments(const std::vector& in, const std::vector& out) const override; + void emit_code_impl(const std::vector& in, + const std::vector& out, + const std::vector& pool_vec_idxs, + const std::vector& pool_gpr_idxs) const override; + + void emit_impl(const std::vector& in, const std::vector& out) const override; + +private: + static ov::snippets::lowered::ExpressionPtr get_loop_begin_expr(const ov::snippets::lowered::ExpressionPtr& expr); + + ov::intel_cpu::riscv64::cpu_isa_t isa; + ov::intel_cpu::riscv64::jit_generator_t* h; + size_t num_inputs = 0; + size_t num_outputs = 0; + int64_t work_amount = 0; + size_t wa_increment = 0; + size_t loop_id = 0; + bool evaluate_once = false; + bool are_ptr_increments_dynamic = false; + bool are_final_offsets_dynamic = false; + size_t loop_args_offset = 0; + jit_snippets_call_args::loop_args_t loop_args; + mutable std::shared_ptr loop_begin_label = nullptr; + mutable std::shared_ptr loop_end_label = nullptr; + mutable bool end_label_bound = false; +}; + +} // namespace ov::intel_cpu::riscv64 diff --git a/src/plugins/intel_cpu/src/emitters/snippets/riscv64/jit_memory_emitters.cpp b/src/plugins/intel_cpu/src/emitters/snippets/riscv64/jit_memory_emitters.cpp new file mode 100644 index 00000000000000..61d92508c954a1 --- /dev/null +++ b/src/plugins/intel_cpu/src/emitters/snippets/riscv64/jit_memory_emitters.cpp @@ -0,0 +1,288 @@ +// Copyright (C) 2018-2025 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 +// + +#include "jit_memory_emitters.hpp" + +#include +#include +#include +#include +#include +#include + +#include "emitters/plugin/riscv64/jit_emitter.hpp" +#include "emitters/snippets/jit_snippets_call_args.hpp" +#include "emitters/snippets/utils/utils.hpp" +#include "emitters/utils.hpp" +#include "openvino/core/type.hpp" +#include "openvino/core/type/element_type.hpp" +#include "snippets/lowered/expression.hpp" +#include "snippets/lowered/loop_manager.hpp" +#include "snippets/op/broadcastload.hpp" +#include "snippets/op/load.hpp" +#include "snippets/op/memory_access.hpp" +#include "snippets/op/store.hpp" +#include "snippets/utils/utils.hpp" +#include "utils/general_utils.h" +#include "xbyak_riscv/xbyak_riscv.hpp" +#include "xbyak_riscv/xbyak_riscv_csr.hpp" + +namespace ov::intel_cpu::riscv64 { + +using jit_generator_t = ov::intel_cpu::riscv64::jit_generator_t; +using cpu_isa_t = ov::intel_cpu::riscv64::cpu_isa_t; +using ExpressionPtr = ov::snippets::lowered::ExpressionPtr; + +jit_memory_emitter::jit_memory_emitter(jit_generator_t* h, + cpu_isa_t isa, + const ExpressionPtr& expr, + emitter_in_out_map in_out_type) + : jit_emitter(h, isa) { + in_out_type_ = in_out_type; + + const auto n = expr->get_node(); + src_prc = n->get_input_element_type(0); + dst_prc = n->get_output_element_type(0); + + const auto& memory_access = std::dynamic_pointer_cast(expr->get_node()); + if (in_out_type_ == emitter_in_out_map::gpr_to_vec) { + OV_CPU_JIT_EMITTER_ASSERT(memory_access->is_memory_access_input_port(0), "must be input port - memory access"); + count = memory_access->get_input_count(); + compiled_byte_offset = memory_access->get_input_offset(); + buffer_cluster_id = ov::intel_cpu::utils::get_parent_buffer_cluster_id(expr); + } else if (in_out_type_ == emitter_in_out_map::vec_to_gpr) { + OV_CPU_JIT_EMITTER_ASSERT(memory_access->is_memory_access_output_port(0), + "must be output port - memory access"); + count = memory_access->get_output_count(); + compiled_byte_offset = memory_access->get_output_offset(); + buffer_cluster_id = ov::intel_cpu::utils::get_consumer_buffer_cluster_id(expr); + } else { + OV_CPU_JIT_EMITTER_THROW("unsupported in_out_type"); + } + + if (ov::snippets::utils::is_dynamic_value(compiled_byte_offset)) { + is_offset_runtime = true; + // Compiled byte offset is zero to manually `add` runtime offset before operation and `sub` after to reset + // pointer in the register + compiled_byte_offset = 0; + OV_CPU_JIT_EMITTER_ASSERT(buffer_cluster_id != SIZE_MAX, "Incorrect buffer offset in call_args"); + } +} + +jit_load_memory_emitter::jit_load_memory_emitter(jit_generator_t* h, cpu_isa_t isa, const ExpressionPtr& expr) + : jit_memory_emitter(h, isa, expr, emitter_in_out_map::gpr_to_vec) { + bool is_supported_precision = + any_of(src_prc, ov::element::f32, ov::element::i32, ov::element::f16) && src_prc == dst_prc; + OV_CPU_JIT_EMITTER_ASSERT(is_supported_precision, "Unsupported precision pair."); + + const auto load = ov::as_type_ptr(expr->get_node()); + OV_CPU_JIT_EMITTER_ASSERT(load != nullptr, "Expects Load expression"); + count = load->get_count(); + byte_size = src_prc.size(); + OV_CPU_JIT_EMITTER_ASSERT(byte_size == 2 || byte_size == 4, + "Only 2- or 4-byte element loads are supported, got: ", + byte_size); +} + +size_t jit_memory_emitter::aux_gprs_count() const { + // for runtime arguments + return is_offset_runtime ? 1 : 0; +} + +void jit_memory_emitter::emit_code_impl(const std::vector& in_idxs, + const std::vector& out_idxs, + const std::vector& pool_vec_idxs, + const std::vector& pool_gpr_idxs) const { + std::vector pool_fp_gpr_idxs; // Empty for now + emitter_preamble(in_idxs, out_idxs, pool_vec_idxs, pool_gpr_idxs, pool_fp_gpr_idxs); + + auto reg_runtime_params = Xbyak_riscv::a0; // First ABI parameter register + Xbyak_riscv::Reg aux_gpr = is_offset_runtime ? Xbyak_riscv::Reg(aux_gpr_idxs.back()) : Xbyak_riscv::zero; + + Xbyak_riscv::Reg data_reg = Xbyak_riscv::zero; + if (in_out_type_ == emitter_in_out_map::gpr_to_vec) { + data_reg = Xbyak_riscv::Reg(in_idxs[0]); + } else if (in_out_type_ == emitter_in_out_map::vec_to_gpr) { + data_reg = Xbyak_riscv::Reg(out_idxs[0]); + } else { + OV_CPU_JIT_EMITTER_THROW("unsupported in_out_type"); + } + + if (is_offset_runtime) { + // load the runtime offset from args.buffer_offsets[buffer_cluster_id] + const auto offset = GET_OFF(buffer_offsets) + buffer_cluster_id * sizeof(size_t); + // RV64 uses 64-bit size_t + h->ld(aux_gpr, reg_runtime_params, static_cast(offset)); + // bump the pointer + h->add(data_reg, data_reg, aux_gpr); + } + + emit_impl(in_idxs, out_idxs); + + if (is_offset_runtime) { + // subtract back so we leave the pointer unchanged for the caller + h->sub(data_reg, data_reg, aux_gpr); + } + + emitter_postamble(); +} + +void jit_load_memory_emitter::emit_impl(const std::vector& in, const std::vector& out) const { + if (host_isa_ == ov::intel_cpu::riscv64::gv) { + emit_isa(in, out); + } else { + OV_CPU_JIT_EMITTER_THROW("Doesn't support isa ", host_isa_); + } +} + +template +void jit_load_memory_emitter::emit_isa(const std::vector& in, const std::vector& out) const { + auto src = Xbyak_riscv::Reg(in[0]); + auto dst = Xbyak_riscv::VReg(out[0]); + + // Set vector configuration for the load (e16 for 2-byte, e32 for 4-byte) + auto sew = (byte_size == 2) ? Xbyak_riscv::SEW::e16 : Xbyak_riscv::SEW::e32; + h->vsetivli(Xbyak_riscv::zero, count, sew, Xbyak_riscv::LMUL::m1); + + // Load vector data from memory + if (compiled_byte_offset == 0) { + if (byte_size == 2) { + h->vle16_v(dst, src); + } else { + h->vle32_v(dst, src); + } + } else { + // Use temporary register to calculate address with offset + auto tmp_gpr = Xbyak_riscv::Reg(aux_gpr_idxs.empty() ? Xbyak_riscv::t0.getIdx() : aux_gpr_idxs[0]); + h->addi(tmp_gpr, src, static_cast(compiled_byte_offset)); + if (byte_size == 2) { + h->vle16_v(dst, tmp_gpr); + } else { + h->vle32_v(dst, tmp_gpr); + } + } +} + +void jit_load_memory_emitter::emit_data() const { + // No additional data emission needed for basic load +} + +jit_store_memory_emitter::jit_store_memory_emitter(jit_generator_t* h, cpu_isa_t isa, const ExpressionPtr& expr) + : jit_memory_emitter(h, isa, expr, emitter_in_out_map::vec_to_gpr) { + bool is_supported_precision = + any_of(dst_prc, ov::element::f32, ov::element::i32, ov::element::f16) && src_prc == dst_prc; + OV_CPU_JIT_EMITTER_ASSERT(is_supported_precision, "Unsupported precision pair."); + + const auto store = ov::as_type_ptr(expr->get_node()); + OV_CPU_JIT_EMITTER_ASSERT(store != nullptr, "Expects Store expression"); + count = store->get_count(); + byte_size = dst_prc.size(); + OV_CPU_JIT_EMITTER_ASSERT(byte_size == 2 || byte_size == 4, + "Only 2- or 4-byte element stores are supported, got: ", + byte_size); +} + +void jit_store_memory_emitter::emit_impl(const std::vector& in, const std::vector& out) const { + if (host_isa_ == ov::intel_cpu::riscv64::gv) { + emit_isa(in, out); + } else { + OV_CPU_JIT_EMITTER_THROW("Doesn't support isa ", host_isa_); + } +} + +template +void jit_store_memory_emitter::emit_isa(const std::vector& in, const std::vector& out) const { + auto src = Xbyak_riscv::VReg(in[0]); + auto dst = Xbyak_riscv::Reg(out[0]); + + // Set vector configuration for the store + auto sew = (byte_size == 2) ? Xbyak_riscv::SEW::e16 : Xbyak_riscv::SEW::e32; + h->vsetivli(Xbyak_riscv::zero, count, sew, Xbyak_riscv::LMUL::m1); + + // Store vector data to memory + if (compiled_byte_offset == 0) { + if (byte_size == 2) { + h->vse16_v(src, dst); + } else { + h->vse32_v(src, dst); + } + } else { + // Use temporary register to calculate address with offset + auto tmp_gpr = Xbyak_riscv::Reg(aux_gpr_idxs.empty() ? Xbyak_riscv::t0.getIdx() : aux_gpr_idxs[0]); + h->addi(tmp_gpr, dst, static_cast(compiled_byte_offset)); + if (byte_size == 2) { + h->vse16_v(src, tmp_gpr); + } else { + h->vse32_v(src, tmp_gpr); + } + } +} + +/* ============== jit_load_broadcast_emitter =============== */ + +jit_load_broadcast_emitter::jit_load_broadcast_emitter(ov::intel_cpu::riscv64::jit_generator_t* h, + ov::intel_cpu::riscv64::cpu_isa_t isa, + const ov::snippets::lowered::ExpressionPtr& expr) + : jit_memory_emitter(h, isa, expr, emitter_in_out_map::gpr_to_vec) { + bool is_supported_precision = any_of(dst_prc, ov::element::f32, ov::element::i32) && src_prc == dst_prc; + OV_CPU_JIT_EMITTER_ASSERT(is_supported_precision, "Unsupported precision pair."); + + const auto broadcast_load = ov::as_type_ptr(expr->get_node()); + OV_CPU_JIT_EMITTER_ASSERT(broadcast_load != nullptr, "Expects BroadcastLoad expression"); + count = 1; // BroadcastLoad loads a single scalar value + byte_size = src_prc.size(); +} + +void jit_load_broadcast_emitter::emit_impl(const std::vector& in, const std::vector& out) const { + if (host_isa_ == ov::intel_cpu::riscv64::gv) { + emit_isa(in, out); + } else { + OV_CPU_JIT_EMITTER_THROW("Doesn't support isa ", host_isa_); + } +} + +template +void jit_load_broadcast_emitter::emit_isa(const std::vector& in, const std::vector& out) const { + auto src_gpr = Xbyak_riscv::Reg(in[0]); + auto dst_vreg = Xbyak_riscv::VReg(out[0]); + + // Set vector configuration for appropriate element size + if (byte_size == 4) { + h->vsetivli(Xbyak_riscv::zero, 4, Xbyak_riscv::SEW::e32, Xbyak_riscv::LMUL::m1); + } else if (byte_size == 2) { + h->vsetivli(Xbyak_riscv::zero, 4, Xbyak_riscv::SEW::e16, Xbyak_riscv::LMUL::m1); + } else { + OV_CPU_JIT_EMITTER_THROW("Unsupported byte size: ", byte_size); + } + + // Load scalar from memory and broadcast to vector register + // First load the scalar value into a temporary GPR + auto tmp_gpr = Xbyak_riscv::Reg(aux_gpr_idxs.empty() ? Xbyak_riscv::t0.getIdx() : aux_gpr_idxs[0]); + + // Calculate effective address if there's an offset + if (compiled_byte_offset == 0) { + if (byte_size == 2) { + h->lhu(tmp_gpr, src_gpr, 0); + } else { + h->lw(tmp_gpr, src_gpr, 0); + } + } else { + auto addr_gpr = Xbyak_riscv::Reg(aux_gpr_idxs.size() > 1 ? aux_gpr_idxs[1] : Xbyak_riscv::t1.getIdx()); + h->addi(addr_gpr, src_gpr, static_cast(compiled_byte_offset)); + if (byte_size == 2) { + h->lhu(tmp_gpr, addr_gpr, 0); + } else { + h->lw(tmp_gpr, addr_gpr, 0); + } + } + + // Move scalar to vector register and broadcast + h->vmv_v_x(dst_vreg, tmp_gpr); // Broadcast scalar to all elements +} + +void jit_load_broadcast_emitter::emit_data() const { + // No additional data emission needed for broadcast load operations +} + +} // namespace ov::intel_cpu::riscv64 diff --git a/src/plugins/intel_cpu/src/emitters/snippets/riscv64/jit_memory_emitters.hpp b/src/plugins/intel_cpu/src/emitters/snippets/riscv64/jit_memory_emitters.hpp new file mode 100644 index 00000000000000..70903c78134cf7 --- /dev/null +++ b/src/plugins/intel_cpu/src/emitters/snippets/riscv64/jit_memory_emitters.hpp @@ -0,0 +1,102 @@ +// Copyright (C) 2018-2025 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 +// + +#pragma once + +#include +#include +#include +#include +#include + +#include "emitters/plugin/riscv64/jit_emitter.hpp" +#include "openvino/core/type/element_type.hpp" +#include "snippets/lowered/expression.hpp" + +namespace ov::intel_cpu::riscv64 { + +class jit_memory_emitter : public jit_emitter { +public: + jit_memory_emitter(ov::intel_cpu::riscv64::jit_generator_t* h, + ov::intel_cpu::riscv64::cpu_isa_t isa, + const ov::snippets::lowered::ExpressionPtr& expr, + emitter_in_out_map in_out_type); + + size_t aux_gprs_count() const override; + + void emit_code_impl(const std::vector& in_idxs, + const std::vector& out_idxs, + const std::vector& pool_vec_idxs, + const std::vector& pool_gpr_idxs) const override; + +protected: + ov::element::Type src_prc; + ov::element::Type dst_prc; + + size_t count = 0; + size_t compiled_byte_offset = 0; + size_t buffer_cluster_id = 0; + bool is_offset_runtime = false; +}; + +class jit_load_memory_emitter : public jit_memory_emitter { +public: + jit_load_memory_emitter(ov::intel_cpu::riscv64::jit_generator_t* h, + ov::intel_cpu::riscv64::cpu_isa_t isa, + const ov::snippets::lowered::ExpressionPtr& expr); + + size_t get_inputs_num() const override { + return 1; + } + +private: + void emit_impl(const std::vector& in, const std::vector& out) const override; + void emit_data() const override; + + template + void emit_isa(const std::vector& in, const std::vector& out) const; + + size_t byte_size = 0; +}; + +class jit_store_memory_emitter : public jit_memory_emitter { +public: + jit_store_memory_emitter(ov::intel_cpu::riscv64::jit_generator_t* h, + ov::intel_cpu::riscv64::cpu_isa_t isa, + const ov::snippets::lowered::ExpressionPtr& expr); + + size_t get_inputs_num() const override { + return 1; + } + +private: + void emit_impl(const std::vector& in, const std::vector& out) const override; + + template + void emit_isa(const std::vector& in, const std::vector& out) const; + + size_t byte_size = 0; +}; + +class jit_load_broadcast_emitter : public jit_memory_emitter { +public: + jit_load_broadcast_emitter(ov::intel_cpu::riscv64::jit_generator_t* h, + ov::intel_cpu::riscv64::cpu_isa_t isa, + const ov::snippets::lowered::ExpressionPtr& expr); + + size_t get_inputs_num() const override { + return 1; + } + +private: + void emit_impl(const std::vector& in, const std::vector& out) const override; + void emit_data() const override; + + template + void emit_isa(const std::vector& in, const std::vector& out) const; + + size_t byte_size = 0; +}; + +} // namespace ov::intel_cpu::riscv64 diff --git a/src/plugins/intel_cpu/src/emitters/snippets/riscv64/jit_snippets_emitters.cpp b/src/plugins/intel_cpu/src/emitters/snippets/riscv64/jit_snippets_emitters.cpp new file mode 100644 index 00000000000000..30787d9121748a --- /dev/null +++ b/src/plugins/intel_cpu/src/emitters/snippets/riscv64/jit_snippets_emitters.cpp @@ -0,0 +1,83 @@ +// Copyright (C) 2018-2025 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 +// + +#include "jit_snippets_emitters.hpp" + +#include +#include +#include +#include +#include + +#include "emitters/plugin/riscv64/jit_emitter.hpp" +#include "emitters/utils.hpp" +#include "nodes/kernels/riscv64/jit_generator.hpp" +#include "openvino/core/type.hpp" +#include "openvino/core/type/element_type.hpp" +#include "openvino/op/constant.hpp" +#include "snippets/lowered/expression.hpp" +#include "xbyak_riscv/xbyak_riscv.hpp" +#include "xbyak_riscv/xbyak_riscv_csr.hpp" + +namespace ov::intel_cpu::riscv64 { + +using jit_generator_t = ov::intel_cpu::riscv64::jit_generator_t; +using cpu_isa_t = ov::intel_cpu::riscv64::cpu_isa_t; +using ExpressionPtr = ov::snippets::lowered::ExpressionPtr; + +jit_nop_emitter::jit_nop_emitter(jit_generator_t* h, cpu_isa_t isa, [[maybe_unused]] const ExpressionPtr& expr) + : riscv64::jit_emitter(h, isa) { + in_out_type_ = emitter_in_out_map::gpr_to_gpr; +} + +jit_scalar_emitter::jit_scalar_emitter(jit_generator_t* h, cpu_isa_t isa, const ExpressionPtr& expr) + : jit_emitter(h, isa) { + const auto n = expr->get_node(); + const auto& precision = n->get_output_element_type(0); + switch (precision) { + case element::i32: { + value = ov::as_type_ptr(n)->cast_vector()[0]; + break; + } + case element::f32: { + // For RISC-V, we'll store the float value as int32 bitcast + const auto float_val = ov::as_type_ptr(n)->cast_vector()[0]; + std::memcpy(&value, &float_val, sizeof(value)); + break; + } + default: { + OV_CPU_JIT_EMITTER_THROW("Doesn't support precision ", precision); + } + } +} + +void jit_scalar_emitter::emit_impl(const std::vector& in, const std::vector& out) const { + if (host_isa_ == ov::intel_cpu::riscv64::gv) { + emit_isa(in, out); + } else { + OV_CPU_JIT_EMITTER_THROW("Doesn't support isa ", host_isa_); + } +} + +template +void jit_scalar_emitter::emit_isa([[maybe_unused]] const std::vector& in, + const std::vector& out) const { + // Get destination vector register + auto dst_vreg = Xbyak_riscv::VReg(out[0]); + + // For now, use t0 as a temporary register + Xbyak_riscv::Reg tmp_gpr = Xbyak_riscv::t0; + + // Load scalar value directly into register + h->uni_li(tmp_gpr, value); + + // Broadcast scalar to vector register using RISC-V Vector Extension + // Set vector configuration for 32-bit elements + h->vsetivli(Xbyak_riscv::zero, 4, Xbyak_riscv::SEW::e32, Xbyak_riscv::LMUL::m1); + + // Move scalar from GPR to vector register and broadcast + h->vmv_v_x(dst_vreg, tmp_gpr); +} + +} // namespace ov::intel_cpu::riscv64 \ No newline at end of file diff --git a/src/plugins/intel_cpu/src/emitters/snippets/riscv64/jit_snippets_emitters.hpp b/src/plugins/intel_cpu/src/emitters/snippets/riscv64/jit_snippets_emitters.hpp new file mode 100644 index 00000000000000..f1af9f79e75fe7 --- /dev/null +++ b/src/plugins/intel_cpu/src/emitters/snippets/riscv64/jit_snippets_emitters.hpp @@ -0,0 +1,56 @@ +// Copyright (C) 2018-2025 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 +// + +#pragma once + +#include +#include +#include +#include +#include + +#include "emitters/plugin/riscv64/jit_emitter.hpp" +#include "snippets/lowered/expression.hpp" + +namespace ov::intel_cpu::riscv64 { + +class jit_nop_emitter : public jit_emitter { +public: + jit_nop_emitter(ov::intel_cpu::riscv64::jit_generator_t* h, + ov::intel_cpu::riscv64::cpu_isa_t isa, + const ov::snippets::lowered::ExpressionPtr& expr); + + size_t get_inputs_num() const override { + return 0; + } + +private: + void emit_impl(const std::vector& in, const std::vector& out) const override {} +}; + +class jit_scalar_emitter : public jit_emitter { +public: + jit_scalar_emitter(ov::intel_cpu::riscv64::jit_generator_t* h, + ov::intel_cpu::riscv64::cpu_isa_t isa, + const ov::snippets::lowered::ExpressionPtr& expr); + + size_t get_inputs_num() const override { + return 0; + } + +protected: + size_t aux_gprs_count() const override { + return 1; + } + +private: + void emit_impl(const std::vector& in, const std::vector& out) const override; + + template + void emit_isa(const std::vector& in, const std::vector& out) const; + + int32_t value; +}; + +} // namespace ov::intel_cpu::riscv64 \ No newline at end of file diff --git a/src/plugins/intel_cpu/src/emitters/snippets/riscv64/utils.cpp b/src/plugins/intel_cpu/src/emitters/snippets/riscv64/utils.cpp new file mode 100644 index 00000000000000..58f22b9a71c9ec --- /dev/null +++ b/src/plugins/intel_cpu/src/emitters/snippets/riscv64/utils.cpp @@ -0,0 +1,110 @@ +// Copyright (C) 2018-2025 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 +// + +#include "utils.hpp" + +#include +#include +#include +#include +#include + +#include "nodes/kernels/riscv64/jit_generator.hpp" +#include "openvino/core/except.hpp" +#include "snippets/emitter.hpp" +#include "xbyak_riscv/xbyak_riscv.hpp" + +namespace ov::intel_cpu::riscv64::utils { + +Xbyak_riscv::Reg get_aux_gpr(const std::vector& used_gpr_idxs) { + // RISC-V reserved registers to avoid: x0(zero), x1(ra), x2(sp), x3(gp), x4(tp), x8(s0/fp) + // Also avoid a0, a1 which are used for ABI parameters + const std::set reserved_regs = {0, 1, 2, 3, 4, 8, 10, 11}; + + // Start with temporary registers t0-t6 (x5-x7, x28-x31) + const std::vector temp_regs = {5, 6, 7, 28, 29, 30, 31}; + + for (size_t reg_idx : temp_regs) { + if (std::find(used_gpr_idxs.begin(), used_gpr_idxs.end(), reg_idx) == used_gpr_idxs.end()) { + return Xbyak_riscv::Reg(static_cast(reg_idx)); + } + } + + // If no temporary registers available, try saved registers s1-s11 (x9, x18-x27) + const std::vector saved_regs = {9, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27}; + + for (size_t reg_idx : saved_regs) { + if (std::find(used_gpr_idxs.begin(), used_gpr_idxs.end(), reg_idx) == used_gpr_idxs.end()) { + return Xbyak_riscv::Reg(static_cast(reg_idx)); + } + } + + OPENVINO_THROW("No available auxiliary GPR registers"); +} + +std::vector get_aux_gprs(const std::vector& used_gpr_idxs, size_t count) { + std::vector result; + std::vector current_used = used_gpr_idxs; + + for (size_t i = 0; i < count; ++i) { + auto aux_reg = get_aux_gpr(current_used); + result.push_back(aux_reg); + current_used.push_back(aux_reg.getIdx()); + } + + return result; +} + +Xbyak_riscv::Reg init_memory_access_aux_gpr(const std::vector& used_gpr_reg_idxs, + const std::vector& aux_gpr_idxs, + std::set& regs_to_spill) { + if (!aux_gpr_idxs.empty()) { + return Xbyak_riscv::Reg(static_cast(aux_gpr_idxs.front())); + } + + // Find an available register and mark it for spilling + auto aux_reg = get_aux_gpr(used_gpr_reg_idxs); + regs_to_spill.insert({snippets::RegType::gpr, static_cast(aux_reg.getIdx())}); + return aux_reg; +} + +void push_ptr_with_runtime_offset_on_stack(ov::intel_cpu::riscv64::jit_generator_t* h, + int32_t stack_offset, + const Xbyak_riscv::Reg& ptr_reg, + const std::vector& aux_regs, + size_t runtime_offset) { + OPENVINO_ASSERT(aux_regs.size() >= 3, "Need at least 3 auxiliary registers"); + + const auto& aux1 = aux_regs[0]; + const auto& aux2 = aux_regs[1]; + + // Load runtime offset from runtime params + h->lw(aux1, Xbyak_riscv::a0, static_cast(runtime_offset)); + + // Add offset to pointer + h->add(aux2, ptr_reg, aux1); + + // Store adjusted pointer to stack + h->sw(aux2, Xbyak_riscv::sp, stack_offset); +} + +void push_ptr_with_static_offset_on_stack(ov::intel_cpu::riscv64::jit_generator_t* h, + int32_t stack_offset, + const Xbyak_riscv::Reg& ptr_reg, + const std::vector& aux_regs, + size_t ptr_offset) { + OPENVINO_ASSERT(aux_regs.size() >= 2, "Need at least 2 auxiliary registers"); + + if (ptr_offset == 0) { + // Direct store without offset + h->sw(ptr_reg, Xbyak_riscv::sp, stack_offset); + } else { + // Add static offset and store + const auto& aux = aux_regs[0]; + h->addi(aux, ptr_reg, static_cast(ptr_offset)); + h->sw(aux, Xbyak_riscv::sp, stack_offset); + } +} + +} // namespace ov::intel_cpu::riscv64::utils \ No newline at end of file diff --git a/src/plugins/intel_cpu/src/emitters/snippets/riscv64/utils.hpp b/src/plugins/intel_cpu/src/emitters/snippets/riscv64/utils.hpp new file mode 100644 index 00000000000000..90f3c40bdd57d0 --- /dev/null +++ b/src/plugins/intel_cpu/src/emitters/snippets/riscv64/utils.hpp @@ -0,0 +1,87 @@ +// Copyright (C) 2018-2025 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 +// + +#pragma once + +#include +#include +#include +#include +#include +#include + +#include "nodes/kernels/riscv64/jit_generator.hpp" +#include "snippets/emitter.hpp" +#include "snippets/lowered/expression_port.hpp" +#include "xbyak_riscv/xbyak_riscv.hpp" + +namespace ov::intel_cpu::riscv64::utils { + +inline static std::vector transform_idxs_to_regs(const std::vector& idxs) { + std::vector regs; + regs.reserve(idxs.size()); + std::transform(idxs.begin(), idxs.end(), std::back_inserter(regs), [](size_t idx) { + return Xbyak_riscv::Reg(static_cast(idx)); + }); + return regs; +} + +/** + * @brief Find the available register from the pool excepting: a0, a1, sp, ra and `used_gpr_idxs` + * @param used_gpr_idxs current used gpr register indexes + * @return register + */ +Xbyak_riscv::Reg get_aux_gpr(const std::vector& used_gpr_idxs); + +/** + * @brief Find multiple available registers from the pool excepting: a0, a1, sp, ra and `used_gpr_idxs` + * @param used_gpr_idxs current used gpr register indexes + * @param count number of auxiliary registers needed (default: 3) + * @return vector of registers + */ +std::vector get_aux_gprs(const std::vector& used_gpr_idxs, size_t count = 3); + +/** + * @brief Returns an auxiliary GPR register. Returns a register from `aux_gpr_idxs`. + * If it's empty, then choose a register that is not in `used_gpr_reg_idxs` and add it to `regs_to_spill`. + * @param used_gpr_reg_idxs register indexes reserved to store memory pointers in this emitter + * @param aux_gpr_idxs pool of available gp register indexes + * @param regs_to_spill set of live registers to be spilled before ABI call + */ +Xbyak_riscv::Reg init_memory_access_aux_gpr(const std::vector& used_gpr_reg_idxs, + const std::vector& aux_gpr_idxs, + std::set& regs_to_spill); + +/** + * @brief Push data pointer on stack adding offset. The offset is taken from runtime params `a0` + * @param h generator + * @param stack_offset stack offset + * @param ptr_reg register containing data pointer + * @param aux_regs vector of available auxiliary registers (must contain >= 3 registers, ptr_reg must not be in this + * vector) + * @param runtime_offset offset in runtime params `a0` + */ +void push_ptr_with_runtime_offset_on_stack(ov::intel_cpu::riscv64::jit_generator_t* h, + int32_t stack_offset, + const Xbyak_riscv::Reg& ptr_reg, + const std::vector& aux_regs, + size_t runtime_offset); + +/** + * @brief Push data pointer on stack adding static offset `ptr_offset` + * Note: This helper doesn't allocate stack space - the user should guarantee allocated space on stack + * @param h generator + * @param stack_offset stack offset + * @param ptr_reg register containing data pointer + * @param aux_regs vector of available auxiliary registers (must contain >= 2 registers, ptr_reg must not be in this + * vector) + * @param ptr_offset offset which will be added to data pointer + */ +void push_ptr_with_static_offset_on_stack(ov::intel_cpu::riscv64::jit_generator_t* h, + int32_t stack_offset, + const Xbyak_riscv::Reg& ptr_reg, + const std::vector& aux_regs, + size_t ptr_offset); + +} // namespace ov::intel_cpu::riscv64::utils \ No newline at end of file diff --git a/src/plugins/intel_cpu/src/emitters/snippets/utils/utils.cpp b/src/plugins/intel_cpu/src/emitters/snippets/utils/utils.cpp index 67a7ae636c03ee..ff37a30877fc09 100644 --- a/src/plugins/intel_cpu/src/emitters/snippets/utils/utils.cpp +++ b/src/plugins/intel_cpu/src/emitters/snippets/utils/utils.cpp @@ -11,6 +11,7 @@ #include "emitters/utils.hpp" #include "openvino/core/except.hpp" #include "openvino/core/type.hpp" +#include "snippets/lowered/expression.hpp" #include "snippets/lowered/expression_port.hpp" #include "snippets/lowered/expressions/buffer_expression.hpp" #include "snippets/op/loop.hpp" @@ -50,4 +51,16 @@ size_t get_buffer_cluster_id(const ov::snippets::lowered::ExpressionPort& port) return id; } +size_t get_parent_buffer_cluster_id(const ov::snippets::lowered::ExpressionPtr& expr) { + OPENVINO_ASSERT(expr, "Expression must not be null"); + OPENVINO_ASSERT(expr->get_input_count() == 1, "MemoryAccess must have one parent"); + return get_buffer_cluster_id(expr->get_input_port(0)); +} + +size_t get_consumer_buffer_cluster_id(const ov::snippets::lowered::ExpressionPtr& expr) { + OPENVINO_ASSERT(expr, "Expression must not be null"); + OPENVINO_ASSERT(expr->get_output_count() == 1, "MemoryAccess must have one output"); + return get_buffer_cluster_id(expr->get_output_port(0)); +} + } // namespace ov::intel_cpu::utils diff --git a/src/plugins/intel_cpu/src/emitters/snippets/utils/utils.hpp b/src/plugins/intel_cpu/src/emitters/snippets/utils/utils.hpp index f1399b13d89fcc..41011ae3e2e282 100644 --- a/src/plugins/intel_cpu/src/emitters/snippets/utils/utils.hpp +++ b/src/plugins/intel_cpu/src/emitters/snippets/utils/utils.hpp @@ -6,10 +6,13 @@ #include +#include "snippets/lowered/expression.hpp" #include "snippets/lowered/expression_port.hpp" namespace ov::intel_cpu::utils { size_t get_buffer_cluster_id(const ov::snippets::lowered::ExpressionPort& port); +size_t get_parent_buffer_cluster_id(const ov::snippets::lowered::ExpressionPtr& expr); +size_t get_consumer_buffer_cluster_id(const ov::snippets::lowered::ExpressionPtr& expr); } // namespace ov::intel_cpu::utils diff --git a/src/plugins/intel_cpu/src/emitters/snippets/x64/cpu_generator.cpp b/src/plugins/intel_cpu/src/emitters/snippets/x64/cpu_generator.cpp index 5e87c889b95e5d..97af09917c9e6b 100644 --- a/src/plugins/intel_cpu/src/emitters/snippets/x64/cpu_generator.cpp +++ b/src/plugins/intel_cpu/src/emitters/snippets/x64/cpu_generator.cpp @@ -10,7 +10,6 @@ #include #include #include -#include #include #include #include @@ -490,23 +489,6 @@ snippets::CompiledSnippetPtr intel_cpu::CPUTargetMachine::get_snippet() { return result; } -intel_cpu::CompiledSnippetCPU::CompiledSnippetCPU(std::unique_ptr h) - : h_compiled(std::move(h)) { - OPENVINO_ASSERT(h_compiled && h_compiled->jit_ker(), "Got invalid jit generator or kernel was nopt compiled"); -} - -const uint8_t* intel_cpu::CompiledSnippetCPU::get_code() const { - return h_compiled->jit_ker(); -} - -size_t intel_cpu::CompiledSnippetCPU::get_code_size() const { - return h_compiled->getSize(); -} - -bool intel_cpu::CompiledSnippetCPU::empty() const { - return get_code_size() == 0; -} - intel_cpu::CPUGenerator::CPUGenerator(dnnl::impl::cpu::x64::cpu_isa_t host_isa, ov::intel_cpu::MultiCacheWeakPtr cache) : Generator(std::make_shared(host_isa, std::move(cache))) {} intel_cpu::CPUGenerator::CPUGenerator(const std::shared_ptr& target) : Generator(target) {} diff --git a/src/plugins/intel_cpu/src/emitters/snippets/x64/cpu_generator.hpp b/src/plugins/intel_cpu/src/emitters/snippets/x64/cpu_generator.hpp index 7e29e22e4d68c0..1de839e5ee0d0b 100644 --- a/src/plugins/intel_cpu/src/emitters/snippets/x64/cpu_generator.hpp +++ b/src/plugins/intel_cpu/src/emitters/snippets/x64/cpu_generator.hpp @@ -12,6 +12,7 @@ #include "cache/multi_cache.h" #include "cpu/x64/jit_generator.hpp" +#include "emitters/snippets/common/compiled_snippet_cpu.hpp" #include "openvino/core/node.hpp" #include "openvino/core/node_output.hpp" #include "snippets/emitter.hpp" @@ -24,15 +25,7 @@ namespace ov::intel_cpu { -class CompiledSnippetCPU : public snippets::CompiledSnippet { - const std::unique_ptr h_compiled; - -public: - [[nodiscard]] const uint8_t* get_code() const override; - [[nodiscard]] size_t get_code_size() const override; - [[nodiscard]] bool empty() const override; - explicit CompiledSnippetCPU(std::unique_ptr h); -}; +using CompiledSnippetCPU = ov::intel_cpu::CompiledSnippetCPUCommon; class CPUTargetMachine : public snippets::TargetMachine { public: diff --git a/src/plugins/intel_cpu/src/emitters/snippets/x64/jit_loop_base_emitters.cpp b/src/plugins/intel_cpu/src/emitters/snippets/x64/jit_loop_base_emitters.cpp index b737f3dadb1f4f..0b0850534a5d34 100644 --- a/src/plugins/intel_cpu/src/emitters/snippets/x64/jit_loop_base_emitters.cpp +++ b/src/plugins/intel_cpu/src/emitters/snippets/x64/jit_loop_base_emitters.cpp @@ -18,6 +18,7 @@ #include #include "emitters/plugin/x64/jit_emitter.hpp" +#include "emitters/snippets/common/jit_loop_args_helper.hpp" #include "emitters/snippets/jit_snippets_call_args.hpp" #include "emitters/snippets/x64/utils.hpp" #include "emitters/utils.hpp" @@ -130,7 +131,7 @@ jit_loop_end_base_emitter::jit_loop_end_base_emitter(jit_generator_t* h, m_evaluate_once = loop_end->get_evaluate_once(); m_are_ptr_increments_dynamic = ov::snippets::utils::has_dynamic_values(loop_end->get_ptr_increments()); m_are_final_offsets_dynamic = ov::snippets::utils::has_dynamic_values(loop_end->get_finalization_offsets()); - m_loop_args = compose_loop_args(loop_end); + m_loop_args = ov::intel_cpu::snippets_common::compose_loop_args(loop_end); } ov::snippets::lowered::ExpressionPtr jit_loop_end_base_emitter::get_loop_begin_expr( @@ -141,40 +142,6 @@ ov::snippets::lowered::ExpressionPtr jit_loop_end_base_emitter::get_loop_begin_e return begin_expr; } -jit_snippets_call_args::loop_args_t jit_loop_end_base_emitter::compose_loop_args( - const std::shared_ptr& loop_end) { - const auto& ptr_increments = loop_end->get_ptr_increments(); - const auto& fin_offsets = loop_end->get_finalization_offsets(); - const auto& is_incremented = loop_end->get_is_incremented(); - const auto wa_increment = loop_end->get_increment(); - - const auto int_work_amount = ov::snippets::utils::is_dynamic_value(loop_end->get_work_amount()) - ? ov::snippets::utils::get_dynamic_value() - : static_cast(loop_end->get_work_amount()); - auto loop_args = jit_snippets_call_args::loop_args_t(int_work_amount, ptr_increments, fin_offsets); - - const auto& data_sizes = loop_end->get_element_type_sizes(); - for (int64_t i = 0; i < loop_args.m_num_data_ptrs; ++i) { - // Increments for non-incremented indices should be zeroed - if (!is_incremented[i]) { - loop_args.m_ptr_increments[i] = 0; - loop_args.m_finalization_offsets[i] = 0; - continue; - } - - // Note: behavior is aligned with runtime configurator: - // data_sizes and increment are already taken into account in the offsets - if (!ov::snippets::utils::is_dynamic_value(loop_args.m_ptr_increments[i])) { - loop_args.m_ptr_increments[i] *= (wa_increment * data_sizes[i]); - } - if (!ov::snippets::utils::is_dynamic_value(loop_args.m_finalization_offsets[i])) { - loop_args.m_finalization_offsets[i] *= data_sizes[i]; - } - } - - return loop_args; -} - void jit_loop_end_base_emitter::validate_arguments(const std::vector& in, const std::vector& out) const { OV_CPU_JIT_EMITTER_ASSERT(out.empty(), "Invalid number of out arguments: expected ", 0, " got ", out.size()); diff --git a/src/plugins/intel_cpu/src/emitters/snippets/x64/jit_loop_base_emitters.hpp b/src/plugins/intel_cpu/src/emitters/snippets/x64/jit_loop_base_emitters.hpp index 44b200ed2c821a..26a1a70dec83e1 100644 --- a/src/plugins/intel_cpu/src/emitters/snippets/x64/jit_loop_base_emitters.hpp +++ b/src/plugins/intel_cpu/src/emitters/snippets/x64/jit_loop_base_emitters.hpp @@ -80,9 +80,6 @@ class jit_loop_end_base_emitter : public jit_emitter { return 0; } - static jit_snippets_call_args::loop_args_t compose_loop_args( - const std::shared_ptr& loop_end); - // `jit_loop_end_base_emitter` handles manually aux_gpr allocation using `jit_aux_gpr_holder` size_t aux_gprs_count() const override { return 0; diff --git a/src/plugins/intel_cpu/src/emitters/snippets/x64/jit_memory_emitters.cpp b/src/plugins/intel_cpu/src/emitters/snippets/x64/jit_memory_emitters.cpp index 87bce870697263..078e9eb0387a92 100644 --- a/src/plugins/intel_cpu/src/emitters/snippets/x64/jit_memory_emitters.cpp +++ b/src/plugins/intel_cpu/src/emitters/snippets/x64/jit_memory_emitters.cpp @@ -17,10 +17,10 @@ #include "emitters/plugin/x64/jit_emitter.hpp" #include "emitters/plugin/x64/jit_load_store_emitters.hpp" #include "emitters/snippets/jit_snippets_call_args.hpp" +#include "emitters/snippets/utils/utils.hpp" #include "emitters/utils.hpp" #include "openvino/core/type.hpp" #include "snippets/lowered/expression.hpp" -#include "snippets/lowered/expressions/buffer_expression.hpp" #include "snippets/op/broadcastload.hpp" #include "snippets/op/load.hpp" #include "snippets/op/memory_access.hpp" @@ -54,13 +54,13 @@ jit_memory_emitter::jit_memory_emitter(jit_generator_t* h, OV_CPU_JIT_EMITTER_ASSERT(memory_access->is_memory_access_input_port(0), "must be input port - memory access"); count = memory_access->get_input_count(); compiled_byte_offset = memory_access->get_input_offset(); - buffer_cluster_id = get_parent_buffer_cluster_id(expr); + buffer_cluster_id = ov::intel_cpu::utils::get_parent_buffer_cluster_id(expr); } else if (in_out_type_ == emitter_in_out_map::vec_to_gpr) { OV_CPU_JIT_EMITTER_ASSERT(memory_access->is_memory_access_output_port(0), "must be output port - memory access"); count = memory_access->get_output_count(); compiled_byte_offset = memory_access->get_output_offset(); - buffer_cluster_id = get_consumer_buffer_cluster_id(expr); + buffer_cluster_id = ov::intel_cpu::utils::get_consumer_buffer_cluster_id(expr); } else { OV_CPU_JIT_EMITTER_THROW("unsupported in_out_type"); } @@ -79,26 +79,6 @@ size_t jit_memory_emitter::aux_gprs_count() const { return is_offset_runtime ? 1 : 0; } -size_t jit_memory_emitter::get_parent_buffer_cluster_id(const ov::snippets::lowered::ExpressionPtr& expr) { - OV_CPU_JIT_EMITTER_ASSERT(expr->get_input_port_connectors().size() == 1, "MemoryAccess must have one parent"); - const auto& parent_expr = expr->get_input_expr_ptr(0); - if (const auto buffer = ov::as_type_ptr(parent_expr)) { - return buffer->get_cluster_id(); - } - return SIZE_MAX; -} - -size_t jit_memory_emitter::get_consumer_buffer_cluster_id(const ov::snippets::lowered::ExpressionPtr& expr) { - OV_CPU_JIT_EMITTER_ASSERT(expr->get_output_port_connectors().size() == 1, "MemoryAccess must have one consumer"); - const auto& consumers = expr->get_output_port_connector(0)->get_consumers(); - for (const auto& consumer : consumers) { - if (const auto buffer = ov::as_type_ptr(consumer.get_expr())) { - return buffer->get_cluster_id(); - } - } - return SIZE_MAX; -} - std::vector jit_memory_emitter::get_available_aux_gprs() const { OV_CPU_JIT_EMITTER_ASSERT(IMPLICATION(is_offset_runtime, !aux_gpr_idxs.empty()), "If offset is dynamic, memory emitter need to have one aux gpr at least!"); diff --git a/src/plugins/intel_cpu/src/emitters/snippets/x64/jit_memory_emitters.hpp b/src/plugins/intel_cpu/src/emitters/snippets/x64/jit_memory_emitters.hpp index 8fb602e1e19f2f..5066a6dbf1e75b 100644 --- a/src/plugins/intel_cpu/src/emitters/snippets/x64/jit_memory_emitters.hpp +++ b/src/plugins/intel_cpu/src/emitters/snippets/x64/jit_memory_emitters.hpp @@ -26,9 +26,6 @@ class jit_memory_emitter : public jit_emitter { emitter_in_out_map in_out_type); protected: - static size_t get_parent_buffer_cluster_id(const ov::snippets::lowered::ExpressionPtr& expr); - static size_t get_consumer_buffer_cluster_id(const ov::snippets::lowered::ExpressionPtr& expr); - size_t aux_gprs_count() const override; std::vector get_available_aux_gprs() const; diff --git a/src/plugins/intel_cpu/src/emitters/snippets/x64/jit_parallel_loop_emitters.cpp b/src/plugins/intel_cpu/src/emitters/snippets/x64/jit_parallel_loop_emitters.cpp index acfda4ad8ec0f7..2f03df69bb4988 100644 --- a/src/plugins/intel_cpu/src/emitters/snippets/x64/jit_parallel_loop_emitters.cpp +++ b/src/plugins/intel_cpu/src/emitters/snippets/x64/jit_parallel_loop_emitters.cpp @@ -19,6 +19,7 @@ #include "emitters/plugin/x64/jit_emitter.hpp" #include "emitters/plugin/x64/utils.hpp" +#include "emitters/snippets/common/jit_loop_args_helper.hpp" #include "emitters/snippets/jit_snippets_call_args.hpp" #include "emitters/snippets/x64/jit_binary_call_emitter.hpp" #include "emitters/snippets/x64/kernel_executors/parallel_loop.hpp" @@ -49,7 +50,7 @@ jit_parallel_loop_begin_emitter::jit_parallel_loop_begin_emitter(jit_generator_t m_parallel_section_reg_spiller(std::make_shared(h)) { const auto loop_end_expr = get_loop_end_expr(expr); const auto loop_end = ov::as_type_ptr(loop_end_expr->get_node()); - m_loop_args = jit_loop_end_base_emitter::compose_loop_args(loop_end); + m_loop_args = ov::intel_cpu::snippets_common::compose_loop_args(loop_end); m_is_dynamic = loop_end->has_dynamic_params(); const auto& loop_end_input_regs = loop_end_expr->get_reg_info().first; diff --git a/src/plugins/intel_cpu/src/nodes/executors/riscv64/subgraph.cpp b/src/plugins/intel_cpu/src/nodes/executors/riscv64/subgraph.cpp new file mode 100644 index 00000000000000..42bebe9835ae53 --- /dev/null +++ b/src/plugins/intel_cpu/src/nodes/executors/riscv64/subgraph.cpp @@ -0,0 +1,90 @@ +// Copyright (C) 2018-2025 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 +// + +#include "nodes/executors/riscv64/subgraph.hpp" + +#include +#include +#include +#include + +#include "cache/multi_cache.h" +#include "emitters/snippets/cpu_runtime_configurator.hpp" +#include "emitters/snippets/jit_snippets_call_args.hpp" +#include "nodes/executors/subgraph.hpp" +#include "openvino/core/except.hpp" + +namespace ov::intel_cpu { + +SubgraphExecutor::SubgraphExecutor(const std::shared_ptr& snippet_config, + const std::shared_ptr& snippet_attrs, + const std::shared_ptr& snippet, + const std::vector& start_offset_in, + const std::vector& start_offset_out, + const BufferScratchpadAllocator& allocator, + const ov::intel_cpu::MultiCacheWeakPtr& kernel_cache) + : SubgraphBaseExecutor(snippet_config, + snippet_attrs, + snippet, + start_offset_in, + start_offset_out, + allocator, + kernel_cache) { + m_buffer_scratchpad = allocator(m_internal_buffer_size); +} + +void SubgraphStaticExecutor::exec_impl(const std::vector& inMemPtrs, + const std::vector& outMemPtrs) { + const auto& callable = m_schedule->get_callable(); + + auto initializer = [&](jit_snippets_call_args& call_args, size_t ithr) { + init_call_args(call_args, inMemPtrs, outMemPtrs, m_start_offset_in, m_start_offset_out); + update_scratchpad_ptr(call_args.buffer_scratchpad_ptr, ithr); + }; + auto caller = + [&](jit_snippets_call_args& call_args, const std::vector& indexes, [[maybe_unused]] size_t ithr) { + callable(&call_args, indexes.data()); + }; + + if (m_parallel_exec_domain.size() == rank6D) { + parallel_for6d(initializer, caller); + } else { + parallel_forNd(initializer, caller); + } +} + +void SubgraphDynamicSpecializedExecutor::exec_impl(const std::vector& inMemPtrs, + const std::vector& outMemPtrs) { + const auto& callable = m_schedule->get_callable(); + + OPENVINO_ASSERT(m_data_offsets.size() == inMemPtrs.size() + outMemPtrs.size(), "Incorrect data offset count!"); + OPENVINO_ASSERT(m_data_offsets.front().size() == m_parallel_exec_domain.size(), + "Data offsets with invalid ranks detected"); + + // Note: we need to reset KernelExecutorTable to the state that was recorded in the + // SubgraphDynamicSpecializedExecutor constructor because the table might've been used for other shapes + m_reset_exec_table_state(); + + std::vector src_ptrs; + std::vector dst_ptrs; + init_original_ptrs(inMemPtrs, outMemPtrs, src_ptrs, dst_ptrs, m_start_offset_in, m_start_offset_out); + + auto initializer = [&](jit_snippets_call_args& call_args, size_t ithr) { + init_call_args(call_args); + update_scratchpad_ptr(call_args.buffer_scratchpad_ptr, ithr); + }; + auto caller = + [&](jit_snippets_call_args& call_args, const std::vector& indexes, [[maybe_unused]] size_t ithr) { + update_ptrs(call_args, src_ptrs, dst_ptrs, indexes); + callable(&call_args); + }; + + if (m_parallel_exec_domain.size() == rank6D) { + parallel_for6d(initializer, caller); + } else { + parallel_forNd(initializer, caller); + } +} + +} // namespace ov::intel_cpu diff --git a/src/plugins/intel_cpu/src/nodes/executors/riscv64/subgraph.hpp b/src/plugins/intel_cpu/src/nodes/executors/riscv64/subgraph.hpp new file mode 100644 index 00000000000000..8f6bab0e394e91 --- /dev/null +++ b/src/plugins/intel_cpu/src/nodes/executors/riscv64/subgraph.hpp @@ -0,0 +1,58 @@ +// Copyright (C) 2018-2025 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 +// + +#pragma once + +#include +#include +#include + +#include "cache/multi_cache.h" +#include "cpu_memory.h" +#include "emitters/snippets/cpu_runtime_configurator.hpp" +#include "nodes/executors/subgraph.hpp" + +namespace ov::intel_cpu { + +class SubgraphExecutor : public SubgraphBaseExecutor { +public: + SubgraphExecutor(const std::shared_ptr& snippet_config, + const std::shared_ptr& snippet_attrs, + const std::shared_ptr& snippet, + const std::vector& start_offset_in, + const std::vector& start_offset_out, + const BufferScratchpadAllocator& allocator, + const ov::intel_cpu::MultiCacheWeakPtr& kernel_cache); +}; + +class SubgraphStaticExecutor : public SubgraphExecutor, public SubgraphStaticBaseExecutor { +public: + template + SubgraphStaticExecutor(const std::shared_ptr& config, + const std::set& external_ptrs_idces, + size_t in_num, + Args&&... rest) + : SubgraphExecutor(config, std::forward(rest)...), + SubgraphStaticBaseExecutor(external_ptrs_idces, in_num) {} + + void exec_impl(const std::vector& inMemPtrs, const std::vector& outMemPtrs) override; +}; + +// Dynamic specialized executor is not used on RISCV64 yet, but keep the class for symmetry +class SubgraphDynamicSpecializedExecutor : public SubgraphExecutor, public SubgraphDynamicSpecializedBaseExecutor { +public: + template + SubgraphDynamicSpecializedExecutor(const std::shared_ptr& config, + const std::set& external_ptrs_idces, + size_t in_num, + Args&&... rest) + : SubgraphExecutor(config, std::forward(rest)...), + SubgraphDynamicSpecializedBaseExecutor(config, external_ptrs_idces, in_num) { + OPENVINO_THROW("SubgraphDynamicSpecializedExecutor is not supported on RISC-V platform"); + } + + void exec_impl(const std::vector& inMemPtrs, const std::vector& outMemPtrs) override; +}; + +} // namespace ov::intel_cpu diff --git a/src/plugins/intel_cpu/src/nodes/subgraph.cpp b/src/plugins/intel_cpu/src/nodes/subgraph.cpp index 16fe6ebcc56a4d..adf0799f07fa7a 100644 --- a/src/plugins/intel_cpu/src/nodes/subgraph.cpp +++ b/src/plugins/intel_cpu/src/nodes/subgraph.cpp @@ -60,15 +60,25 @@ # include "transformations/snippets/aarch64/pass/lowered/adjust_gemm_copy_b_loop_ports.hpp" # include "transformations/snippets/aarch64/pass/lowered/gemm_cpu_blocking.hpp" # include "transformations/snippets/aarch64/pass/lowered/insert_gemm_copy_buffers.hpp" -#endif +#elif defined(OPENVINO_ARCH_RISCV64) +# include -#if !defined(OPENVINO_ARCH_RISCV64) +# include "emitters/snippets/riscv64/cpu_generator.hpp" +# include "executors/riscv64/subgraph.hpp" +#else # include "emitters/snippets/cpu_runtime_configurator.hpp" # include "snippets/lowered/pass/insert_perf_count_verbose.hpp" # include "snippets/lowered/pass/mark_loops.hpp" # include "snippets/pass/propagate_precision.hpp" #endif +#include "emitters/snippets/cpu_runtime_configurator.hpp" +#if defined(OPENVINO_ARCH_X86_64) || defined(OPENVINO_ARCH_ARM64) +# include "snippets/lowered/pass/insert_perf_count_verbose.hpp" +# include "snippets/lowered/pass/mark_loops.hpp" +# include "snippets/pass/propagate_precision.hpp" +#endif + #if defined(OPENVINO_ARCH_X86_64) # include "cache/cache_entry.h" # include "snippets/lowered/pass/init_loops.hpp" @@ -109,7 +119,7 @@ namespace ov::intel_cpu::node { namespace { -#if defined(OPENVINO_ARCH_X86_64) || defined(OPENVINO_ARCH_ARM64) +#if defined(OPENVINO_ARCH_X86_64) || defined(OPENVINO_ARCH_ARM64) || defined(OPENVINO_ARCH_RISCV64) struct SubgraphKey { SubgraphKey() = default; SubgraphKey(std::shared_ptr attrs_, std::vector in_shapes_) @@ -190,11 +200,15 @@ struct SubgraphShapeInferResult { } // namespace static _ov_dnnl_cpu_isa getHostIsa() { -#if defined(OPENVINO_ARCH_ARM64) - return dnnl::impl::cpu::aarch64::asimd; -#else +#if defined(OPENVINO_ARCH_X86_64) return dnnl::impl::cpu::x64::mayiuse(dnnl::impl::cpu::x64::avx512_core) ? dnnl::impl::cpu::x64::avx512_core : dnnl::impl::cpu::x64::avx2; +#elif defined(OPENVINO_ARCH_ARM64) + return dnnl::impl::cpu::aarch64::asimd; +#elif defined(OPENVINO_ARCH_RISCV64) + return static_cast<_ov_dnnl_cpu_isa>(ov::intel_cpu::riscv64::gv); +#else + OPENVINO_THROW("Subgraphs code-generator is not supported on this platform"); #endif } @@ -212,8 +226,12 @@ Subgraph::Subgraph(const std::shared_ptr& op, const GraphContext::CPtr std::make_shared(host_isa, context->getSnippetsParamsCache())); #elif defined(OPENVINO_ARCH_X86_64) subgraph_attrs->snippet->set_generator(std::make_shared(host_isa, context->getSnippetsParamsCache())); +#elif defined(OPENVINO_ARCH_RISCV64) + subgraph_attrs->snippet->set_generator( + std::make_shared(static_cast(host_isa), + context->getSnippetsParamsCache())); #else - CPU_NODE_THROW("Subgraphs code-generator is not supported on non-x64 platforms"); + OPENVINO_THROW("Subgraphs code-generator is not supported on this platform"); #endif // Note: we have to update shapeInfer, so it uses the per-thread op::Subgraph copy @@ -773,7 +791,7 @@ void Subgraph::optimizeIR() { } void Subgraph::prepareParams() { -#if defined(OPENVINO_ARCH_X86_64) || defined(OPENVINO_ARCH_ARM64) +#if defined(OPENVINO_ARCH_X86_64) || defined(OPENVINO_ARCH_ARM64) || defined(OPENVINO_ARCH_RISCV64) const auto& cache = context->getSnippetsParamsCache(); auto builder = [this, &cache](const SubgraphKey& key) -> std::shared_ptr { diff --git a/src/plugins/intel_cpu/src/transformations/transformation_pipeline.cpp b/src/plugins/intel_cpu/src/transformations/transformation_pipeline.cpp index 93373ee5675f7b..7c745a0317794d 100644 --- a/src/plugins/intel_cpu/src/transformations/transformation_pipeline.cpp +++ b/src/plugins/intel_cpu/src/transformations/transformation_pipeline.cpp @@ -41,13 +41,11 @@ #include "openvino/op/fake_quantize.hpp" #include "openvino/op/matmul.hpp" #include "openvino/op/max_pool.hpp" -#include "openvino/op/mish.hpp" #include "openvino/op/paged_attention.hpp" #include "openvino/op/reduce_max.hpp" #include "openvino/op/reduce_sum.hpp" #include "openvino/op/reshape.hpp" #include "openvino/op/result.hpp" -#include "openvino/op/swish.hpp" #include "openvino/op/transpose.hpp" #include "openvino/op/util/attr_types.hpp" @@ -196,6 +194,7 @@ # include "openvino/op/logical_xor.hpp" # include "openvino/op/maximum.hpp" # include "openvino/op/minimum.hpp" +# include "openvino/op/mish.hpp" # include "openvino/op/mod.hpp" # include "openvino/op/negative.hpp" # include "openvino/op/not_equal.hpp" @@ -207,6 +206,7 @@ # include "openvino/op/sigmoid.hpp" # include "openvino/op/sqrt.hpp" # include "openvino/op/squared_difference.hpp" +# include "openvino/op/swish.hpp" # include "openvino/op/tanh.hpp" # include "openvino/op/xor.hpp" # include "snippets/utils/utils.hpp" @@ -230,6 +230,7 @@ # include "onednn/dnnl.h" # include "openvino/op/group_normalization.hpp" # include "openvino/op/multiply.hpp" +# include "openvino/op/softmax.hpp" # include "openvino/op/subtract.hpp" # include "snippets/pass/common_optimizations.hpp" # include "snippets/pass/split_dimension_m.hpp" @@ -277,7 +278,6 @@ # include "cpu/x64/cpu_isa_traits.hpp" # include "openvino/op/gru_sequence.hpp" # include "openvino/op/lstm_sequence.hpp" -# include "openvino/op/softmax.hpp" #endif #if !defined(OPENVINO_ARCH_X86_64) && !defined(OPENVINO_ARCH_ARM64) @@ -1173,6 +1173,8 @@ void Transformations::MainSnippets() { return dnnl::impl::cpu::x64::mayiuse(dnnl::impl::cpu::x64::avx2); #elif defined(OPENVINO_ARCH_ARM64) return dnnl::impl::cpu::aarch64::mayiuse(dnnl::impl::cpu::aarch64::asimd); +#elif defined(OPENVINO_ARCH_RISCV64) + return true; // RISC-V with Vector Extension supports snippets #endif return false; }; @@ -1196,6 +1198,9 @@ void Transformations::MainSnippets() { // - abi_param1: used for runtime parameters // - RSP: stack related register size_t available_gprs_count = 14; +#elif defined(OPENVINO_ARCH_RISCV64) + // RISC-V has 32 gprs. Similar to ARM, conservatively use 23 available registers. + size_t available_gprs_count = 23; #else size_t available_gprs_count = 0; #endif @@ -1228,9 +1233,7 @@ void Transformations::MainSnippets() { mha_token_enable_transpose_on_output, is_dynamic_mha_token_enabled, mha_supported_transpose_ranks); -#if defined(OPENVINO_ARCH_ARM64) - TokenizeMLPSeqSnippets::Config mlp_seq_config(tokenization_config); -#elif defined(OPENVINO_ARCH_X86_64) +#if defined(OPENVINO_ARCH_X86_64) auto supported_as_postop = [this](const std::shared_ptr& matmul, const std::shared_ptr& node) { if (!pass::FuseBrgemmCPUPostops::can_be_fused_as_postop(node)) { @@ -1249,7 +1252,6 @@ void Transformations::MainSnippets() { #else TokenizeMLPSeqSnippets::Config mlp_seq_config(tokenization_config); #endif - ov::pass::Manager snippetsManager("CPU:Snippets"); snippetsManager.set_per_pass_validation(false); // if callback needed for better perf, enable SnippetsMarkSkipped, and disable TokenizeFCSnippets. @@ -1356,7 +1358,7 @@ void Transformations::MainSnippets() { }; #endif // OPENVINO_ARCH_X86_64 - auto is_supported_op = [](const std::shared_ptr& n) -> bool { + auto is_supported_op = []([[maybe_unused]] const std::shared_ptr& n) -> bool { #if defined(OPENVINO_ARCH_ARM64) // Power on ARM64 only supports power and swish with scalar second inputs auto is_supported_with_scalar_inputs = [](const std::shared_ptr& n) { @@ -1408,6 +1410,9 @@ void Transformations::MainSnippets() { ov::op::v0::Xor>(n)); }; return is_supported(n) || is_supported_with_scalar_inputs(n); +#elif defined(OPENVINO_ARCH_RISCV64) + // Snippets on RISC-V arch are enabled only in tests for now + return false; #else // CPU Plugin support Swish in Subgraph via conversion to SwichCPU which assumes second input to be constant, // and CPU Plugin does not support Mish for x64 diff --git a/src/plugins/intel_cpu/tests/functional/custom/subgraph_tests/src/common/lora_pattern.cpp b/src/plugins/intel_cpu/tests/functional/custom/subgraph_tests/src/common/lora_pattern.cpp index 70d5fce85438e6..2e5782fb008bb5 100644 --- a/src/plugins/intel_cpu/tests/functional/custom/subgraph_tests/src/common/lora_pattern.cpp +++ b/src/plugins/intel_cpu/tests/functional/custom/subgraph_tests/src/common/lora_pattern.cpp @@ -283,6 +283,7 @@ class LoraPatternConvolutionCPUTest : public LoraPatternBaseCPUTest { }; TEST_P(LoraPatternMatmulCPUTest, CompareWithRefs) { + SKIP_IF_CURRENT_TEST_IS_DISABLED(); targetStaticShapes = {{{{1, 20, K}}, {{N, K}}}}; run_test(); CPUTestUtils::CheckNumberOfNodesWithType(compiledModel, "LoRA", 1); @@ -290,6 +291,7 @@ TEST_P(LoraPatternMatmulCPUTest, CompareWithRefs) { } TEST_P(LoraPatternConvolutionCPUTest, CompareWithRefs) { + SKIP_IF_CURRENT_TEST_IS_DISABLED(); targetStaticShapes = {{{1, num_channels, 10, 15}}}; run_test(); CPUTestUtils::CheckNumberOfNodesWithType(compiledModel, "LoRA", 1); diff --git a/src/plugins/intel_cpu/tests/functional/shared_tests_instances/skip_tests_config.cpp b/src/plugins/intel_cpu/tests/functional/shared_tests_instances/skip_tests_config.cpp index 48d3c5dd16b612..04fa93e65e1b63 100644 --- a/src/plugins/intel_cpu/tests/functional/shared_tests_instances/skip_tests_config.cpp +++ b/src/plugins/intel_cpu/tests/functional/shared_tests_instances/skip_tests_config.cpp @@ -502,8 +502,8 @@ std::vector disabledTestPatterns() { // Issue: 170863 retVector.emplace_back(R"(smoke_Model_Distribution_MatMul_NoTranspose.*)"); #endif -#if !defined(OPENVINO_ARCH_ARM64) && !defined(OPENVINO_ARCH_X86_64) - // smoke_Snippets test cases are on platforms except x64 and aarch64/arm64 +#if !defined(OPENVINO_ARCH_ARM64) && !defined(OPENVINO_ARCH_X86_64) && !defined(OPENVINO_ARCH_RISCV64) + // smoke_Snippets test cases are on platforms except x64, ARM64 and RISCV64 retVector.emplace_back(R"(smoke_Snippets.*)"); #endif #if defined(OPENVINO_ARCH_ARM64) @@ -512,6 +512,11 @@ std::vector disabledTestPatterns() { retVector.emplace_back(R"(smoke_Snippets_GroupNormalization.*)"); retVector.emplace_back(R"(smoke_Snippets_PrecisionPropagation_Convertion.*)"); #endif +#if defined(OPENVINO_ARCH_RISCV64) + retVector.emplace_back(R"(smoke_Snippets.*\[.*\?.*\].*)"); + retVector.emplace_back(R"(smoke_Snippets(?!_Eltwise/Add\.).*)"); + retVector.emplace_back(R"(.*_enforceSnippets=1.*)"); +#endif #if defined(_WIN32) retVector.emplace_back(R"(.*smoke_QuantizedConvolutionBatchNormTransposeOnWeights/QuantizedConvolutionBatchNorm.CompareWithRefs/conv_type=convolution_quantize_type=fake_quantize_intervals_type=per_(tensor|channel)_transpose_on_weights=true_device=CPU.*)"); retVector.emplace_back(R"(.*smoke_LPT/ConvolutionTransformation.CompareWithRefImpl/f32_\[(1|4),3,16,16\]_CPU_f32_rank=4D_fq_on_data=\{level=256_shape=\[1,1,1,1\]_input_low=\{ 0 \}_input_high=\{ 255 \}_output_low=\{ -12.7 \}_output_high\{ 12.8 \}_precision=\}_fq_on_weights=\{_255_\[1,1,1,1\]_\{ -12.7 \}_\{ 12.7 \}\}.*)");