Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
7 changes: 7 additions & 0 deletions src/inference/dev_api/openvino/runtime/system_conf.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -90,6 +90,13 @@ OPENVINO_RUNTIME_API bool with_cpu_neon_fp16();
*/
OPENVINO_RUNTIME_API bool with_cpu_arm_dotprod();

/**
* @brief Checks whether CPU supports ARM Int8 MM capability
* @ingroup ov_dev_api_system_conf
* @return `True` is ARM Int8 MM instructions are available, `false` otherwise
*/
OPENVINO_RUNTIME_API bool with_cpu_arm_i8mm();

/**
* @brief Checks whether CPU supports ARM SVE capability
* @ingroup ov_dev_api_system_conf
Expand Down
23 changes: 23 additions & 0 deletions src/inference/src/system_conf.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -123,6 +123,10 @@ bool with_cpu_arm_dotprod() {
return false;
}

bool with_cpu_arm_i8mm() {
return false;
}

#else // OPENVINO_ARCH_X86 || OPENVINO_ARCH_X86_64

bool with_cpu_x86_sse42() {
Expand Down Expand Up @@ -216,6 +220,25 @@ bool with_cpu_arm_dotprod() {
# endif
}

bool with_cpu_arm_i8mm() {
# if !defined(_WIN64) && !defined(BARE_METAL) && !defined(__APPLE__) && !defined(__OpenBSD__) && \
!defined(__arm__) && defined(__aarch64__)
const uint32_t hwcaps = getauxval(AT_HWCAP);
return hwcaps & HWCAP2_I8MM;
# elif !defined(_WIN64) && !defined(BARE_METAL) && !defined(__APPLE__) && !defined(__OpenBSD__) && \
!defined(__aarch64__) && defined(__arm__)
return false;
# elif defined(__aarch64__) && defined(__APPLE__)
int64_t result(0);
size_t size = sizeof(result);
const std::string& cap = "hw.optional.arm.FEAT_I8MM";
sysctlbyname(cap.c_str(), &result, &size, NULL, 0);
return result > 0;
# else
return false;
# endif
}

#endif // OPENVINO_ARCH_X86 || OPENVINO_ARCH_X86_64

bool check_open_mp_env_vars(bool include_omp_num_threads) {
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -50,7 +50,7 @@ static bool useDynamicQuantizationImpl(const FCAttrs& attrs, const MemoryDescPtr
return false;
}

if (!hasIntDotProductSupport()) {
if (!hasIntDotProductSupport() || !hasInt8MMSupport()) {
return false;
}

Expand Down Expand Up @@ -124,17 +124,18 @@ MatMulKleidiAIExecutor::MatMulKleidiAIExecutor(const FCAttrs& attrs,
0,
nullptr);
} else {
ukernel_i8 = hasInt8MMSupport() ? &ukernel_i8_imm : &ukernel_i8_dotprod;
MemoryPtr weightsMemory = memory.at(ARG_WEI);
if (!attrs.weightsNonTransposed) {
auto dnnlSrcDesc = MemoryDescUtils::convertToDnnlMemoryDesc(originalWeightsDesc);
auto dnnlDstDesc = acl_fc_executor::makeTransposedWeightDescriptor(dnnlSrcDesc, dnnlSrcDesc);
weightsMemory = acl_fc_executor::reorderData(dnnlSrcDesc, dnnlDstDesc, memory.at(ARG_WEI), context);
}

mr = ukernel_i8.get_mr();
nr = ukernel_i8.get_nr();
kr = ukernel_i8.get_kr();
sr = ukernel_i8.get_sr();
mr = ukernel_i8->get_mr();
nr = ukernel_i8->get_nr();
kr = ukernel_i8->get_kr();
sr = ukernel_i8->get_sr();

auto* bias = biasMem->getDataAs<float>();
auto* rhs_native_qs8cx = weightsMemory->getDataAs<int8_t>();
Expand Down Expand Up @@ -184,7 +185,10 @@ bool MatMulKleidiAIExecutor::update(const MemoryArgs& memory) {
}
// Assign LHS memory
if (useDynamicQuant) {
const size_t lhsPackedSize = kai_get_lhs_packed_size_lhs_quant_pack_qai8dxp_f32(M, K, mr, kr, sr);
const size_t _m_blocks = (M + BLOCK_SIZE_M_INT8 - 1) / BLOCK_SIZE_M_INT8;
packed_lhs_block_in_bytes_int8 =
kai_get_lhs_packed_size_lhs_quant_pack_qai8dxp_f32(BLOCK_SIZE_M_INT8, K, mr, kr, sr);
const size_t lhsPackedSize = packed_lhs_block_in_bytes_int8 * _m_blocks;
auto lhsPackedDesc = std::make_shared<CpuBlockedMemoryDesc>(i8, Shape({lhsPackedSize}));
lhsPackedMem = scratchPad->createScratchPadMem(lhsPackedDesc);
}
Expand Down Expand Up @@ -237,38 +241,45 @@ void MatMulKleidiAIExecutor::execute(const MemoryArgs& memory) {
auto* lhs_packed_qa8dx = lhsPackedMem->getDataAs<int8_t>();
auto* rhs_packed_qs8cx = rhsPackedMem->getDataAs<int8_t>();

kai_run_lhs_quant_pack_qai8dxp_f32(M,
K, // Dimensions
mr,
kr,
sr,
0, // Packing dimensions
lhs, // LHS (F32)
lhs_stride, // LHS stride
lhs_packed_qa8dx // LHS packed
);

const size_t lhs_packed_offset = ukernel_i8.get_lhs_packed_offset(0, K);
const auto* lhs_ptr = static_cast<const void*>(lhs_packed_qa8dx + lhs_packed_offset);

parallel_for(n_blocks, [&](size_t n_block) {
size_t n_start = (n_block * BLOCK_SIZE);
size_t n_end = std::min(n_start + BLOCK_SIZE, N);
size_t n_block_size = n_end - n_start;
const size_t rhs_packed_offset = ukernel_i8.get_rhs_packed_offset(n_start, K);
const size_t dst_offset = ukernel_i8.get_dst_offset(0, n_start, dst_stride_row);
const auto* rhs_ptr = static_cast<const void*>(rhs_packed_qs8cx + rhs_packed_offset);
float* dst_ptr = (dst + dst_offset / sizeof(float));
ukernel_i8.run_matmul(M,
n_block_size,
K,
lhs_ptr,
rhs_ptr,
dst_ptr,
dst_stride_row,
dst_stride_col,
FLOAT_MIN,
FLOAT_MAX);
constexpr size_t m_step = BLOCK_SIZE_M_INT8;
constexpr size_t n_step = 4;
const size_t M_BLOCKS = (M + m_step - 1) / m_step;
const size_t N_BLOCKS = (N + n_step - 1) / n_step;
const size_t lhs_packed_offset = ukernel_i8->get_lhs_packed_offset(0, K);

parallel_for(M_BLOCKS, [&](size_t m_blk) {
const size_t M_iter = std::min(M - m_blk * m_step, m_step);
auto* lhs_packed_qa8dx_B = lhs_packed_qa8dx + m_blk * packed_lhs_block_in_bytes_int8;

kai_run_lhs_quant_pack_qai8dxp_f32(M_iter,
K,
mr,
kr,
sr,
0,
lhs + m_blk * m_step * K, // LHS (F32)
lhs_stride,
lhs_packed_qa8dx_B // lhs packed output
);
parallel_for(N_BLOCKS, [&](size_t n_blk) {
// matmul exec
const size_t rhs_packed_offset = ukernel_i8->get_rhs_packed_offset(n_blk * n_step, K);
const size_t dst_offset = ukernel_i8->get_dst_offset(m_blk * m_step, n_blk * n_step, dst_stride_row);
const void* rhs_ptr = static_cast<const void*>(rhs_packed_qs8cx + rhs_packed_offset);
const auto* lhs_ptr = static_cast<const void*>(lhs_packed_qa8dx_B + lhs_packed_offset);
float* dst_ptr = (dst + dst_offset / sizeof(float));
const size_t N_iter = std::min(N - n_blk * n_step, n_step);
ukernel_i8->run_matmul(M_iter,
N_iter,
K,
lhs_ptr,
rhs_ptr,
dst_ptr,
dst_stride_row,
dst_stride_col,
FLOAT_MIN,
FLOAT_MAX);
});
});
}
}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,7 @@

// INT8
#include "kai/ukernels/matmul/matmul_clamp_f32_qai8dxp_qsi8cxp/kai_matmul_clamp_f32_qai8dxp1x4_qsi8cxp4x4_1x4_neon_dotprod.h"
#include "kai/ukernels/matmul/matmul_clamp_f32_qai8dxp_qsi8cxp/kai_matmul_clamp_f32_qai8dxp4x8_qsi8cxp4x8_16x4_neon_i8mm.h"
#include "kai/ukernels/matmul/matmul_clamp_f32_qai8dxp_qsi8cxp/kai_matmul_clamp_f32_qai8dxp_qsi8cxp_interface.h"
#include "kai/ukernels/matmul/pack/kai_lhs_quant_pack_qai8dxp_f32.h"
#include "kai/ukernels/matmul/pack/kai_rhs_pack_kxn_qsi8cxp_qsi8cx_neon.h"
Expand Down Expand Up @@ -53,7 +54,7 @@ class MatMulKleidiAIExecutor : public Executor {
kai_get_dst_offset_matmul_clamp_f32_f32_f32p8x1biasf32_6x8x4_neon_mla,
kai_get_dst_size_matmul_clamp_f32_f32_f32p8x1biasf32_6x8x4_neon_mla,
kai_run_matmul_clamp_f32_f32_f32p8x1biasf32_6x8x4_neon_mla};
static constexpr kai_matmul_clamp_f32_qai8dxp_qsi8cxp_ukernel ukernel_i8{
static constexpr kai_matmul_clamp_f32_qai8dxp_qsi8cxp_ukernel ukernel_i8_dotprod{
kai_get_m_step_matmul_clamp_f32_qai8dxp1x4_qsi8cxp4x4_1x4_neon_dotprod,
kai_get_n_step_matmul_clamp_f32_qai8dxp1x4_qsi8cxp4x4_1x4_neon_dotprod,
kai_get_mr_matmul_clamp_f32_qai8dxp1x4_qsi8cxp4x4_1x4_neon_dotprod,
Expand All @@ -65,6 +66,18 @@ class MatMulKleidiAIExecutor : public Executor {
kai_get_dst_offset_matmul_clamp_f32_qai8dxp1x4_qsi8cxp4x4_1x4_neon_dotprod,
kai_get_dst_size_matmul_clamp_f32_qai8dxp1x4_qsi8cxp4x4_1x4_neon_dotprod,
kai_run_matmul_clamp_f32_qai8dxp1x4_qsi8cxp4x4_1x4_neon_dotprod};
static constexpr kai_matmul_clamp_f32_qai8dxp_qsi8cxp_ukernel ukernel_i8_imm{
kai_get_m_step_matmul_clamp_f32_qai8dxp4x8_qsi8cxp4x8_16x4_neon_i8mm,
kai_get_n_step_matmul_clamp_f32_qai8dxp4x8_qsi8cxp4x8_16x4_neon_i8mm,
kai_get_mr_matmul_clamp_f32_qai8dxp4x8_qsi8cxp4x8_16x4_neon_i8mm,
kai_get_nr_matmul_clamp_f32_qai8dxp4x8_qsi8cxp4x8_16x4_neon_i8mm,
kai_get_kr_matmul_clamp_f32_qai8dxp4x8_qsi8cxp4x8_16x4_neon_i8mm,
kai_get_sr_matmul_clamp_f32_qai8dxp4x8_qsi8cxp4x8_16x4_neon_i8mm,
kai_get_lhs_packed_offset_matmul_clamp_f32_qai8dxp4x8_qsi8cxp4x8_16x4_neon_i8mm,
kai_get_rhs_packed_offset_matmul_clamp_f32_qai8dxp4x8_qsi8cxp4x8_16x4_neon_i8mm,
kai_get_dst_offset_matmul_clamp_f32_qai8dxp4x8_qsi8cxp4x8_16x4_neon_i8mm,
kai_get_dst_size_matmul_clamp_f32_qai8dxp4x8_qsi8cxp4x8_16x4_neon_i8mm,
kai_run_matmul_clamp_f32_qai8dxp4x8_qsi8cxp4x8_16x4_neon_i8mm};

const FCAttrs& m_attrs;
const MemoryArgs& m_memoryArgs;
Expand All @@ -76,7 +89,12 @@ class MatMulKleidiAIExecutor : public Executor {
MemoryCPtr packedWeights;
size_t M = 0UL, N = 0UL, K = 0UL;
size_t mr, nr, kr, sr;
// F32 Kernel block size
static constexpr size_t BLOCK_SIZE = 8;
// INT8 blocking in M dimension for both packing and matmul calls
const kai_matmul_clamp_f32_qai8dxp_qsi8cxp_ukernel* ukernel_i8 = nullptr;
static constexpr size_t BLOCK_SIZE_M_INT8 = 16;
size_t packed_lhs_block_in_bytes_int8 = 0UL;
int curNumaNode = -1;
bool useDynamicQuant = false;
};
Expand Down
3 changes: 3 additions & 0 deletions src/plugins/intel_cpu/src/utils/precision_support.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -58,4 +58,7 @@ bool hasIntDotProductSupport() {
return with_cpu_arm_dotprod();
}

bool hasInt8MMSupport() {
return with_cpu_arm_i8mm();
}
} // namespace ov::intel_cpu
1 change: 1 addition & 0 deletions src/plugins/intel_cpu/src/utils/precision_support.h
Original file line number Diff line number Diff line change
Expand Up @@ -11,5 +11,6 @@ namespace ov::intel_cpu {
bool hasHardwareSupport(const ov::element::Type& precision);
ov::element::Type defaultFloatPrecision();
bool hasIntDotProductSupport();
bool hasInt8MMSupport();

} // namespace ov::intel_cpu
Loading