Skip to content
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
20 changes: 10 additions & 10 deletions .github/workflows/bench.yml
Original file line number Diff line number Diff line change
Expand Up @@ -33,21 +33,21 @@ jobs:
name: Arm Cortex-A72 (Raspberry Pi 4) benchmarks
bench_pmu: PMU
archflags: -mcpu=cortex-a72 -DMLD_SYS_AARCH64_SLOW_BARREL_SHIFTER
cflags: "-flto -DMLD_FORCE_AARCH64"
cflags: "-DMLD_FORCE_AARCH64"
bench_extra_args: ""
only_no_opt: false
- system: rpi5
name: Arm Cortex-A76 (Raspberry Pi 5) benchmarks
bench_pmu: PERF
archflags: "-mcpu=cortex-a76 -march=armv8.2-a"
cflags: "-flto -DMLD_FORCE_AARCH64"
cflags: "-DMLD_FORCE_AARCH64"
bench_extra_args: ""
only_no_opt: false
- system: a55
name: Arm Cortex-A55 (Snapdragon 888) benchmarks
bench_pmu: PERF
archflags: "-mcpu=cortex-a55 -march=armv8.2-a"
cflags: "-flto -static -DMLD_FORCE_AARCH64"
cflags: "-static -DMLD_FORCE_AARCH64"
bench_extra_args: -w exec-on-a55
only_no_opt: false
- system: bpi
Expand Down Expand Up @@ -109,43 +109,43 @@ jobs:
ec2_instance_type: t4g.small
ec2_ami: ubuntu-latest (aarch64)
archflags: -mcpu=cortex-a76 -march=armv8.2-a
cflags: "-flto -DMLD_FORCE_AARCH64"
cflags: "-DMLD_FORCE_AARCH64"
perf: PERF
- name: Graviton3
ec2_instance_type: c7g.medium
ec2_ami: ubuntu-latest (aarch64)
archflags: -march=armv8.4-a+sha3
cflags: "-flto -DMLD_FORCE_AARCH64"
cflags: "-DMLD_FORCE_AARCH64"
perf: PERF
- name: Graviton4
ec2_instance_type: c8g.medium
ec2_ami: ubuntu-latest (aarch64)
archflags: -march=armv9-a+sha3
cflags: "-flto -DMLD_FORCE_AARCH64"
cflags: "-DMLD_FORCE_AARCH64"
perf: PERF
- name: AMD EPYC 4th gen (c7a)
ec2_instance_type: c7a.medium
ec2_ami: ubuntu-latest (x86_64)
archflags: -mavx2 -mbmi2 -mpopcnt -maes -march=znver4
cflags: "-flto -DMLD_FORCE_X86_64"
cflags: "-DMLD_FORCE_X86_64"
perf: PMU
- name: Intel Xeon 4th gen (c7i)
ec2_instance_type: c7i.metal-24xl
ec2_ami: ubuntu-latest (x86_64)
archflags: -mavx2 -mbmi2 -mpopcnt -maes -march=sapphirerapids
cflags: "-flto -DMLD_FORCE_X86_64"
cflags: "-DMLD_FORCE_X86_64"
perf: PMU
- name: AMD EPYC 3rd gen (c6a)
ec2_instance_type: c6a.large
ec2_ami: ubuntu-latest (x86_64)
archflags: -mavx2 -mbmi2 -mpopcnt -maes -march=znver3
cflags: "-flto -DMLD_FORCE_X86_64"
cflags: "-DMLD_FORCE_X86_64"
perf: PMU
- name: Intel Xeon 3rd gen (c6i)
ec2_instance_type: c6i.large
ec2_ami: ubuntu-latest (x86_64)
archflags: -mavx2 -mbmi2 -mpopcnt -maes -march=icelake-server
cflags: "-flto -DMLD_FORCE_X86_64"
cflags: "-DMLD_FORCE_X86_64"
perf: PMU
uses: ./.github/workflows/bench_ec2_reusable.yml
if: github.repository_owner == 'pq-code-package' && (github.event.label.name == 'benchmark' || github.ref == 'refs/heads/main')
Expand Down
31 changes: 31 additions & 0 deletions mldsa/native/x86_64/meta.h
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,8 @@
#define MLD_USE_NATIVE_NTT_CUSTOM_ORDER
#define MLD_USE_NATIVE_NTT
#define MLD_USE_NATIVE_INTT
#define MLD_USE_NATIVE_POLY_POINTWISE_MONTGOMERY
#define MLD_USE_NATIVE_POLYVECK_POINTWISE_POLY_MONTGOMERY

#if !defined(__ASSEMBLER__)
#include <string.h>
Expand All @@ -34,6 +36,35 @@ static MLD_INLINE void mld_intt_native(int32_t data[MLDSA_N])
mld_invntt_avx2((__m256i *)data, mld_qdata.vec);
}

static MLD_INLINE void mld_poly_pointwise_montgomery_native(
int32_t c[MLDSA_N], const int32_t a[MLDSA_N], const int32_t b[MLDSA_N])
{
mld_poly_pointwise_montgomery_asm(c, a, b);
}

#if MLDSA_K == 4
static MLD_INLINE void mld_polyveck_pointwise_poly_montgomery_native(
int32_t r[MLDSA_K * MLDSA_N], const int32_t a[MLDSA_N],
const int32_t v[MLDSA_K * MLDSA_N])
{
mld_polyveck_pointwise_poly_montgomery_k4_asm(r, a, v);
}
#elif MLDSA_K == 6
static MLD_INLINE void mld_polyveck_pointwise_poly_montgomery_native(
int32_t r[MLDSA_K * MLDSA_N], const int32_t a[MLDSA_N],
const int32_t v[MLDSA_K * MLDSA_N])
{
mld_polyveck_pointwise_poly_montgomery_k6_asm(r, a, v);
}
#elif MLDSA_K == 8
static MLD_INLINE void mld_polyveck_pointwise_poly_montgomery_native(
int32_t r[MLDSA_K * MLDSA_N], const int32_t a[MLDSA_N],
const int32_t v[MLDSA_K * MLDSA_N])
{
mld_polyveck_pointwise_poly_montgomery_k8_asm(r, a, v);
}
#endif /* MLDSA_K == 8 */

#endif /* !__ASSEMBLER__ */

#endif /* !MLD_NATIVE_X86_64_META_H */
20 changes: 20 additions & 0 deletions mldsa/native/x86_64/src/arith_native_x86_64.h
Original file line number Diff line number Diff line change
Expand Up @@ -19,4 +19,24 @@ void mld_invntt_avx2(__m256i *r, const __m256i *mld_qdata);
#define mld_nttunpack_avx2 MLD_NAMESPACE(nttunpack_avx2)
void mld_nttunpack_avx2(__m256i *r);

#define mld_poly_pointwise_montgomery_asm \
MLD_NAMESPACE(poly_pointwise_montgomery_asm)
void mld_poly_pointwise_montgomery_asm(int32_t *c, const int32_t *a,
const int32_t *b);

#define mld_polyveck_pointwise_poly_montgomery_k4_asm \
MLD_NAMESPACE(polyveck_pointwise_poly_montgomery_k4_asm)
void mld_polyveck_pointwise_poly_montgomery_k4_asm(int32_t *r, const int32_t *a,
const int32_t *v);

#define mld_polyveck_pointwise_poly_montgomery_k6_asm \
MLD_NAMESPACE(polyveck_pointwise_poly_montgomery_k6_asm)
void mld_polyveck_pointwise_poly_montgomery_k6_asm(int32_t *r, const int32_t *a,
const int32_t *v);

#define mld_polyveck_pointwise_poly_montgomery_k8_asm \
MLD_NAMESPACE(polyveck_pointwise_poly_montgomery_k8_asm)
void mld_polyveck_pointwise_poly_montgomery_k8_asm(int32_t *r, const int32_t *a,
const int32_t *v);

#endif /* !MLD_NATIVE_X86_64_SRC_ARITH_NATIVE_X86_64_H */
78 changes: 78 additions & 0 deletions mldsa/native/x86_64/src/mld_poly_pointwise_montgomery.S
Original file line number Diff line number Diff line change
@@ -0,0 +1,78 @@
/*
* Copyright (c) The mldsa-native project authors
* SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT
*/

#include "../../../common.h"
#if defined(MLD_ARITH_BACKEND_X86_64_DEFAULT)

.macro montgomery_reduce_avx2 res, a, qinv, q
// Montgomery reduction implementation using AVX2 instructions
// Input: a (product of coefficients)
// Output: res (reduced result)
// Constants: qinv (QINV = 58728449), q (MLDSA_Q = 8380417)

// 1. t = a & 0xFFFFFFFF (low 32 bits)
// 2. m = t * QINV & 0xFFFFFFFF
vpmulld \qinv, \a, %ymm5 // ymm5 = t * QINV (low 32 bits)

// 3. t = (a + m*Q) >> 32
vpmulld \q, %ymm5, %ymm6 // ymm6 = m * Q (low 32 bits)
vpaddq \a, %ymm6, %ymm7 // ymm7 = a + m*Q
vpsrlq $32, %ymm7, \res // res = (a + m*Q) >> 32
.endm

.text
.global MLD_ASM_NAMESPACE(poly_pointwise_montgomery_asm)
.balign 4
MLD_ASM_FN_SYMBOL(poly_pointwise_montgomery_asm)
// Save registers
push %rbx
push %r12
push %r13

// Load parameters
// rdi: pointer to output polynomial c
// rsi: pointer to input polynomial a
// rdx: pointer to input polynomial b

// Load constants
movabs $58728449, %rax // QINV = 58728449
vpbroadcastd %eax, %ymm0 // Broadcast QINV to all elements of ymm0

movabs $8380417, %rax // MLDSA_Q = 8380417
vpbroadcastd %eax, %ymm1 // Broadcast MLDSA_Q to all elements of ymm1

// Process 8 coefficients at a time (32 iterations for 256 coefficients)
mov $32, %rbx

loop_start:
// Load 8 coefficients from each polynomial
vmovdqa (%rsi), %ymm2 // Load 8 coefficients from a
vmovdqa (%rdx), %ymm3 // Load 8 coefficients from b

// Multiply coefficients
vpmulld %ymm2, %ymm3, %ymm4 // ymm4 = a[i] * b[i] (low 32 bits)

// Apply Montgomery reduction
montgomery_reduce_avx2 %ymm4, %ymm4, %ymm0, %ymm1

// Store result
vmovdqa %ymm4, (%rdi)

// Advance pointers
add $32, %rsi
add $32, %rdx
add $32, %rdi

// Decrement counter and loop
dec %rbx
jnz loop_start

// Restore registers and return
pop %r13
pop %r12
pop %rbx
ret

#endif /* MLD_ARITH_BACKEND_X86_64_DEFAULT */
Original file line number Diff line number Diff line change
@@ -0,0 +1,62 @@
/*
* Copyright (c) The mldsa-native project authors
* SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT
*/

#include "../../../common.h"
#if defined(MLD_ARITH_BACKEND_X86_64_DEFAULT)

.text
.global MLD_ASM_NAMESPACE(polyveck_pointwise_poly_montgomery_k4_asm)
.balign 4
MLD_ASM_FN_SYMBOL(polyveck_pointwise_poly_montgomery_k4_asm)
// Save registers
push %rbx
push %r12
push %r13
push %r14

// Load parameters
// rdi: pointer to output vector r
// rsi: pointer to input polynomial a
// rdx: pointer to input vector v

// Save parameters
mov %rdi, %r12 // r
mov %rsi, %r13 // a
mov %rdx, %r14 // v

// Call poly_pointwise_montgomery for each polynomial in the vector

// First polynomial
mov %r12, %rdi // r[0]
mov %r13, %rsi // a
mov %r14, %rdx // v[0]
call MLD_ASM_NAMESPACE(poly_pointwise_montgomery_asm)

// Second polynomial
lea 1024(%r12), %rdi // r[1]
mov %r13, %rsi // a
lea 1024(%r14), %rdx // v[1]
call MLD_ASM_NAMESPACE(poly_pointwise_montgomery_asm)

// Third polynomial
lea 2048(%r12), %rdi // r[2]
mov %r13, %rsi // a
lea 2048(%r14), %rdx // v[2]
call MLD_ASM_NAMESPACE(poly_pointwise_montgomery_asm)

// Fourth polynomial
lea 3072(%r12), %rdi // r[3]
mov %r13, %rsi // a
lea 3072(%r14), %rdx // v[3]
call MLD_ASM_NAMESPACE(poly_pointwise_montgomery_asm)

// Restore registers and return
pop %r14
pop %r13
pop %r12
pop %rbx
ret

#endif /* MLD_ARITH_BACKEND_X86_64_DEFAULT */
Original file line number Diff line number Diff line change
@@ -0,0 +1,74 @@
/*
* Copyright (c) The mldsa-native project authors
* SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT
*/

#include "../../../common.h"
#if defined(MLD_ARITH_BACKEND_X86_64_DEFAULT)

.text
.global MLD_ASM_NAMESPACE(polyveck_pointwise_poly_montgomery_k6_asm)
.balign 4
MLD_ASM_FN_SYMBOL(polyveck_pointwise_poly_montgomery_k6_asm)
// Save registers
push %rbx
push %r12
push %r13
push %r14

// Load parameters
// rdi: pointer to output vector r
// rsi: pointer to input polynomial a
// rdx: pointer to input vector v

// Save parameters
mov %rdi, %r12 // r
mov %rsi, %r13 // a
mov %rdx, %r14 // v

// Call poly_pointwise_montgomery for each polynomial in the vector

// First polynomial
mov %r12, %rdi // r[0]
mov %r13, %rsi // a
mov %r14, %rdx // v[0]
call MLD_ASM_NAMESPACE(poly_pointwise_montgomery_asm)

// Second polynomial
lea 1024(%r12), %rdi // r[1]
mov %r13, %rsi // a
lea 1024(%r14), %rdx // v[1]
call MLD_ASM_NAMESPACE(poly_pointwise_montgomery_asm)

// Third polynomial
lea 2048(%r12), %rdi // r[2]
mov %r13, %rsi // a
lea 2048(%r14), %rdx // v[2]
call MLD_ASM_NAMESPACE(poly_pointwise_montgomery_asm)

// Fourth polynomial
lea 3072(%r12), %rdi // r[3]
mov %r13, %rsi // a
lea 3072(%r14), %rdx // v[3]
call MLD_ASM_NAMESPACE(poly_pointwise_montgomery_asm)

// Fifth polynomial
lea 4096(%r12), %rdi // r[4]
mov %r13, %rsi // a
lea 4096(%r14), %rdx // v[4]
call MLD_ASM_NAMESPACE(poly_pointwise_montgomery_asm)

// Sixth polynomial
lea 5120(%r12), %rdi // r[5]
mov %r13, %rsi // a
lea 5120(%r14), %rdx // v[5]
call MLD_ASM_NAMESPACE(poly_pointwise_montgomery_asm)

// Restore registers and return
pop %r14
pop %r13
pop %r12
pop %rbx
ret

#endif /* MLD_ARITH_BACKEND_X86_64_DEFAULT */
Loading
Loading