pq-code-package · jakemas · Jun 9, 2025 · Jun 17, 2025 · Jun 19, 2025 · Jun 19, 2025
@@ -33,21 +33,21 @@ jobs:
           name: Arm Cortex-A72 (Raspberry Pi 4) benchmarks
           bench_pmu: PMU
           archflags: -mcpu=cortex-a72 -DMLD_SYS_AARCH64_SLOW_BARREL_SHIFTER
-          cflags: "-flto -DMLD_FORCE_AARCH64"
+          cflags: "-DMLD_FORCE_AARCH64"
           bench_extra_args: ""
           only_no_opt: false
         - system: rpi5
           name: Arm Cortex-A76 (Raspberry Pi 5) benchmarks
           bench_pmu: PERF
           archflags: "-mcpu=cortex-a76 -march=armv8.2-a"
-          cflags: "-flto -DMLD_FORCE_AARCH64"
+          cflags: "-DMLD_FORCE_AARCH64"
           bench_extra_args: ""
           only_no_opt: false
         - system: a55
           name: Arm Cortex-A55 (Snapdragon 888) benchmarks
           bench_pmu: PERF
           archflags: "-mcpu=cortex-a55 -march=armv8.2-a"
-          cflags: "-flto -static -DMLD_FORCE_AARCH64"
+          cflags: "-static -DMLD_FORCE_AARCH64"
           bench_extra_args: -w exec-on-a55
           only_no_opt: false
         - system: bpi
@@ -109,43 +109,43 @@ jobs:
             ec2_instance_type: t4g.small
             ec2_ami: ubuntu-latest (aarch64)
             archflags: -mcpu=cortex-a76 -march=armv8.2-a
-            cflags: "-flto -DMLD_FORCE_AARCH64"
+            cflags: "-DMLD_FORCE_AARCH64"
             perf: PERF
           - name: Graviton3
             ec2_instance_type: c7g.medium
             ec2_ami: ubuntu-latest (aarch64)
             archflags: -march=armv8.4-a+sha3
-            cflags: "-flto -DMLD_FORCE_AARCH64"
+            cflags: "-DMLD_FORCE_AARCH64"
             perf: PERF
           - name: Graviton4
             ec2_instance_type: c8g.medium
             ec2_ami: ubuntu-latest (aarch64)
             archflags: -march=armv9-a+sha3
-            cflags: "-flto -DMLD_FORCE_AARCH64"
+            cflags: "-DMLD_FORCE_AARCH64"
             perf: PERF
           - name: AMD EPYC 4th gen (c7a)
             ec2_instance_type: c7a.medium
             ec2_ami: ubuntu-latest (x86_64)
             archflags: -mavx2 -mbmi2 -mpopcnt -maes -march=znver4
-            cflags: "-flto -DMLD_FORCE_X86_64"
+            cflags: "-DMLD_FORCE_X86_64"
             perf: PMU
           - name: Intel Xeon 4th gen (c7i)
             ec2_instance_type: c7i.metal-24xl
             ec2_ami: ubuntu-latest (x86_64)
             archflags: -mavx2 -mbmi2 -mpopcnt -maes -march=sapphirerapids
-            cflags: "-flto -DMLD_FORCE_X86_64"
+            cflags: "-DMLD_FORCE_X86_64"
             perf: PMU
           - name: AMD EPYC 3rd gen (c6a)
             ec2_instance_type: c6a.large
             ec2_ami: ubuntu-latest (x86_64)
             archflags: -mavx2 -mbmi2 -mpopcnt -maes -march=znver3
-            cflags: "-flto -DMLD_FORCE_X86_64"
+            cflags: "-DMLD_FORCE_X86_64"
             perf: PMU
           - name: Intel Xeon 3rd gen (c6i)
             ec2_instance_type: c6i.large
             ec2_ami: ubuntu-latest (x86_64)
             archflags: -mavx2 -mbmi2 -mpopcnt -maes -march=icelake-server
-            cflags: "-flto -DMLD_FORCE_X86_64"
+            cflags: "-DMLD_FORCE_X86_64"
             perf: PMU
     uses: ./.github/workflows/bench_ec2_reusable.yml
     if: github.repository_owner == 'pq-code-package' && (github.event.label.name == 'benchmark' || github.ref == 'refs/heads/main')

@@ -14,6 +14,8 @@
 #define MLD_USE_NATIVE_NTT_CUSTOM_ORDER
 #define MLD_USE_NATIVE_NTT
 #define MLD_USE_NATIVE_INTT
+#define MLD_USE_NATIVE_POLY_POINTWISE_MONTGOMERY
+#define MLD_USE_NATIVE_POLYVECK_POINTWISE_POLY_MONTGOMERY
 
 #if !defined(__ASSEMBLER__)
 #include <string.h>
@@ -34,6 +36,35 @@ static MLD_INLINE void mld_intt_native(int32_t data[MLDSA_N])
   mld_invntt_avx2((__m256i *)data, mld_qdata.vec);
 }
 
+static MLD_INLINE void mld_poly_pointwise_montgomery_native(
+    int32_t c[MLDSA_N], const int32_t a[MLDSA_N], const int32_t b[MLDSA_N])
+{
+  mld_poly_pointwise_montgomery_asm(c, a, b);
+}
+
+#if MLDSA_K == 4
+static MLD_INLINE void mld_polyveck_pointwise_poly_montgomery_native(
+    int32_t r[MLDSA_K * MLDSA_N], const int32_t a[MLDSA_N],
+    const int32_t v[MLDSA_K * MLDSA_N])
+{
+  mld_polyveck_pointwise_poly_montgomery_k4_asm(r, a, v);
+}
+#elif MLDSA_K == 6
+static MLD_INLINE void mld_polyveck_pointwise_poly_montgomery_native(
+    int32_t r[MLDSA_K * MLDSA_N], const int32_t a[MLDSA_N],
+    const int32_t v[MLDSA_K * MLDSA_N])
+{
+  mld_polyveck_pointwise_poly_montgomery_k6_asm(r, a, v);
+}
+#elif MLDSA_K == 8
+static MLD_INLINE void mld_polyveck_pointwise_poly_montgomery_native(
+    int32_t r[MLDSA_K * MLDSA_N], const int32_t a[MLDSA_N],
+    const int32_t v[MLDSA_K * MLDSA_N])
+{
+  mld_polyveck_pointwise_poly_montgomery_k8_asm(r, a, v);
+}
+#endif /* MLDSA_K == 8 */
+
 #endif /* !__ASSEMBLER__ */
 
 #endif /* !MLD_NATIVE_X86_64_META_H */
@@ -19,4 +19,24 @@ void mld_invntt_avx2(__m256i *r, const __m256i *mld_qdata);
 #define mld_nttunpack_avx2 MLD_NAMESPACE(nttunpack_avx2)
 void mld_nttunpack_avx2(__m256i *r);
 
+#define mld_poly_pointwise_montgomery_asm \
+  MLD_NAMESPACE(poly_pointwise_montgomery_asm)
+void mld_poly_pointwise_montgomery_asm(int32_t *c, const int32_t *a,
+                                       const int32_t *b);
+
+#define mld_polyveck_pointwise_poly_montgomery_k4_asm \
+  MLD_NAMESPACE(polyveck_pointwise_poly_montgomery_k4_asm)
+void mld_polyveck_pointwise_poly_montgomery_k4_asm(int32_t *r, const int32_t *a,
+                                                   const int32_t *v);
+
+#define mld_polyveck_pointwise_poly_montgomery_k6_asm \
+  MLD_NAMESPACE(polyveck_pointwise_poly_montgomery_k6_asm)
+void mld_polyveck_pointwise_poly_montgomery_k6_asm(int32_t *r, const int32_t *a,
+                                                   const int32_t *v);
+
+#define mld_polyveck_pointwise_poly_montgomery_k8_asm \
+  MLD_NAMESPACE(polyveck_pointwise_poly_montgomery_k8_asm)
+void mld_polyveck_pointwise_poly_montgomery_k8_asm(int32_t *r, const int32_t *a,
+                                                   const int32_t *v);
+
 #endif /* !MLD_NATIVE_X86_64_SRC_ARITH_NATIVE_X86_64_H */
@@ -0,0 +1,78 @@
+/*
+ * Copyright (c) The mldsa-native project authors
+ * SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT
+ */
+
+#include "../../../common.h"
+#if defined(MLD_ARITH_BACKEND_X86_64_DEFAULT)
+
+.macro montgomery_reduce_avx2 res, a, qinv, q
+    // Montgomery reduction implementation using AVX2 instructions
+    // Input: a (product of coefficients)
+    // Output: res (reduced result)
+    // Constants: qinv (QINV = 58728449), q (MLDSA_Q = 8380417)
+
+    // 1. t = a & 0xFFFFFFFF (low 32 bits)
+    // 2. m = t * QINV & 0xFFFFFFFF
+    vpmulld \qinv, \a, %ymm5  // ymm5 = t * QINV (low 32 bits)
+
+    // 3. t = (a + m*Q) >> 32
+    vpmulld \q, %ymm5, %ymm6  // ymm6 = m * Q (low 32 bits)
+    vpaddq \a, %ymm6, %ymm7   // ymm7 = a + m*Q
+    vpsrlq $32, %ymm7, \res   // res = (a + m*Q) >> 32
+.endm
+
+.text
+.global MLD_ASM_NAMESPACE(poly_pointwise_montgomery_asm)
+.balign 4
+MLD_ASM_FN_SYMBOL(poly_pointwise_montgomery_asm)
+    // Save registers
+    push %rbx
+    push %r12
+    push %r13
+
+    // Load parameters
+    // rdi: pointer to output polynomial c
+    // rsi: pointer to input polynomial a
+    // rdx: pointer to input polynomial b
+
+    // Load constants
+    movabs $58728449, %rax       // QINV = 58728449
+    vpbroadcastd %eax, %ymm0     // Broadcast QINV to all elements of ymm0
+
+    movabs $8380417, %rax        // MLDSA_Q = 8380417
+    vpbroadcastd %eax, %ymm1     // Broadcast MLDSA_Q to all elements of ymm1
+
+    // Process 8 coefficients at a time (32 iterations for 256 coefficients)
+    mov $32, %rbx
+
+loop_start:
+    // Load 8 coefficients from each polynomial
+    vmovdqa (%rsi), %ymm2        // Load 8 coefficients from a
+    vmovdqa (%rdx), %ymm3        // Load 8 coefficients from b
+
+    // Multiply coefficients
+    vpmulld %ymm2, %ymm3, %ymm4  // ymm4 = a[i] * b[i] (low 32 bits)
+
+    // Apply Montgomery reduction
+    montgomery_reduce_avx2 %ymm4, %ymm4, %ymm0, %ymm1
+
+    // Store result
+    vmovdqa %ymm4, (%rdi)
+
+    // Advance pointers
+    add $32, %rsi
+    add $32, %rdx
+    add $32, %rdi
+
+    // Decrement counter and loop
+    dec %rbx
+    jnz loop_start
+
+    // Restore registers and return
+    pop %r13
+    pop %r12
+    pop %rbx
+    ret
+
+#endif /* MLD_ARITH_BACKEND_X86_64_DEFAULT */
@@ -0,0 +1,62 @@
+/*
+ * Copyright (c) The mldsa-native project authors
+ * SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT
+ */
+
+#include "../../../common.h"
+#if defined(MLD_ARITH_BACKEND_X86_64_DEFAULT)
+
+.text
+.global MLD_ASM_NAMESPACE(polyveck_pointwise_poly_montgomery_k4_asm)
+.balign 4
+MLD_ASM_FN_SYMBOL(polyveck_pointwise_poly_montgomery_k4_asm)
+    // Save registers
+    push %rbx
+    push %r12
+    push %r13
+    push %r14
+
+    // Load parameters
+    // rdi: pointer to output vector r
+    // rsi: pointer to input polynomial a
+    // rdx: pointer to input vector v
+
+    // Save parameters
+    mov %rdi, %r12  // r
+    mov %rsi, %r13  // a
+    mov %rdx, %r14  // v
+
+    // Call poly_pointwise_montgomery for each polynomial in the vector
+
+    // First polynomial
+    mov %r12, %rdi  // r[0]
+    mov %r13, %rsi  // a
+    mov %r14, %rdx  // v[0]
+    call MLD_ASM_NAMESPACE(poly_pointwise_montgomery_asm)
+
+    // Second polynomial
+    lea 1024(%r12), %rdi  // r[1]
+    mov %r13, %rsi        // a
+    lea 1024(%r14), %rdx  // v[1]
+    call MLD_ASM_NAMESPACE(poly_pointwise_montgomery_asm)
+
+    // Third polynomial
+    lea 2048(%r12), %rdi  // r[2]
+    mov %r13, %rsi        // a
+    lea 2048(%r14), %rdx  // v[2]
+    call MLD_ASM_NAMESPACE(poly_pointwise_montgomery_asm)
+
+    // Fourth polynomial
+    lea 3072(%r12), %rdi  // r[3]
+    mov %r13, %rsi        // a
+    lea 3072(%r14), %rdx  // v[3]
+    call MLD_ASM_NAMESPACE(poly_pointwise_montgomery_asm)
+
+    // Restore registers and return
+    pop %r14
+    pop %r13
+    pop %r12
+    pop %rbx
+    ret
+
+#endif /* MLD_ARITH_BACKEND_X86_64_DEFAULT */
@@ -0,0 +1,74 @@
+/*
+ * Copyright (c) The mldsa-native project authors
+ * SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT
+ */
+
+#include "../../../common.h"
+#if defined(MLD_ARITH_BACKEND_X86_64_DEFAULT)
+
+.text
+.global MLD_ASM_NAMESPACE(polyveck_pointwise_poly_montgomery_k6_asm)
+.balign 4
+MLD_ASM_FN_SYMBOL(polyveck_pointwise_poly_montgomery_k6_asm)
+    // Save registers
+    push %rbx
+    push %r12
+    push %r13
+    push %r14
+
+    // Load parameters
+    // rdi: pointer to output vector r
+    // rsi: pointer to input polynomial a
+    // rdx: pointer to input vector v
+
+    // Save parameters
+    mov %rdi, %r12  // r
+    mov %rsi, %r13  // a
+    mov %rdx, %r14  // v
+
+    // Call poly_pointwise_montgomery for each polynomial in the vector
+
+    // First polynomial
+    mov %r12, %rdi  // r[0]
+    mov %r13, %rsi  // a
+    mov %r14, %rdx  // v[0]
+    call MLD_ASM_NAMESPACE(poly_pointwise_montgomery_asm)
+
+    // Second polynomial
+    lea 1024(%r12), %rdi  // r[1]
+    mov %r13, %rsi        // a
+    lea 1024(%r14), %rdx  // v[1]
+    call MLD_ASM_NAMESPACE(poly_pointwise_montgomery_asm)
+
+    // Third polynomial
+    lea 2048(%r12), %rdi  // r[2]
+    mov %r13, %rsi        // a
+    lea 2048(%r14), %rdx  // v[2]
+    call MLD_ASM_NAMESPACE(poly_pointwise_montgomery_asm)
+
+    // Fourth polynomial
+    lea 3072(%r12), %rdi  // r[3]
+    mov %r13, %rsi        // a
+    lea 3072(%r14), %rdx  // v[3]
+    call MLD_ASM_NAMESPACE(poly_pointwise_montgomery_asm)
+
+    // Fifth polynomial
+    lea 4096(%r12), %rdi  // r[4]
+    mov %r13, %rsi        // a
+    lea 4096(%r14), %rdx  // v[4]
+    call MLD_ASM_NAMESPACE(poly_pointwise_montgomery_asm)
+
+    // Sixth polynomial
+    lea 5120(%r12), %rdi  // r[5]
+    mov %r13, %rsi        // a
+    lea 5120(%r14), %rdx  // v[5]
+    call MLD_ASM_NAMESPACE(poly_pointwise_montgomery_asm)
+
+    // Restore registers and return
+    pop %r14
+    pop %r13
+    pop %r12
+    pop %rbx
+    ret
+
+#endif /* MLD_ARITH_BACKEND_X86_64_DEFAULT */