AVX2: Add native implementation of poly_reduce and poly_caddq

jakemas · jakemas · commit c408bba9d879 · 2025-06-24T21:15:45.000-07:00
Signed-off-by: Jake Massimo &lt;jakemas@amazon.com&gt;
diff --git a/mldsa/native/api.h b/mldsa/native/api.h
@@ -80,23 +80,49 @@ set if there are native implementations for NTT and INTT."
  * Arguments:   - int32_t p[MLDSA_N]: pointer to in/output polynomial
  *
  **************************************************/
-static MLD_INLINE void mld_poly_permute_bitrev_to_custom(int32_t p[MLDSA_N])
+static MLD_INLINE void mld_poly_permute_bitrev_to_custom(int32_t p[MLDSA_N]);
 #endif /* MLD_USE_NATIVE_NTT_CUSTOM_ORDER */
 
 
 #if defined(MLD_USE_NATIVE_INTT)
-    /*************************************************
-     * Name:        mld_intt_native
-     *
-     * Description: Computes inverse of negacyclic number-theoretic transform
-     *(NTT) of a polynomial in place.
-     *
-     *              The input polynomial is in bitreversed order.
-     *              The output polynomial is assumed to be in normal order.
-     *
-     * Arguments:   - uint32_t p[MLDSA_N]: pointer to in/output polynomial
-     **************************************************/
-    static MLD_INLINE void mld_intt_native(int16_t p[MLDSA_N])
+/*************************************************
+ * Name:        mld_intt_native
+ *
+ * Description: Computes inverse of negacyclic number-theoretic transform
+ *(NTT) of a polynomial in place.
+ *
+ *              The input polynomial is in bitreversed order.
+ *              The output polynomial is assumed to be in normal order.
+ *
+ * Arguments:   - uint32_t p[MLDSA_N]: pointer to in/output polynomial
+ **************************************************/
+static MLD_INLINE void mld_intt_native(int32_t p[MLDSA_N]);
 #endif /* MLD_USE_NATIVE_INTT */
 
+#if defined(MLD_USE_NATIVE_POLY_REDUCE)
+/*************************************************
+ * Name:        mld_poly_reduce_native
+ *
+ * Description: Inplace reduction of all coefficients of polynomial to
+ *              representative in [-6283009,6283008]. Assumes input
+ *              coefficients to be at most 2^31 - 2^22 - 1 in absolute
+ *value.
+ *
+ * Arguments:   - int32_t *a: pointer to input/output polynomial
+ **************************************************/
+static MLD_INLINE void mld_poly_reduce_native(int32_t a[MLDSA_N]);
+#endif /* MLD_USE_NATIVE_POLY_REDUCE */
+
+#if defined(MLD_USE_NATIVE_POLY_CADDQ)
+/*************************************************
+ * Name:        mld_poly_caddq_native
+ *
+ * Description: For all coefficients of in/out polynomial add Q if
+ *              coefficient is negative.
+ *
+ * Arguments:   - int32_t *a: pointer to input/output polynomial
+ **************************************************/
+static MLD_INLINE void mld_poly_caddq_native(int32_t a[MLDSA_N]);
+#endif /* MLD_USE_NATIVE_POLY_CADDQ */
+
 #endif /* !MLD_NATIVE_API_H */
diff --git a/mldsa/native/x86_64/meta.h b/mldsa/native/x86_64/meta.h
@@ -14,6 +14,8 @@
 #define MLD_USE_NATIVE_NTT_CUSTOM_ORDER
 #define MLD_USE_NATIVE_NTT
 #define MLD_USE_NATIVE_INTT
+#define MLD_USE_NATIVE_POLY_REDUCE
+#define MLD_USE_NATIVE_POLY_CADDQ
 
 #if !defined(__ASSEMBLER__)
 #include <string.h>
@@ -34,6 +36,16 @@ static MLD_INLINE void mld_intt_native(int32_t data[MLDSA_N])
   mld_invntt_avx2((__m256i *)data, mld_qdata.vec);
 }
 
+static MLD_INLINE void mld_poly_reduce_native(int32_t a[MLDSA_N])
+{
+  mld_poly_reduce_avx2(a);
+}
+
+static MLD_INLINE void mld_poly_caddq_native(int32_t a[MLDSA_N])
+{
+  mld_poly_caddq_avx2(a);
+}
+
 #endif /* !__ASSEMBLER__ */
 
 #endif /* !MLD_NATIVE_X86_64_META_H */
diff --git a/mldsa/native/x86_64/src/arith_native_x86_64.h b/mldsa/native/x86_64/src/arith_native_x86_64.h
@@ -19,4 +19,10 @@ void mld_invntt_avx2(__m256i *r, const __m256i *mld_qdata);
 #define mld_nttunpack_avx2 MLD_NAMESPACE(nttunpack_avx2)
 void mld_nttunpack_avx2(__m256i *r);
 
+#define mld_poly_reduce_avx2 MLD_NAMESPACE(poly_reduce_avx2)
+void mld_poly_reduce_avx2(int32_t *r);
+
+#define mld_poly_caddq_avx2 MLD_NAMESPACE(poly_caddq_avx2)
+void mld_poly_caddq_avx2(int32_t *r);
+
 #endif /* !MLD_NATIVE_X86_64_SRC_ARITH_NATIVE_X86_64_H */
diff --git a/mldsa/native/x86_64/src/reduce_avx2.c b/mldsa/native/x86_64/src/reduce_avx2.c
@@ -0,0 +1,82 @@
+/*
+ * Copyright (c) The mldsa-native project authors
+ * SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT
+ */
+
+/*
+ * This file is derived from the public domain
+ * AVX2 Dilithium implementation @[REF_AVX2].
+ */
+
+#include "../../../common.h"
+
+#if defined(MLD_ARITH_BACKEND_X86_64_DEFAULT) && \
+    !defined(MLD_CONFIG_MULTILEVEL_NO_SHARED)
+
+#include <immintrin.h>
+#include <stdint.h>
+#include "../../../reduce.h"
+#include "arith_native_x86_64.h"
+#include "consts.h"
+
+/*************************************************
+ * Name:        mld_poly_reduce_avx2
+ *
+ * Description: Inplace reduction of all coefficients of polynomial to
+ *              representative in [-6283009,6283008]. Assumes input
+ *              coefficients to be at most 2^31 - 2^22 - 1 in absolute value.
+ *
+ * Arguments:   - int32_t *r: pointer to input/output polynomial
+ **************************************************/
+void mld_poly_reduce_avx2(int32_t *r)
+{
+  unsigned int i;
+  __m256i f, g;
+  const __m256i q = _mm256_set1_epi32(MLDSA_Q);
+  const __m256i off = _mm256_set1_epi32(1 << 22);
+  __m256i *rr = (__m256i *)r;
+
+  for (i = 0; i < MLDSA_N / 8; i++)
+  {
+    f = _mm256_load_si256(&rr[i]);
+    g = _mm256_add_epi32(f, off);
+    g = _mm256_srai_epi32(g, 23);
+    g = _mm256_mullo_epi32(g, q);
+    f = _mm256_sub_epi32(f, g);
+    _mm256_store_si256(&rr[i], f);
+  }
+}
+
+/*************************************************
+ * Name:        mld_poly_caddq_avx2
+ *
+ * Description: For all coefficients of in/out polynomial add Q if
+ *              coefficient is negative.
+ *
+ * Arguments:   - int32_t *r: pointer to input/output polynomial
+ **************************************************/
+void mld_poly_caddq_avx2(int32_t *r)
+{
+  unsigned int i;
+  __m256i f, g;
+  const __m256i q = _mm256_set1_epi32(MLDSA_Q);
+  const __m256i zero = _mm256_setzero_si256();
+  __m256i *rr = (__m256i *)r;
+
+  for (i = 0; i < MLDSA_N / 8; i++)
+  {
+    f = _mm256_load_si256(&rr[i]);
+    g = _mm256_cmpgt_epi32(zero, f);
+    g = _mm256_and_si256(g, q);
+    f = _mm256_add_epi32(f, g);
+    _mm256_store_si256(&rr[i], f);
+  }
+}
+
+#else /* MLD_ARITH_BACKEND_X86_64_DEFAULT && !MLD_CONFIG_MULTILEVEL_NO_SHARED \
+       */
+
+MLD_EMPTY_CU(avx2_reduce)
+
+#endif /* !(MLD_ARITH_BACKEND_X86_64_DEFAULT && \
+          !MLD_CONFIG_MULTILEVEL_NO_SHARED) */
diff --git a/mldsa/poly.c b/mldsa/poly.c
@@ -16,11 +16,16 @@
 
 void poly_reduce(poly *a)
 {
+#if !defined(MLD_USE_NATIVE_POLY_REDUCE)
   unsigned int i;
+#endif
   /* TODO: Introduce the following after using inclusive lower bounds in
    * the underlying debug function mld_debug_check_bounds(). */
   /* mld_assert_bound(a->coeffs, MLDSA_N, INT32_MIN, REDUCE_DOMAIN_MAX); */
 
+#if defined(MLD_USE_NATIVE_POLY_REDUCE)
+  mld_poly_reduce_native(a->coeffs);
+#else
   for (i = 0; i < MLDSA_N; ++i)
   __loop__(
     invariant(i <= MLDSA_N)
@@ -29,15 +34,21 @@ void poly_reduce(poly *a)
   {
     a->coeffs[i] = reduce32(a->coeffs[i]);
   }
+#endif /* !MLD_USE_NATIVE_POLY_REDUCE */
 
   mld_assert_bound(a->coeffs, MLDSA_N, -REDUCE_RANGE_MAX, REDUCE_RANGE_MAX);
 }
 
 void poly_caddq(poly *a)
 {
+#if !defined(MLD_USE_NATIVE_POLY_CADDQ)
   unsigned int i;
+#endif
   mld_assert_abs_bound(a->coeffs, MLDSA_N, MLDSA_Q);
 
+#if defined(MLD_USE_NATIVE_POLY_CADDQ)
+  mld_poly_caddq_native(a->coeffs);
+#else
   for (i = 0; i < MLDSA_N; ++i)
   __loop__(
     invariant(i <= MLDSA_N)
@@ -47,6 +58,7 @@ void poly_caddq(poly *a)
   {
     a->coeffs[i] = caddq(a->coeffs[i]);
   }
+#endif /* !MLD_USE_NATIVE_POLY_CADDQ */
 
   mld_assert_bound(a->coeffs, MLDSA_N, 0, MLDSA_Q);
 }

Original file line number	Diff line number	Diff line change
`@@ -16,11 +16,16 @@`
`16`	`16`
`17`	`17`	`void poly_reduce(poly *a)`
`18`	`18`	`{`
	`19`	`+#if !defined(MLD_USE_NATIVE_POLY_REDUCE)`
`19`	`20`	`unsigned int i;`
	`21`	`+#endif`
`20`	`22`	`/* TODO: Introduce the following after using inclusive lower bounds in`
`21`	`23`	`* the underlying debug function mld_debug_check_bounds(). */`
`22`	`24`	`/* mld_assert_bound(a->coeffs, MLDSA_N, INT32_MIN, REDUCE_DOMAIN_MAX); */`
`23`	`25`
	`26`	`+#if defined(MLD_USE_NATIVE_POLY_REDUCE)`
	`27`	`+ mld_poly_reduce_native(a->coeffs);`
	`28`	`+#else`
`24`	`29`	`for (i = 0; i < MLDSA_N; ++i)`
`25`	`30`	`__loop__(`
`26`	`31`	`invariant(i <= MLDSA_N)`
`@@ -29,15 +34,21 @@ void poly_reduce(poly *a)`
`29`	`34`	`{`
`30`	`35`	`a->coeffs[i] = reduce32(a->coeffs[i]);`
`31`	`36`	`}`
	`37`	`+#endif /* !MLD_USE_NATIVE_POLY_REDUCE */`
`32`	`38`
`33`	`39`	`mld_assert_bound(a->coeffs, MLDSA_N, -REDUCE_RANGE_MAX, REDUCE_RANGE_MAX);`
`34`	`40`	`}`
`35`	`41`
`36`	`42`	`void poly_caddq(poly *a)`
`37`	`43`	`{`
	`44`	`+#if !defined(MLD_USE_NATIVE_POLY_CADDQ)`
`38`	`45`	`unsigned int i;`
	`46`	`+#endif`
`39`	`47`	`mld_assert_abs_bound(a->coeffs, MLDSA_N, MLDSA_Q);`
`40`	`48`
	`49`	`+#if defined(MLD_USE_NATIVE_POLY_CADDQ)`
	`50`	`+ mld_poly_caddq_native(a->coeffs);`
	`51`	`+#else`
`41`	`52`	`for (i = 0; i < MLDSA_N; ++i)`
`42`	`53`	`__loop__(`
`43`	`54`	`invariant(i <= MLDSA_N)`
`@@ -47,6 +58,7 @@ void poly_caddq(poly *a)`
`47`	`58`	`{`
`48`	`59`	`a->coeffs[i] = caddq(a->coeffs[i]);`
`49`	`60`	`}`
	`61`	`+#endif /* !MLD_USE_NATIVE_POLY_CADDQ */`
`50`	`62`
`51`	`63`	`mld_assert_bound(a->coeffs, MLDSA_N, 0, MLDSA_Q);`
`52`	`64`	`}`