Skip to content

Commit d40c2ae

Browse files
committed
AVX2: Add native implementation of poly_reduce
Signed-off-by: Jake Massimo <[email protected]>
1 parent c408bba commit d40c2ae

File tree

2 files changed

+63
-28
lines changed

2 files changed

+63
-28
lines changed

mldsa/native/x86_64/src/reduce_avx2.S

Lines changed: 63 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,63 @@
1+
/*
2+
* Copyright (c) The mldsa-native project authors
3+
* SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT
4+
*/
5+
6+
/*
7+
* This file is derived from the public domain
8+
* AVX2 Dilithium implementation @[REF_AVX2].
9+
*/
10+
11+
#include "../../../common.h"
12+
13+
#if defined(MLD_ARITH_BACKEND_X86_64_DEFAULT) && \
14+
!defined(MLD_CONFIG_MULTILEVEL_NO_SHARED)
15+
16+
#include "consts.h"
17+
18+
.text
19+
.balign 16 /* Ensure 16-byte alignment for AVX2 code */
20+
.global MLD_ASM_NAMESPACE(poly_reduce_avx2)
21+
MLD_ASM_FN_SYMBOL(poly_reduce_avx2)
22+
/* Load constants */
23+
vpbroadcastd .Lmldsa_q(%rip), %ymm0 /* q in all lanes */
24+
vpbroadcastd .Loffset(%rip), %ymm1 /* offset (1 << 22) in all lanes */
25+
26+
/* Setup loop counter */
27+
xor %rax, %rax /* byte offset = 0 */
28+
mov $MLDSA_N*4, %rcx /* Total bytes to process (MLDSA_N * sizeof(int32_t)) */
29+
30+
.Lloop:
31+
/* Load 8 int32_t values */
32+
vmovdqa (%rdi,%rax), %ymm2 /* f = _mm256_load_si256(&rr[i]) */
33+
34+
/* Perform reduction */
35+
vpaddd %ymm1, %ymm2, %ymm3 /* g = f + offset */
36+
vpsrad $23, %ymm3, %ymm3 /* g = g >> 23 */
37+
38+
/* Multiply by q using vpmulld */
39+
vpmulld %ymm0, %ymm3, %ymm3 /* g = g * q */
40+
41+
/* Subtract from original value */
42+
vpsubd %ymm3, %ymm2, %ymm2 /* f = f - g */
43+
44+
/* Store result */
45+
vmovdqa %ymm2, (%rdi,%rax) /* _mm256_store_si256(&rr[i], f) */
46+
47+
/* Increment counter and check loop condition */
48+
add $32, %rax /* Move to next 32 bytes (8 int32_t values) */
49+
cmp %rcx, %rax /* Compare with total bytes */
50+
jb .Lloop
51+
52+
/* Return */
53+
ret
54+
55+
/* Constants section */
56+
.section .rodata
57+
.balign 32 /* 32-byte alignment for AVX2 constants */
58+
.Lmldsa_q:
59+
.int MLDSA_Q /* Only need one value for vpbroadcastd */
60+
.Loffset:
61+
.int (1 << 22) /* Only need one value for vpbroadcastd */
62+
63+
#endif /* MLD_ARITH_BACKEND_X86_64_DEFAULT && !MLD_CONFIG_MULTILEVEL_NO_SHARED */

mldsa/native/x86_64/src/reduce_avx2.c

Lines changed: 0 additions & 28 deletions
Original file line numberDiff line numberDiff line change
@@ -19,34 +19,6 @@
1919
#include "arith_native_x86_64.h"
2020
#include "consts.h"
2121

22-
/*************************************************
23-
* Name: mld_poly_reduce_avx2
24-
*
25-
* Description: Inplace reduction of all coefficients of polynomial to
26-
* representative in [-6283009,6283008]. Assumes input
27-
* coefficients to be at most 2^31 - 2^22 - 1 in absolute value.
28-
*
29-
* Arguments: - int32_t *r: pointer to input/output polynomial
30-
**************************************************/
31-
void mld_poly_reduce_avx2(int32_t *r)
32-
{
33-
unsigned int i;
34-
__m256i f, g;
35-
const __m256i q = _mm256_set1_epi32(MLDSA_Q);
36-
const __m256i off = _mm256_set1_epi32(1 << 22);
37-
__m256i *rr = (__m256i *)r;
38-
39-
for (i = 0; i < MLDSA_N / 8; i++)
40-
{
41-
f = _mm256_load_si256(&rr[i]);
42-
g = _mm256_add_epi32(f, off);
43-
g = _mm256_srai_epi32(g, 23);
44-
g = _mm256_mullo_epi32(g, q);
45-
f = _mm256_sub_epi32(f, g);
46-
_mm256_store_si256(&rr[i], f);
47-
}
48-
}
49-
5022
/*************************************************
5123
* Name: mld_poly_caddq_avx2
5224
*

0 commit comments

Comments
 (0)