Test performance penalty for not using lazy reduction in matrix-vector mul

mkannwischer · mkannwischer · commit 0b28a8a764ad · 2025-06-25T13:02:05.000+08:00
diff --git a/mldsa/poly.c b/mldsa/poly.c
@@ -143,6 +143,19 @@ void poly_pointwise_montgomery(poly *c, const poly *a, const poly *b)
   }
 }
 
+void poly_pointwise_acc_montgomery(poly *c, const poly *a, const poly *b)
+{
+  unsigned int i;
+
+  for (i = 0; i < MLDSA_N; ++i)
+  __loop__(
+    invariant(i <= MLDSA_N))
+  {
+    c->coeffs[i] += montgomery_reduce((int64_t)a->coeffs[i] * b->coeffs[i]);
+  }
+}
+
+
 void poly_power2round(poly *a1, poly *a0, const poly *a)
 {
   unsigned int i;
diff --git a/mldsa/poly.h b/mldsa/poly.h
@@ -174,6 +174,17 @@ __contract__(
   assigns(memory_slice(c, sizeof(poly)))
 );
 
+
+#define poly_pointwise_acc_montgomery \
+  MLD_NAMESPACE(poly_pointwise_acc_montgomery)
+void poly_pointwise_acc_montgomery(poly *c, const poly *a, const poly *b)
+__contract__(
+  requires(memory_no_alias(a, sizeof(poly)))
+  requires(memory_no_alias(b, sizeof(poly)))
+  requires(memory_no_alias(c, sizeof(poly)))
+  assigns(memory_slice(c, sizeof(poly)))
+);
+
 #define poly_power2round MLD_NAMESPACE(poly_power2round)
 /*************************************************
  * Name:        poly_power2round
diff --git a/mldsa/polyvec.c b/mldsa/polyvec.c
@@ -259,32 +259,11 @@ void polyvecl_pointwise_poly_montgomery(polyvecl *r, const poly *a,
 void polyvecl_pointwise_acc_montgomery(poly *w, const polyvecl *u,
                                        const polyvecl *v)
 {
-  unsigned int i, j;
-  /* The second input is bounded by 9q. Hence, we can safely accumulate
-   * in 64-bits without intermediate reductions as
-   * MLDSA_L * MLD_NTT_BOUND * INT32_MAX < INT64_MAX
-   * worst case is ML-DSA-87: 7 * 9 * q * 2**31 < 2**63
-   * (likewise for negative values)
-   */
-
-  for (i = 0; i < MLDSA_N; i++)
-  __loop__(
-    assigns(i, j, object_whole(w))
-    invariant(i <= MLDSA_N)
-  )
+  unsigned int i;
+  poly_pointwise_montgomery(w, &u->vec[0], &v->vec[0]);
+  for (i = 1; i < MLDSA_L; i++)
   {
-    int64_t t = 0;
-    for (j = 0; j < MLDSA_L; j++)
-    __loop__(
-      assigns(j, t)
-      invariant(j <= MLDSA_L)
-      invariant(t <= -(int64_t)j*INT32_MIN*MLD_NTT_BOUND)
-      invariant(t >= (int64_t)j*INT32_MIN*MLD_NTT_BOUND)
-    )
-    {
-      t += (int64_t)u->vec[j].coeffs[i] * v->vec[j].coeffs[i];
-    }
-    w->coeffs[i] = montgomery_reduce(t);
+    poly_pointwise_acc_montgomery(w, &u->vec[i], &v->vec[i]);
   }
 }
 

Original file line number	Diff line number	Diff line change
`@@ -143,6 +143,19 @@ void poly_pointwise_montgomery(poly c, const poly a, const poly *b)`
`143`	`143`	`}`
`144`	`144`	`}`
`145`	`145`
	`146`	`+void poly_pointwise_acc_montgomery(poly c, const poly a, const poly *b)`
	`147`	`+{`
	`148`	`+ unsigned int i;`
	`149`	`+`
	`150`	`+ for (i = 0; i < MLDSA_N; ++i)`
	`151`	`+ __loop__(`
	`152`	`+ invariant(i <= MLDSA_N))`
	`153`	`+ {`
	`154`	`+ c->coeffs[i] += montgomery_reduce((int64_t)a->coeffs[i] * b->coeffs[i]);`
	`155`	`+ }`
	`156`	`+}`
	`157`	`+`
	`158`	`+`
`146`	`159`	`void poly_power2round(poly a1, poly a0, const poly *a)`
`147`	`160`	`{`
`148`	`161`	`unsigned int i;`