ml-kem, module-lattice: avoid UDIV in compiled output (#289)

tob-scott-a · web-flow · commit 584f5815a0df · 2026-04-17T09:32:56.000-06:00
When compiling ML-KEM and checking the resulting binary for side-channel
leakage, several false positive UDIVs appear on ARM assembly. This is a
mild annoyance, but not security-relevant. This PR updates
module-lattice and ml-kem to avoid the division operators entirely.

As an added bonus, `cargo bench` reports a performance win:

| Operation   | master   | this branch   | Δ       | p-value |
|-------------|----------|----------|---------|---------|
| keygen      | 31.00 µs | 26.57 µs | −14.08% | &lt; 0.05  |
| encapsulate | 27.80 µs | 22.78 µs | −18.80% | &lt; 0.05  |
| decapsulate | 34.48 µs | 26.18 µs | −23.30% | &lt; 0.05  |
| round_trip  | 99.46 µs | 82.35 µs | −17.27% | &lt; 0.05  |


## Raw criterion output

### master (baseline)

```
keygen                  time:   [30.917 µs 31.003 µs 31.115 µs]
encapsulate             time:   [27.637 µs 27.802 µs 28.046 µs]
decapsulate             time:   [34.279 µs 34.479 µs 34.778 µs]
round_trip              time:   [99.161 µs 99.463 µs 99.854 µs]
```

### ml-kem-undivided (compared against master)

```
keygen                  time:   [26.493 µs 26.574 µs 26.691 µs]
                        change: [−14.429% −14.079% −13.765%] (p = 0.00 &lt; 0.05)
                        Performance has improved.

encapsulate             time:   [22.478 µs 22.781 µs 23.228 µs]
                        change: [−19.472% −18.797% −17.936%] (p = 0.00 &lt; 0.05)
                        Performance has improved.

decapsulate             time:   [26.089 µs 26.185 µs 26.304 µs]
                        change: [−23.952% −23.304% −22.512%] (p = 0.00 &lt; 0.05)
                        Performance has improved.

round_trip              time:   [81.947 µs 82.345 µs 83.057 µs]
                        change: [−17.604% −17.269% −16.761%] (p = 0.00 &lt; 0.05)
                        Performance has improved.
```

## Claude's Interpretation

&gt; [!NOTE]
&gt; Take this with a grain of salt, but it does sound plausible.

- **NTT const-generic layers** (`ntt_layer&lt;LEN, ITERATIONS&gt;` /
`ntt_inverse_layer&lt;LEN, ITERATIONS&gt;`) are the dominant win. With `LEN`
and
`ITERATIONS` compile-time constants, the inner loops unroll completely
and
LLVM auto-vectorizes the butterfly into NEON (`add.8h`, `sub.8h`,
`cmhs.8h`,
  `bic.8h`). In the original form, `(0..256).step_by(2 * len)` carried a
runtime `UDIV` and blocked unrolling through the outer `for len in
[...]`.
- **Decapsulate benefits the most (−23%)** because it runs both `ntt`
and
  `ntt_inverse` on the length-`K` vector and also hits the `D = 12`
  `byte_decode` path.
- **Keygen (−14%)** mainly benefits from the forward NTT on the secret
and
  error vectors.
- **Encapsulate (−19%)** benefits from the forward NTT on the randomness
vector
  and matrix-vector product in the NTT domain.
diff --git a/ml-kem/src/algebra.rs b/ml-kem/src/algebra.rs
@@ -134,26 +134,42 @@ pub(crate) trait Ntt {
     fn ntt(&self) -> Self::Output;
 }
 
+/// One layer of the forward NTT butterfly.
+///
+/// `LEN` is the butterfly half-length and `ITERATIONS = 128 / LEN` is the number of
+/// butterfly groups in the layer. Making both compile-time constants lets the compiler
+/// eliminate the iterator length calculation (`256 / (2 * LEN)`) that `step_by` would
+/// otherwise compute with a `UDIV` instruction.
+#[inline(always)]
+fn ntt_layer<const LEN: usize, const ITERATIONS: usize>(f: &mut Array<Elem, U256>, k: &mut usize) {
+    for i in 0..ITERATIONS {
+        let start = i * 2 * LEN;
+        let zeta = ZETA_POW_BITREV[*k];
+        *k += 1;
+
+        for j in start..(start + LEN) {
+            let t = zeta * f[j + LEN];
+            f[j + LEN] = f[j] - t;
+            f[j] = f[j] + t;
+        }
+    }
+}
+
 /// Algorithm 9: `NTT`
 impl Ntt for Polynomial {
     type Output = NttPolynomial;
 
     fn ntt(&self) -> NttPolynomial {
         let mut k = 1;
-
         let mut f = self.0;
-        for len in [128, 64, 32, 16, 8, 4, 2] {
-            for start in (0..256).step_by(2 * len) {
-                let zeta = ZETA_POW_BITREV[k];
-                k += 1;
-
-                for j in start..(start + len) {
-                    let t = zeta * f[j + len];
-                    f[j + len] = f[j] - t;
-                    f[j] = f[j] + t;
-                }
-            }
-        }
+
+        ntt_layer::<128, 1>(&mut f, &mut k);
+        ntt_layer::<64, 2>(&mut f, &mut k);
+        ntt_layer::<32, 4>(&mut f, &mut k);
+        ntt_layer::<16, 8>(&mut f, &mut k);
+        ntt_layer::<8, 16>(&mut f, &mut k);
+        ntt_layer::<4, 32>(&mut f, &mut k);
+        ntt_layer::<2, 64>(&mut f, &mut k);
 
         f.into()
     }
@@ -175,26 +191,42 @@ pub(crate) trait NttInverse {
     fn ntt_inverse(&self) -> Self::Output;
 }
 
+/// One layer of the inverse NTT butterfly.
+///
+/// See [`ntt_layer`] for the rationale behind the const generics.
+#[inline(always)]
+fn ntt_inverse_layer<const LEN: usize, const ITERATIONS: usize>(
+    f: &mut Array<Elem, U256>,
+    k: &mut usize,
+) {
+    for i in 0..ITERATIONS {
+        let start = i * 2 * LEN;
+        let zeta = ZETA_POW_BITREV[*k];
+        *k -= 1;
+
+        for j in start..(start + LEN) {
+            let t = f[j];
+            f[j] = t + f[j + LEN];
+            f[j + LEN] = zeta * (f[j + LEN] - t);
+        }
+    }
+}
+
 /// Algorithm 10: `NTT^{-1}`
 impl NttInverse for NttPolynomial {
     type Output = Polynomial;
 
     fn ntt_inverse(&self) -> Polynomial {
         let mut f: Array<Elem, U256> = self.0.clone();
-
         let mut k = 127;
-        for len in [2, 4, 8, 16, 32, 64, 128] {
-            for start in (0..256).step_by(2 * len) {
-                let zeta = ZETA_POW_BITREV[k];
-                k -= 1;
-
-                for j in start..(start + len) {
-                    let t = f[j];
-                    f[j] = t + f[j + len];
-                    f[j + len] = zeta * (f[j + len] - t);
-                }
-            }
-        }
+
+        ntt_inverse_layer::<2, 64>(&mut f, &mut k);
+        ntt_inverse_layer::<4, 32>(&mut f, &mut k);
+        ntt_inverse_layer::<8, 16>(&mut f, &mut k);
+        ntt_inverse_layer::<16, 8>(&mut f, &mut k);
+        ntt_inverse_layer::<32, 4>(&mut f, &mut k);
+        ntt_inverse_layer::<64, 2>(&mut f, &mut k);
+        ntt_inverse_layer::<128, 1>(&mut f, &mut k);
 
         Elem::new(3303) * &Polynomial::new(f)
     }
diff --git a/module-lattice/src/algebra.rs b/module-lattice/src/algebra.rs
@@ -72,7 +72,13 @@ macro_rules! define_field {
             const BARRETT_MULTIPLIER: Self::LongLong = (1 << Self::BARRETT_SHIFT) / Self::QLL;
 
             fn small_reduce(x: Self::Int) -> Self::Int {
-                if x < Self::Q { x } else { x - Self::Q }
+                // Branchless conditional subtraction: if x >= Q, subtract Q; else
+                // leave x alone. Compilers already emit `csel` here at O2, but the
+                // explicit mask form removes the dependency on optimizer choices
+                // and keeps the generated assembly free of secret-dependent control
+                // flow at every optimization level.
+                let mask = ((x >= Self::Q) as $int).wrapping_neg();
+                x - (Self::Q & mask)
             }
 
             fn barrett_reduce(x: Self::Long) -> Self::Int {
diff --git a/module-lattice/src/encoding.rs b/module-lattice/src/encoding.rs
@@ -130,9 +130,11 @@ pub fn byte_decode<F: Field, D: EncodingSize>(bytes: &EncodedPolynomial<D>) -> D
             let val = F::Int::truncate(x >> (D::USIZE * j));
             vj.0 = val & mask;
 
-            // Special case for FIPS 203
+            // Special case for FIPS 203. For 12-bit values (max 4095) with Q = 3329,
+            // the masked value is always in [0, 2Q), so `small_reduce` is exact and
+            // avoids the hardware UDIV that `% F::Q` would emit.
             if D::USIZE == 12 {
-                vj.0 = vj.0 % F::Q;
+                vj.0 = F::small_reduce(vj.0);
             }
         }
     }

Original file line number	Diff line number	Diff line change
`@@ -130,9 +130,11 @@ pub fn byte_decode<F: Field, D: EncodingSize>(bytes: &EncodedPolynomial<D>) -> D`
`130`	`130`	`let val = F::Int::truncate(x >> (D::USIZE * j));`
`131`	`131`	`vj.0 = val & mask;`
`132`	`132`
`133`		`- // Special case for FIPS 203`
	`133`	`+ // Special case for FIPS 203. For 12-bit values (max 4095) with Q = 3329,`
	`134`	+ // the masked value is always in [0, 2Q), so `small_reduce` is exact and
	`135`	+ // avoids the hardware UDIV that `% F::Q` would emit.
`134`	`136`	`if D::USIZE == 12 {`
`135`		`- vj.0 = vj.0 % F::Q;`
	`137`	`+ vj.0 = F::small_reduce(vj.0);`
`136`	`138`	`}`
`137`	`139`	`}`
`138`	`140`	`}`