Skip to content

Commit 36d07ac

Browse files
authored
Merge pull request #883 from pq-code-package/hol_light_basemul2
HOL-Light: Add low-level spec and proof for AArch64 basemul
2 parents 123d1d8 + a2884f6 commit 36d07ac

16 files changed

+2106
-25
lines changed

.github/workflows/hol_light.yml

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -40,6 +40,12 @@ jobs:
4040
needs: ["mlkem_specs.ml", "mlkem_utils.ml"]
4141
- name: mlkem_poly_reduce
4242
needs: ["mlkem_specs.ml", "mlkem_utils.ml"]
43+
- name: mlkem_poly_basemul_acc_montgomery_cached_k2
44+
needs: ["mlkem_specs.ml", "mlkem_utils.ml"]
45+
- name: mlkem_poly_basemul_acc_montgomery_cached_k3
46+
needs: ["mlkem_specs.ml", "mlkem_utils.ml"]
47+
- name: mlkem_poly_basemul_acc_montgomery_cached_k4
48+
needs: ["mlkem_specs.ml", "mlkem_utils.ml"]
4349
- name: keccak_f1600_x1_scalar
4450
needs: ["keccak_specs.ml"]
4551
- name: keccak_f1600_x1_v84a

nix/s2n_bignum/default.nix

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -2,12 +2,12 @@
22
{ stdenv, fetchFromGitHub, writeText, ... }:
33
stdenv.mkDerivation rec {
44
pname = "s2n_bignum";
5-
version = "ea86b7535c49425b149d7ae88809ed97e3697661";
5+
version = "4ee6cb44e7aab3d48de72a461491b3a70494df35";
66
src = fetchFromGitHub {
77
owner = "awslabs";
88
repo = "s2n-bignum";
99
rev = "${version}";
10-
hash = "sha256-R5x7UT1flnXtDhZssTH+24fHawFOI8KwFmAAQ+x+Nk0=";
10+
hash = "sha256-ay69mWbA/oBffVtEvAU/XBKOajlzukHXUTKYawwn2Ik=";
1111
};
1212
setupHook = writeText "setup-hook.sh" ''
1313
export S2N_BIGNUM_DIR="$1"

proofs/hol_light/arm/Makefile

Lines changed: 12 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -65,15 +65,18 @@ endif
6565
endif
6666
endif
6767

68-
OBJ = mlkem/mlkem_ntt.o \
69-
mlkem/mlkem_intt.o \
70-
mlkem/mlkem_poly_tomont.o \
71-
mlkem/mlkem_poly_mulcache_compute.o \
72-
mlkem/mlkem_poly_reduce.o \
73-
mlkem/keccak_f1600_x1_scalar.o \
74-
mlkem/keccak_f1600_x1_v84a.o \
75-
mlkem/keccak_f1600_x2_v84a.o \
76-
mlkem/keccak_f1600_x4_v8a_v84a_scalar.o \
68+
OBJ = mlkem/mlkem_ntt.o \
69+
mlkem/mlkem_intt.o \
70+
mlkem/mlkem_poly_tomont.o \
71+
mlkem/mlkem_poly_mulcache_compute.o \
72+
mlkem/mlkem_poly_reduce.o \
73+
mlkem/mlkem_poly_basemul_acc_montgomery_cached_k2.o \
74+
mlkem/mlkem_poly_basemul_acc_montgomery_cached_k3.o \
75+
mlkem/mlkem_poly_basemul_acc_montgomery_cached_k4.o \
76+
mlkem/keccak_f1600_x1_scalar.o \
77+
mlkem/keccak_f1600_x1_v84a.o \
78+
mlkem/keccak_f1600_x2_v84a.o \
79+
mlkem/keccak_f1600_x4_v8a_v84a_scalar.o \
7780
mlkem/keccak_f1600_x4_v8a_scalar.o
7881

7982
# According to
Lines changed: 196 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,196 @@
1+
/*
2+
* Copyright (c) 2024-2025 The mlkem-native project authors
3+
* SPDX-License-Identifier: Apache-2.0
4+
*/
5+
6+
// AArch64 re-implementation of the asymmetric base multiplication from:
7+
8+
// Neon NTT: Faster Dilithium, Kyber, and Saber on Cortex-A72 and Apple M1
9+
// https://eprint.iacr.org/2021/986
10+
// https://github.com/neon-ntt/neon-ntt
11+
12+
13+
/*
14+
* WARNING: This file is auto-derived from the mlkem-native source file
15+
* dev/aarch64_opt/src/polyvec_basemul_acc_montgomery_cached_asm_k2.S using scripts/simpasm. Do not modify it directly.
16+
*/
17+
18+
19+
.text
20+
.balign 4
21+
#ifdef __APPLE__
22+
.global _PQCP_MLKEM_NATIVE_MLKEM768_polyvec_basemul_acc_montgomery_cached_asm_k2
23+
_PQCP_MLKEM_NATIVE_MLKEM768_polyvec_basemul_acc_montgomery_cached_asm_k2:
24+
#else
25+
.global PQCP_MLKEM_NATIVE_MLKEM768_polyvec_basemul_acc_montgomery_cached_asm_k2
26+
PQCP_MLKEM_NATIVE_MLKEM768_polyvec_basemul_acc_montgomery_cached_asm_k2:
27+
#endif
28+
29+
sub sp, sp, #0x40
30+
stp d8, d9, [sp]
31+
stp d10, d11, [sp, #0x10]
32+
stp d12, d13, [sp, #0x20]
33+
stp d14, d15, [sp, #0x30]
34+
mov w14, #0xd01 // =3329
35+
dup v0.8h, w14
36+
mov w14, #0xcff // =3327
37+
dup v2.8h, w14
38+
add x4, x1, #0x200
39+
add x5, x2, #0x200
40+
add x6, x3, #0x100
41+
mov x13, #0x10 // =16
42+
ldr q9, [x4], #0x20
43+
ldur q5, [x4, #-0x10]
44+
ldr q11, [x5], #0x20
45+
uzp1 v23.8h, v9.8h, v5.8h
46+
uzp2 v9.8h, v9.8h, v5.8h
47+
ldr q5, [x2], #0x20
48+
ldur q7, [x5, #-0x10]
49+
ldur q21, [x2, #-0x10]
50+
uzp2 v10.8h, v11.8h, v7.8h
51+
uzp1 v11.8h, v11.8h, v7.8h
52+
uzp1 v7.8h, v5.8h, v21.8h
53+
uzp2 v5.8h, v5.8h, v21.8h
54+
ldr q21, [x1], #0x20
55+
ldur q25, [x1, #-0x10]
56+
ld1 { v6.8h }, [x3], #16
57+
uzp1 v26.8h, v21.8h, v25.8h
58+
uzp2 v21.8h, v21.8h, v25.8h
59+
smull v25.4s, v26.4h, v5.4h
60+
smull2 v5.4s, v26.8h, v5.8h
61+
smull v19.4s, v26.4h, v7.4h
62+
smull2 v26.4s, v26.8h, v7.8h
63+
smlal v25.4s, v21.4h, v7.4h
64+
smlal2 v5.4s, v21.8h, v7.8h
65+
smlal v19.4s, v21.4h, v6.4h
66+
smlal2 v26.4s, v21.8h, v6.8h
67+
smlal v25.4s, v23.4h, v10.4h
68+
smlal2 v5.4s, v23.8h, v10.8h
69+
smlal v19.4s, v23.4h, v11.4h
70+
smlal2 v26.4s, v23.8h, v11.8h
71+
ld1 { v23.8h }, [x6], #16
72+
smlal v25.4s, v9.4h, v11.4h
73+
smlal2 v5.4s, v9.8h, v11.8h
74+
smlal2 v26.4s, v9.8h, v23.8h
75+
smlal v19.4s, v9.4h, v23.4h
76+
ldr q9, [x4], #0x20
77+
uzp1 v11.8h, v25.8h, v5.8h
78+
uzp1 v23.8h, v19.8h, v26.8h
79+
mul v11.8h, v11.8h, v2.8h
80+
mul v23.8h, v23.8h, v2.8h
81+
ldr q7, [x5], #0x20
82+
smlal2 v5.4s, v11.8h, v0.8h
83+
smlal v25.4s, v11.4h, v0.4h
84+
ldr q11, [x2], #0x20
85+
ldur q21, [x2, #-0x10]
86+
ldur q6, [x4, #-0x10]
87+
uzp1 v17.8h, v11.8h, v21.8h
88+
ldr q10, [x1], #0x20
89+
ldur q29, [x1, #-0x10]
90+
uzp2 v11.8h, v11.8h, v21.8h
91+
uzp1 v13.8h, v9.8h, v6.8h
92+
uzp1 v3.8h, v10.8h, v29.8h
93+
uzp2 v10.8h, v10.8h, v29.8h
94+
smull v12.4s, v3.4h, v11.4h
95+
smull2 v11.4s, v3.8h, v11.8h
96+
ldur q21, [x5, #-0x10]
97+
smlal v12.4s, v10.4h, v17.4h
98+
smlal2 v11.4s, v10.8h, v17.8h
99+
uzp2 v29.8h, v7.8h, v21.8h
100+
uzp1 v15.8h, v7.8h, v21.8h
101+
smlal v12.4s, v13.4h, v29.4h
102+
smlal2 v11.4s, v13.8h, v29.8h
103+
uzp2 v28.8h, v9.8h, v6.8h
104+
smlal2 v26.4s, v23.8h, v0.8h
105+
smlal v12.4s, v28.4h, v15.4h
106+
smlal2 v11.4s, v28.8h, v15.8h
107+
smlal v19.4s, v23.4h, v0.4h
108+
uzp2 v27.8h, v25.8h, v5.8h
109+
smull v23.4s, v3.4h, v17.4h
110+
uzp1 v9.8h, v12.8h, v11.8h
111+
uzp2 v19.8h, v19.8h, v26.8h
112+
mul v14.8h, v9.8h, v2.8h
113+
ld1 { v22.8h }, [x6], #16
114+
zip2 v9.8h, v19.8h, v27.8h
115+
smlal2 v11.4s, v14.8h, v0.8h
116+
ld1 { v4.8h }, [x3], #16
117+
sub x13, x13, #0x2
118+
119+
polyvec_basemul_acc_montgomery_cached_asm_k2_loop:
120+
smull2 v20.4s, v3.8h, v17.8h
121+
ldr q18, [x4], #0x20
122+
ldr q30, [x5], #0x20
123+
smlal2 v20.4s, v10.8h, v4.8h
124+
smlal v12.4s, v14.4h, v0.4h
125+
smlal v23.4s, v10.4h, v4.4h
126+
str q9, [x0, #0x10]
127+
smlal2 v20.4s, v13.8h, v15.8h
128+
ldr q8, [x2], #0x20
129+
smlal v23.4s, v13.4h, v15.4h
130+
smlal2 v20.4s, v28.8h, v22.8h
131+
zip1 v26.8h, v19.8h, v27.8h
132+
ldur q9, [x2, #-0x10]
133+
smlal v23.4s, v28.4h, v22.4h
134+
uzp2 v27.8h, v12.8h, v11.8h
135+
uzp1 v17.8h, v8.8h, v9.8h
136+
uzp2 v4.8h, v8.8h, v9.8h
137+
uzp1 v5.8h, v23.8h, v20.8h
138+
str q26, [x0], #0x20
139+
mul v31.8h, v5.8h, v2.8h
140+
ldur q19, [x4, #-0x10]
141+
ldr q29, [x1], #0x20
142+
ldur q12, [x1, #-0x10]
143+
smlal2 v20.4s, v31.8h, v0.8h
144+
uzp1 v13.8h, v18.8h, v19.8h
145+
uzp1 v3.8h, v29.8h, v12.8h
146+
uzp2 v10.8h, v29.8h, v12.8h
147+
smull v12.4s, v3.4h, v4.4h
148+
smull2 v11.4s, v3.8h, v4.8h
149+
ldur q5, [x5, #-0x10]
150+
smlal v12.4s, v10.4h, v17.4h
151+
smlal2 v11.4s, v10.8h, v17.8h
152+
uzp2 v14.8h, v30.8h, v5.8h
153+
uzp1 v15.8h, v30.8h, v5.8h
154+
smlal v12.4s, v13.4h, v14.4h
155+
smlal2 v11.4s, v13.8h, v14.8h
156+
uzp2 v28.8h, v18.8h, v19.8h
157+
smlal v23.4s, v31.4h, v0.4h
158+
smlal v12.4s, v28.4h, v15.4h
159+
smlal2 v11.4s, v28.8h, v15.8h
160+
ld1 { v22.8h }, [x6], #16
161+
uzp2 v19.8h, v23.8h, v20.8h
162+
uzp1 v1.8h, v12.8h, v11.8h
163+
smull v23.4s, v3.4h, v17.4h
164+
mul v14.8h, v1.8h, v2.8h
165+
zip2 v9.8h, v19.8h, v27.8h
166+
ld1 { v4.8h }, [x3], #16
167+
smlal2 v11.4s, v14.8h, v0.8h
168+
sub x13, x13, #0x1
169+
cbnz x13, polyvec_basemul_acc_montgomery_cached_asm_k2_loop
170+
smull2 v5.4s, v3.8h, v17.8h
171+
smlal v12.4s, v14.4h, v0.4h
172+
smlal v23.4s, v10.4h, v4.4h
173+
str q9, [x0, #0x10]
174+
smlal2 v5.4s, v10.8h, v4.8h
175+
uzp2 v11.8h, v12.8h, v11.8h
176+
zip1 v9.8h, v19.8h, v27.8h
177+
smlal v23.4s, v13.4h, v15.4h
178+
smlal2 v5.4s, v13.8h, v15.8h
179+
str q9, [x0], #0x20
180+
smlal v23.4s, v28.4h, v22.4h
181+
smlal2 v5.4s, v28.8h, v22.8h
182+
uzp1 v9.8h, v23.8h, v5.8h
183+
mul v9.8h, v9.8h, v2.8h
184+
smlal2 v5.4s, v9.8h, v0.8h
185+
smlal v23.4s, v9.4h, v0.4h
186+
uzp2 v9.8h, v23.8h, v5.8h
187+
zip2 v5.8h, v9.8h, v11.8h
188+
zip1 v9.8h, v9.8h, v11.8h
189+
str q5, [x0, #0x10]
190+
str q9, [x0], #0x20
191+
ldp d8, d9, [sp]
192+
ldp d10, d11, [sp, #0x10]
193+
ldp d12, d13, [sp, #0x20]
194+
ldp d14, d15, [sp, #0x30]
195+
add sp, sp, #0x40
196+
ret

0 commit comments

Comments
 (0)