Skip to content

Commit 7b33a4c

Browse files
jammychiou1mkannwischer
authored andcommitted
Add bounds reasoning comments to AVX2 basemul
Improve some existing comments while we're at it. Signed-off-by: jammychiou1 <[email protected]>
1 parent 0d1562e commit 7b33a4c

File tree

4 files changed

+109
-10
lines changed

4 files changed

+109
-10
lines changed

dev/x86_64/src/pointwise.S

Lines changed: 42 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -48,6 +48,8 @@ MLD_ASM_FN_SYMBOL(pointwise_avx2)
4848

4949
xor eax, eax
5050
_looptop1:
51+
// Handle 24 = 3*8 coefficients per iteration
52+
5153
// Load
5254
vmovdqa ymm2, [rsi]
5355
vmovdqa ymm4, [rsi + 32]
@@ -61,6 +63,12 @@ _looptop1:
6163
vpsrlq ymm11, ymm10, 32
6264
vpsrlq ymm13, ymm12, 32
6365
vmovshdup ymm15, ymm14
66+
/*
67+
* ymm{i} stores a's coefficients for i in 2...7, and b's coefficients
68+
* for i in 10...15.
69+
*
70+
* Bounds: |ymm{i}| < 9q for i in 2...7, 10...15
71+
*/
6472

6573
// Multiply
6674
vpmuldq ymm2, ymm2, ymm10
@@ -69,6 +77,10 @@ _looptop1:
6977
vpmuldq ymm5, ymm5, ymm13
7078
vpmuldq ymm6, ymm6, ymm14
7179
vpmuldq ymm7, ymm7, ymm15
80+
/*
81+
* Bounds: |ymm{i}| < 81q^2 < MONTGOMERY_REDUCE_STRONG_DOMAIN_MAX
82+
* for i in 2...7
83+
*/
7284

7385
// Reduce
7486
vpmuldq ymm10, ymm0, ymm2
@@ -89,11 +101,18 @@ _looptop1:
89101
vpsubq ymm5, ymm5, ymm13
90102
vpsubq ymm6, ymm6, ymm14
91103
vpsubq ymm7, ymm7, ymm15
104+
/*
105+
* All coefficients are Montgomery-reduced, resulting in
106+
*
107+
* Bounds: |ymm{i}| < q for i in 2...7
108+
*
109+
* See description of mld_montgomery_reduce() in mldsa/src/reduce.h.
110+
*/
111+
112+
// Store
92113
vpsrlq ymm2, ymm2, 32
93114
vpsrlq ymm4, ymm4, 32
94115
vmovshdup ymm6, ymm6
95-
96-
// Store
97116
vpblendd ymm2, ymm2, ymm3, 0xAA
98117
vpblendd ymm4, ymm4, ymm5, 0xAA
99118
vpblendd ymm6, ymm6, ymm7, 0xAA
@@ -108,6 +127,10 @@ _looptop1:
108127
cmp eax, 10
109128
jb _looptop1
110129

130+
131+
// Handle the last 256 % 24 = 16 = 2*8 coefficients, left over by the loop
132+
133+
// Load
111134
vmovdqa ymm2, [rsi]
112135
vmovdqa ymm4, [rsi + 32]
113136
vmovdqa ymm10, [rdx]
@@ -116,12 +139,22 @@ _looptop1:
116139
vpsrlq ymm5, ymm4, 32
117140
vmovshdup ymm11, ymm10
118141
vmovshdup ymm13, ymm12
142+
/*
143+
* ymm{i} stores a's coefficients for i in 2...5, and b's coefficients
144+
* for i in 10...13.
145+
*
146+
* Bounds: |ymm{i}| < 9q for i in 2...5, 10...13
147+
*/
119148

120149
// Multiply
121150
vpmuldq ymm2, ymm2, ymm10
122151
vpmuldq ymm3, ymm3, ymm11
123152
vpmuldq ymm4, ymm4, ymm12
124153
vpmuldq ymm5, ymm5, ymm13
154+
/*
155+
* Bounds: |ymm{i}| < 81q^2 < MONTGOMERY_REDUCE_STRONG_DOMAIN_MAX
156+
* for i in 2...5
157+
*/
125158

126159
// Reduce
127160
vpmuldq ymm10, ymm0, ymm2
@@ -136,10 +169,15 @@ _looptop1:
136169
vpsubq ymm3, ymm3, ymm11
137170
vpsubq ymm4, ymm4, ymm12
138171
vpsubq ymm5, ymm5, ymm13
139-
vpsrlq ymm2, ymm2, 32
140-
vmovshdup ymm4, ymm4
172+
/*
173+
* As explained in the loop.
174+
*
175+
* Bounds: |ymm{i}| < q for i in 2...5
176+
*/
141177

142178
// Store
179+
vpsrlq ymm2, ymm2, 32
180+
vmovshdup ymm4, ymm4
143181
vpblendd ymm2, ymm3, ymm2, 0x55
144182
vpblendd ymm4, ymm5, ymm4, 0x55
145183
vmovdqa [rdi], ymm2

dev/x86_64/src/pointwise_acc_l4.S

Lines changed: 21 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -37,12 +37,20 @@
3737
vpsrlq ymm9, ymm8, 32
3838
vmovshdup ymm11, ymm10
3939
vmovshdup ymm13, ymm12
40+
/*
41+
* ymm{i} stores a's coefficients for i in 6...9, and b's coefficients
42+
* for i in 10...13.
43+
*
44+
* Bounds: |ymm{i}| < q for i in 6...9
45+
* < 9q for i in 10...13
46+
*/
4047

4148
// Multiply
4249
vpmuldq ymm6, ymm6, ymm10
4350
vpmuldq ymm7, ymm7, ymm11
4451
vpmuldq ymm8, ymm8, ymm12
4552
vpmuldq ymm9, ymm9, ymm13
53+
/* Bounds: |ymm{i}| < 9q^2 for i in 6...9 */
4654
.endm
4755

4856
.macro acc
@@ -80,15 +88,19 @@ _looptop2:
8088
vmovdqa ymm3, ymm7
8189
vmovdqa ymm4, ymm8
8290
vmovdqa ymm5, ymm9
91+
/* Bounds: |ymm{i}| < 9q^2 */
8392

8493
pointwise 1024
8594
acc
95+
/* Bounds: |ymm{i}| < 18q^2 */
8696

8797
pointwise 2048
8898
acc
99+
/* Bounds: |ymm{i}| < 27q^2 */
89100

90101
pointwise 3072
91102
acc
103+
/* Bounds: |ymm{i}| < 36q^2 < MONTGOMERY_REDUCE_STRONG_DOMAIN_MAX */
92104

93105
// Reduce
94106
vpmuldq ymm6, ymm0, ymm2
@@ -103,10 +115,17 @@ _looptop2:
103115
vpsubq ymm3, ymm3, ymm7
104116
vpsubq ymm4, ymm4, ymm8
105117
vpsubq ymm5, ymm5, ymm9
106-
vpsrlq ymm2, ymm2, 32
107-
vmovshdup ymm4, ymm4
118+
/*
119+
* All coefficients are Montgomery-reduced, resulting in
120+
*
121+
* Bounds: |ymm{i}| < q for i in 2...5
122+
*
123+
* See description of mld_montgomery_reduce() in mldsa/src/reduce.h.
124+
*/
108125

109126
// Store
127+
vpsrlq ymm2, ymm2, 32
128+
vmovshdup ymm4, ymm4
110129
vpblendd ymm2, ymm2, ymm3, 0xAA
111130
vpblendd ymm4, ymm4, ymm5, 0xAA
112131

dev/x86_64/src/pointwise_acc_l5.S

Lines changed: 22 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -37,12 +37,20 @@
3737
vpsrlq ymm9, ymm8, 32
3838
vmovshdup ymm11, ymm10
3939
vmovshdup ymm13, ymm12
40+
/*
41+
* ymm{i} stores a's coefficients for i in 6...9, and b's coefficients
42+
* for i in 10...13.
43+
*
44+
* Bounds: |ymm{i}| < q for i in 6...9
45+
* < 9q for i in 10...13
46+
*/
4047

4148
// Multiply
4249
vpmuldq ymm6, ymm6, ymm10
4350
vpmuldq ymm7, ymm7, ymm11
4451
vpmuldq ymm8, ymm8, ymm12
4552
vpmuldq ymm9, ymm9, ymm13
53+
/* Bounds: |ymm{i}| < 9q^2 for i in 6...9 */
4654
.endm
4755

4856
.macro acc
@@ -80,18 +88,23 @@ _looptop2:
8088
vmovdqa ymm3, ymm7
8189
vmovdqa ymm4, ymm8
8290
vmovdqa ymm5, ymm9
91+
/* Bounds: |ymm{i}| < 9q^2 */
8392

8493
pointwise 1024
8594
acc
95+
/* Bounds: |ymm{i}| < 18q^2 */
8696

8797
pointwise 2048
8898
acc
99+
/* Bounds: |ymm{i}| < 27q^2 */
89100

90101
pointwise 3072
91102
acc
103+
/* Bounds: |ymm{i}| < 36q^2 */
92104

93105
pointwise 4096
94106
acc
107+
/* Bounds: |ymm{i}| < 45q^2 < MONTGOMERY_REDUCE_STRONG_DOMAIN_MAX */
95108

96109
// Reduce
97110
vpmuldq ymm6, ymm0, ymm2
@@ -106,10 +119,17 @@ _looptop2:
106119
vpsubq ymm3, ymm3, ymm7
107120
vpsubq ymm4, ymm4, ymm8
108121
vpsubq ymm5, ymm5, ymm9
109-
vpsrlq ymm2, ymm2, 32
110-
vmovshdup ymm4, ymm4
122+
/*
123+
* All coefficients are Montgomery-reduced, resulting in
124+
*
125+
* Bounds: |ymm{i}| < q for i in 2...5
126+
*
127+
* See description of mld_montgomery_reduce() in mldsa/src/reduce.h.
128+
*/
111129

112130
// Store
131+
vpsrlq ymm2, ymm2, 32
132+
vmovshdup ymm4, ymm4
113133
vpblendd ymm2, ymm2, ymm3, 0xAA
114134
vpblendd ymm4, ymm4, ymm5, 0xAA
115135

dev/x86_64/src/pointwise_acc_l7.S

Lines changed: 24 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -37,12 +37,20 @@
3737
vpsrlq ymm9, ymm8, 32
3838
vmovshdup ymm11, ymm10
3939
vmovshdup ymm13, ymm12
40+
/*
41+
* ymm{i} stores a's coefficients for i in 6...9, and b's coefficients
42+
* for i in 10...13.
43+
*
44+
* Bounds: |ymm{i}| < q for i in 6...9
45+
* < 9q for i in 10...13
46+
*/
4047

4148
// Multiply
4249
vpmuldq ymm6, ymm6, ymm10
4350
vpmuldq ymm7, ymm7, ymm11
4451
vpmuldq ymm8, ymm8, ymm12
4552
vpmuldq ymm9, ymm9, ymm13
53+
/* Bounds: |ymm{i}| < 9q^2 for i in 6...9 */
4654
.endm
4755

4856
.macro acc
@@ -80,24 +88,31 @@ _looptop2:
8088
vmovdqa ymm3, ymm7
8189
vmovdqa ymm4, ymm8
8290
vmovdqa ymm5, ymm9
91+
/* Bounds: |ymm{i}| < 9q^2 */
8392

8493
pointwise 1024
8594
acc
95+
/* Bounds: |ymm{i}| < 18q^2 */
8696

8797
pointwise 2048
8898
acc
99+
/* Bounds: |ymm{i}| < 27q^2 */
89100

90101
pointwise 3072
91102
acc
103+
/* Bounds: |ymm{i}| < 36q^2 */
92104

93105
pointwise 4096
94106
acc
107+
/* Bounds: |ymm{i}| < 45q^2 */
95108

96109
pointwise 5120
97110
acc
111+
/* Bounds: |ymm{i}| < 54q^2 */
98112

99113
pointwise 6144
100114
acc
115+
/* Bounds: |ymm{i}| < 63q^2 < MONTGOMERY_REDUCE_STRONG_DOMAIN_MAX */
101116

102117
// Reduce
103118
vpmuldq ymm6, ymm0, ymm2
@@ -112,10 +127,17 @@ _looptop2:
112127
vpsubq ymm3, ymm3, ymm7
113128
vpsubq ymm4, ymm4, ymm8
114129
vpsubq ymm5, ymm5, ymm9
115-
vpsrlq ymm2, ymm2, 32
116-
vmovshdup ymm4, ymm4
130+
/*
131+
* All coefficients are Montgomery-reduced, resulting in
132+
*
133+
* Bounds: |ymm{i}| < q for i in 2...5
134+
*
135+
* See description of mld_montgomery_reduce() in mldsa/src/reduce.h.
136+
*/
117137

118138
// Store
139+
vpsrlq ymm2, ymm2, 32
140+
vmovshdup ymm4, ymm4
119141
vpblendd ymm2, ymm2, ymm3, 0xAA
120142
vpblendd ymm4, ymm4, ymm5, 0xAA
121143

0 commit comments

Comments
 (0)