1
- ; RUN: llc -mtriple=amdgcn -mcpu=gfx900 -denormal-fp-math-f32=preserve-sign -enable-unsafe-fp-math - verify-machineinstrs < %s | FileCheck %s -check-prefixes=GCN,GFX900
2
- ; RUN: llc -mtriple=amdgcn -mcpu=gfx906 -denormal-fp-math-f32=preserve-sign -enable-unsafe-fp-math - verify-machineinstrs < %s | FileCheck %s -check-prefixes=GCN,GCN-DL-UNSAFE,GFX906-DL-UNSAFE
3
- ; RUN: llc -mtriple=amdgcn -mcpu=gfx1011 -denormal-fp-math-f32=preserve-sign -enable-unsafe-fp-math - verify-machineinstrs < %s | FileCheck %s -check-prefixes=GCN,GCN-DL-UNSAFE,GFX10-DL-UNSAFE,GFX10-CONTRACT
4
- ; RUN: llc -mtriple=amdgcn -mcpu=gfx1012 -denormal-fp-math-f32=preserve-sign -enable-unsafe-fp-math - verify-machineinstrs < %s | FileCheck %s -check-prefixes=GCN,GCN-DL-UNSAFE,GFX10-DL-UNSAFE,GFX10-CONTRACT
1
+ ; RUN: llc -mtriple=amdgcn -mcpu=gfx900 -denormal-fp-math-f32=preserve-sign -verify-machineinstrs < %s | FileCheck %s -check-prefixes=GCN,GFX900
2
+ ; RUN: llc -mtriple=amdgcn -mcpu=gfx906 -denormal-fp-math-f32=preserve-sign -verify-machineinstrs < %s | FileCheck %s -check-prefixes=GCN,GCN-DL-UNSAFE,GFX906-DL-UNSAFE
3
+ ; RUN: llc -mtriple=amdgcn -mcpu=gfx1011 -denormal-fp-math-f32=preserve-sign -verify-machineinstrs < %s | FileCheck %s -check-prefixes=GCN,GCN-DL-UNSAFE,GFX10-DL-UNSAFE,GFX10-CONTRACT
4
+ ; RUN: llc -mtriple=amdgcn -mcpu=gfx1012 -denormal-fp-math-f32=preserve-sign -verify-machineinstrs < %s | FileCheck %s -check-prefixes=GCN,GCN-DL-UNSAFE,GFX10-DL-UNSAFE,GFX10-CONTRACT
5
5
; RUN: llc -mtriple=amdgcn -mcpu=gfx906 -denormal-fp-math-f32=preserve-sign -verify-machineinstrs < %s | FileCheck %s -check-prefixes=GCN,GFX906
6
6
; RUN: llc -mtriple=amdgcn -mcpu=gfx906 -denormal-fp-math=preserve-sign -fp-contract=fast -verify-machineinstrs < %s | FileCheck %s -check-prefixes=GCN,GFX906-CONTRACT
7
7
; RUN: llc -mtriple=amdgcn -mcpu=gfx906 -denormal-fp-math=ieee -fp-contract=fast -verify-machineinstrs < %s | FileCheck %s -check-prefixes=GCN,GFX906-DENORM-CONTRACT
8
- ; RUN: llc -mtriple=amdgcn -mcpu=gfx906 -denormal-fp-math-f32=preserve-sign -enable-unsafe-fp-math - mattr="+dot7-insts,-dot10-insts" -verify-machineinstrs < %s | FileCheck %s -check-prefixes=GCN,GFX906-DOT10-DISABLED
8
+ ; RUN: llc -mtriple=amdgcn -mcpu=gfx906 -denormal-fp-math-f32=preserve-sign -mattr="+dot7-insts,-dot10-insts" -verify-machineinstrs < %s | FileCheck %s -check-prefixes=GCN,GFX906-DOT10-DISABLED
9
9
; (fadd (fmul S1.x, S2.x), (fadd (fmul (S1.y, S2.y), z))) -> (fdot2 S1, S2, z)
10
10
11
11
; Tests to make sure fdot2 is not generated when vector elements of dot-product expressions
12
12
; are not converted from f16 to f32.
13
- ; GCN-LABEL: {{^}}dotproduct_f16
13
+ ; GCN-LABEL: {{^}}dotproduct_f16_contract
14
14
; GFX900: v_fma_f16
15
15
; GFX900: v_fma_f16
16
16
17
- ; GFX906: v_mul_f16_e32
18
- ; GFX906: v_mul_f16_e32
19
-
20
17
; GFX906-DL-UNSAFE: v_fma_f16
21
18
; GFX10-CONTRACT: v_fmac_f16
22
19
23
20
; GFX906-CONTRACT: v_mac_f16_e32
24
21
; GFX906-DENORM-CONTRACT: v_fma_f16
25
22
; GFX906-DOT10-DISABLED: v_fma_f16
23
+
24
+ define amdgpu_kernel void @dotproduct_f16_contract (ptr addrspace (1 ) %src1 ,
25
+ ptr addrspace (1 ) %src2 ,
26
+ ptr addrspace (1 ) nocapture %dst ) {
27
+ entry:
28
+ %src1.vec = load <2 x half >, ptr addrspace (1 ) %src1
29
+ %src2.vec = load <2 x half >, ptr addrspace (1 ) %src2
30
+
31
+ %src1.el1 = extractelement <2 x half > %src1.vec , i64 0
32
+ %src2.el1 = extractelement <2 x half > %src2.vec , i64 0
33
+
34
+ %src1.el2 = extractelement <2 x half > %src1.vec , i64 1
35
+ %src2.el2 = extractelement <2 x half > %src2.vec , i64 1
36
+
37
+ %mul2 = fmul contract half %src1.el2 , %src2.el2
38
+ %mul1 = fmul contract half %src1.el1 , %src2.el1
39
+ %acc = load half , ptr addrspace (1 ) %dst , align 2
40
+ %acc1 = fadd contract half %mul2 , %acc
41
+ %acc2 = fadd contract half %mul1 , %acc1
42
+ store half %acc2 , ptr addrspace (1 ) %dst , align 2
43
+ ret void
44
+ }
45
+
46
+ ; GCN-LABEL: {{^}}dotproduct_f16
47
+
48
+ ; GFX906: v_mul_f16_e32
49
+ ; GFX906: v_mul_f16_e32
50
+
26
51
define amdgpu_kernel void @dotproduct_f16 (ptr addrspace (1 ) %src1 ,
27
52
ptr addrspace (1 ) %src2 ,
28
53
ptr addrspace (1 ) nocapture %dst ) {
@@ -45,18 +70,12 @@ entry:
45
70
ret void
46
71
}
47
72
48
-
49
73
; We only want to generate fdot2 if:
50
74
; - vector element of dot product is converted from f16 to f32, and
51
75
; - the vectors are of type <2 x half>, and
52
76
; - "dot10-insts" is enabled
53
77
54
- ; GCN-LABEL: {{^}}dotproduct_f16_f32
55
- ; GFX900: v_mad_mix_f32
56
- ; GFX900: v_mad_mix_f32
57
-
58
- ; GFX906: v_mad_f32
59
- ; GFX906: v_mac_f32_e32
78
+ ; GCN-LABEL: {{^}}dotproduct_f16_f32_contract
60
79
61
80
; GFX906-DL-UNSAFE: v_dot2_f32_f16
62
81
; GFX10-DL-UNSAFE: v_dot2c_f32_f16
@@ -65,6 +84,39 @@ entry:
65
84
66
85
; GFX906-DENORM-CONTRACT: v_dot2_f32_f16
67
86
; GFX906-DOT10-DISABLED: v_fma_mix_f32
87
+ define amdgpu_kernel void @dotproduct_f16_f32_contract (ptr addrspace (1 ) %src1 ,
88
+ ptr addrspace (1 ) %src2 ,
89
+ ptr addrspace (1 ) nocapture %dst ) {
90
+ entry:
91
+ %src1.vec = load <2 x half >, ptr addrspace (1 ) %src1
92
+ %src2.vec = load <2 x half >, ptr addrspace (1 ) %src2
93
+
94
+ %src1.el1 = extractelement <2 x half > %src1.vec , i64 0
95
+ %csrc1.el1 = fpext half %src1.el1 to float
96
+ %src2.el1 = extractelement <2 x half > %src2.vec , i64 0
97
+ %csrc2.el1 = fpext half %src2.el1 to float
98
+
99
+ %src1.el2 = extractelement <2 x half > %src1.vec , i64 1
100
+ %csrc1.el2 = fpext half %src1.el2 to float
101
+ %src2.el2 = extractelement <2 x half > %src2.vec , i64 1
102
+ %csrc2.el2 = fpext half %src2.el2 to float
103
+
104
+ %mul2 = fmul contract float %csrc1.el2 , %csrc2.el2
105
+ %mul1 = fmul contract float %csrc1.el1 , %csrc2.el1
106
+ %acc = load float , ptr addrspace (1 ) %dst , align 4
107
+ %acc1 = fadd contract float %mul2 , %acc
108
+ %acc2 = fadd contract float %mul1 , %acc1
109
+ store float %acc2 , ptr addrspace (1 ) %dst , align 4
110
+ ret void
111
+ }
112
+
113
+ ; GCN-LABEL: {{^}}dotproduct_f16_f32
114
+ ; GFX900: v_mad_mix_f32
115
+ ; GFX900: v_mad_mix_f32
116
+
117
+ ; GFX906: v_mad_f32
118
+ ; GFX906: v_mac_f32_e32
119
+
68
120
define amdgpu_kernel void @dotproduct_f16_f32 (ptr addrspace (1 ) %src1 ,
69
121
ptr addrspace (1 ) %src2 ,
70
122
ptr addrspace (1 ) nocapture %dst ) {
@@ -96,19 +148,46 @@ entry:
96
148
; - the vectors are of type <2 x half>, and
97
149
; - "dot10-insts" is enabled
98
150
151
+ ; GCN-LABEL: {{^}}dotproduct_diffvecorder_contract
152
+ ; GFX906-DL-UNSAFE: v_dot2_f32_f16
153
+ ; GFX10-DL-UNSAFE: v_dot2c_f32_f16
154
+
155
+ ; GFX906-CONTRACT: v_dot2_f32_f16
156
+ ; GFX906-DENORM-CONTRACT: v_dot2_f32_f16
157
+ ; GFX906-DOT10-DISABLED: v_fma_mix_f32
158
+ define amdgpu_kernel void @dotproduct_diffvecorder_contract (ptr addrspace (1 ) %src1 ,
159
+ ptr addrspace (1 ) %src2 ,
160
+ ptr addrspace (1 ) nocapture %dst ) {
161
+ entry:
162
+ %src1.vec = load <2 x half >, ptr addrspace (1 ) %src1
163
+ %src2.vec = load <2 x half >, ptr addrspace (1 ) %src2
164
+
165
+ %src1.el1 = extractelement <2 x half > %src1.vec , i64 0
166
+ %csrc1.el1 = fpext half %src1.el1 to float
167
+ %src2.el1 = extractelement <2 x half > %src2.vec , i64 0
168
+ %csrc2.el1 = fpext half %src2.el1 to float
169
+
170
+ %src1.el2 = extractelement <2 x half > %src1.vec , i64 1
171
+ %csrc1.el2 = fpext half %src1.el2 to float
172
+ %src2.el2 = extractelement <2 x half > %src2.vec , i64 1
173
+ %csrc2.el2 = fpext half %src2.el2 to float
174
+
175
+ %mul2 = fmul contract float %csrc2.el2 , %csrc1.el2
176
+ %mul1 = fmul contract float %csrc1.el1 , %csrc2.el1
177
+ %acc = load float , ptr addrspace (1 ) %dst , align 4
178
+ %acc1 = fadd contract float %mul2 , %acc
179
+ %acc2 = fadd contract float %mul1 , %acc1
180
+ store float %acc2 , ptr addrspace (1 ) %dst , align 4
181
+ ret void
182
+ }
183
+
99
184
; GCN-LABEL: {{^}}dotproduct_diffvecorder
100
185
; GFX900: v_mad_mix_f32
101
186
; GFX900: v_mad_mix_f32
102
187
103
188
; GFX906: v_mad_f32
104
189
; GFX906: v_mac_f32_e32
105
190
106
- ; GFX906-DL-UNSAFE: v_dot2_f32_f16
107
- ; GFX10-DL-UNSAFE: v_dot2c_f32_f16
108
-
109
- ; GFX906-CONTRACT: v_dot2_f32_f16
110
- ; GFX906-DENORM-CONTRACT: v_dot2_f32_f16
111
- ; GFX906-DOT10-DISABLED: v_fma_mix_f32
112
191
define amdgpu_kernel void @dotproduct_diffvecorder (ptr addrspace (1 ) %src1 ,
113
192
ptr addrspace (1 ) %src2 ,
114
193
ptr addrspace (1 ) nocapture %dst ) {
@@ -136,17 +215,45 @@ entry:
136
215
}
137
216
138
217
; Tests to make sure dot product is not generated when the vectors are not of <2 x half>.
139
- ; GCN-LABEL: {{^}}dotproduct_v4f16
140
- ; GFX900: v_mad_mix_f32
141
-
142
- ; GFX906: v_mad_f32
143
- ; GFX906: v_mac_f32_e32
218
+ ; GCN-LABEL: {{^}}dotproduct_v4f16_contract
144
219
145
220
; GCN-DL-UNSAFE: v_fma_mix_f32
146
221
147
222
; GFX906-CONTRACT: v_fma_mix_f32
148
223
; GFX906-DENORM-CONTRACT: v_fma_mix_f32
149
224
; GFX906-DOT10-DISABLED: v_fma_mix_f32
225
+ define amdgpu_kernel void @dotproduct_v4f16_contract (ptr addrspace (1 ) %src1 ,
226
+ ptr addrspace (1 ) %src2 ,
227
+ ptr addrspace (1 ) nocapture %dst ) {
228
+ entry:
229
+ %src1.vec = load <4 x half >, ptr addrspace (1 ) %src1
230
+ %src2.vec = load <4 x half >, ptr addrspace (1 ) %src2
231
+
232
+ %src1.el1 = extractelement <4 x half > %src1.vec , i64 0
233
+ %csrc1.el1 = fpext half %src1.el1 to float
234
+ %src2.el1 = extractelement <4 x half > %src2.vec , i64 0
235
+ %csrc2.el1 = fpext half %src2.el1 to float
236
+
237
+ %src1.el2 = extractelement <4 x half > %src1.vec , i64 1
238
+ %csrc1.el2 = fpext half %src1.el2 to float
239
+ %src2.el2 = extractelement <4 x half > %src2.vec , i64 1
240
+ %csrc2.el2 = fpext half %src2.el2 to float
241
+
242
+ %mul2 = fmul contract float %csrc1.el2 , %csrc2.el2
243
+ %mul1 = fmul float %csrc1.el1 , %csrc2.el1
244
+ %acc = load float , ptr addrspace (1 ) %dst , align 4
245
+ %acc1 = fadd contract float %mul2 , %acc
246
+ %acc2 = fadd contract float %mul1 , %acc1
247
+ store float %acc2 , ptr addrspace (1 ) %dst , align 4
248
+ ret void
249
+ }
250
+
251
+ ; GCN-LABEL: {{^}}dotproduct_v4f16
252
+ ; GFX900: v_mad_mix_f32
253
+
254
+ ; GFX906: v_mad_f32
255
+ ; GFX906: v_mac_f32_e32
256
+
150
257
define amdgpu_kernel void @dotproduct_v4f16 (ptr addrspace (1 ) %src1 ,
151
258
ptr addrspace (1 ) %src2 ,
152
259
ptr addrspace (1 ) nocapture %dst ) {
@@ -173,18 +280,46 @@ entry:
173
280
ret void
174
281
}
175
282
283
+ ; GCN-LABEL: {{^}}NotAdotproductContract
284
+
285
+ ; GCN-DL-UNSAFE: v_fma_mix_f32
286
+
287
+ ; GFX906-CONTRACT: v_fma_mix_f32
288
+ ; GFX906-DENORM-CONTRACT: v_fma_mix_f32
289
+ ; GFX906-DOT10-DISABLED: v_fma_mix_f32
290
+ define amdgpu_kernel void @NotAdotproductContract (ptr addrspace (1 ) %src1 ,
291
+ ptr addrspace (1 ) %src2 ,
292
+ ptr addrspace (1 ) nocapture %dst ) {
293
+ entry:
294
+ %src1.vec = load <2 x half >, ptr addrspace (1 ) %src1
295
+ %src2.vec = load <2 x half >, ptr addrspace (1 ) %src2
296
+
297
+ %src1.el1 = extractelement <2 x half > %src1.vec , i64 0
298
+ %csrc1.el1 = fpext half %src1.el1 to float
299
+ %src2.el1 = extractelement <2 x half > %src2.vec , i64 0
300
+ %csrc2.el1 = fpext half %src2.el1 to float
301
+
302
+ %src1.el2 = extractelement <2 x half > %src1.vec , i64 1
303
+ %csrc1.el2 = fpext half %src1.el2 to float
304
+ %src2.el2 = extractelement <2 x half > %src2.vec , i64 1
305
+ %csrc2.el2 = fpext half %src2.el2 to float
306
+
307
+ %mul2 = fmul contract float %csrc1.el2 , %csrc1.el1
308
+ %mul1 = fmul contract float %csrc2.el1 , %csrc2.el2
309
+ %acc = load float , ptr addrspace (1 ) %dst , align 4
310
+ %acc1 = fadd contract float %mul2 , %acc
311
+ %acc2 = fadd contract float %mul1 , %acc1
312
+ store float %acc2 , ptr addrspace (1 ) %dst , align 4
313
+ ret void
314
+ }
315
+
176
316
; GCN-LABEL: {{^}}NotAdotproduct
177
317
; GFX900: v_mad_mix_f32
178
318
; GFX900: v_mad_mix_f32
179
319
180
320
; GFX906: v_mad_f32
181
321
; GFX906: v_mac_f32_e32
182
322
183
- ; GCN-DL-UNSAFE: v_fma_mix_f32
184
-
185
- ; GFX906-CONTRACT: v_fma_mix_f32
186
- ; GFX906-DENORM-CONTRACT: v_fma_mix_f32
187
- ; GFX906-DOT10-DISABLED: v_fma_mix_f32
188
323
define amdgpu_kernel void @NotAdotproduct (ptr addrspace (1 ) %src1 ,
189
324
ptr addrspace (1 ) %src2 ,
190
325
ptr addrspace (1 ) nocapture %dst ) {
@@ -211,18 +346,46 @@ entry:
211
346
ret void
212
347
}
213
348
349
+ ; GCN-LABEL: {{^}}Diff_Idx_NotAdotproductContract
350
+
351
+ ; GCN-DL-UNSAFE: v_fma_mix_f32
352
+
353
+ ; GFX906-CONTRACT: v_fma_mix_f32
354
+ ; GFX906-DENORM-CONTRACT: v_fma_mix_f32
355
+ ; GFX906-DOT10-DISABLED: v_fma_mix_f32
356
+ define amdgpu_kernel void @Diff_Idx_NotAdotproductContract (ptr addrspace (1 ) %src1 ,
357
+ ptr addrspace (1 ) %src2 ,
358
+ ptr addrspace (1 ) nocapture %dst ) {
359
+ entry:
360
+ %src1.vec = load <2 x half >, ptr addrspace (1 ) %src1
361
+ %src2.vec = load <2 x half >, ptr addrspace (1 ) %src2
362
+
363
+ %src1.el1 = extractelement <2 x half > %src1.vec , i64 0
364
+ %csrc1.el1 = fpext half %src1.el1 to float
365
+ %src2.el1 = extractelement <2 x half > %src2.vec , i64 0
366
+ %csrc2.el1 = fpext half %src2.el1 to float
367
+
368
+ %src1.el2 = extractelement <2 x half > %src1.vec , i64 1
369
+ %csrc1.el2 = fpext half %src1.el2 to float
370
+ %src2.el2 = extractelement <2 x half > %src2.vec , i64 1
371
+ %csrc2.el2 = fpext half %src2.el2 to float
372
+
373
+ %mul2 = fmul contract float %csrc1.el2 , %csrc2.el1
374
+ %mul1 = fmul contract float %csrc1.el1 , %csrc2.el2
375
+ %acc = load float , ptr addrspace (1 ) %dst , align 4
376
+ %acc1 = fadd contract float %mul2 , %acc
377
+ %acc2 = fadd contract float %mul1 , %acc1
378
+ store float %acc2 , ptr addrspace (1 ) %dst , align 4
379
+ ret void
380
+ }
381
+
214
382
; GCN-LABEL: {{^}}Diff_Idx_NotAdotproduct
215
383
; GFX900: v_mad_mix_f32
216
384
; GFX900: v_mad_mix_f32
217
385
218
386
; GFX906: v_mad_f32
219
387
; GFX906: v_mac_f32_e32
220
388
221
- ; GCN-DL-UNSAFE: v_fma_mix_f32
222
-
223
- ; GFX906-CONTRACT: v_fma_mix_f32
224
- ; GFX906-DENORM-CONTRACT: v_fma_mix_f32
225
- ; GFX906-DOT10-DISABLED: v_fma_mix_f32
226
389
define amdgpu_kernel void @Diff_Idx_NotAdotproduct (ptr addrspace (1 ) %src1 ,
227
390
ptr addrspace (1 ) %src2 ,
228
391
ptr addrspace (1 ) nocapture %dst ) {
0 commit comments