Skip to content

Commit 901e139

Browse files
authored
[SelectionDAG] Remove UnsafeFPMath check in visitFADDForFMACombine (#127770)
As requested in #127488, remove reference to `Options.UnsafeFPMath`, which should be obsolete and `AllowFPOpFusion` also handles it.
1 parent 948cc91 commit 901e139

File tree

11 files changed

+329
-110
lines changed

11 files changed

+329
-110
lines changed

llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp

Lines changed: 14 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -16772,8 +16772,8 @@ SDValue DAGCombiner::visitFADDForFMACombine(SDNode *N) {
1677216772
if (!HasFMAD && !HasFMA)
1677316773
return SDValue();
1677416774

16775-
bool AllowFusionGlobally = (Options.AllowFPOpFusion == FPOpFusion::Fast ||
16776-
Options.UnsafeFPMath || HasFMAD);
16775+
bool AllowFusionGlobally =
16776+
Options.AllowFPOpFusion == FPOpFusion::Fast || HasFMAD;
1677716777
// If the addition is not contractable, do not combine.
1677816778
if (!AllowFusionGlobally && !N->getFlags().hasAllowContract())
1677916779
return SDValue();
@@ -17982,6 +17982,7 @@ template <class MatchContextClass> SDValue DAGCombiner::visitFMA(SDNode *N) {
1798217982
SDValue N2 = N->getOperand(2);
1798317983
ConstantFPSDNode *N0CFP = dyn_cast<ConstantFPSDNode>(N0);
1798417984
ConstantFPSDNode *N1CFP = dyn_cast<ConstantFPSDNode>(N1);
17985+
ConstantFPSDNode *N2CFP = dyn_cast<ConstantFPSDNode>(N2);
1798517986
EVT VT = N->getValueType(0);
1798617987
SDLoc DL(N);
1798717988
const TargetOptions &Options = DAG.getTarget().Options;
@@ -18011,11 +18012,17 @@ template <class MatchContextClass> SDValue DAGCombiner::visitFMA(SDNode *N) {
1801118012
}
1801218013

1801318014
// FIXME: use fast math flags instead of Options.UnsafeFPMath
18014-
if (Options.UnsafeFPMath) {
18015-
if (N0CFP && N0CFP->isZero())
18016-
return N2;
18017-
if (N1CFP && N1CFP->isZero())
18018-
return N2;
18015+
// TODO: Finally migrate away from global TargetOptions.
18016+
if (Options.AllowFPOpFusion == FPOpFusion::Fast ||
18017+
(Options.NoNaNsFPMath && Options.NoInfsFPMath) ||
18018+
(N->getFlags().hasNoNaNs() && N->getFlags().hasNoInfs())) {
18019+
if (Options.NoSignedZerosFPMath || N->getFlags().hasNoSignedZeros() ||
18020+
(N2CFP && !N2CFP->isExactlyValue(-0.0))) {
18021+
if (N0CFP && N0CFP->isZero())
18022+
return N2;
18023+
if (N1CFP && N1CFP->isZero())
18024+
return N2;
18025+
}
1801918026
}
1802018027

1802118028
// FIXME: Support splat of constant.
Lines changed: 4 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -1,18 +1,16 @@
11
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2-
; RUN: llc -mtriple=arm64 -fp-contract=fast -o - %s | FileCheck %s
2+
; RUN: llc -mtriple=arm64 -o - %s | FileCheck %s
33

44

55
; Make sure we don't try to fold an fneg into +0.0, creating an illegal constant
66
; -0.0. It's also good, though not essential, that we don't resort to a litpool.
77
define double @test_fms_fold(double %a, double %b) {
88
; CHECK-LABEL: test_fms_fold:
99
; CHECK: // %bb.0:
10-
; CHECK-NEXT: movi d2, #0000000000000000
11-
; CHECK-NEXT: fmul d1, d1, d2
12-
; CHECK-NEXT: fnmsub d0, d0, d2, d1
10+
; CHECK-NEXT: movi {{d[0-9]+}}, #0000000000000000
1311
; CHECK-NEXT: ret
14-
%mul = fmul double %a, 0.000000e+00
15-
%mul1 = fmul double %b, 0.000000e+00
12+
%mul = fmul fast double %a, 0.000000e+00
13+
%mul1 = fmul fast double %b, 0.000000e+00
1614
%sub = fsub double %mul, %mul1
1715
ret double %sub
1816
}

llvm/test/CodeGen/AMDGPU/fdot2.ll

Lines changed: 200 additions & 37 deletions
Original file line numberDiff line numberDiff line change
@@ -1,28 +1,53 @@
1-
; RUN: llc -mtriple=amdgcn -mcpu=gfx900 -denormal-fp-math-f32=preserve-sign -enable-unsafe-fp-math -verify-machineinstrs < %s | FileCheck %s -check-prefixes=GCN,GFX900
2-
; RUN: llc -mtriple=amdgcn -mcpu=gfx906 -denormal-fp-math-f32=preserve-sign -enable-unsafe-fp-math -verify-machineinstrs < %s | FileCheck %s -check-prefixes=GCN,GCN-DL-UNSAFE,GFX906-DL-UNSAFE
3-
; RUN: llc -mtriple=amdgcn -mcpu=gfx1011 -denormal-fp-math-f32=preserve-sign -enable-unsafe-fp-math -verify-machineinstrs < %s | FileCheck %s -check-prefixes=GCN,GCN-DL-UNSAFE,GFX10-DL-UNSAFE,GFX10-CONTRACT
4-
; RUN: llc -mtriple=amdgcn -mcpu=gfx1012 -denormal-fp-math-f32=preserve-sign -enable-unsafe-fp-math -verify-machineinstrs < %s | FileCheck %s -check-prefixes=GCN,GCN-DL-UNSAFE,GFX10-DL-UNSAFE,GFX10-CONTRACT
1+
; RUN: llc -mtriple=amdgcn -mcpu=gfx900 -denormal-fp-math-f32=preserve-sign -verify-machineinstrs < %s | FileCheck %s -check-prefixes=GCN,GFX900
2+
; RUN: llc -mtriple=amdgcn -mcpu=gfx906 -denormal-fp-math-f32=preserve-sign -verify-machineinstrs < %s | FileCheck %s -check-prefixes=GCN,GCN-DL-UNSAFE,GFX906-DL-UNSAFE
3+
; RUN: llc -mtriple=amdgcn -mcpu=gfx1011 -denormal-fp-math-f32=preserve-sign -verify-machineinstrs < %s | FileCheck %s -check-prefixes=GCN,GCN-DL-UNSAFE,GFX10-DL-UNSAFE,GFX10-CONTRACT
4+
; RUN: llc -mtriple=amdgcn -mcpu=gfx1012 -denormal-fp-math-f32=preserve-sign -verify-machineinstrs < %s | FileCheck %s -check-prefixes=GCN,GCN-DL-UNSAFE,GFX10-DL-UNSAFE,GFX10-CONTRACT
55
; RUN: llc -mtriple=amdgcn -mcpu=gfx906 -denormal-fp-math-f32=preserve-sign -verify-machineinstrs < %s | FileCheck %s -check-prefixes=GCN,GFX906
66
; RUN: llc -mtriple=amdgcn -mcpu=gfx906 -denormal-fp-math=preserve-sign -fp-contract=fast -verify-machineinstrs < %s | FileCheck %s -check-prefixes=GCN,GFX906-CONTRACT
77
; RUN: llc -mtriple=amdgcn -mcpu=gfx906 -denormal-fp-math=ieee -fp-contract=fast -verify-machineinstrs < %s | FileCheck %s -check-prefixes=GCN,GFX906-DENORM-CONTRACT
8-
; RUN: llc -mtriple=amdgcn -mcpu=gfx906 -denormal-fp-math-f32=preserve-sign -enable-unsafe-fp-math -mattr="+dot7-insts,-dot10-insts" -verify-machineinstrs < %s | FileCheck %s -check-prefixes=GCN,GFX906-DOT10-DISABLED
8+
; RUN: llc -mtriple=amdgcn -mcpu=gfx906 -denormal-fp-math-f32=preserve-sign -mattr="+dot7-insts,-dot10-insts" -verify-machineinstrs < %s | FileCheck %s -check-prefixes=GCN,GFX906-DOT10-DISABLED
99
; (fadd (fmul S1.x, S2.x), (fadd (fmul (S1.y, S2.y), z))) -> (fdot2 S1, S2, z)
1010

1111
; Tests to make sure fdot2 is not generated when vector elements of dot-product expressions
1212
; are not converted from f16 to f32.
13-
; GCN-LABEL: {{^}}dotproduct_f16
13+
; GCN-LABEL: {{^}}dotproduct_f16_contract
1414
; GFX900: v_fma_f16
1515
; GFX900: v_fma_f16
1616

17-
; GFX906: v_mul_f16_e32
18-
; GFX906: v_mul_f16_e32
19-
2017
; GFX906-DL-UNSAFE: v_fma_f16
2118
; GFX10-CONTRACT: v_fmac_f16
2219

2320
; GFX906-CONTRACT: v_mac_f16_e32
2421
; GFX906-DENORM-CONTRACT: v_fma_f16
2522
; GFX906-DOT10-DISABLED: v_fma_f16
23+
24+
define amdgpu_kernel void @dotproduct_f16_contract(ptr addrspace(1) %src1,
25+
ptr addrspace(1) %src2,
26+
ptr addrspace(1) nocapture %dst) {
27+
entry:
28+
%src1.vec = load <2 x half>, ptr addrspace(1) %src1
29+
%src2.vec = load <2 x half>, ptr addrspace(1) %src2
30+
31+
%src1.el1 = extractelement <2 x half> %src1.vec, i64 0
32+
%src2.el1 = extractelement <2 x half> %src2.vec, i64 0
33+
34+
%src1.el2 = extractelement <2 x half> %src1.vec, i64 1
35+
%src2.el2 = extractelement <2 x half> %src2.vec, i64 1
36+
37+
%mul2 = fmul contract half %src1.el2, %src2.el2
38+
%mul1 = fmul contract half %src1.el1, %src2.el1
39+
%acc = load half, ptr addrspace(1) %dst, align 2
40+
%acc1 = fadd contract half %mul2, %acc
41+
%acc2 = fadd contract half %mul1, %acc1
42+
store half %acc2, ptr addrspace(1) %dst, align 2
43+
ret void
44+
}
45+
46+
; GCN-LABEL: {{^}}dotproduct_f16
47+
48+
; GFX906: v_mul_f16_e32
49+
; GFX906: v_mul_f16_e32
50+
2651
define amdgpu_kernel void @dotproduct_f16(ptr addrspace(1) %src1,
2752
ptr addrspace(1) %src2,
2853
ptr addrspace(1) nocapture %dst) {
@@ -45,18 +70,12 @@ entry:
4570
ret void
4671
}
4772

48-
4973
; We only want to generate fdot2 if:
5074
; - vector element of dot product is converted from f16 to f32, and
5175
; - the vectors are of type <2 x half>, and
5276
; - "dot10-insts" is enabled
5377

54-
; GCN-LABEL: {{^}}dotproduct_f16_f32
55-
; GFX900: v_mad_mix_f32
56-
; GFX900: v_mad_mix_f32
57-
58-
; GFX906: v_mad_f32
59-
; GFX906: v_mac_f32_e32
78+
; GCN-LABEL: {{^}}dotproduct_f16_f32_contract
6079

6180
; GFX906-DL-UNSAFE: v_dot2_f32_f16
6281
; GFX10-DL-UNSAFE: v_dot2c_f32_f16
@@ -65,6 +84,39 @@ entry:
6584

6685
; GFX906-DENORM-CONTRACT: v_dot2_f32_f16
6786
; GFX906-DOT10-DISABLED: v_fma_mix_f32
87+
define amdgpu_kernel void @dotproduct_f16_f32_contract(ptr addrspace(1) %src1,
88+
ptr addrspace(1) %src2,
89+
ptr addrspace(1) nocapture %dst) {
90+
entry:
91+
%src1.vec = load <2 x half>, ptr addrspace(1) %src1
92+
%src2.vec = load <2 x half>, ptr addrspace(1) %src2
93+
94+
%src1.el1 = extractelement <2 x half> %src1.vec, i64 0
95+
%csrc1.el1 = fpext half %src1.el1 to float
96+
%src2.el1 = extractelement <2 x half> %src2.vec, i64 0
97+
%csrc2.el1 = fpext half %src2.el1 to float
98+
99+
%src1.el2 = extractelement <2 x half> %src1.vec, i64 1
100+
%csrc1.el2 = fpext half %src1.el2 to float
101+
%src2.el2 = extractelement <2 x half> %src2.vec, i64 1
102+
%csrc2.el2 = fpext half %src2.el2 to float
103+
104+
%mul2 = fmul contract float %csrc1.el2, %csrc2.el2
105+
%mul1 = fmul contract float %csrc1.el1, %csrc2.el1
106+
%acc = load float, ptr addrspace(1) %dst, align 4
107+
%acc1 = fadd contract float %mul2, %acc
108+
%acc2 = fadd contract float %mul1, %acc1
109+
store float %acc2, ptr addrspace(1) %dst, align 4
110+
ret void
111+
}
112+
113+
; GCN-LABEL: {{^}}dotproduct_f16_f32
114+
; GFX900: v_mad_mix_f32
115+
; GFX900: v_mad_mix_f32
116+
117+
; GFX906: v_mad_f32
118+
; GFX906: v_mac_f32_e32
119+
68120
define amdgpu_kernel void @dotproduct_f16_f32(ptr addrspace(1) %src1,
69121
ptr addrspace(1) %src2,
70122
ptr addrspace(1) nocapture %dst) {
@@ -96,19 +148,46 @@ entry:
96148
; - the vectors are of type <2 x half>, and
97149
; - "dot10-insts" is enabled
98150

151+
; GCN-LABEL: {{^}}dotproduct_diffvecorder_contract
152+
; GFX906-DL-UNSAFE: v_dot2_f32_f16
153+
; GFX10-DL-UNSAFE: v_dot2c_f32_f16
154+
155+
; GFX906-CONTRACT: v_dot2_f32_f16
156+
; GFX906-DENORM-CONTRACT: v_dot2_f32_f16
157+
; GFX906-DOT10-DISABLED: v_fma_mix_f32
158+
define amdgpu_kernel void @dotproduct_diffvecorder_contract(ptr addrspace(1) %src1,
159+
ptr addrspace(1) %src2,
160+
ptr addrspace(1) nocapture %dst) {
161+
entry:
162+
%src1.vec = load <2 x half>, ptr addrspace(1) %src1
163+
%src2.vec = load <2 x half>, ptr addrspace(1) %src2
164+
165+
%src1.el1 = extractelement <2 x half> %src1.vec, i64 0
166+
%csrc1.el1 = fpext half %src1.el1 to float
167+
%src2.el1 = extractelement <2 x half> %src2.vec, i64 0
168+
%csrc2.el1 = fpext half %src2.el1 to float
169+
170+
%src1.el2 = extractelement <2 x half> %src1.vec, i64 1
171+
%csrc1.el2 = fpext half %src1.el2 to float
172+
%src2.el2 = extractelement <2 x half> %src2.vec, i64 1
173+
%csrc2.el2 = fpext half %src2.el2 to float
174+
175+
%mul2 = fmul contract float %csrc2.el2, %csrc1.el2
176+
%mul1 = fmul contract float %csrc1.el1, %csrc2.el1
177+
%acc = load float, ptr addrspace(1) %dst, align 4
178+
%acc1 = fadd contract float %mul2, %acc
179+
%acc2 = fadd contract float %mul1, %acc1
180+
store float %acc2, ptr addrspace(1) %dst, align 4
181+
ret void
182+
}
183+
99184
; GCN-LABEL: {{^}}dotproduct_diffvecorder
100185
; GFX900: v_mad_mix_f32
101186
; GFX900: v_mad_mix_f32
102187

103188
; GFX906: v_mad_f32
104189
; GFX906: v_mac_f32_e32
105190

106-
; GFX906-DL-UNSAFE: v_dot2_f32_f16
107-
; GFX10-DL-UNSAFE: v_dot2c_f32_f16
108-
109-
; GFX906-CONTRACT: v_dot2_f32_f16
110-
; GFX906-DENORM-CONTRACT: v_dot2_f32_f16
111-
; GFX906-DOT10-DISABLED: v_fma_mix_f32
112191
define amdgpu_kernel void @dotproduct_diffvecorder(ptr addrspace(1) %src1,
113192
ptr addrspace(1) %src2,
114193
ptr addrspace(1) nocapture %dst) {
@@ -136,17 +215,45 @@ entry:
136215
}
137216

138217
; Tests to make sure dot product is not generated when the vectors are not of <2 x half>.
139-
; GCN-LABEL: {{^}}dotproduct_v4f16
140-
; GFX900: v_mad_mix_f32
141-
142-
; GFX906: v_mad_f32
143-
; GFX906: v_mac_f32_e32
218+
; GCN-LABEL: {{^}}dotproduct_v4f16_contract
144219

145220
; GCN-DL-UNSAFE: v_fma_mix_f32
146221

147222
; GFX906-CONTRACT: v_fma_mix_f32
148223
; GFX906-DENORM-CONTRACT: v_fma_mix_f32
149224
; GFX906-DOT10-DISABLED: v_fma_mix_f32
225+
define amdgpu_kernel void @dotproduct_v4f16_contract(ptr addrspace(1) %src1,
226+
ptr addrspace(1) %src2,
227+
ptr addrspace(1) nocapture %dst) {
228+
entry:
229+
%src1.vec = load <4 x half>, ptr addrspace(1) %src1
230+
%src2.vec = load <4 x half>, ptr addrspace(1) %src2
231+
232+
%src1.el1 = extractelement <4 x half> %src1.vec, i64 0
233+
%csrc1.el1 = fpext half %src1.el1 to float
234+
%src2.el1 = extractelement <4 x half> %src2.vec, i64 0
235+
%csrc2.el1 = fpext half %src2.el1 to float
236+
237+
%src1.el2 = extractelement <4 x half> %src1.vec, i64 1
238+
%csrc1.el2 = fpext half %src1.el2 to float
239+
%src2.el2 = extractelement <4 x half> %src2.vec, i64 1
240+
%csrc2.el2 = fpext half %src2.el2 to float
241+
242+
%mul2 = fmul contract float %csrc1.el2, %csrc2.el2
243+
%mul1 = fmul float %csrc1.el1, %csrc2.el1
244+
%acc = load float, ptr addrspace(1) %dst, align 4
245+
%acc1 = fadd contract float %mul2, %acc
246+
%acc2 = fadd contract float %mul1, %acc1
247+
store float %acc2, ptr addrspace(1) %dst, align 4
248+
ret void
249+
}
250+
251+
; GCN-LABEL: {{^}}dotproduct_v4f16
252+
; GFX900: v_mad_mix_f32
253+
254+
; GFX906: v_mad_f32
255+
; GFX906: v_mac_f32_e32
256+
150257
define amdgpu_kernel void @dotproduct_v4f16(ptr addrspace(1) %src1,
151258
ptr addrspace(1) %src2,
152259
ptr addrspace(1) nocapture %dst) {
@@ -173,18 +280,46 @@ entry:
173280
ret void
174281
}
175282

283+
; GCN-LABEL: {{^}}NotAdotproductContract
284+
285+
; GCN-DL-UNSAFE: v_fma_mix_f32
286+
287+
; GFX906-CONTRACT: v_fma_mix_f32
288+
; GFX906-DENORM-CONTRACT: v_fma_mix_f32
289+
; GFX906-DOT10-DISABLED: v_fma_mix_f32
290+
define amdgpu_kernel void @NotAdotproductContract(ptr addrspace(1) %src1,
291+
ptr addrspace(1) %src2,
292+
ptr addrspace(1) nocapture %dst) {
293+
entry:
294+
%src1.vec = load <2 x half>, ptr addrspace(1) %src1
295+
%src2.vec = load <2 x half>, ptr addrspace(1) %src2
296+
297+
%src1.el1 = extractelement <2 x half> %src1.vec, i64 0
298+
%csrc1.el1 = fpext half %src1.el1 to float
299+
%src2.el1 = extractelement <2 x half> %src2.vec, i64 0
300+
%csrc2.el1 = fpext half %src2.el1 to float
301+
302+
%src1.el2 = extractelement <2 x half> %src1.vec, i64 1
303+
%csrc1.el2 = fpext half %src1.el2 to float
304+
%src2.el2 = extractelement <2 x half> %src2.vec, i64 1
305+
%csrc2.el2 = fpext half %src2.el2 to float
306+
307+
%mul2 = fmul contract float %csrc1.el2, %csrc1.el1
308+
%mul1 = fmul contract float %csrc2.el1, %csrc2.el2
309+
%acc = load float, ptr addrspace(1) %dst, align 4
310+
%acc1 = fadd contract float %mul2, %acc
311+
%acc2 = fadd contract float %mul1, %acc1
312+
store float %acc2, ptr addrspace(1) %dst, align 4
313+
ret void
314+
}
315+
176316
; GCN-LABEL: {{^}}NotAdotproduct
177317
; GFX900: v_mad_mix_f32
178318
; GFX900: v_mad_mix_f32
179319

180320
; GFX906: v_mad_f32
181321
; GFX906: v_mac_f32_e32
182322

183-
; GCN-DL-UNSAFE: v_fma_mix_f32
184-
185-
; GFX906-CONTRACT: v_fma_mix_f32
186-
; GFX906-DENORM-CONTRACT: v_fma_mix_f32
187-
; GFX906-DOT10-DISABLED: v_fma_mix_f32
188323
define amdgpu_kernel void @NotAdotproduct(ptr addrspace(1) %src1,
189324
ptr addrspace(1) %src2,
190325
ptr addrspace(1) nocapture %dst) {
@@ -211,18 +346,46 @@ entry:
211346
ret void
212347
}
213348

349+
; GCN-LABEL: {{^}}Diff_Idx_NotAdotproductContract
350+
351+
; GCN-DL-UNSAFE: v_fma_mix_f32
352+
353+
; GFX906-CONTRACT: v_fma_mix_f32
354+
; GFX906-DENORM-CONTRACT: v_fma_mix_f32
355+
; GFX906-DOT10-DISABLED: v_fma_mix_f32
356+
define amdgpu_kernel void @Diff_Idx_NotAdotproductContract(ptr addrspace(1) %src1,
357+
ptr addrspace(1) %src2,
358+
ptr addrspace(1) nocapture %dst) {
359+
entry:
360+
%src1.vec = load <2 x half>, ptr addrspace(1) %src1
361+
%src2.vec = load <2 x half>, ptr addrspace(1) %src2
362+
363+
%src1.el1 = extractelement <2 x half> %src1.vec, i64 0
364+
%csrc1.el1 = fpext half %src1.el1 to float
365+
%src2.el1 = extractelement <2 x half> %src2.vec, i64 0
366+
%csrc2.el1 = fpext half %src2.el1 to float
367+
368+
%src1.el2 = extractelement <2 x half> %src1.vec, i64 1
369+
%csrc1.el2 = fpext half %src1.el2 to float
370+
%src2.el2 = extractelement <2 x half> %src2.vec, i64 1
371+
%csrc2.el2 = fpext half %src2.el2 to float
372+
373+
%mul2 = fmul contract float %csrc1.el2, %csrc2.el1
374+
%mul1 = fmul contract float %csrc1.el1, %csrc2.el2
375+
%acc = load float, ptr addrspace(1) %dst, align 4
376+
%acc1 = fadd contract float %mul2, %acc
377+
%acc2 = fadd contract float %mul1, %acc1
378+
store float %acc2, ptr addrspace(1) %dst, align 4
379+
ret void
380+
}
381+
214382
; GCN-LABEL: {{^}}Diff_Idx_NotAdotproduct
215383
; GFX900: v_mad_mix_f32
216384
; GFX900: v_mad_mix_f32
217385

218386
; GFX906: v_mad_f32
219387
; GFX906: v_mac_f32_e32
220388

221-
; GCN-DL-UNSAFE: v_fma_mix_f32
222-
223-
; GFX906-CONTRACT: v_fma_mix_f32
224-
; GFX906-DENORM-CONTRACT: v_fma_mix_f32
225-
; GFX906-DOT10-DISABLED: v_fma_mix_f32
226389
define amdgpu_kernel void @Diff_Idx_NotAdotproduct(ptr addrspace(1) %src1,
227390
ptr addrspace(1) %src2,
228391
ptr addrspace(1) nocapture %dst) {

0 commit comments

Comments
 (0)