Skip to content

Commit 62651dd

Browse files
authored
[SYCL][CUDA][libclc] Add bf16 builtins and optimize half builtins for fma, fmin, fmax and fmax (#5724)
For functions fma, fmin, fmax and fmax adds bf16 builtins to libclc and optimizes half builtins to use half instructions if supported by the device. This PR also contains some changes (everything in clang folder) that have been merged in upstream llvm since last pulldown and are required for building it. There are parts of (something went wrong when merging these, so only parts were merged at first. The changes in this PR are the remainder): https://reviews.llvm.org/D118977 https://reviews.llvm.org/D117887 https://reviews.llvm.org/D119157 Tests for half changes are in intel/llvm-test-suite#880. Tests for bf16 implementations will be added together with adding support for these to runtime in future PRs.
1 parent 647249c commit 62651dd

File tree

9 files changed

+655
-18
lines changed

9 files changed

+655
-18
lines changed

clang/include/clang/Basic/BuiltinsNVPTX.def

Lines changed: 110 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -17,6 +17,7 @@
1717
# define TARGET_BUILTIN(ID, TYPE, ATTRS, FEATURE) BUILTIN(ID, TYPE, ATTRS)
1818
#endif
1919

20+
#pragma push_macro("SM_53")
2021
#pragma push_macro("SM_70")
2122
#pragma push_macro("SM_72")
2223
#pragma push_macro("SM_75")
@@ -30,7 +31,9 @@
3031

3132
#pragma push_macro("SM_60")
3233
#define SM_60 "sm_60|sm_61|sm_62|" SM_70
34+
#define SM_53 "sm_53|" SM_60
3335

36+
#pragma push_macro("PTX42")
3437
#pragma push_macro("PTX60")
3538
#pragma push_macro("PTX61")
3639
#pragma push_macro("PTX63")
@@ -53,6 +56,7 @@
5356
#define PTX63 "ptx63|" PTX64
5457
#define PTX61 "ptx61|" PTX63
5558
#define PTX60 "ptx60|" PTX61
59+
#define PTX42 "ptx42|" PTX60
5660

5761
#pragma push_macro("AND")
5862
#define AND(a, b) "(" a "),(" b ")"
@@ -110,13 +114,89 @@ BUILTIN(__nvvm_prmt, "UiUiUiUi", "")
110114

111115
// Min Max
112116

113-
BUILTIN(__nvvm_fmax_ftz_f, "fff", "")
114-
BUILTIN(__nvvm_fmax_f, "fff", "")
115-
BUILTIN(__nvvm_fmin_ftz_f, "fff", "")
116-
BUILTIN(__nvvm_fmin_f, "fff", "")
117+
TARGET_BUILTIN(__nvvm_fmin_f16, "hhh", "", AND(SM_80, PTX70))
118+
TARGET_BUILTIN(__nvvm_fmin_ftz_f16, "hhh", "", AND(SM_80, PTX70))
119+
TARGET_BUILTIN(__nvvm_fmin_nan_f16, "hhh", "", AND(SM_80, PTX70))
120+
TARGET_BUILTIN(__nvvm_fmin_ftz_nan_f16, "hhh", "", AND(SM_80, PTX70))
121+
TARGET_BUILTIN(__nvvm_fmin_xorsign_abs_f16, "hhh", "", AND(SM_86, PTX72))
122+
TARGET_BUILTIN(__nvvm_fmin_ftz_xorsign_abs_f16, "hhh", "", AND(SM_86, PTX72))
123+
TARGET_BUILTIN(__nvvm_fmin_nan_xorsign_abs_f16, "hhh", "", AND(SM_86, PTX72))
124+
TARGET_BUILTIN(__nvvm_fmin_ftz_nan_xorsign_abs_f16, "hhh", "",
125+
AND(SM_86, PTX72))
126+
TARGET_BUILTIN(__nvvm_fmin_f16x2, "V2hV2hV2h", "", AND(SM_80, PTX70))
127+
TARGET_BUILTIN(__nvvm_fmin_ftz_f16x2, "V2hV2hV2h", "", AND(SM_80, PTX70))
128+
TARGET_BUILTIN(__nvvm_fmin_nan_f16x2, "V2hV2hV2h", "", AND(SM_80, PTX70))
129+
TARGET_BUILTIN(__nvvm_fmin_ftz_nan_f16x2, "V2hV2hV2h", "", AND(SM_80, PTX70))
130+
TARGET_BUILTIN(__nvvm_fmin_xorsign_abs_f16x2, "V2hV2hV2h", "",
131+
AND(SM_86, PTX72))
132+
TARGET_BUILTIN(__nvvm_fmin_ftz_xorsign_abs_f16x2, "V2hV2hV2h", "",
133+
AND(SM_86, PTX72))
134+
TARGET_BUILTIN(__nvvm_fmin_nan_xorsign_abs_f16x2, "V2hV2hV2h", "",
135+
AND(SM_86, PTX72))
136+
TARGET_BUILTIN(__nvvm_fmin_ftz_nan_xorsign_abs_f16x2, "V2hV2hV2h", "",
137+
AND(SM_86, PTX72))
138+
TARGET_BUILTIN(__nvvm_fmin_bf16, "UsUsUs", "", AND(SM_80, PTX70))
139+
TARGET_BUILTIN(__nvvm_fmin_nan_bf16, "UsUsUs", "", AND(SM_80, PTX70))
140+
TARGET_BUILTIN(__nvvm_fmin_xorsign_abs_bf16, "UsUsUs", "", AND(SM_86, PTX72))
141+
TARGET_BUILTIN(__nvvm_fmin_nan_xorsign_abs_bf16, "UsUsUs", "",
142+
AND(SM_86, PTX72))
143+
TARGET_BUILTIN(__nvvm_fmin_bf16x2, "ZUiZUiZUi", "", AND(SM_80, PTX70))
144+
TARGET_BUILTIN(__nvvm_fmin_nan_bf16x2, "ZUiZUiZUi", "", AND(SM_80, PTX70))
145+
TARGET_BUILTIN(__nvvm_fmin_xorsign_abs_bf16x2, "ZUiZUiZUi", "",
146+
AND(SM_86, PTX72))
147+
TARGET_BUILTIN(__nvvm_fmin_nan_xorsign_abs_bf16x2, "ZUiZUiZUi", "",
148+
AND(SM_86, PTX72))
149+
BUILTIN(__nvvm_fmin_f, "fff", "")
150+
BUILTIN(__nvvm_fmin_ftz_f, "fff", "")
151+
TARGET_BUILTIN(__nvvm_fmin_nan_f, "fff", "", AND(SM_80, PTX70))
152+
TARGET_BUILTIN(__nvvm_fmin_ftz_nan_f, "fff", "", AND(SM_80, PTX70))
153+
TARGET_BUILTIN(__nvvm_fmin_xorsign_abs_f, "fff", "", AND(SM_86, PTX72))
154+
TARGET_BUILTIN(__nvvm_fmin_ftz_xorsign_abs_f, "fff", "", AND(SM_86, PTX72))
155+
TARGET_BUILTIN(__nvvm_fmin_nan_xorsign_abs_f, "fff", "", AND(SM_86, PTX72))
156+
TARGET_BUILTIN(__nvvm_fmin_ftz_nan_xorsign_abs_f, "fff", "", AND(SM_86, PTX72))
157+
BUILTIN(__nvvm_fmin_d, "ddd", "")
117158

159+
TARGET_BUILTIN(__nvvm_fmax_f16, "hhh", "", AND(SM_80, PTX70))
160+
TARGET_BUILTIN(__nvvm_fmax_ftz_f16, "hhh", "", AND(SM_80, PTX70))
161+
TARGET_BUILTIN(__nvvm_fmax_nan_f16, "hhh", "", AND(SM_80, PTX70))
162+
TARGET_BUILTIN(__nvvm_fmax_ftz_nan_f16, "hhh", "", AND(SM_80, PTX70))
163+
TARGET_BUILTIN(__nvvm_fmax_xorsign_abs_f16, "hhh", "", AND(SM_86, PTX72))
164+
TARGET_BUILTIN(__nvvm_fmax_ftz_xorsign_abs_f16, "hhh", "", AND(SM_86, PTX72))
165+
TARGET_BUILTIN(__nvvm_fmax_nan_xorsign_abs_f16, "hhh", "", AND(SM_86, PTX72))
166+
TARGET_BUILTIN(__nvvm_fmax_ftz_nan_xorsign_abs_f16, "hhh", "",
167+
AND(SM_86, PTX72))
168+
TARGET_BUILTIN(__nvvm_fmax_f16x2, "V2hV2hV2h", "", AND(SM_80, PTX70))
169+
TARGET_BUILTIN(__nvvm_fmax_ftz_f16x2, "V2hV2hV2h", "", AND(SM_80, PTX70))
170+
TARGET_BUILTIN(__nvvm_fmax_nan_f16x2, "V2hV2hV2h", "", AND(SM_80, PTX70))
171+
TARGET_BUILTIN(__nvvm_fmax_ftz_nan_f16x2, "V2hV2hV2h", "", AND(SM_80, PTX70))
172+
TARGET_BUILTIN(__nvvm_fmax_xorsign_abs_f16x2, "V2hV2hV2h", "",
173+
AND(SM_86, PTX72))
174+
TARGET_BUILTIN(__nvvm_fmax_ftz_xorsign_abs_f16x2, "V2hV2hV2h", "",
175+
AND(SM_86, PTX72))
176+
TARGET_BUILTIN(__nvvm_fmax_nan_xorsign_abs_f16x2, "V2hV2hV2h", "",
177+
AND(SM_86, PTX72))
178+
TARGET_BUILTIN(__nvvm_fmax_ftz_nan_xorsign_abs_f16x2, "V2hV2hV2h", "",
179+
AND(SM_86, PTX72))
180+
TARGET_BUILTIN(__nvvm_fmax_bf16, "UsUsUs", "", AND(SM_80, PTX70))
181+
TARGET_BUILTIN(__nvvm_fmax_nan_bf16, "UsUsUs", "", AND(SM_80, PTX70))
182+
TARGET_BUILTIN(__nvvm_fmax_xorsign_abs_bf16, "UsUsUs", "", AND(SM_86, PTX72))
183+
TARGET_BUILTIN(__nvvm_fmax_nan_xorsign_abs_bf16, "UsUsUs", "",
184+
AND(SM_86, PTX72))
185+
TARGET_BUILTIN(__nvvm_fmax_bf16x2, "ZUiZUiZUi", "", AND(SM_80, PTX70))
186+
TARGET_BUILTIN(__nvvm_fmax_nan_bf16x2, "ZUiZUiZUi", "", AND(SM_80, PTX70))
187+
TARGET_BUILTIN(__nvvm_fmax_xorsign_abs_bf16x2, "ZUiZUiZUi", "",
188+
AND(SM_86, PTX72))
189+
TARGET_BUILTIN(__nvvm_fmax_nan_xorsign_abs_bf16x2, "ZUiZUiZUi", "",
190+
AND(SM_86, PTX72))
191+
BUILTIN(__nvvm_fmax_f, "fff", "")
192+
BUILTIN(__nvvm_fmax_ftz_f, "fff", "")
193+
TARGET_BUILTIN(__nvvm_fmax_nan_f, "fff", "", AND(SM_80, PTX70))
194+
TARGET_BUILTIN(__nvvm_fmax_ftz_nan_f, "fff", "", AND(SM_80, PTX70))
195+
TARGET_BUILTIN(__nvvm_fmax_xorsign_abs_f, "fff", "", AND(SM_86, PTX72))
196+
TARGET_BUILTIN(__nvvm_fmax_ftz_xorsign_abs_f, "fff", "", AND(SM_86, PTX72))
197+
TARGET_BUILTIN(__nvvm_fmax_nan_xorsign_abs_f, "fff", "", AND(SM_86, PTX72))
198+
TARGET_BUILTIN(__nvvm_fmax_ftz_nan_xorsign_abs_f, "fff", "", AND(SM_86, PTX72))
118199
BUILTIN(__nvvm_fmax_d, "ddd", "")
119-
BUILTIN(__nvvm_fmin_d, "ddd", "")
120200

121201
// Multiplication
122202

@@ -228,6 +308,22 @@ TARGET_BUILTIN(__nvvm_tanh_approx_f16x2, "V2hV2h", "", AND(SM_75, PTX70))
228308

229309
// Fma
230310

311+
TARGET_BUILTIN(__nvvm_fma_rn_f16, "hhhh", "", AND(SM_53, PTX42))
312+
TARGET_BUILTIN(__nvvm_fma_rn_ftz_f16, "hhhh", "", AND(SM_53, PTX42))
313+
TARGET_BUILTIN(__nvvm_fma_rn_sat_f16, "hhhh", "", AND(SM_53, PTX42))
314+
TARGET_BUILTIN(__nvvm_fma_rn_ftz_sat_f16, "hhhh", "", AND(SM_53, PTX42))
315+
TARGET_BUILTIN(__nvvm_fma_rn_relu_f16, "hhhh", "", AND(SM_80, PTX70))
316+
TARGET_BUILTIN(__nvvm_fma_rn_ftz_relu_f16, "hhhh", "", AND(SM_80, PTX70))
317+
TARGET_BUILTIN(__nvvm_fma_rn_f16x2, "V2hV2hV2hV2h", "", AND(SM_53, PTX42))
318+
TARGET_BUILTIN(__nvvm_fma_rn_ftz_f16x2, "V2hV2hV2hV2h", "", AND(SM_53, PTX42))
319+
TARGET_BUILTIN(__nvvm_fma_rn_sat_f16x2, "V2hV2hV2hV2h", "", AND(SM_53, PTX42))
320+
TARGET_BUILTIN(__nvvm_fma_rn_ftz_sat_f16x2, "V2hV2hV2hV2h", "", AND(SM_53, PTX42))
321+
TARGET_BUILTIN(__nvvm_fma_rn_relu_f16x2, "V2hV2hV2hV2h", "", AND(SM_80, PTX70))
322+
TARGET_BUILTIN(__nvvm_fma_rn_ftz_relu_f16x2, "V2hV2hV2hV2h", "", AND(SM_80, PTX70))
323+
TARGET_BUILTIN(__nvvm_fma_rn_bf16, "UsUsUsUs", "", AND(SM_80, PTX70))
324+
TARGET_BUILTIN(__nvvm_fma_rn_relu_bf16, "UsUsUsUs", "", AND(SM_80, PTX70))
325+
TARGET_BUILTIN(__nvvm_fma_rn_bf16x2, "ZUiZUiZUiZUi", "", AND(SM_80, PTX70))
326+
TARGET_BUILTIN(__nvvm_fma_rn_relu_bf16x2, "ZUiZUiZUiZUi", "", AND(SM_80, PTX70))
231327
BUILTIN(__nvvm_fma_rn_ftz_f, "ffff", "")
232328
BUILTIN(__nvvm_fma_rn_f, "ffff", "")
233329
BUILTIN(__nvvm_fma_rz_ftz_f, "ffff", "")
@@ -2309,15 +2405,24 @@ TARGET_BUILTIN(__nvvm_cp_async_commit_group, "v", "", AND(SM_80,PTX70))
23092405
TARGET_BUILTIN(__nvvm_cp_async_wait_group, "vIi", "", AND(SM_80,PTX70))
23102406
TARGET_BUILTIN(__nvvm_cp_async_wait_all, "v", "", AND(SM_80,PTX70))
23112407

2408+
2409+
// bf16, bf16x2 abs, neg
2410+
TARGET_BUILTIN(__nvvm_abs_bf16, "UsUs", "", AND(SM_80,PTX70))
2411+
TARGET_BUILTIN(__nvvm_abs_bf16x2, "ZUiZUi", "", AND(SM_80,PTX70))
2412+
TARGET_BUILTIN(__nvvm_neg_bf16, "UsUs", "", AND(SM_80,PTX70))
2413+
TARGET_BUILTIN(__nvvm_neg_bf16x2, "ZUiZUi", "", AND(SM_80,PTX70))
2414+
23122415
#undef BUILTIN
23132416
#undef TARGET_BUILTIN
23142417
#pragma pop_macro("AND")
2418+
#pragma pop_macro("SM_53")
23152419
#pragma pop_macro("SM_60")
23162420
#pragma pop_macro("SM_70")
23172421
#pragma pop_macro("SM_72")
23182422
#pragma pop_macro("SM_75")
23192423
#pragma pop_macro("SM_80")
23202424
#pragma pop_macro("SM_86")
2425+
#pragma pop_macro("PTX42")
23212426
#pragma pop_macro("PTX60")
23222427
#pragma pop_macro("PTX61")
23232428
#pragma pop_macro("PTX63")
Lines changed: 174 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,174 @@
1+
// REQUIRES: nvptx-registered-target
2+
//
3+
// RUN: %clang_cc1 -ffp-contract=off -triple nvptx-unknown-unknown -target-cpu \
4+
// RUN: sm_75 -target-feature +ptx70 -fcuda-is-device -fnative-half-type -S \
5+
// RUN: -emit-llvm -o - -x cuda %s \
6+
// RUN: | FileCheck -check-prefix=CHECK -check-prefix=CHECK_PTX70_SM75 %s
7+
8+
// RUN: %clang_cc1 -ffp-contract=off -triple nvptx-unknown-unknown -target-cpu \
9+
// RUN: sm_80 -target-feature +ptx70 -fcuda-is-device -fnative-half-type -S \
10+
// RUN: -emit-llvm -o - -x cuda %s \
11+
// RUN: | FileCheck -check-prefix=CHECK -check-prefix=CHECK_PTX70_SM80 %s
12+
13+
// RUN: %clang_cc1 -ffp-contract=off -triple nvptx64-unknown-unknown \
14+
// RUN: -target-cpu sm_80 -target-feature +ptx70 -fcuda-is-device \
15+
// RUN: -fnative-half-type -S -emit-llvm -o - -x cuda %s \
16+
// RUN: | FileCheck -check-prefix=CHECK -check-prefix=CHECK_PTX70_SM80 %s
17+
18+
// RUN: %clang_cc1 -ffp-contract=off -triple nvptx-unknown-unknown -target-cpu \
19+
// RUN: sm_86 -target-feature +ptx72 -fcuda-is-device -fnative-half-type -S \
20+
// RUN: -emit-llvm -o - -x cuda %s \
21+
// RUN: | FileCheck -check-prefix=CHECK -check-prefix=CHECK_PTX72_SM86 %s
22+
23+
// RUN: %clang_cc1 -ffp-contract=off -triple nvptx64-unknown-unknown \
24+
// RUN: -target-cpu sm_86 -target-feature +ptx72 -fcuda-is-device \
25+
// RUN: -fnative-half-type -S -emit-llvm -o - -x cuda %s \
26+
// RUN: | FileCheck -check-prefix=CHECK -check-prefix=CHECK_PTX72_SM86 %s
27+
28+
// RUN: %clang_cc1 -ffp-contract=off -triple nvptx-unknown-unknown -target-cpu \
29+
// RUN: sm_53 -target-feature +ptx42 -fcuda-is-device -fnative-half-type -S \
30+
// RUN: -emit-llvm -o - -x cuda %s \
31+
// RUN: | FileCheck -check-prefix=CHECK -check-prefix=CHECK_PTX42_SM53 %s
32+
33+
// RUN: %clang_cc1 -ffp-contract=off -triple nvptx64-unknown-unknown \
34+
// RUN: -target-cpu sm_53 -target-feature +ptx42 -fcuda-is-device \
35+
// RUN: -fnative-half-type -S -emit-llvm -o - -x cuda %s \
36+
// RUN: | FileCheck -check-prefix=CHECK -check-prefix=CHECK_PTX42_SM53 %s
37+
38+
#define __device__ __attribute__((device))
39+
40+
__device__ void nvvm_ex2_sm75() {
41+
#if __CUDA_ARCH__ >= 750
42+
// CHECK_PTX70_SM75: call half @llvm.nvvm.ex2.approx.f16
43+
__nvvm_ex2_approx_f16(0.1f16);
44+
// CHECK_PTX70_SM75: call <2 x half> @llvm.nvvm.ex2.approx.f16x2
45+
__nvvm_ex2_approx_f16x2({0.1f16, 0.7f16});
46+
#endif
47+
// CHECK: ret void
48+
}
49+
50+
// CHECK-LABEL: nvvm_min_max_sm80
51+
__device__ void nvvm_min_max_sm80() {
52+
#if __CUDA_ARCH__ >= 800
53+
// CHECK_PTX70_SM80: call half @llvm.nvvm.fmin.f16
54+
__nvvm_fmin_f16(0.1f16, 0.1f16);
55+
// CHECK_PTX70_SM80: call half @llvm.nvvm.fmin.ftz.f16
56+
__nvvm_fmin_ftz_f16(0.1f16, 0.1f16);
57+
// CHECK_PTX70_SM80: call half @llvm.nvvm.fmin.nan.f16
58+
__nvvm_fmin_nan_f16(0.1f16, 0.1f16);
59+
// CHECK_PTX70_SM80: call half @llvm.nvvm.fmin.ftz.nan.f16
60+
__nvvm_fmin_ftz_nan_f16(0.1f16, 0.1f16);
61+
// CHECK_PTX70_SM80: call <2 x half> @llvm.nvvm.fmin.f16x2
62+
__nvvm_fmin_f16x2({0.1f16, 0.7f16}, {0.1f16, 0.7f16});
63+
// CHECK_PTX70_SM80: call <2 x half> @llvm.nvvm.fmin.ftz.f16x2
64+
__nvvm_fmin_ftz_f16x2({0.1f16, 0.7f16}, {0.1f16, 0.7f16});
65+
// CHECK_PTX70_SM80: call <2 x half> @llvm.nvvm.fmin.nan.f16x2
66+
__nvvm_fmin_nan_f16x2({0.1f16, 0.7f16}, {0.1f16, 0.7f16});
67+
// CHECK_PTX70_SM80: call <2 x half> @llvm.nvvm.fmin.ftz.nan.f16x2
68+
__nvvm_fmin_ftz_nan_f16x2({0.1f16, 0.7f16}, {0.1f16, 0.7f16});
69+
70+
// CHECK_PTX70_SM80: call half @llvm.nvvm.fmax.f16
71+
__nvvm_fmax_f16(0.1f16, 0.1f16);
72+
// CHECK_PTX70_SM80: call half @llvm.nvvm.fmax.ftz.f16
73+
__nvvm_fmax_ftz_f16(0.1f16, 0.1f16);
74+
// CHECK_PTX70_SM80: call half @llvm.nvvm.fmax.nan.f16
75+
__nvvm_fmax_nan_f16(0.1f16, 0.1f16);
76+
// CHECK_PTX70_SM80: call half @llvm.nvvm.fmax.ftz.nan.f16
77+
__nvvm_fmax_ftz_nan_f16(0.1f16, 0.1f16);
78+
// CHECK_PTX70_SM80: call <2 x half> @llvm.nvvm.fmax.f16x2
79+
__nvvm_fmax_f16x2({0.1f16, 0.7f16}, {0.1f16, 0.7f16});
80+
// CHECK_PTX70_SM80: call <2 x half> @llvm.nvvm.fmax.ftz.f16x2
81+
__nvvm_fmax_ftz_f16x2({0.1f16, 0.7f16}, {0.1f16, 0.7f16});
82+
// CHECK_PTX70_SM80: call <2 x half> @llvm.nvvm.fmax.nan.f16x2
83+
__nvvm_fmax_nan_f16x2({0.1f16, 0.7f16}, {0.1f16, 0.7f16});
84+
// CHECK_PTX70_SM80: call <2 x half> @llvm.nvvm.fmax.ftz.nan.f16x2
85+
__nvvm_fmax_ftz_nan_f16x2({0.1f16, 0.7f16}, {0.1f16, 0.7f16});
86+
#endif
87+
// CHECK: ret void
88+
}
89+
90+
// CHECK-LABEL: nvvm_fma_f16_f16x2_sm80
91+
__device__ void nvvm_fma_f16_f16x2_sm80() {
92+
#if __CUDA_ARCH__ >= 800
93+
// CHECK_PTX70_SM80: call half @llvm.nvvm.fma.rn.relu.f16
94+
__nvvm_fma_rn_relu_f16(0.1f16, 0.1f16, 0.1f16);
95+
// CHECK_PTX70_SM80: call half @llvm.nvvm.fma.rn.ftz.relu.f16
96+
__nvvm_fma_rn_ftz_relu_f16(0.1f16, 0.1f16, 0.1f16);
97+
98+
// CHECK_PTX70_SM80: call <2 x half> @llvm.nvvm.fma.rn.relu.f16x2
99+
__nvvm_fma_rn_relu_f16x2({0.1f16, 0.7f16}, {0.1f16, 0.7f16},
100+
{0.1f16, 0.7f16});
101+
// CHECK_PTX70_SM80: call <2 x half> @llvm.nvvm.fma.rn.ftz.relu.f16x2
102+
__nvvm_fma_rn_ftz_relu_f16x2({0.1f16, 0.7f16}, {0.1f16, 0.7f16},
103+
{0.1f16, 0.7f16});
104+
#endif
105+
// CHECK: ret void
106+
}
107+
108+
// CHECK-LABEL: nvvm_fma_f16_f16x2_sm53
109+
__device__ void nvvm_fma_f16_f16x2_sm53() {
110+
#if __CUDA_ARCH__ >= 530
111+
// CHECK_PTX42_SM53: call half @llvm.nvvm.fma.rn.f16
112+
__nvvm_fma_rn_f16(0.1f16, 0.1f16, 0.1f16);
113+
// CHECK_PTX42_SM53: call half @llvm.nvvm.fma.rn.ftz.f16
114+
__nvvm_fma_rn_ftz_f16(0.1f16, 0.1f16, 0.1f16);
115+
// CHECK_PTX42_SM53: call half @llvm.nvvm.fma.rn.sat.f16
116+
__nvvm_fma_rn_sat_f16(0.1f16, 0.1f16, 0.1f16);
117+
// CHECK_PTX42_SM53: call half @llvm.nvvm.fma.rn.ftz.sat.f16
118+
__nvvm_fma_rn_ftz_sat_f16(0.1f16, 0.1f16, 0.1f16);
119+
120+
// CHECK_PTX42_SM53: call <2 x half> @llvm.nvvm.fma.rn.f16x2
121+
__nvvm_fma_rn_f16x2({0.1f16, 0.7f16}, {0.1f16, 0.7f16},
122+
{0.1f16, 0.7f16});
123+
// CHECK_PTX42_SM53: call <2 x half> @llvm.nvvm.fma.rn.ftz.f16x2
124+
__nvvm_fma_rn_ftz_f16x2({0.1f16, 0.7f16}, {0.1f16, 0.7f16},
125+
{0.1f16, 0.7f16});
126+
// CHECK_PTX42_SM53: call <2 x half> @llvm.nvvm.fma.rn.sat.f16x2
127+
__nvvm_fma_rn_sat_f16x2({0.1f16, 0.7f16}, {0.1f16, 0.7f16},
128+
{0.1f16, 0.7f16});
129+
// CHECK_PTX42_SM53: call <2 x half> @llvm.nvvm.fma.rn.ftz.sat.f16x2
130+
__nvvm_fma_rn_ftz_sat_f16x2({0.1f16, 0.7f16}, {0.1f16, 0.7f16},
131+
{0.1f16, 0.7f16});
132+
#endif
133+
// CHECK: ret void
134+
}
135+
136+
// CHECK-LABEL: nvvm_min_max_sm86
137+
__device__ void nvvm_min_max_sm86() {
138+
#if __CUDA_ARCH__ >= 860
139+
// CHECK_PTX72_SM86: call half @llvm.nvvm.fmin.xorsign.abs.f16
140+
__nvvm_fmin_xorsign_abs_f16(0.1f16, 0.1f16);
141+
// CHECK_PTX72_SM86: call half @llvm.nvvm.fmin.ftz.xorsign.abs.f16
142+
__nvvm_fmin_ftz_xorsign_abs_f16(0.1f16, 0.1f16);
143+
// CHECK_PTX72_SM86: call half @llvm.nvvm.fmin.nan.xorsign.abs.f16
144+
__nvvm_fmin_nan_xorsign_abs_f16(0.1f16, 0.1f16);
145+
// CHECK_PTX72_SM86: call half @llvm.nvvm.fmin.ftz.nan.xorsign.abs.f16
146+
__nvvm_fmin_ftz_nan_xorsign_abs_f16(0.1f16, 0.1f16);
147+
// CHECK_PTX72_SM86: call <2 x half> @llvm.nvvm.fmin.xorsign.abs.f16x2
148+
__nvvm_fmin_xorsign_abs_f16x2({0.1f16, 0.7f16}, {0.1f16, 0.7f16});
149+
// CHECK_PTX72_SM86: call <2 x half> @llvm.nvvm.fmin.ftz.xorsign.abs.f16x2
150+
__nvvm_fmin_ftz_xorsign_abs_f16x2({0.1f16, 0.7f16}, {0.1f16, 0.7f16});
151+
// CHECK_PTX72_SM86: call <2 x half> @llvm.nvvm.fmin.nan.xorsign.abs.f16x2
152+
__nvvm_fmin_nan_xorsign_abs_f16x2({0.1f16, 0.7f16}, {0.1f16, 0.7f16});
153+
// CHECK_PTX72_SM86: call <2 x half> @llvm.nvvm.fmin.ftz.nan.xorsign.abs.f16x2
154+
__nvvm_fmin_ftz_nan_xorsign_abs_f16x2({0.1f16, 0.7f16}, {0.1f16, 0.7f16});
155+
156+
// CHECK_PTX72_SM86: call half @llvm.nvvm.fmax.xorsign.abs.f16
157+
__nvvm_fmax_xorsign_abs_f16(0.1f16, 0.1f16);
158+
// CHECK_PTX72_SM86: call half @llvm.nvvm.fmax.ftz.xorsign.abs.f16
159+
__nvvm_fmax_ftz_xorsign_abs_f16(0.1f16, 0.1f16);
160+
// CHECK_PTX72_SM86: call half @llvm.nvvm.fmax.nan.xorsign.abs.f16
161+
__nvvm_fmax_nan_xorsign_abs_f16(0.1f16, 0.1f16);
162+
// CHECK_PTX72_SM86: call half @llvm.nvvm.fmax.ftz.nan.xorsign.abs.f16
163+
__nvvm_fmax_ftz_nan_xorsign_abs_f16(0.1f16, 0.1f16);
164+
// CHECK_PTX72_SM86: call <2 x half> @llvm.nvvm.fmax.xorsign.abs.f16x2
165+
__nvvm_fmax_xorsign_abs_f16x2({0.1f16, 0.7f16}, {0.1f16, 0.7f16});
166+
// CHECK_PTX72_SM86: call <2 x half> @llvm.nvvm.fmax.ftz.xorsign.abs.f16x2
167+
__nvvm_fmax_ftz_xorsign_abs_f16x2({0.1f16, 0.7f16}, {0.1f16, 0.7f16});
168+
// CHECK_PTX72_SM86: call <2 x half> @llvm.nvvm.fmax.nan.xorsign.abs.f16x2
169+
__nvvm_fmax_nan_xorsign_abs_f16x2({0.1f16, 0.7f16}, {0.1f16, 0.7f16});
170+
// CHECK_PTX72_SM86: call <2 x half> @llvm.nvvm.fmax.ftz.nan.xorsign.abs.f16x2
171+
__nvvm_fmax_ftz_nan_xorsign_abs_f16x2({0.1f16, 0.7f16}, {0.1f16, 0.7f16});
172+
#endif
173+
// CHECK: ret void
174+
}

0 commit comments

Comments
 (0)