Skip to content

Commit 7e105fb

Browse files
shiltianrampitec
andauthored
[AMDGPU] Add support for v_tanh_f32 on gfx1250 (#149360)
Co-authored-by: Mekhanoshin, Stanislav <[email protected]>
1 parent e68efed commit 7e105fb

27 files changed

+779
-1
lines changed

clang/include/clang/Basic/BuiltinsAMDGPU.def

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -669,6 +669,7 @@ TARGET_BUILTIN(__builtin_amdgcn_s_monitor_sleep, "vIs", "n", "gfx1250-insts")
669669
TARGET_BUILTIN(__builtin_amdgcn_s_wait_asynccnt, "vIUs", "n", "gfx1250-insts")
670670
TARGET_BUILTIN(__builtin_amdgcn_s_wait_tensorcnt, "vIUs", "n", "gfx1250-insts")
671671

672+
TARGET_BUILTIN(__builtin_amdgcn_tanhf, "ff", "nc", "tanh-insts")
672673
TARGET_BUILTIN(__builtin_amdgcn_tanh_bf16, "yy", "nc", "bf16-trans-insts")
673674
TARGET_BUILTIN(__builtin_amdgcn_rcp_bf16, "yy", "nc", "bf16-trans-insts")
674675
TARGET_BUILTIN(__builtin_amdgcn_rsq_bf16, "yy", "nc", "bf16-trans-insts")

clang/lib/CodeGen/TargetBuiltins/AMDGPU.cpp

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -503,6 +503,7 @@ Value *CodeGenFunction::EmitAMDGPUBuiltinExpr(unsigned BuiltinID,
503503
Function *F = CGM.getIntrinsic(Intrinsic::amdgcn_ballot, { ResultType });
504504
return Builder.CreateCall(F, { Src });
505505
}
506+
case AMDGPU::BI__builtin_amdgcn_tanhf:
506507
case AMDGPU::BI__builtin_amdgcn_tanh_bf16:
507508
return emitBuiltinWithOneOverloadedType<1>(*this, E,
508509
Intrinsic::amdgcn_tanh);

clang/test/CodeGenOpenCL/amdgpu-features.cl

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -108,7 +108,7 @@
108108
// GFX1153: "target-features"="+16-bit-insts,+atomic-fadd-rtn-insts,+ci-insts,+dl-insts,+dot10-insts,+dot12-insts,+dot5-insts,+dot7-insts,+dot8-insts,+dot9-insts,+dpp,+gfx10-3-insts,+gfx10-insts,+gfx11-insts,+gfx8-insts,+gfx9-insts,+wavefrontsize32"
109109
// GFX1200: "target-features"="+16-bit-insts,+atomic-buffer-global-pk-add-f16-insts,+atomic-buffer-pk-add-bf16-inst,+atomic-ds-pk-add-16-insts,+atomic-fadd-rtn-insts,+atomic-flat-pk-add-16-insts,+atomic-global-pk-add-bf16-inst,+ci-insts,+dl-insts,+dot10-insts,+dot11-insts,+dot12-insts,+dot7-insts,+dot8-insts,+dot9-insts,+dpp,+fp8-conversion-insts,+gfx10-3-insts,+gfx10-insts,+gfx11-insts,+gfx12-insts,+gfx8-insts,+gfx9-insts,+wavefrontsize32"
110110
// GFX1201: "target-features"="+16-bit-insts,+atomic-buffer-global-pk-add-f16-insts,+atomic-buffer-pk-add-bf16-inst,+atomic-ds-pk-add-16-insts,+atomic-fadd-rtn-insts,+atomic-flat-pk-add-16-insts,+atomic-global-pk-add-bf16-inst,+ci-insts,+dl-insts,+dot10-insts,+dot11-insts,+dot12-insts,+dot7-insts,+dot8-insts,+dot9-insts,+dpp,+fp8-conversion-insts,+gfx10-3-insts,+gfx10-insts,+gfx11-insts,+gfx12-insts,+gfx8-insts,+gfx9-insts,+wavefrontsize32"
111-
// GFX1250: "target-features"="+16-bit-insts,+ashr-pk-insts,+atomic-buffer-global-pk-add-f16-insts,+atomic-buffer-pk-add-bf16-inst,+atomic-ds-pk-add-16-insts,+atomic-fadd-rtn-insts,+atomic-flat-pk-add-16-insts,+atomic-global-pk-add-bf16-inst,+bf16-trans-insts,+bitop3-insts,+ci-insts,+dl-insts,+dot7-insts,+dot8-insts,+dpp,+fp8-conversion-insts,+fp8e5m3-insts,+gfx10-3-insts,+gfx10-insts,+gfx11-insts,+gfx12-insts,+gfx1250-insts,+gfx8-insts,+gfx9-insts,+permlane16-swap,+prng-inst,+setprio-inc-wg-inst,+transpose-load-f4f6-insts,+wavefrontsize32"
111+
// GFX1250: "target-features"="+16-bit-insts,+ashr-pk-insts,+atomic-buffer-global-pk-add-f16-insts,+atomic-buffer-pk-add-bf16-inst,+atomic-ds-pk-add-16-insts,+atomic-fadd-rtn-insts,+atomic-flat-pk-add-16-insts,+atomic-global-pk-add-bf16-inst,+bf16-trans-insts,+bitop3-insts,+ci-insts,+dl-insts,+dot7-insts,+dot8-insts,+dpp,+fp8-conversion-insts,+fp8e5m3-insts,+gfx10-3-insts,+gfx10-insts,+gfx11-insts,+gfx12-insts,+gfx1250-insts,+gfx8-insts,+gfx9-insts,+permlane16-swap,+prng-inst,+setprio-inc-wg-inst,+tanh-insts,+transpose-load-f4f6-insts,+wavefrontsize32
112112

113113
// GFX1103-W64: "target-features"="+16-bit-insts,+atomic-fadd-rtn-insts,+ci-insts,+dl-insts,+dot10-insts,+dot12-insts,+dot5-insts,+dot7-insts,+dot8-insts,+dot9-insts,+dpp,+gfx10-3-insts,+gfx10-insts,+gfx11-insts,+gfx8-insts,+gfx9-insts,+wavefrontsize64"
114114

clang/test/CodeGenOpenCL/builtins-amdgcn-gfx1250.cl

Lines changed: 19 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -42,6 +42,25 @@ void test_s_wait_tensorcnt() {
4242
__builtin_amdgcn_s_wait_tensorcnt(0);
4343
}
4444

45+
// CHECK-LABEL: @test_tanh_f32(
46+
// CHECK-NEXT: entry:
47+
// CHECK-NEXT: [[OUT_ADDR:%.*]] = alloca ptr addrspace(1), align 8, addrspace(5)
48+
// CHECK-NEXT: [[A_ADDR:%.*]] = alloca float, align 4, addrspace(5)
49+
// CHECK-NEXT: [[OUT_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[OUT_ADDR]] to ptr
50+
// CHECK-NEXT: [[A_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[A_ADDR]] to ptr
51+
// CHECK-NEXT: store ptr addrspace(1) [[OUT:%.*]], ptr [[OUT_ADDR_ASCAST]], align 8
52+
// CHECK-NEXT: store float [[A:%.*]], ptr [[A_ADDR_ASCAST]], align 4
53+
// CHECK-NEXT: [[TMP0:%.*]] = load float, ptr [[A_ADDR_ASCAST]], align 4
54+
// CHECK-NEXT: [[TMP1:%.*]] = call float @llvm.amdgcn.tanh.f32(float [[TMP0]])
55+
// CHECK-NEXT: [[TMP2:%.*]] = load ptr addrspace(1), ptr [[OUT_ADDR_ASCAST]], align 8
56+
// CHECK-NEXT: store float [[TMP1]], ptr addrspace(1) [[TMP2]], align 4
57+
// CHECK-NEXT: ret void
58+
//
59+
void test_tanh_f32(global float* out, float a)
60+
{
61+
*out = __builtin_amdgcn_tanhf(a);
62+
}
63+
4564
// CHECK-LABEL: @test_tanh_bf16(
4665
// CHECK-NEXT: entry:
4766
// CHECK-NEXT: [[OUT_ADDR:%.*]] = alloca ptr addrspace(1), align 8, addrspace(5)

llvm/lib/Target/AMDGPU/AMDGPU.td

Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1118,6 +1118,12 @@ def FeatureBitOp3Insts : SubtargetFeature<"bitop3-insts",
11181118
"Has v_bitop3_b32/v_bitop3_b16 instructions"
11191119
>;
11201120

1121+
def FeatureTanhInsts : SubtargetFeature<"tanh-insts",
1122+
"HasTanhInsts",
1123+
"true",
1124+
"Has v_tanh_f32/f16 instructions"
1125+
>;
1126+
11211127
def FeatureTransposeLoadF4F6Insts : SubtargetFeature<"transpose-load-f4f6-insts",
11221128
"HasTransposeLoadF4F6Insts",
11231129
"true",
@@ -1979,6 +1985,7 @@ def FeatureISAVersion12_50 : FeatureSet<
19791985
FeatureScalarDwordx3Loads,
19801986
FeatureDPPSrc1SGPR,
19811987
FeatureBitOp3Insts,
1988+
FeatureTanhInsts,
19821989
FeatureTransposeLoadF4F6Insts,
19831990
FeatureBF16TransInsts,
19841991
FeatureBF16ConversionInsts,
@@ -2703,6 +2710,9 @@ def HasPseudoScalarTrans : Predicate<"Subtarget->hasPseudoScalarTrans()">,
27032710
def HasBitOp3Insts : Predicate<"Subtarget->hasBitOp3Insts()">,
27042711
AssemblerPredicate<(all_of FeatureBitOp3Insts)>;
27052712

2713+
def HasTanhInsts : Predicate<"Subtarget->hasTanhInsts()">,
2714+
AssemblerPredicate<(all_of FeatureTanhInsts)>;
2715+
27062716
def HasTransposeLoadF4F6Insts : Predicate<"Subtarget->hasTransposeLoadF4F6Insts()">,
27072717
AssemblerPredicate<(all_of FeatureTransposeLoadF4F6Insts)>;
27082718

llvm/lib/Target/AMDGPU/GCNSubtarget.h

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -234,6 +234,7 @@ class GCNSubtarget final : public AMDGPUGenSubtargetInfo,
234234
bool HasRestrictedSOffset = false;
235235
bool Has64BitLiterals = false;
236236
bool HasBitOp3Insts = false;
237+
bool HasTanhInsts = false;
237238
bool HasTransposeLoadF4F6Insts = false;
238239
bool HasPrngInst = false;
239240
bool HasBVHDualAndBVH8Insts = false;
@@ -1380,6 +1381,8 @@ class GCNSubtarget final : public AMDGPUGenSubtargetInfo,
13801381
return HasMinimum3Maximum3F16;
13811382
}
13821383

1384+
bool hasTanhInsts() const { return HasTanhInsts; }
1385+
13831386
bool hasAddPC64Inst() const { return GFX1250Insts; }
13841387

13851388
bool hasMinimum3Maximum3PKF16() const {

llvm/lib/Target/AMDGPU/VOP1Instructions.td

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -366,6 +366,9 @@ defm V_SQRT_F64 : VOP1Inst <"v_sqrt_f64", VOP_F64_F64, int_amdgcn_sqrt>;
366366
let TRANS = 1, SchedRW = [WriteTrans32] in {
367367
defm V_SIN_F32 : VOP1Inst <"v_sin_f32", VOP_F32_F32, AMDGPUsin>;
368368
defm V_COS_F32 : VOP1Inst <"v_cos_f32", VOP_F32_F32, AMDGPUcos>;
369+
370+
let SubtargetPredicate = HasTanhInsts in
371+
defm V_TANH_F32 : VOP1Inst <"v_tanh_f32", VOP_F32_F32, int_amdgcn_tanh>;
369372
} // End TRANS = 1, SchedRW = [WriteTrans32]
370373

371374
defm V_NOT_B32 : VOP1Inst <"v_not_b32", VOP_I32_I32>;
@@ -1138,6 +1141,7 @@ defm V_CVT_F32_F16 : VOP1_Real_FULL_t16_and_fake16_gfx11_gfx12<0x00b>;
11381141

11391142
defm V_MOV_B64 : VOP1_Real_FULL <GFX1250Gen, 0x1d>;
11401143

1144+
defm V_TANH_F32 : VOP1_Real_FULL<GFX1250Gen, 0x01e>;
11411145
defm V_TANH_BF16 : VOP1_Real_FULL_t16_and_fake16_gfx1250<0x04a>;
11421146
defm V_CVT_F32_BF16 : VOP1_Real_FULL_t16_and_fake16_gfx1250<0x072, "v_cvt_f32_bf16", "V_CVT_F32_BF16_gfx1250">;
11431147
defm V_CVT_PK_F16_FP8 : VOP1_Real_FULL_t16_and_fake16_gfx1250<0x075>;

llvm/lib/TargetParser/TargetParser.cpp

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -443,6 +443,7 @@ void AMDGPU::fillAMDGPUFeatureMap(StringRef GPU, const Triple &T,
443443
Features["gfx1250-insts"] = true;
444444
Features["bitop3-insts"] = true;
445445
Features["prng-inst"] = true;
446+
Features["tanh-insts"] = true;
446447
Features["transpose-load-f4f6-insts"] = true;
447448
Features["bf16-trans-insts"] = true;
448449
Features["fp8-conversion-insts"] = true;

llvm/test/CodeGen/AMDGPU/llvm.amdgcn.tanh.ll

Lines changed: 84 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -7,8 +7,92 @@
77
; FIXME: t16 doesn't work at the moment because the store of s16 under t16 mode fails to select.
88
; FIXME: GlobalISel does not work with bf16
99

10+
declare float @llvm.amdgcn.tanh.f32(float) #0
1011
declare bfloat @llvm.amdgcn.tanh.bf16(bfloat) #0
1112

13+
define amdgpu_kernel void @tanh_f32(ptr addrspace(1) %out, float %src) #1 {
14+
; SDAG-REAL16-LABEL: tanh_f32:
15+
; SDAG-REAL16: ; %bb.0:
16+
; SDAG-REAL16-NEXT: s_load_b96 s[0:2], s[4:5], 0x0
17+
; SDAG-REAL16-NEXT: v_mov_b32_e32 v1, 0
18+
; SDAG-REAL16-NEXT: s_wait_kmcnt 0x0
19+
; SDAG-REAL16-NEXT: v_tanh_f32_e32 v0, s2
20+
; SDAG-REAL16-NEXT: global_store_b32 v1, v0, s[0:1]
21+
; SDAG-REAL16-NEXT: s_endpgm
22+
;
23+
; SDAG-FAKE16-LABEL: tanh_f32:
24+
; SDAG-FAKE16: ; %bb.0:
25+
; SDAG-FAKE16-NEXT: s_load_b96 s[0:2], s[4:5], 0x0
26+
; SDAG-FAKE16-NEXT: v_mov_b32_e32 v1, 0
27+
; SDAG-FAKE16-NEXT: s_wait_kmcnt 0x0
28+
; SDAG-FAKE16-NEXT: v_tanh_f32_e32 v0, s2
29+
; SDAG-FAKE16-NEXT: global_store_b32 v1, v0, s[0:1]
30+
; SDAG-FAKE16-NEXT: s_endpgm
31+
%tanh = call float @llvm.amdgcn.tanh.f32(float %src) #0
32+
store float %tanh, ptr addrspace(1) %out, align 4
33+
ret void
34+
}
35+
36+
; TODO: Really these should be constant folded
37+
define amdgpu_kernel void @tanh_f32_constant_4.0(ptr addrspace(1) %out) #1 {
38+
; SDAG-REAL16-LABEL: tanh_f32_constant_4.0:
39+
; SDAG-REAL16: ; %bb.0:
40+
; SDAG-REAL16-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
41+
; SDAG-REAL16-NEXT: v_tanh_f32_e32 v0, 4.0
42+
; SDAG-REAL16-NEXT: v_mov_b32_e32 v1, 0
43+
; SDAG-REAL16-NEXT: s_wait_kmcnt 0x0
44+
; SDAG-REAL16-NEXT: global_store_b32 v1, v0, s[0:1]
45+
; SDAG-REAL16-NEXT: s_endpgm
46+
;
47+
; SDAG-FAKE16-LABEL: tanh_f32_constant_4.0:
48+
; SDAG-FAKE16: ; %bb.0:
49+
; SDAG-FAKE16-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
50+
; SDAG-FAKE16-NEXT: v_tanh_f32_e32 v0, 4.0
51+
; SDAG-FAKE16-NEXT: v_mov_b32_e32 v1, 0
52+
; SDAG-FAKE16-NEXT: s_wait_kmcnt 0x0
53+
; SDAG-FAKE16-NEXT: global_store_b32 v1, v0, s[0:1]
54+
; SDAG-FAKE16-NEXT: s_endpgm
55+
%tanh = call float @llvm.amdgcn.tanh.f32(float 4.0) #0
56+
store float %tanh, ptr addrspace(1) %out, align 4
57+
ret void
58+
}
59+
60+
define amdgpu_kernel void @tanh_f32_constant_100.0(ptr addrspace(1) %out) #1 {
61+
; SDAG-REAL16-LABEL: tanh_f32_constant_100.0:
62+
; SDAG-REAL16: ; %bb.0:
63+
; SDAG-REAL16-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
64+
; SDAG-REAL16-NEXT: v_tanh_f32_e32 v0, 0x42c80000
65+
; SDAG-REAL16-NEXT: v_mov_b32_e32 v1, 0
66+
; SDAG-REAL16-NEXT: s_wait_kmcnt 0x0
67+
; SDAG-REAL16-NEXT: global_store_b32 v1, v0, s[0:1]
68+
; SDAG-REAL16-NEXT: s_endpgm
69+
;
70+
; SDAG-FAKE16-LABEL: tanh_f32_constant_100.0:
71+
; SDAG-FAKE16: ; %bb.0:
72+
; SDAG-FAKE16-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
73+
; SDAG-FAKE16-NEXT: v_tanh_f32_e32 v0, 0x42c80000
74+
; SDAG-FAKE16-NEXT: v_mov_b32_e32 v1, 0
75+
; SDAG-FAKE16-NEXT: s_wait_kmcnt 0x0
76+
; SDAG-FAKE16-NEXT: global_store_b32 v1, v0, s[0:1]
77+
; SDAG-FAKE16-NEXT: s_endpgm
78+
%tanh = call float @llvm.amdgcn.tanh.f32(float 100.0) #0
79+
store float %tanh, ptr addrspace(1) %out, align 4
80+
ret void
81+
}
82+
83+
define amdgpu_kernel void @tanh_undef_f32(ptr addrspace(1) %out) #1 {
84+
; SDAG-REAL16-LABEL: tanh_undef_f32:
85+
; SDAG-REAL16: ; %bb.0:
86+
; SDAG-REAL16-NEXT: s_endpgm
87+
;
88+
; SDAG-FAKE16-LABEL: tanh_undef_f32:
89+
; SDAG-FAKE16: ; %bb.0:
90+
; SDAG-FAKE16-NEXT: s_endpgm
91+
%tanh = call float @llvm.amdgcn.tanh.f32(float undef)
92+
store float %tanh, ptr addrspace(1) %out, align 4
93+
ret void
94+
}
95+
1296
define amdgpu_kernel void @tanh_bf16(ptr addrspace(1) %out, bfloat %src) #1 {
1397
; SDAG-REAL16-LABEL: tanh_bf16:
1498
; SDAG-REAL16: ; %bb.0:

llvm/test/MC/AMDGPU/gfx1250_asm_vop1-fake16.s

Lines changed: 45 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -28,6 +28,51 @@ v_mov_b64 v[4:5], 0.5
2828
v_mov_b64 v[254:255], 0xaf123456
2929
// GFX1250: v_mov_b64_e32 v[254:255], lit64(0xaf123456) ; encoding: [0xfe,0x3a,0xfc,0x7f,0x56,0x34,0x12,0xaf,0x00,0x00,0x00,0x00]
3030

31+
v_tanh_f32 v5, v1
32+
// GFX1250: v_tanh_f32_e32 v5, v1 ; encoding: [0x01,0x3d,0x0a,0x7e]
33+
34+
v_tanh_f32 v5, v255
35+
// GFX1250: v_tanh_f32_e32 v5, v255 ; encoding: [0xff,0x3d,0x0a,0x7e]
36+
37+
v_tanh_f32 v5, s1
38+
// GFX1250: v_tanh_f32_e32 v5, s1 ; encoding: [0x01,0x3c,0x0a,0x7e]
39+
40+
v_tanh_f32 v5, s105
41+
// GFX1250: v_tanh_f32_e32 v5, s105 ; encoding: [0x69,0x3c,0x0a,0x7e]
42+
43+
v_tanh_f32 v5, vcc_lo
44+
// GFX1250: v_tanh_f32_e32 v5, vcc_lo ; encoding: [0x6a,0x3c,0x0a,0x7e]
45+
46+
v_tanh_f32 v5, vcc_hi
47+
// GFX1250: v_tanh_f32_e32 v5, vcc_hi ; encoding: [0x6b,0x3c,0x0a,0x7e]
48+
49+
v_tanh_f32 v5, ttmp15
50+
// GFX1250: v_tanh_f32_e32 v5, ttmp15 ; encoding: [0x7b,0x3c,0x0a,0x7e]
51+
52+
v_tanh_f32 v5, m0
53+
// GFX1250: v_tanh_f32_e32 v5, m0 ; encoding: [0x7d,0x3c,0x0a,0x7e]
54+
55+
v_tanh_f32 v5, exec_lo
56+
// GFX1250: v_tanh_f32_e32 v5, exec_lo ; encoding: [0x7e,0x3c,0x0a,0x7e]
57+
58+
v_tanh_f32 v5, exec_hi
59+
// GFX1250: v_tanh_f32_e32 v5, exec_hi ; encoding: [0x7f,0x3c,0x0a,0x7e]
60+
61+
v_tanh_f32 v5, null
62+
// GFX1250: v_tanh_f32_e32 v5, null ; encoding: [0x7c,0x3c,0x0a,0x7e]
63+
64+
v_tanh_f32 v5, -1
65+
// GFX1250: v_tanh_f32_e32 v5, -1 ; encoding: [0xc1,0x3c,0x0a,0x7e]
66+
67+
v_tanh_f32 v5, 0.5
68+
// GFX1250: v_tanh_f32_e32 v5, 0.5 ; encoding: [0xf0,0x3c,0x0a,0x7e]
69+
70+
v_tanh_f32 v5, src_scc
71+
// GFX1250: v_tanh_f32_e32 v5, src_scc ; encoding: [0xfd,0x3c,0x0a,0x7e]
72+
73+
v_tanh_f32 v255, 0xaf123456
74+
// GFX1250: v_tanh_f32_e32 v255, 0xaf123456 ; encoding: [0xff,0x3c,0xfe,0x7f,0x56,0x34,0x12,0xaf]
75+
3176
v_tanh_bf16 v5, v1
3277
// GFX1250: v_tanh_bf16_e32 v5, v1 ; encoding: [0x01,0x95,0x0a,0x7e]
3378

0 commit comments

Comments
 (0)