Skip to content

Commit 1247fdd

Browse files
[SimplifyCFG] Relax cttz cost check in simplifySwitchOfPowersOfTwo
We should be able to allow `simplifySwitchOfPowersOfTwo` transform to take place, as, on recent X86 targets, the weighted latency-size appears to be 2. This favours computing trailing zeroes and indexing into a smaller value table, over generating a jump table with an indirect branch, which overall should be more efficient.
1 parent c5972da commit 1247fdd

File tree

2 files changed

+45
-10
lines changed

2 files changed

+45
-10
lines changed

llvm/lib/Transforms/Utils/SimplifyCFG.cpp

Lines changed: 10 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -7198,8 +7198,10 @@ static bool reduceSwitchRange(SwitchInst *SI, IRBuilder<> &Builder,
71987198
/// will be transformed to:
71997199
/// switch (count_trailing_zeros(C)) { case 0: case 1: case 6: case 7: }
72007200
///
7201-
/// This transformation allows better lowering and could allow transforming into
7202-
/// a lookup table.
7201+
/// This transformation allows better lowering and may transform the switch
7202+
/// instruction into a sequence of bit manipulation and a smaller
7203+
/// log2(C)-indexed value table (instead of traditionally emitting a load of the
7204+
/// address of the jump target, and indirectly jump to it).
72037205
static bool simplifySwitchOfPowersOfTwo(SwitchInst *SI, IRBuilder<> &Builder,
72047206
const DataLayout &DL,
72057207
const TargetTransformInfo &TTI) {
@@ -7211,17 +7213,15 @@ static bool simplifySwitchOfPowersOfTwo(SwitchInst *SI, IRBuilder<> &Builder,
72117213
!DL.fitsInLegalInteger(CondTy->getIntegerBitWidth()))
72127214
return false;
72137215

7214-
const auto CttzIntrinsicCost = TTI.getIntrinsicInstrCost(
7215-
IntrinsicCostAttributes(Intrinsic::cttz, CondTy,
7216-
{Condition, ConstantInt::getTrue(Context)}),
7217-
TTI::TCK_SizeAndLatency);
7218-
7219-
if (CttzIntrinsicCost > TTI::TCC_Basic)
7220-
// Inserting intrinsic is too expensive.
7216+
// Ensure trailing zeroes count intrinsic emission is not too expensive.
7217+
IntrinsicCostAttributes Attrs(Intrinsic::cttz, CondTy,
7218+
{Condition, ConstantInt::getTrue(Context)});
7219+
if (TTI.getIntrinsicInstrCost(Attrs, TTI::TCK_SizeAndLatency) >
7220+
TTI::TCC_Basic * 2)
72217221
return false;
72227222

72237223
// Only bother with this optimization if there are more than 3 switch cases.
7224-
// SDAG will only bother creating jump tables for 4 or more cases.
7224+
// SDAG will start emitting jump tables for 4 or more cases.
72257225
if (SI->getNumCases() < 4)
72267226
return false;
72277227

Lines changed: 35 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,35 @@
1+
; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5
2+
; RUN: opt -passes='simplifycfg<switch-to-lookup>' -simplifycfg-require-and-preserve-domtree=1 -S < %s | FileCheck %s
3+
4+
target triple = "x86_64-unknown-linux-gnu"
5+
6+
define i32 @switch_of_powers_two(i32 %arg) {
7+
; CHECK-LABEL: define i32 @switch_of_powers_two(
8+
; CHECK-SAME: i32 [[ARG:%.*]]) {
9+
; CHECK-NEXT: [[ENTRY:.*:]]
10+
; CHECK-NEXT: [[TMP0:%.*]] = call i32 @llvm.cttz.i32(i32 [[ARG]], i1 true)
11+
; CHECK-NEXT: [[SWITCH_GEP:%.*]] = getelementptr inbounds [7 x i32], ptr @switch.table.switch_of_powers_two, i32 0, i32 [[TMP0]]
12+
; CHECK-NEXT: [[SWITCH_LOAD:%.*]] = load i32, ptr [[SWITCH_GEP]], align 4
13+
; CHECK-NEXT: ret i32 [[SWITCH_LOAD]]
14+
;
15+
entry:
16+
switch i32 %arg, label %default_case [
17+
i32 1, label %bb1
18+
i32 8, label %bb2
19+
i32 16, label %bb3
20+
i32 32, label %bb4
21+
i32 64, label %bb5
22+
]
23+
24+
25+
default_case: unreachable
26+
bb1: br label %return
27+
bb2: br label %return
28+
bb3: br label %return
29+
bb4: br label %return
30+
bb5: br label %return
31+
32+
return:
33+
%phi = phi i32 [ 3, %bb1 ], [ 2, %bb2 ], [ 1, %bb3 ], [ 0, %bb4 ], [ 42, %bb5 ]
34+
ret i32 %phi
35+
}

0 commit comments

Comments
 (0)