Skip to content

Commit 9fed480

Browse files
authored
[BOLT] Explicitly check for returns when extending call continuation profile (#143295)
Call continuation logic relies on assumptions about fall-through origin: - the branch is external to the function, - fall-through start is at the beginning of the block, - the block is not an entry point or a landing pad. Leverage trace information to explicitly check whether the origin is a return instruction, and defer to checks above only in case of DSO-external branch source. This covers both regular and BAT cases, addressing call continuation fall-through undercounting in the latter mode, which improves BAT profile quality metrics. For example, for one large binary: - CFG discontinuity 21.83% -> 0.00%, - CFG flow imbalance 10.77%/100.00% -> 3.40%/13.82% (weighted/worst) - CG flow imbalance 8.49% —> 8.49%. Depends on #143289. Test Plan: updated callcont-fallthru.s
1 parent 816ab1a commit 9fed480

File tree

3 files changed

+90
-65
lines changed

3 files changed

+90
-65
lines changed

bolt/include/bolt/Profile/DataAggregator.h

Lines changed: 9 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -132,6 +132,9 @@ class DataAggregator : public DataReader {
132132
/// and use them later for processing and assigning profile.
133133
std::unordered_map<Trace, TakenBranchInfo, TraceHash> TraceMap;
134134
std::vector<std::pair<Trace, TakenBranchInfo>> Traces;
135+
/// Pre-populated addresses of returns, coming from pre-aggregated data or
136+
/// disassembly. Used to disambiguate call-continuation fall-throughs.
137+
std::unordered_set<uint64_t> Returns;
135138
std::unordered_map<uint64_t, uint64_t> BasicSamples;
136139
std::vector<PerfMemSample> MemSamples;
137140

@@ -204,8 +207,8 @@ class DataAggregator : public DataReader {
204207
/// Return a vector of offsets corresponding to a trace in a function
205208
/// if the trace is valid, std::nullopt otherwise.
206209
std::optional<SmallVector<std::pair<uint64_t, uint64_t>, 16>>
207-
getFallthroughsInTrace(BinaryFunction &BF, const Trace &Trace,
208-
uint64_t Count) const;
210+
getFallthroughsInTrace(BinaryFunction &BF, const Trace &Trace, uint64_t Count,
211+
bool IsReturn) const;
209212

210213
/// Record external entry into the function \p BF.
211214
///
@@ -265,11 +268,14 @@ class DataAggregator : public DataReader {
265268
uint64_t From, uint64_t To, uint64_t Count,
266269
uint64_t Mispreds);
267270

271+
/// Checks if \p Addr corresponds to a return instruction.
272+
bool checkReturn(uint64_t Addr);
273+
268274
/// Register a \p Branch.
269275
bool doBranch(uint64_t From, uint64_t To, uint64_t Count, uint64_t Mispreds);
270276

271277
/// Register a trace between two LBR entries supplied in execution order.
272-
bool doTrace(const Trace &Trace, uint64_t Count);
278+
bool doTrace(const Trace &Trace, uint64_t Count, bool IsReturn);
273279

274280
/// Parser helpers
275281
/// Return false if we exhausted our parser buffer and finished parsing

bolt/lib/Profile/DataAggregator.cpp

Lines changed: 41 additions & 30 deletions
Original file line numberDiff line numberDiff line change
@@ -730,50 +730,54 @@ bool DataAggregator::doInterBranch(BinaryFunction *FromFunc,
730730
return true;
731731
}
732732

733+
bool DataAggregator::checkReturn(uint64_t Addr) {
734+
auto isReturn = [&](auto MI) { return MI && BC->MIB->isReturn(*MI); };
735+
if (llvm::is_contained(Returns, Addr))
736+
return true;
737+
738+
BinaryFunction *Func = getBinaryFunctionContainingAddress(Addr);
739+
if (!Func)
740+
return false;
741+
742+
const uint64_t Offset = Addr - Func->getAddress();
743+
if (Func->hasInstructions()
744+
? isReturn(Func->getInstructionAtOffset(Offset))
745+
: isReturn(Func->disassembleInstructionAtOffset(Offset))) {
746+
Returns.emplace(Addr);
747+
return true;
748+
}
749+
return false;
750+
}
751+
733752
bool DataAggregator::doBranch(uint64_t From, uint64_t To, uint64_t Count,
734753
uint64_t Mispreds) {
735-
// Returns whether \p Offset in \p Func contains a return instruction.
736-
auto checkReturn = [&](const BinaryFunction &Func, const uint64_t Offset) {
737-
auto isReturn = [&](auto MI) { return MI && BC->MIB->isReturn(*MI); };
738-
return Func.hasInstructions()
739-
? isReturn(Func.getInstructionAtOffset(Offset))
740-
: isReturn(Func.disassembleInstructionAtOffset(Offset));
741-
};
742-
743754
// Mutates \p Addr to an offset into the containing function, performing BAT
744755
// offset translation and parent lookup.
745756
//
746-
// Returns the containing function (or BAT parent) and whether the address
747-
// corresponds to a return (if \p IsFrom) or a call continuation (otherwise).
757+
// Returns the containing function (or BAT parent).
748758
auto handleAddress = [&](uint64_t &Addr, bool IsFrom) {
749759
BinaryFunction *Func = getBinaryFunctionContainingAddress(Addr);
750760
if (!Func) {
751761
Addr = 0;
752-
return std::pair{Func, false};
762+
return Func;
753763
}
754764

755765
Addr -= Func->getAddress();
756766

757-
bool IsRet = IsFrom && checkReturn(*Func, Addr);
758-
759767
if (BAT)
760768
Addr = BAT->translate(Func->getAddress(), Addr, IsFrom);
761769

762770
if (BinaryFunction *ParentFunc = getBATParentFunction(*Func))
763-
Func = ParentFunc;
771+
return ParentFunc;
764772

765-
return std::pair{Func, IsRet};
773+
return Func;
766774
};
767775

768-
auto [FromFunc, IsReturn] = handleAddress(From, /*IsFrom*/ true);
769-
auto [ToFunc, _] = handleAddress(To, /*IsFrom*/ false);
776+
BinaryFunction *FromFunc = handleAddress(From, /*IsFrom*/ true);
777+
BinaryFunction *ToFunc = handleAddress(To, /*IsFrom*/ false);
770778
if (!FromFunc && !ToFunc)
771779
return false;
772780

773-
// Ignore returns.
774-
if (IsReturn)
775-
return true;
776-
777781
// Treat recursive control transfers as inter-branches.
778782
if (FromFunc == ToFunc && To != 0) {
779783
recordBranch(*FromFunc, From, To, Count, Mispreds);
@@ -783,7 +787,8 @@ bool DataAggregator::doBranch(uint64_t From, uint64_t To, uint64_t Count,
783787
return doInterBranch(FromFunc, ToFunc, From, To, Count, Mispreds);
784788
}
785789

786-
bool DataAggregator::doTrace(const Trace &Trace, uint64_t Count) {
790+
bool DataAggregator::doTrace(const Trace &Trace, uint64_t Count,
791+
bool IsReturn) {
787792
const uint64_t From = Trace.From, To = Trace.To;
788793
BinaryFunction *FromFunc = getBinaryFunctionContainingAddress(From);
789794
BinaryFunction *ToFunc = getBinaryFunctionContainingAddress(To);
@@ -808,8 +813,8 @@ bool DataAggregator::doTrace(const Trace &Trace, uint64_t Count) {
808813
const uint64_t FuncAddress = FromFunc->getAddress();
809814
std::optional<BoltAddressTranslation::FallthroughListTy> FTs =
810815
BAT && BAT->isBATFunction(FuncAddress)
811-
? BAT->getFallthroughsInTrace(FuncAddress, From, To)
812-
: getFallthroughsInTrace(*FromFunc, Trace, Count);
816+
? BAT->getFallthroughsInTrace(FuncAddress, From - IsReturn, To)
817+
: getFallthroughsInTrace(*FromFunc, Trace, Count, IsReturn);
813818
if (!FTs) {
814819
LLVM_DEBUG(dbgs() << "Invalid trace " << Trace << '\n');
815820
NumInvalidTraces += Count;
@@ -831,7 +836,7 @@ bool DataAggregator::doTrace(const Trace &Trace, uint64_t Count) {
831836

832837
std::optional<SmallVector<std::pair<uint64_t, uint64_t>, 16>>
833838
DataAggregator::getFallthroughsInTrace(BinaryFunction &BF, const Trace &Trace,
834-
uint64_t Count) const {
839+
uint64_t Count, bool IsReturn) const {
835840
SmallVector<std::pair<uint64_t, uint64_t>, 16> Branches;
836841

837842
BinaryContext &BC = BF.getBinaryContext();
@@ -865,9 +870,13 @@ DataAggregator::getFallthroughsInTrace(BinaryFunction &BF, const Trace &Trace,
865870

866871
// Adjust FromBB if the first LBR is a return from the last instruction in
867872
// the previous block (that instruction should be a call).
868-
if (Trace.Branch != Trace::FT_ONLY && !BF.containsAddress(Trace.Branch) &&
869-
From == FromBB->getOffset() && !FromBB->isEntryPoint() &&
870-
!FromBB->isLandingPad()) {
873+
if (IsReturn) {
874+
if (From)
875+
FromBB = BF.getBasicBlockContainingOffset(From - 1);
876+
else
877+
LLVM_DEBUG(dbgs() << "return to the function start: " << Trace << '\n');
878+
} else if (Trace.Branch == Trace::EXTERNAL && From == FromBB->getOffset() &&
879+
!FromBB->isEntryPoint() && !FromBB->isLandingPad()) {
871880
const BinaryBasicBlock *PrevBB =
872881
BF.getLayout().getBlock(FromBB->getIndex() - 1);
873882
if (PrevBB->getSuccessor(FromBB->getLabel())) {
@@ -1557,11 +1566,13 @@ void DataAggregator::processBranchEvents() {
15571566
TimerGroupName, TimerGroupDesc, opts::TimeAggregator);
15581567

15591568
for (const auto &[Trace, Info] : Traces) {
1560-
if (Trace.Branch != Trace::FT_ONLY &&
1569+
bool IsReturn = checkReturn(Trace.Branch);
1570+
// Ignore returns.
1571+
if (!IsReturn && Trace.Branch != Trace::FT_ONLY &&
15611572
Trace.Branch != Trace::FT_EXTERNAL_ORIGIN)
15621573
doBranch(Trace.Branch, Trace.From, Info.TakenCount, Info.MispredCount);
15631574
if (Trace.To != Trace::BR_ONLY)
1564-
doTrace(Trace, Info.TakenCount);
1575+
doTrace(Trace, Info.TakenCount, IsReturn);
15651576
}
15661577
printBranchSamplesDiagnostics();
15671578
}

bolt/test/X86/callcont-fallthru.s

Lines changed: 40 additions & 32 deletions
Original file line numberDiff line numberDiff line change
@@ -4,29 +4,43 @@
44
# RUN: %clang %cflags -fpic -shared -xc /dev/null -o %t.so
55
## Link against a DSO to ensure PLT entries.
66
# RUN: %clangxx %cxxflags %s %t.so -o %t -Wl,-q -nostdlib
7-
# RUN: link_fdata %s %t %t.pat PREAGGT1
8-
# RUN: link_fdata %s %t %t.pat2 PREAGGT2
9-
# RUN-DISABLED: link_fdata %s %t %t.patplt PREAGGPLT
7+
# Trace to a call continuation, not a landing pad/entry point
8+
# RUN: link_fdata %s %t %t.pa-base PREAGG-BASE
9+
# Trace from a return to a landing pad/entry point call continuation
10+
# RUN: link_fdata %s %t %t.pa-ret PREAGG-RET
11+
# Trace from an external location to a landing pad/entry point call continuation
12+
# RUN: link_fdata %s %t %t.pa-ext PREAGG-EXT
13+
# RUN-DISABLED: link_fdata %s %t %t.pa-plt PREAGG-PLT
1014

1115
# RUN: llvm-strip --strip-unneeded %t -o %t.strip
1216
# RUN: llvm-objcopy --remove-section=.eh_frame %t.strip %t.noeh
1317

1418
## Check pre-aggregated traces attach call continuation fallthrough count
15-
# RUN: llvm-bolt %t.noeh --pa -p %t.pat -o %t.out \
16-
# RUN: --print-cfg --print-only=main | FileCheck %s
17-
18-
## Check pre-aggregated traces don't attach call continuation fallthrough count
19-
## to secondary entry point (unstripped)
20-
# RUN: llvm-bolt %t --pa -p %t.pat2 -o %t.out \
21-
# RUN: --print-cfg --print-only=main | FileCheck %s --check-prefix=CHECK3
22-
## Check pre-aggregated traces don't attach call continuation fallthrough count
23-
## to landing pad (stripped, LP)
24-
# RUN: llvm-bolt %t.strip --pa -p %t.pat2 -o %t.out \
25-
# RUN: --print-cfg --print-only=main | FileCheck %s --check-prefix=CHECK3
19+
## in the basic case (not an entry point, not a landing pad).
20+
# RUN: llvm-bolt %t.noeh --pa -p %t.pa-base -o %t.out \
21+
# RUN: --print-cfg --print-only=main | FileCheck %s --check-prefix=CHECK-BASE
22+
23+
## Check pre-aggregated traces from a return attach call continuation
24+
## fallthrough count to secondary entry point (unstripped)
25+
# RUN: llvm-bolt %t --pa -p %t.pa-ret -o %t.out \
26+
# RUN: --print-cfg --print-only=main | FileCheck %s --check-prefix=CHECK-ATTACH
27+
## Check pre-aggregated traces from a return attach call continuation
28+
## fallthrough count to landing pad (stripped, landing pad)
29+
# RUN: llvm-bolt %t.strip --pa -p %t.pa-ret -o %t.out \
30+
# RUN: --print-cfg --print-only=main | FileCheck %s --check-prefix=CHECK-ATTACH
31+
32+
## Check pre-aggregated traces from external location don't attach call
33+
## continuation fallthrough count to secondary entry point (unstripped)
34+
# RUN: llvm-bolt %t --pa -p %t.pa-ext -o %t.out \
35+
# RUN: --print-cfg --print-only=main | FileCheck %s --check-prefix=CHECK-SKIP
36+
## Check pre-aggregated traces from external location don't attach call
37+
## continuation fallthrough count to landing pad (stripped, landing pad)
38+
# RUN: llvm-bolt %t.strip --pa -p %t.pa-ext -o %t.out \
39+
# RUN: --print-cfg --print-only=main | FileCheck %s --check-prefix=CHECK-SKIP
2640

2741
## Check pre-aggregated traces don't report zero-sized PLT fall-through as
2842
## invalid trace
29-
# RUN-DISABLED: llvm-bolt %t.strip --pa -p %t.patplt -o %t.out | FileCheck %s \
43+
# RUN-DISABLED: llvm-bolt %t.strip --pa -p %t.pa-plt -o %t.out | FileCheck %s \
3044
# RUN-DISABLED: --check-prefix=CHECK-PLT
3145
# CHECK-PLT: traces mismatching disassembled function contents: 0
3246

@@ -56,11 +70,11 @@ main:
5670
Ltmp0_br:
5771
callq puts@PLT
5872
## Check PLT traces are accepted
59-
# PREAGGPLT: T #Ltmp0_br# #puts@plt# #puts@plt# 3
73+
# PREAGG-PLT: T #Ltmp0_br# #puts@plt# #puts@plt# 3
6074
## Target is an external-origin call continuation
61-
# PREAGGT1: T X:0 #Ltmp1# #Ltmp4_br# 2
62-
# CHECK: callq puts@PLT
63-
# CHECK-NEXT: count: 2
75+
# PREAGG-BASE: T X:0 #Ltmp1# #Ltmp4_br# 2
76+
# CHECK-BASE: callq puts@PLT
77+
# CHECK-BASE-NEXT: count: 2
6478

6579
Ltmp1:
6680
movq -0x10(%rbp), %rax
@@ -71,24 +85,18 @@ Ltmp4:
7185
cmpl $0x0, -0x14(%rbp)
7286
Ltmp4_br:
7387
je Ltmp0
74-
# CHECK2: je .Ltmp0
75-
# CHECK2-NEXT: count: 3
7688

7789
movl $0xa, -0x18(%rbp)
7890
callq foo
7991
## Target is a binary-local call continuation
80-
# PREAGGT1: T #Lfoo_ret# #Ltmp3# #Ltmp3_br# 1
81-
# CHECK: callq foo
82-
# CHECK-NEXT: count: 1
83-
84-
## PLT call continuation fallthrough spanning the call
85-
# CHECK2: callq foo
86-
# CHECK2-NEXT: count: 3
87-
92+
# PREAGG-RET: T #Lfoo_ret# #Ltmp3# #Ltmp3_br# 1
8893
## Target is a secondary entry point (unstripped) or a landing pad (stripped)
89-
# PREAGGT2: T X:0 #Ltmp3# #Ltmp3_br# 2
90-
# CHECK3: callq foo
91-
# CHECK3-NEXT: count: 0
94+
# PREAGG-EXT: T X:0 #Ltmp3# #Ltmp3_br# 1
95+
96+
# CHECK-ATTACH: callq foo
97+
# CHECK-ATTACH-NEXT: count: 1
98+
# CHECK-SKIP: callq foo
99+
# CHECK-SKIP-NEXT: count: 0
92100

93101
Ltmp3:
94102
cmpl $0x0, -0x18(%rbp)

0 commit comments

Comments
 (0)