Skip to content

[BOLT] Impute missing trace fall-through #145258

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Open
wants to merge 3 commits into
base: users/aaupov/spr/main.bolt-impute-missing-trace-fall-through
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
6 changes: 6 additions & 0 deletions bolt/include/bolt/Core/MCPlusBuilder.h
Original file line number Diff line number Diff line change
Expand Up @@ -430,6 +430,12 @@ class MCPlusBuilder {
return Analysis->isIndirectBranch(Inst);
}

bool IsUnconditionalJump(const MCInst &Inst) const {
const MCInstrDesc &Desc = Info->get(Inst.getOpcode());
// barrier captures returns and unconditional branches
return Desc.isCall() || Desc.isBarrier();
}

/// Returns true if the instruction is memory indirect call or jump
virtual bool isBranchOnMem(const MCInst &Inst) const {
llvm_unreachable("not implemented");
Expand Down
43 changes: 42 additions & 1 deletion bolt/include/bolt/Profile/DataAggregator.h
Original file line number Diff line number Diff line change
Expand Up @@ -137,7 +137,7 @@ class DataAggregator : public DataReader {
std::vector<std::pair<Trace, TakenBranchInfo>> Traces;
/// Pre-populated addresses of returns, coming from pre-aggregated data or
/// disassembly. Used to disambiguate call-continuation fall-throughs.
std::unordered_set<uint64_t> Returns;
std::unordered_map<uint64_t, bool> Returns;
std::unordered_map<uint64_t, uint64_t> BasicSamples;
std::vector<PerfMemSample> MemSamples;

Expand Down Expand Up @@ -499,6 +499,10 @@ class DataAggregator : public DataReader {
/// If \p FileBuildID has no match, then issue an error and exit.
void processFileBuildID(StringRef FileBuildID);

/// Infer missing fall-throughs for branch-only traces (LBR top-of-stack
/// entries).
void imputeFallThroughs();

/// Debugging dump methods
void dump() const;
void dump(const PerfBranchSample &Sample) const;
Expand All @@ -510,6 +514,43 @@ class DataAggregator : public DataReader {
void printBasicSamplesDiagnostics(uint64_t OutOfRangeSamples) const;
void printBranchStacksDiagnostics(uint64_t IgnoredSamples) const;

/// Get instruction at \p Addr either from containing binary function or
/// disassemble in-place, and invoke \p Callback on resulting MCInst.
/// Returns the result of the callback or nullopt.
template <typename T>
std::optional<T>
testInstructionAt(const uint64_t Addr,
std::function<T(const MCInst &)> Callback) const {
BinaryFunction *Func = getBinaryFunctionContainingAddress(Addr);
if (!Func)
return std::nullopt;
const uint64_t Offset = Addr - Func->getAddress();
if (Func->hasInstructions()) {
if (auto *MI = Func->getInstructionAtOffset(Offset))
return Callback(*MI);
} else {
if (auto MI = Func->disassembleInstructionAtOffset(Offset))
return Callback(*MI);
}
return std::nullopt;
}

/// Apply \p Callback to the instruction at \p Addr, and memoize the result
/// in a \p Map.
template <typename T>
std::optional<T> testAndSet(const uint64_t Addr,
std::function<T(const MCInst &)> Callback,
std::unordered_map<uint64_t, T> &Map) {
auto It = Map.find(Addr);
if (It != Map.end())
return It->second;
if (std::optional<T> Res = testInstructionAt<T>(Addr, Callback)) {
Map.emplace(Addr, *Res);
return *Res;
}
return std::nullopt;
}

public:
/// If perf.data was collected without build ids, the buildid-list may contain
/// incomplete entries. Return true if the buffer containing
Expand Down
75 changes: 57 additions & 18 deletions bolt/lib/Profile/DataAggregator.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -77,6 +77,11 @@ FilterPID("pid",
cl::Optional,
cl::cat(AggregatorCategory));

static cl::opt<bool> ImputeTraceFallthrough(
"impute-trace-fall-through",
cl::desc("impute missing fall-throughs for branch-only traces"),
cl::Optional, cl::cat(AggregatorCategory));

static cl::opt<bool>
IgnoreBuildID("ignore-build-id",
cl::desc("continue even if build-ids in input binary and perf.data mismatch"),
Expand Down Expand Up @@ -529,6 +534,49 @@ void DataAggregator::parsePerfData(BinaryContext &BC) {
deleteTempFiles();
}

void DataAggregator::imputeFallThroughs() {
if (Traces.empty())
return;

std::pair PrevBranch(Trace::EXTERNAL, Trace::EXTERNAL);
uint64_t AggregateCount = 0;
uint64_t AggregateFallthroughSize = 0;
uint64_t InferredTraces = 0;

// Helper map with whether the instruction is a call/ret/unconditional branch
std::unordered_map<uint64_t, bool> IsUncondJumpMap;
auto checkUncondJump = [&](const uint64_t Addr) {
auto isUncondJump = [&](const MCInst &MI) -> bool {
return BC->MIB->IsUnconditionalJump(MI);
};
return testAndSet<bool>(Addr, isUncondJump, IsUncondJumpMap).value_or(true);
};

for (auto &[Trace, Info] : Traces) {
if (Trace.From == Trace::EXTERNAL)
continue;
std::pair CurrentBranch(Trace.Branch, Trace.From);
if (Trace.To == Trace::BR_ONLY) {
uint64_t InferredBytes = PrevBranch == CurrentBranch
? AggregateFallthroughSize / AggregateCount
: !checkUncondJump(Trace.From);
Trace.To = Trace.From + InferredBytes;
LLVM_DEBUG(dbgs() << "imputed " << Trace << " (" << InferredBytes
<< " bytes)\n");
++InferredTraces;
} else {
if (CurrentBranch != PrevBranch)
AggregateCount = AggregateFallthroughSize = 0;
if (Trace.To != Trace::EXTERNAL)
AggregateFallthroughSize += (Trace.To - Trace.From) * Info.TakenCount;
AggregateCount += Info.TakenCount;
}
PrevBranch = CurrentBranch;
}
if (opts::Verbosity >= 1)
outs() << "BOLT-INFO: imputed " << InferredTraces << " traces\n";
}

Error DataAggregator::preprocessProfile(BinaryContext &BC) {
this->BC = &BC;

Expand All @@ -541,6 +589,9 @@ Error DataAggregator::preprocessProfile(BinaryContext &BC) {
// Sort parsed traces for faster processing.
llvm::sort(Traces, llvm::less_first());

if (opts::ImputeTraceFallthrough)
imputeFallThroughs();

if (opts::HeatmapMode) {
if (std::error_code EC = printLBRHeatMap())
return errorCodeToError(EC);
Expand Down Expand Up @@ -742,22 +793,10 @@ bool DataAggregator::doInterBranch(BinaryFunction *FromFunc,
}

bool DataAggregator::checkReturn(uint64_t Addr) {
auto isReturn = [&](auto MI) { return MI && BC->MIB->isReturn(*MI); };
if (llvm::is_contained(Returns, Addr))
return true;

BinaryFunction *Func = getBinaryFunctionContainingAddress(Addr);
if (!Func)
return false;

const uint64_t Offset = Addr - Func->getAddress();
if (Func->hasInstructions()
? isReturn(Func->getInstructionAtOffset(Offset))
: isReturn(Func->disassembleInstructionAtOffset(Offset))) {
Returns.emplace(Addr);
return true;
}
return false;
auto isReturn = [&](const MCInst &MI) -> bool {
return BC->MIB->isReturn(MI);
};
return testAndSet<bool>(Addr, isReturn, Returns).value_or(false);
}

bool DataAggregator::doBranch(uint64_t From, uint64_t To, uint64_t Count,
Expand Down Expand Up @@ -1347,7 +1386,7 @@ std::error_code DataAggregator::parseAggregatedLBREntry() {
if (!Addr[0]->Offset)
Addr[0]->Offset = Trace::FT_EXTERNAL_RETURN;
else
Returns.emplace(Addr[0]->Offset);
Returns.emplace(Addr[0]->Offset, true);
}

/// Record a trace.
Expand Down Expand Up @@ -1608,7 +1647,7 @@ void DataAggregator::processBranchEvents() {
NamedRegionTimer T("processBranch", "Processing branch events",
TimerGroupName, TimerGroupDesc, opts::TimeAggregator);

Returns.emplace(Trace::FT_EXTERNAL_RETURN);
Returns.emplace(Trace::FT_EXTERNAL_RETURN, true);
for (const auto &[Trace, Info] : Traces) {
bool IsReturn = checkReturn(Trace.Branch);
// Ignore returns.
Expand Down
Loading