Skip to content

Commit 4f676c2

Browse files
author
Walter Erquinigo
committedJul 26, 2022
[trace][intel pt] Introduce wall clock time for each trace item
- Decouple TSCs from trace items - Turn TSCs into events just like CPUs. The new name is HW clock tick, wich could be reused by other vendors. - Add a GetWallTime that returns the wall time that the trace plug-in can infer for each trace item. - For intel pt, we are doing the following interpolation: if an instruction takes less than 1 TSC, we use that duration, otherwise, we assume the instruction took 1 TSC. This helps us avoid having to handle context switches, changes to kernel, idle times, decoding errors, etc. We are just trying to show some approximation and not the real data. For the real data, TSCs are the way to go. Besides that, we are making sure that no two trace items will give the same interpolation value. Finally, we are using as time 0 the time at which tracing started. Sample output: ``` (lldb) r Process 750047 launched: '/home/wallace/a.out' (x86_64) Process 750047 stopped * thread hfinkel#1, name = 'a.out', stop reason = breakpoint 1.1 frame #0: 0x0000000000402479 a.out`main at main.cpp:29:20 26 }; 27 28 int main() { -> 29 std::vector<int> vvv; 30 for (int i = 0; i < 100; i++) 31 vvv.push_back(i); 32 (lldb) process trace start -s 64kb -t --per-cpu (lldb) b 60 Breakpoint 2: where = a.out`main + 1689 at main.cpp:60:23, address = 0x0000000000402afe (lldb) c Process 750047 resuming Process 750047 stopped * thread hfinkel#1, name = 'a.out', stop reason = breakpoint 2.1 frame #0: 0x0000000000402afe a.out`main at main.cpp:60:23 57 map<int, int> m; 58 m[3] = 4; 59 -> 60 map<string, string> m2; 61 m2["5"] = "6"; 62 63 std::vector<std::string> vs = {"2", "3"}; (lldb) thread trace dump instructions -t -f -e thread hfinkel#1: tid = 750047 0: [379567.000 ns] (event) HW clock tick [48599428476224707] 1: [379569.000 ns] (event) CPU core changed [new CPU=2] 2: [390487.000 ns] (event) HW clock tick [48599428476246495] 3: [1602508.000 ns] (event) HW clock tick [48599428478664855] 4: [1662745.000 ns] (event) HW clock tick [48599428478785046] libc.so.6`malloc 5: [1662746.995 ns] 0x00007ffff7176660 endbr64 6: [1662748.991 ns] 0x00007ffff7176664 movq 0x32387d(%rip), %rax ; + 408 7: [1662750.986 ns] 0x00007ffff717666b pushq %r12 8: [1662752.981 ns] 0x00007ffff717666d pushq %rbp 9: [1662754.977 ns] 0x00007ffff717666e pushq %rbx 10: [1662756.972 ns] 0x00007ffff717666f movq (%rax), %rax 11: [1662758.967 ns] 0x00007ffff7176672 testq %rax, %rax 12: [1662760.963 ns] 0x00007ffff7176675 jne 0x9c7e0 ; <+384> 13: [1662762.958 ns] 0x00007ffff717667b leaq 0x17(%rdi), %rax 14: [1662764.953 ns] 0x00007ffff717667f cmpq $0x1f, %rax 15: [1662766.949 ns] 0x00007ffff7176683 ja 0x9c730 ; <+208> 16: [1662768.944 ns] 0x00007ffff7176730 andq $-0x10, %rax 17: [1662770.939 ns] 0x00007ffff7176734 cmpq $-0x41, %rax 18: [1662772.935 ns] 0x00007ffff7176738 seta %dl 19: [1662774.930 ns] 0x00007ffff717673b jmp 0x9c690 ; <+48> 20: [1662776.925 ns] 0x00007ffff7176690 cmpq %rdi, %rax 21: [1662778.921 ns] 0x00007ffff7176693 jb 0x9c7b0 ; <+336> 22: [1662780.916 ns] 0x00007ffff7176699 testb %dl, %dl 23: [1662782.911 ns] 0x00007ffff717669b jne 0x9c7b0 ; <+336> 24: [1662784.906 ns] 0x00007ffff71766a1 movq 0x3236c0(%rip), %r12 ; + 24 (lldb) thread trace dump instructions -t -f -e -J -c 4 [ { "id": 0, "timestamp_ns": "379567.000000", "event": "HW clock tick", "hwClock": 48599428476224707 }, { "id": 1, "timestamp_ns": "379569.000000", "event": "CPU core changed", "cpuId": 2 }, { "id": 2, "timestamp_ns": "390487.000000", "event": "HW clock tick", "hwClock": 48599428476246495 }, { "id": 3, "timestamp_ns": "1602508.000000", "event": "HW clock tick", "hwClock": 48599428478664855 }, { "id": 4, "timestamp_ns": "1662745.000000", "event": "HW clock tick", "hwClock": 48599428478785046 }, { "id": 5, "timestamp_ns": "1662746.995324", "loadAddress": "0x7ffff7176660", "module": "libc.so.6", "symbol": "malloc", "mnemonic": "endbr64" }, { "id": 6, "timestamp_ns": "1662748.990648", "loadAddress": "0x7ffff7176664", "module": "libc.so.6", "symbol": "malloc", "mnemonic": "movq" }, { "id": 7, "timestamp_ns": "1662750.985972", "loadAddress": "0x7ffff717666b", "module": "libc.so.6", "symbol": "malloc", "mnemonic": "pushq" }, { "id": 8, "timestamp_ns": "1662752.981296", "loadAddress": "0x7ffff717666d", "module": "libc.so.6", "symbol": "malloc", "mnemonic": "pushq" } ] ``` Differential Revision: https://reviews.llvm.org/D130054
1 parent dcd0946 commit 4f676c2

23 files changed

+606
-291
lines changed
 

‎lldb/include/lldb/Target/TraceCursor.h

+22-14
Original file line numberDiff line numberDiff line change
@@ -215,7 +215,7 @@ class TraceCursor {
215215
/// of this cursor.
216216
ExecutionContextRef &GetExecutionContextRef();
217217

218-
/// Instruction, event or error information
218+
/// Trace item information (instructions, errors and events)
219219
/// \{
220220

221221
/// \return
@@ -255,27 +255,35 @@ class TraceCursor {
255255
/// The load address of the instruction the cursor is pointing at.
256256
virtual lldb::addr_t GetLoadAddress() const = 0;
257257

258-
/// Get the hardware counter of a given type associated with the current
259-
/// instruction. Each architecture might support different counters. It might
260-
/// happen that only some instructions of an entire trace have a given counter
261-
/// associated with them.
262-
///
263-
/// \param[in] counter_type
264-
/// The counter type.
265-
/// \return
266-
/// The value of the counter or \b llvm::None if not available.
267-
virtual llvm::Optional<uint64_t>
268-
GetCounter(lldb::TraceCounter counter_type) const = 0;
269-
270258
/// Get the CPU associated with the current trace item.
271259
///
272260
/// This call might not be O(1), so it's suggested to invoke this method
273-
/// whenever a cpu change event is fired.
261+
/// whenever an eTraceEventCPUChanged event is fired.
274262
///
275263
/// \return
276264
/// The requested CPU id, or \a llvm::None if this information is
277265
/// not available for the current item.
278266
virtual llvm::Optional<lldb::cpu_id_t> GetCPU() const = 0;
267+
268+
/// Get the last hardware clock value that was emitted before the current
269+
/// trace item.
270+
///
271+
/// This call might not be O(1), so it's suggested to invoke this method
272+
/// whenever an eTraceEventHWClockTick event is fired.
273+
///
274+
/// \return
275+
/// The requested HW clock value, or \a llvm::None if this information is
276+
/// not available for the current item.
277+
virtual llvm::Optional<uint64_t> GetHWClock() const = 0;
278+
279+
/// Get the approximate wall clock time in nanoseconds at which the current
280+
/// trace item was executed. Each trace plug-in has a different definition for
281+
/// what time 0 means.
282+
///
283+
/// \return
284+
/// The approximate wall clock time for the trace item, or \a llvm::None
285+
/// if not available.
286+
virtual llvm::Optional<double> GetWallClockTime() const = 0;
279287
/// \}
280288

281289
protected:

‎lldb/include/lldb/Target/TraceDumper.h

+4-3
Original file line numberDiff line numberDiff line change
@@ -29,9 +29,9 @@ struct TraceDumperOptions {
2929
bool json = false;
3030
/// When dumping in JSON format, pretty print the output.
3131
bool pretty_print_json = false;
32-
/// For each instruction, print the corresponding timestamp counter if
32+
/// For each trace item, print the corresponding timestamp in nanoseconds if
3333
/// available.
34-
bool show_tsc = false;
34+
bool show_timestamps = false;
3535
/// Dump the events that happened between instructions.
3636
bool show_events = false;
3737
/// For each instruction, print the instruction kind.
@@ -61,7 +61,8 @@ class TraceDumper {
6161
struct TraceItem {
6262
lldb::user_id_t id;
6363
lldb::addr_t load_address;
64-
llvm::Optional<uint64_t> tsc;
64+
llvm::Optional<double> timestamp;
65+
llvm::Optional<uint64_t> hw_clock;
6566
llvm::Optional<llvm::StringRef> error;
6667
llvm::Optional<lldb::TraceEvent> event;
6768
llvm::Optional<SymbolInfo> symbol_info;

‎lldb/include/lldb/Utility/TraceIntelPTGDBRemotePackets.h

+2-2
Original file line numberDiff line numberDiff line change
@@ -91,8 +91,8 @@ struct LinuxPerfZeroTscConversion {
9191
/// nanoseconds) is defined by the kernel at boot time and has no particularly
9292
/// useful meaning. On the other hand, this value is constant for an entire
9393
/// trace session.
94-
// See 'time_zero' section of
95-
// https://man7.org/linux/man-pages/man2/perf_event_open.2.html
94+
/// See 'time_zero' section of
95+
/// https://man7.org/linux/man-pages/man2/perf_event_open.2.html
9696
///
9797
/// \param[in] tsc
9898
/// The TSC value to be converted.

‎lldb/include/lldb/lldb-enumerations.h

+2-6
Original file line numberDiff line numberDiff line change
@@ -1159,12 +1159,6 @@ enum SaveCoreStyle {
11591159
eSaveCoreStackOnly = 3,
11601160
};
11611161

1162-
// Type of counter values associated with instructions in a trace.
1163-
enum TraceCounter {
1164-
// Timestamp counter, like the one offered by Intel CPUs (TSC).
1165-
eTraceCounterTSC = 0,
1166-
};
1167-
11681162
/// Events that might happen during a trace session.
11691163
enum TraceEvent {
11701164
/// Tracing was disabled for some time due to a software trigger
@@ -1174,6 +1168,8 @@ enum TraceEvent {
11741168
/// Event due to CPU change for a thread. This event is also fired when
11751169
/// suddenly it's not possible to identify the cpu of a given thread.
11761170
eTraceEventCPUChanged,
1171+
/// Event due to a CPU HW clock tick
1172+
eTraceEventHWClockTick,
11771173
};
11781174

11791175
// Enum used to identify which kind of item a \a TraceCursor is pointing at

‎lldb/source/Commands/CommandObjectThread.cpp

+1-1
Original file line numberDiff line numberDiff line change
@@ -2162,7 +2162,7 @@ class CommandObjectTraceDumpInstructions : public CommandObjectParsed {
21622162
break;
21632163
}
21642164
case 't': {
2165-
m_dumper_options.show_tsc = true;
2165+
m_dumper_options.show_timestamps = true;
21662166
break;
21672167
}
21682168
case 'e': {

‎lldb/source/Commands/Options.td

+6-4
Original file line numberDiff line numberDiff line change
@@ -1146,15 +1146,17 @@ let Command = "thread trace dump instructions" in {
11461146
Desc<"Dump in simple JSON format.">;
11471147
def thread_trace_dump_instructions_pretty_print: Option<"pretty-json", "J">,
11481148
Group<1>,
1149-
Desc<"Dump in JSON format but pretty printing the output for easier readability.">;
1149+
Desc<"Dump in JSON format but pretty printing the output for easier "
1150+
"readability.">;
11501151
def thread_trace_dump_instructions_show_kind : Option<"kind", "k">, Group<1>,
11511152
Desc<"Show instruction control flow kind. Refer to the enum "
11521153
"`InstructionControlFlowKind` for a list of control flow kind. "
11531154
"As an important note, far jumps, far calls and far returns often indicate "
11541155
"calls to and from kernel.">;
1155-
def thread_trace_dump_instructions_show_tsc : Option<"tsc", "t">, Group<1>,
1156-
Desc<"For each instruction, print the corresponding timestamp counter if "
1157-
"available.">;
1156+
def thread_trace_dump_instructions_show_timestamps: Option<"time", "t">,
1157+
Group<1>,
1158+
Desc<"For each trace item, print the corresponding wall clock timestamp "
1159+
"if available.">;
11581160
def thread_trace_dump_instructions_show_events : Option<"events", "e">,
11591161
Group<1>,
11601162
Desc<"Dump the events that happened during the execution of the target.">;

‎lldb/source/Plugins/Trace/intel-pt/DecodedThread.cpp

+111-87
Original file line numberDiff line numberDiff line change
@@ -44,11 +44,65 @@ void IntelPTError::log(llvm::raw_ostream &OS) const {
4444
OS << formatv(": {0:x+16}", m_address);
4545
}
4646

47-
int64_t DecodedThread::GetItemsCount() const {
48-
return static_cast<int64_t>(m_item_kinds.size());
47+
bool DecodedThread::TSCRange::InRange(uint64_t item_index) const {
48+
return item_index >= first_item_index &&
49+
item_index < first_item_index + items_count;
50+
}
51+
52+
bool DecodedThread::NanosecondsRange::InRange(uint64_t item_index) const {
53+
return item_index >= first_item_index &&
54+
item_index < first_item_index + items_count;
55+
}
56+
57+
double DecodedThread::NanosecondsRange::GetInterpolatedTime(
58+
uint64_t item_index, uint64_t begin_of_time_nanos,
59+
const LinuxPerfZeroTscConversion &tsc_conversion) const {
60+
uint64_t items_since_last_tsc = item_index - first_item_index;
61+
62+
auto interpolate = [&](uint64_t next_range_start_ns) {
63+
if (next_range_start_ns == nanos) {
64+
// If the resolution of the conversion formula is bad enough to consider
65+
// these two timestamps as equal, then we just increase the next one by 1
66+
// for correction
67+
next_range_start_ns++;
68+
}
69+
long double item_duration =
70+
static_cast<long double>(items_count) / (next_range_start_ns - nanos);
71+
return (nanos - begin_of_time_nanos) + items_since_last_tsc * item_duration;
72+
};
73+
74+
if (!next_range) {
75+
// If this is the last TSC range, so we have to extrapolate. In this case,
76+
// we assume that each instruction took one TSC, which is what an
77+
// instruction would take if no parallelism is achieved and the frequency
78+
// multiplier is 1.
79+
return interpolate(tsc_conversion.ToNanos(tsc + items_count));
80+
}
81+
if (items_count < (next_range->tsc - tsc)) {
82+
// If the numbers of items in this range is less than the total TSC duration
83+
// of this range, i.e. each instruction taking longer than 1 TSC, then we
84+
// can assume that something else happened between these TSCs (e.g. a
85+
// context switch, change to kernel, decoding errors, etc). In this case, we
86+
// also assume that each instruction took 1 TSC. A proper way to improve
87+
// this would be to analize the next events in the trace looking for context
88+
// switches or trace disablement events, but for now, as we only want an
89+
// approximation, we keep it simple. We are also guaranteed that the time in
90+
// nanos of the next range is different to the current one, just because of
91+
// the definition of a NanosecondsRange.
92+
return interpolate(
93+
std::min(tsc_conversion.ToNanos(tsc + items_count), next_range->nanos));
94+
}
95+
96+
// In this case, each item took less than 1 TSC, so some parallelism was
97+
// achieved, which is an indication that we didn't suffered of any kind of
98+
// interruption.
99+
return interpolate(next_range->nanos);
49100
}
50101

51-
lldb::addr_t DecodedThread::GetInstructionLoadAddress(size_t item_index) const {
102+
uint64_t DecodedThread::GetItemsCount() const { return m_item_kinds.size(); }
103+
104+
lldb::addr_t
105+
DecodedThread::GetInstructionLoadAddress(uint64_t item_index) const {
52106
return m_item_data[item_index].load_address;
53107
}
54108

@@ -58,33 +112,69 @@ DecodedThread::TraceItemStorage &
58112
DecodedThread::CreateNewTraceItem(lldb::TraceItemKind kind) {
59113
m_item_kinds.push_back(kind);
60114
m_item_data.emplace_back();
115+
if (m_last_tsc)
116+
(*m_last_tsc)->second.items_count++;
117+
if (m_last_nanoseconds)
118+
(*m_last_nanoseconds)->second.items_count++;
61119
return m_item_data.back();
62120
}
63121

64-
void DecodedThread::NotifyTsc(uint64_t tsc) {
65-
if (!m_last_tsc || *m_last_tsc != tsc) {
66-
m_timestamps.emplace(m_item_kinds.size(), tsc);
67-
m_last_tsc = tsc;
122+
void DecodedThread::NotifyTsc(TSC tsc) {
123+
if (m_last_tsc && (*m_last_tsc)->second.tsc == tsc)
124+
return;
125+
126+
m_last_tsc =
127+
m_tscs.emplace(GetItemsCount(), TSCRange{tsc, 0, GetItemsCount()}).first;
128+
129+
if (m_tsc_conversion) {
130+
uint64_t nanos = m_tsc_conversion->ToNanos(tsc);
131+
if (!m_last_nanoseconds || (*m_last_nanoseconds)->second.nanos != nanos) {
132+
m_last_nanoseconds =
133+
m_nanoseconds
134+
.emplace(GetItemsCount(), NanosecondsRange{nanos, tsc, nullptr, 0,
135+
GetItemsCount()})
136+
.first;
137+
if (*m_last_nanoseconds != m_nanoseconds.begin()) {
138+
auto prev_range = prev(*m_last_nanoseconds);
139+
prev_range->second.next_range = &(*m_last_nanoseconds)->second;
140+
}
141+
}
68142
}
143+
AppendEvent(lldb::eTraceEventHWClockTick);
69144
}
70145

71146
void DecodedThread::NotifyCPU(lldb::cpu_id_t cpu_id) {
72147
if (!m_last_cpu || *m_last_cpu != cpu_id) {
73-
m_cpus.emplace(m_item_kinds.size(), cpu_id);
148+
m_cpus.emplace(GetItemsCount(), cpu_id);
74149
m_last_cpu = cpu_id;
75150
AppendEvent(lldb::eTraceEventCPUChanged);
76151
}
77152
}
78153

79154
Optional<lldb::cpu_id_t>
80-
DecodedThread::GetCPUByIndex(uint64_t insn_index) const {
81-
// Could possibly optimize the search
82-
auto it = m_cpus.upper_bound(insn_index);
155+
DecodedThread::GetCPUByIndex(uint64_t item_index) const {
156+
auto it = m_cpus.upper_bound(item_index);
83157
if (it == m_cpus.begin())
84158
return None;
85159
return prev(it)->second;
86160
}
87161

162+
Optional<DecodedThread::TSCRange>
163+
DecodedThread::GetTSCRangeByIndex(uint64_t item_index) const {
164+
auto next_it = m_tscs.upper_bound(item_index);
165+
if (next_it == m_tscs.begin())
166+
return None;
167+
return prev(next_it)->second;
168+
}
169+
170+
Optional<DecodedThread::NanosecondsRange>
171+
DecodedThread::GetNanosecondsRangeByIndex(uint64_t item_index) {
172+
auto next_it = m_nanoseconds.upper_bound(item_index);
173+
if (next_it == m_nanoseconds.begin())
174+
return None;
175+
return prev(next_it)->second;
176+
}
177+
88178
void DecodedThread::AppendEvent(lldb::TraceEvent event) {
89179
CreateNewTraceItem(lldb::eTraceItemKindEvent).event = event;
90180
m_events_stats.RecordEvent(event);
@@ -134,90 +224,24 @@ void DecodedThread::EventsStats::RecordEvent(lldb::TraceEvent event) {
134224
total_count++;
135225
}
136226

137-
Optional<DecodedThread::TscRange> DecodedThread::CalculateTscRange(
138-
size_t insn_index,
139-
const Optional<DecodedThread::TscRange> &hint_range) const {
140-
// We first try to check the given hint range in case we are traversing the
141-
// trace in short jumps. If that fails, then we do the more expensive
142-
// arbitrary lookup.
143-
if (hint_range) {
144-
Optional<TscRange> candidate_range;
145-
if (insn_index < hint_range->GetStartInstructionIndex())
146-
candidate_range = hint_range->Prev();
147-
else if (insn_index > hint_range->GetEndInstructionIndex())
148-
candidate_range = hint_range->Next();
149-
else
150-
candidate_range = hint_range;
151-
152-
if (candidate_range && candidate_range->InRange(insn_index))
153-
return candidate_range;
154-
}
155-
// Now we do a more expensive lookup
156-
auto it = m_timestamps.upper_bound(insn_index);
157-
if (it == m_timestamps.begin())
158-
return None;
159-
160-
return TscRange(--it, *this);
161-
}
162-
163-
lldb::TraceItemKind DecodedThread::GetItemKindByIndex(size_t item_index) const {
227+
lldb::TraceItemKind
228+
DecodedThread::GetItemKindByIndex(uint64_t item_index) const {
164229
return static_cast<lldb::TraceItemKind>(m_item_kinds[item_index]);
165230
}
166231

167-
const char *DecodedThread::GetErrorByIndex(size_t item_index) const {
232+
const char *DecodedThread::GetErrorByIndex(uint64_t item_index) const {
168233
return m_item_data[item_index].error;
169234
}
170235

171-
DecodedThread::DecodedThread(ThreadSP thread_sp) : m_thread_sp(thread_sp) {}
172-
173-
lldb::TraceCursorUP DecodedThread::CreateNewCursor() {
174-
return std::make_unique<TraceCursorIntelPT>(m_thread_sp, shared_from_this());
175-
}
236+
DecodedThread::DecodedThread(
237+
ThreadSP thread_sp,
238+
const llvm::Optional<LinuxPerfZeroTscConversion> &tsc_conversion)
239+
: m_thread_sp(thread_sp), m_tsc_conversion(tsc_conversion) {}
176240

177241
size_t DecodedThread::CalculateApproximateMemoryUsage() const {
178242
return sizeof(TraceItemStorage) * m_item_data.size() +
179243
sizeof(uint8_t) * m_item_kinds.size() +
180-
(sizeof(size_t) + sizeof(uint64_t)) * m_timestamps.size() +
181-
(sizeof(size_t) + sizeof(lldb::cpu_id_t)) * m_cpus.size();
182-
}
183-
184-
DecodedThread::TscRange::TscRange(std::map<size_t, uint64_t>::const_iterator it,
185-
const DecodedThread &decoded_thread)
186-
: m_it(it), m_decoded_thread(&decoded_thread) {
187-
auto next_it = m_it;
188-
++next_it;
189-
m_end_index = (next_it == m_decoded_thread->m_timestamps.end())
190-
? std::numeric_limits<uint64_t>::max()
191-
: next_it->first - 1;
192-
}
193-
194-
size_t DecodedThread::TscRange::GetTsc() const { return m_it->second; }
195-
196-
size_t DecodedThread::TscRange::GetStartInstructionIndex() const {
197-
return m_it->first;
198-
}
199-
200-
size_t DecodedThread::TscRange::GetEndInstructionIndex() const {
201-
return m_end_index;
202-
}
203-
204-
bool DecodedThread::TscRange::InRange(size_t insn_index) const {
205-
return GetStartInstructionIndex() <= insn_index &&
206-
insn_index <= GetEndInstructionIndex();
207-
}
208-
209-
Optional<DecodedThread::TscRange> DecodedThread::TscRange::Next() const {
210-
auto next_it = m_it;
211-
++next_it;
212-
if (next_it == m_decoded_thread->m_timestamps.end())
213-
return None;
214-
return TscRange(next_it, *m_decoded_thread);
215-
}
216-
217-
Optional<DecodedThread::TscRange> DecodedThread::TscRange::Prev() const {
218-
if (m_it == m_decoded_thread->m_timestamps.begin())
219-
return None;
220-
auto prev_it = m_it;
221-
--prev_it;
222-
return TscRange(prev_it, *m_decoded_thread);
244+
(sizeof(uint64_t) + sizeof(TSC)) * m_tscs.size() +
245+
(sizeof(uint64_t) + sizeof(uint64_t)) * m_nanoseconds.size() +
246+
(sizeof(uint64_t) + sizeof(lldb::cpu_id_t)) * m_cpus.size();
223247
}

0 commit comments

Comments
 (0)
Please sign in to comment.