Skip to content

Commit 7d0db1e

Browse files
committed
Auto merge of #2647 - saethlin:current-span, r=RalfJung
Track local frames incrementally during execution rust-lang/miri#2646 currently introduces a performance regression. This change removes that regression, and provides a minor perf improvement. The existing lazy strategy for tracking the span we want to display is as efficient as it is only because we often create a `CurrentSpan` then never call `.get()`. Most of the calls to the `before_memory_read` and `before_memory_write` hooks do not create any event that we store in `AllocHistory`. But data races are totally different, any memory read or write may race, so every call to those hooks needs to access to the current local span. So this changes to a strategy where we update some state in a `Thread` and `FrameExtra` incrementally, upon entering and existing each function call. Before: ``` Benchmark 1: cargo +miri miri run --manifest-path /home/ben/miri/bench-cargo-miri/backtraces/Cargo.toml Time (mean ± σ): 5.532 s ± 0.022 s [User: 5.444 s, System: 0.073 s] Range (min … max): 5.516 s … 5.569 s 5 runs Benchmark 1: cargo +miri miri run --manifest-path /home/ben/miri/bench-cargo-miri/mse/Cargo.toml Time (mean ± σ): 831.4 ms ± 3.0 ms [User: 783.8 ms, System: 46.7 ms] Range (min … max): 828.7 ms … 836.1 ms 5 runs Benchmark 1: cargo +miri miri run --manifest-path /home/ben/miri/bench-cargo-miri/serde1/Cargo.toml Time (mean ± σ): 1.975 s ± 0.021 s [User: 1.914 s, System: 0.059 s] Range (min … max): 1.939 s … 1.990 s 5 runs Benchmark 1: cargo +miri miri run --manifest-path /home/ben/miri/bench-cargo-miri/serde2/Cargo.toml Time (mean ± σ): 4.060 s ± 0.051 s [User: 3.983 s, System: 0.071 s] Range (min … max): 3.972 s … 4.100 s 5 runs Benchmark 1: cargo +miri miri run --manifest-path /home/ben/miri/bench-cargo-miri/slice-get-unchecked/Cargo.toml Time (mean ± σ): 784.9 ms ± 8.2 ms [User: 746.5 ms, System: 37.7 ms] Range (min … max): 772.9 ms … 793.3 ms 5 runs Benchmark 1: cargo +miri miri run --manifest-path /home/ben/miri/bench-cargo-miri/unicode/Cargo.toml Time (mean ± σ): 1.679 s ± 0.006 s [User: 1.623 s, System: 0.055 s] Range (min … max): 1.673 s … 1.687 s 5 runs ``` After: ``` Benchmark 1: cargo +miri miri run --manifest-path /home/ben/miri/bench-cargo-miri/backtraces/Cargo.toml Time (mean ± σ): 5.330 s ± 0.037 s [User: 5.232 s, System: 0.084 s] Range (min … max): 5.280 s … 5.383 s 5 runs Benchmark 1: cargo +miri miri run --manifest-path /home/ben/miri/bench-cargo-miri/mse/Cargo.toml Time (mean ± σ): 818.9 ms ± 3.7 ms [User: 776.8 ms, System: 41.3 ms] Range (min … max): 813.5 ms … 822.5 ms 5 runs Benchmark 1: cargo +miri miri run --manifest-path /home/ben/miri/bench-cargo-miri/serde1/Cargo.toml Time (mean ± σ): 1.927 s ± 0.011 s [User: 1.864 s, System: 0.061 s] Range (min … max): 1.917 s … 1.945 s 5 runs Benchmark 1: cargo +miri miri run --manifest-path /home/ben/miri/bench-cargo-miri/serde2/Cargo.toml Time (mean ± σ): 3.974 s ± 0.020 s [User: 3.893 s, System: 0.076 s] Range (min … max): 3.956 s … 4.004 s 5 runs Benchmark 1: cargo +miri miri run --manifest-path /home/ben/miri/bench-cargo-miri/slice-get-unchecked/Cargo.toml Time (mean ± σ): 780.0 ms ± 5.3 ms [User: 740.3 ms, System: 39.0 ms] Range (min … max): 771.2 ms … 784.5 ms 5 runs Benchmark 1: cargo +miri miri run --manifest-path /home/ben/miri/bench-cargo-miri/unicode/Cargo.toml Time (mean ± σ): 1.643 s ± 0.007 s [User: 1.584 s, System: 0.058 s] Range (min … max): 1.635 s … 1.654 s 5 runs ``` (This change is marginal, but the point is that it avoids a much more significant regression)
2 parents fe81522 + 726b9d0 commit 7d0db1e

File tree

6 files changed

+153
-161
lines changed

6 files changed

+153
-161
lines changed

src/tools/miri/src/concurrency/thread.rs

Lines changed: 52 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -118,6 +118,13 @@ pub struct Thread<'mir, 'tcx> {
118118
/// The virtual call stack.
119119
stack: Vec<Frame<'mir, 'tcx, Provenance, FrameData<'tcx>>>,
120120

121+
/// The index of the topmost user-relevant frame in `stack`. This field must contain
122+
/// the value produced by `get_top_user_relevant_frame`.
123+
/// The `None` state here represents
124+
/// This field is a cache to reduce how often we call that method. The cache is manually
125+
/// maintained inside `MiriMachine::after_stack_push` and `MiriMachine::after_stack_pop`.
126+
top_user_relevant_frame: Option<usize>,
127+
121128
/// The join status.
122129
join_status: ThreadJoinStatus,
123130

@@ -147,6 +154,40 @@ impl<'mir, 'tcx> Thread<'mir, 'tcx> {
147154
fn thread_name(&self) -> &[u8] {
148155
if let Some(ref thread_name) = self.thread_name { thread_name } else { b"<unnamed>" }
149156
}
157+
158+
/// Return the top user-relevant frame, if there is one.
159+
/// Note that the choice to return `None` here when there is no user-relevant frame is part of
160+
/// justifying the optimization that only pushes of user-relevant frames require updating the
161+
/// `top_user_relevant_frame` field.
162+
fn compute_top_user_relevant_frame(&self) -> Option<usize> {
163+
self.stack
164+
.iter()
165+
.enumerate()
166+
.rev()
167+
.find_map(|(idx, frame)| if frame.extra.is_user_relevant { Some(idx) } else { None })
168+
}
169+
170+
/// Re-compute the top user-relevant frame from scratch.
171+
pub fn recompute_top_user_relevant_frame(&mut self) {
172+
self.top_user_relevant_frame = self.compute_top_user_relevant_frame();
173+
}
174+
175+
/// Set the top user-relevant frame to the given value. Must be equal to what
176+
/// `get_top_user_relevant_frame` would return!
177+
pub fn set_top_user_relevant_frame(&mut self, frame_idx: usize) {
178+
debug_assert_eq!(Some(frame_idx), self.compute_top_user_relevant_frame());
179+
self.top_user_relevant_frame = Some(frame_idx);
180+
}
181+
182+
/// Returns the topmost frame that is considered user-relevant, or the
183+
/// top of the stack if there is no such frame, or `None` if the stack is empty.
184+
pub fn top_user_relevant_frame(&self) -> Option<usize> {
185+
debug_assert_eq!(self.top_user_relevant_frame, self.compute_top_user_relevant_frame());
186+
// This can be called upon creation of an allocation. We create allocations while setting up
187+
// parts of the Rust runtime when we do not have any stack frames yet, so we need to handle
188+
// empty stacks.
189+
self.top_user_relevant_frame.or_else(|| self.stack.len().checked_sub(1))
190+
}
150191
}
151192

152193
impl<'mir, 'tcx> std::fmt::Debug for Thread<'mir, 'tcx> {
@@ -167,6 +208,7 @@ impl<'mir, 'tcx> Default for Thread<'mir, 'tcx> {
167208
state: ThreadState::Enabled,
168209
thread_name: None,
169210
stack: Vec::new(),
211+
top_user_relevant_frame: None,
170212
join_status: ThreadJoinStatus::Joinable,
171213
panic_payload: None,
172214
last_error: None,
@@ -184,8 +226,15 @@ impl<'mir, 'tcx> Thread<'mir, 'tcx> {
184226

185227
impl VisitTags for Thread<'_, '_> {
186228
fn visit_tags(&self, visit: &mut dyn FnMut(SbTag)) {
187-
let Thread { panic_payload, last_error, stack, state: _, thread_name: _, join_status: _ } =
188-
self;
229+
let Thread {
230+
panic_payload,
231+
last_error,
232+
stack,
233+
top_user_relevant_frame: _,
234+
state: _,
235+
thread_name: _,
236+
join_status: _,
237+
} = self;
189238

190239
panic_payload.visit_tags(visit);
191240
last_error.visit_tags(visit);
@@ -414,7 +463,7 @@ impl<'mir, 'tcx: 'mir> ThreadManager<'mir, 'tcx> {
414463
}
415464

416465
/// Get a shared borrow of the currently active thread.
417-
fn active_thread_ref(&self) -> &Thread<'mir, 'tcx> {
466+
pub fn active_thread_ref(&self) -> &Thread<'mir, 'tcx> {
418467
&self.threads[self.active_thread]
419468
}
420469

src/tools/miri/src/helpers.rs

Lines changed: 20 additions & 56 deletions
Original file line numberDiff line numberDiff line change
@@ -936,78 +936,42 @@ pub trait EvalContextExt<'mir, 'tcx: 'mir>: crate::MiriInterpCxExt<'mir, 'tcx> {
936936
}
937937

938938
impl<'mir, 'tcx> MiriMachine<'mir, 'tcx> {
939-
pub fn current_span(&self) -> CurrentSpan<'_, 'mir, 'tcx> {
940-
CurrentSpan { current_frame_idx: None, machine: self }
941-
}
942-
}
943-
944-
/// A `CurrentSpan` should be created infrequently (ideally once) per interpreter step. It does
945-
/// nothing on creation, but when `CurrentSpan::get` is called, searches the current stack for the
946-
/// topmost frame which corresponds to a local crate, and returns the current span in that frame.
947-
/// The result of that search is cached so that later calls are approximately free.
948-
#[derive(Clone)]
949-
pub struct CurrentSpan<'a, 'mir, 'tcx> {
950-
current_frame_idx: Option<usize>,
951-
machine: &'a MiriMachine<'mir, 'tcx>,
952-
}
953-
954-
impl<'a, 'mir: 'a, 'tcx: 'a + 'mir> CurrentSpan<'a, 'mir, 'tcx> {
955-
pub fn machine(&self) -> &'a MiriMachine<'mir, 'tcx> {
956-
self.machine
957-
}
958-
959-
/// Get the current span, skipping non-local frames.
939+
/// Get the current span in the topmost function which is workspace-local and not
940+
/// `#[track_caller]`.
960941
/// This function is backed by a cache, and can be assumed to be very fast.
961-
pub fn get(&mut self) -> Span {
962-
let idx = self.current_frame_idx();
963-
self.stack().get(idx).map(Frame::current_span).unwrap_or(rustc_span::DUMMY_SP)
942+
/// It will work even when the stack is empty.
943+
pub fn current_span(&self) -> Span {
944+
self.top_user_relevant_frame()
945+
.map(|frame_idx| self.stack()[frame_idx].current_span())
946+
.unwrap_or(rustc_span::DUMMY_SP)
964947
}
965948

966949
/// Returns the span of the *caller* of the current operation, again
967950
/// walking down the stack to find the closest frame in a local crate, if the caller of the
968951
/// current operation is not in a local crate.
969952
/// This is useful when we are processing something which occurs on function-entry and we want
970953
/// to point at the call to the function, not the function definition generally.
971-
pub fn get_caller(&mut self) -> Span {
954+
pub fn caller_span(&self) -> Span {
972955
// We need to go down at least to the caller (len - 2), or however
973-
// far we have to go to find a frame in a local crate.
974-
let local_frame_idx = self.current_frame_idx();
975-
let stack = self.stack();
976-
let idx = cmp::min(local_frame_idx, stack.len().saturating_sub(2));
977-
stack.get(idx).map(Frame::current_span).unwrap_or(rustc_span::DUMMY_SP)
956+
// far we have to go to find a frame in a local crate which is also not #[track_caller].
957+
let frame_idx = self.top_user_relevant_frame().unwrap();
958+
let frame_idx = cmp::min(frame_idx, self.stack().len().checked_sub(2).unwrap());
959+
self.stack()[frame_idx].current_span()
978960
}
979961

980962
fn stack(&self) -> &[Frame<'mir, 'tcx, Provenance, machine::FrameData<'tcx>>] {
981-
self.machine.threads.active_thread_stack()
963+
self.threads.active_thread_stack()
982964
}
983965

984-
fn current_frame_idx(&mut self) -> usize {
985-
*self
986-
.current_frame_idx
987-
.get_or_insert_with(|| Self::compute_current_frame_index(self.machine))
966+
fn top_user_relevant_frame(&self) -> Option<usize> {
967+
self.threads.active_thread_ref().top_user_relevant_frame()
988968
}
989969

990-
// Find the position of the inner-most frame which is part of the crate being
991-
// compiled/executed, part of the Cargo workspace, and is also not #[track_caller].
992-
#[inline(never)]
993-
fn compute_current_frame_index(machine: &MiriMachine<'_, '_>) -> usize {
994-
machine
995-
.threads
996-
.active_thread_stack()
997-
.iter()
998-
.enumerate()
999-
.rev()
1000-
.find_map(|(idx, frame)| {
1001-
let def_id = frame.instance.def_id();
1002-
if (def_id.is_local() || machine.local_crates.contains(&def_id.krate))
1003-
&& !frame.instance.def.requires_caller_location(machine.tcx)
1004-
{
1005-
Some(idx)
1006-
} else {
1007-
None
1008-
}
1009-
})
1010-
.unwrap_or(0)
970+
/// This is the source of truth for the `is_user_relevant` flag in our `FrameExtra`.
971+
pub fn is_user_relevant(&self, frame: &Frame<'mir, 'tcx, Provenance>) -> bool {
972+
let def_id = frame.instance.def_id();
973+
(def_id.is_local() || self.local_crates.contains(&def_id.krate))
974+
&& !frame.instance.def.requires_caller_location(self.tcx)
1011975
}
1012976
}
1013977

src/tools/miri/src/lib.rs

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -97,7 +97,7 @@ pub use crate::diagnostics::{
9797
pub use crate::eval::{
9898
create_ecx, eval_entry, AlignmentCheck, BacktraceStyle, IsolatedOp, MiriConfig, RejectOpWith,
9999
};
100-
pub use crate::helpers::{CurrentSpan, EvalContextExt as _};
100+
pub use crate::helpers::EvalContextExt as _;
101101
pub use crate::intptrcast::ProvenanceMode;
102102
pub use crate::machine::{
103103
AllocExtra, FrameData, MiriInterpCx, MiriInterpCxExt, MiriMachine, MiriMemoryKind,

src/tools/miri/src/machine.rs

Lines changed: 28 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -50,12 +50,18 @@ pub struct FrameData<'tcx> {
5050
/// for the start of this frame. When we finish executing this frame,
5151
/// we use this to register a completed event with `measureme`.
5252
pub timing: Option<measureme::DetachedTiming>,
53+
54+
/// Indicates whether a `Frame` is part of a workspace-local crate and is also not
55+
/// `#[track_caller]`. We compute this once on creation and store the result, as an
56+
/// optimization.
57+
/// This is used by `MiriMachine::current_span` and `MiriMachine::caller_span`
58+
pub is_user_relevant: bool,
5359
}
5460

5561
impl<'tcx> std::fmt::Debug for FrameData<'tcx> {
5662
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
5763
// Omitting `timing`, it does not support `Debug`.
58-
let FrameData { stacked_borrows, catch_unwind, timing: _ } = self;
64+
let FrameData { stacked_borrows, catch_unwind, timing: _, is_user_relevant: _ } = self;
5965
f.debug_struct("FrameData")
6066
.field("stacked_borrows", stacked_borrows)
6167
.field("catch_unwind", catch_unwind)
@@ -65,7 +71,7 @@ impl<'tcx> std::fmt::Debug for FrameData<'tcx> {
6571

6672
impl VisitTags for FrameData<'_> {
6773
fn visit_tags(&self, visit: &mut dyn FnMut(SbTag)) {
68-
let FrameData { catch_unwind, stacked_borrows, timing: _ } = self;
74+
let FrameData { catch_unwind, stacked_borrows, timing: _, is_user_relevant: _ } = self;
6975

7076
catch_unwind.visit_tags(visit);
7177
stacked_borrows.visit_tags(visit);
@@ -895,13 +901,7 @@ impl<'mir, 'tcx> Machine<'mir, 'tcx> for MiriMachine<'mir, 'tcx> {
895901

896902
let alloc = alloc.into_owned();
897903
let stacks = ecx.machine.stacked_borrows.as_ref().map(|stacked_borrows| {
898-
Stacks::new_allocation(
899-
id,
900-
alloc.size(),
901-
stacked_borrows,
902-
kind,
903-
ecx.machine.current_span(),
904-
)
904+
Stacks::new_allocation(id, alloc.size(), stacked_borrows, kind, &ecx.machine)
905905
});
906906
let race_alloc = ecx.machine.data_race.as_ref().map(|data_race| {
907907
data_race::AllocExtra::new_allocation(
@@ -1016,8 +1016,7 @@ impl<'mir, 'tcx> Machine<'mir, 'tcx> for MiriMachine<'mir, 'tcx> {
10161016
prov_extra,
10171017
range,
10181018
machine.stacked_borrows.as_ref().unwrap(),
1019-
machine.current_span(),
1020-
&machine.threads,
1019+
machine,
10211020
)?;
10221021
}
10231022
if let Some(weak_memory) = &alloc_extra.weak_memory {
@@ -1048,8 +1047,7 @@ impl<'mir, 'tcx> Machine<'mir, 'tcx> for MiriMachine<'mir, 'tcx> {
10481047
prov_extra,
10491048
range,
10501049
machine.stacked_borrows.as_ref().unwrap(),
1051-
machine.current_span(),
1052-
&machine.threads,
1050+
machine,
10531051
)?;
10541052
}
10551053
if let Some(weak_memory) = &alloc_extra.weak_memory {
@@ -1083,8 +1081,7 @@ impl<'mir, 'tcx> Machine<'mir, 'tcx> for MiriMachine<'mir, 'tcx> {
10831081
prove_extra,
10841082
range,
10851083
machine.stacked_borrows.as_ref().unwrap(),
1086-
machine.current_span(),
1087-
&machine.threads,
1084+
machine,
10881085
)
10891086
} else {
10901087
Ok(())
@@ -1126,7 +1123,9 @@ impl<'mir, 'tcx> Machine<'mir, 'tcx> for MiriMachine<'mir, 'tcx> {
11261123
stacked_borrows: stacked_borrows.map(|sb| sb.borrow_mut().new_frame(&ecx.machine)),
11271124
catch_unwind: None,
11281125
timing,
1126+
is_user_relevant: ecx.machine.is_user_relevant(&frame),
11291127
};
1128+
11301129
Ok(frame.with_extra(extra))
11311130
}
11321131

@@ -1174,6 +1173,13 @@ impl<'mir, 'tcx> Machine<'mir, 'tcx> for MiriMachine<'mir, 'tcx> {
11741173

11751174
#[inline(always)]
11761175
fn after_stack_push(ecx: &mut InterpCx<'mir, 'tcx, Self>) -> InterpResult<'tcx> {
1176+
if ecx.frame().extra.is_user_relevant {
1177+
// We just pushed a local frame, so we know that the topmost local frame is the topmost
1178+
// frame. If we push a non-local frame, there's no need to do anything.
1179+
let stack_len = ecx.active_thread_stack().len();
1180+
ecx.active_thread_mut().set_top_user_relevant_frame(stack_len - 1);
1181+
}
1182+
11771183
if ecx.machine.stacked_borrows.is_some() { ecx.retag_return_place() } else { Ok(()) }
11781184
}
11791185

@@ -1183,6 +1189,13 @@ impl<'mir, 'tcx> Machine<'mir, 'tcx> for MiriMachine<'mir, 'tcx> {
11831189
mut frame: Frame<'mir, 'tcx, Provenance, FrameData<'tcx>>,
11841190
unwinding: bool,
11851191
) -> InterpResult<'tcx, StackPopJump> {
1192+
if frame.extra.is_user_relevant {
1193+
// All that we store is whether or not the frame we just removed is local, so now we
1194+
// have no idea where the next topmost local frame is. So we recompute it.
1195+
// (If this ever becomes a bottleneck, we could have `push` store the previous
1196+
// user-relevant frame and restore that here.)
1197+
ecx.active_thread_mut().recompute_top_user_relevant_frame();
1198+
}
11861199
let timing = frame.extra.timing.take();
11871200
if let Some(stacked_borrows) = &ecx.machine.stacked_borrows {
11881201
stacked_borrows.borrow_mut().end_call(&frame.extra);

0 commit comments

Comments
 (0)