Skip to content

Commit 314da6c

Browse files
committed
Exit cleanly in debug build, abruptly in release build
In order to mitigate the risks of deadlock during shutdown in production builds, this adds back the behavior of exiting the process abruptly after writing metrics files. This also offers better performance by letting the OS deal with cleaning up memory and resources. This limits the behavior of unwinding all the way up to main() and joining the threads to debug builds. The policy could be replaced by an independent flag (if someone thought it important to run Valgrind on release builds, e.g. because it was too slow using it with debug ones).
1 parent ace3ce0 commit 314da6c

File tree

5 files changed

+105
-37
lines changed

5 files changed

+105
-37
lines changed

src/api_server/src/lib.rs

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -280,7 +280,10 @@ impl ApiServer {
280280
Ok(ParsedRequest::GetMMDS) => Some(self.get_mmds()),
281281
Ok(ParsedRequest::PatchMMDS(value)) => Some(self.patch_mmds(value)),
282282
Ok(ParsedRequest::PutMMDS(value)) => Some(self.put_mmds(value)),
283+
284+
#[cfg(debug_assertions)]
283285
Ok(ParsedRequest::ShutdownInternal) => None,
286+
284287
Err(e) => {
285288
error!("{}", e);
286289
Some(e.into())

src/api_server/src/parsed_request.rs

Lines changed: 12 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -31,6 +31,8 @@ pub(crate) enum ParsedRequest {
3131
PatchMMDS(Value),
3232
PutMMDS(Value),
3333
Sync(Box<VmmAction>),
34+
35+
#[cfg(debug_assertions)]
3436
ShutdownInternal, // !!! not an API, used by shutdown to thread::join the API thread
3537
}
3638

@@ -58,7 +60,16 @@ impl ParsedRequest {
5860

5961
match (request.method(), path, request.body.as_ref()) {
6062
(Method::Get, "", None) => parse_get_instance_info(),
61-
(Method::Get, "shutdown-internal", None) => Ok(ParsedRequest::ShutdownInternal),
63+
64+
#[cfg(debug_assertions)]
65+
(Method::Get, "shutdown-internal", None) => {
66+
//
67+
// This isn't a user-facing API, and was added solely to facilitate clean shutdowns.
68+
// Calling it manually will cause problems, so only enable it in debug builds.
69+
//
70+
Ok(ParsedRequest::ShutdownInternal)
71+
},
72+
6273
(Method::Get, "balloon", None) => parse_get_balloon(path_tokens.get(1)),
6374
(Method::Get, "machine-config", None) => parse_get_machine_config(),
6475
(Method::Get, "mmds", None) => parse_get_mmds(),

src/firecracker/src/api_server_adapter.rs

Lines changed: 4 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -251,16 +251,17 @@ pub(crate) fn run_with_api(
251251
&mut event_manager,
252252
);
253253

254+
// Note: In the release build, this is never reached...because exit() is called
255+
// abruptly (the OS does faster cleanup, and it reduces the risk of hanging).
256+
// Top level main() will complain if the bubbling process happens in release builds.
257+
254258
// We want to tell the API thread to shut down for a clean exit. But this is after
255259
// the Vmm.stop() has been called, so it's a moment of internal finalization (as
256260
// opposed to be something the client might call to shut the Vm down). Since it's
257261
// an internal signal implementing it with an HTTP request is probably not the ideal
258262
// way to do it...but having another way would involve waiting on the socket or some
259263
// other signal. This leverages the existing wait.
260264
//
261-
// !!! Since the code is only needed for a "clean" shutdown mode, a non-clean mode
262-
// could not respond to the request, making this effectively a debug-only feature.
263-
//
264265
let mut sock = UnixStream::connect(bind_path).unwrap();
265266
assert!(sock.write_all(b"GET /shutdown-internal HTTP/1.1\r\n\r\n").is_ok());
266267

src/firecracker/src/main.rs

Lines changed: 41 additions & 21 deletions
Original file line numberDiff line numberDiff line change
@@ -16,21 +16,25 @@ use seccomp::{BpfProgram, SeccompLevel};
1616
use utils::arg_parser::{ArgParser, Argument};
1717
use utils::terminal::Terminal;
1818
use utils::validators::validate_instance_id;
19+
1920
use vmm::default_syscalls::get_seccomp_filter;
2021
use vmm::resources::VmResources;
2122
use vmm::signal_handler::{mask_handled_signals, SignalManager};
2223
use vmm::version_map::FC_VERSION_TO_SNAP_VERSION;
2324
use vmm::vmm_config::instance_info::InstanceInfo;
2425
use vmm::vmm_config::logger::{init_logger, LoggerConfig, LoggerLevel};
2526

27+
#[cfg(debug_assertions)]
28+
use vmm::exit_firecracker_abruptly;
29+
2630
// The reason we place default API socket under /run is that API socket is a
2731
// runtime file.
2832
// see https://refspecs.linuxfoundation.org/FHS_3.0/fhs/ch03s15.html for more information.
2933
const DEFAULT_API_SOCK_PATH: &str = "/run/firecracker.socket";
3034
const DEFAULT_INSTANCE_ID: &str = "anonymous-instance";
3135
const FIRECRACKER_VERSION: &str = env!("FIRECRACKER_VERSION");
3236

33-
fn main_exitable() -> ExitCode {
37+
fn main_exitable() -> (SeccompLevel, ExitCode) {
3438
LOGGER
3539
.configure(Some(DEFAULT_INSTANCE_ID.to_string()))
3640
.expect("Failed to register logger");
@@ -206,13 +210,11 @@ fn main_exitable() -> ExitCode {
206210
}
207211

208212
// It's safe to unwrap here because the field's been provided with a default value.
209-
let seccomp_level = arguments.single_value("seccomp-level").unwrap();
210-
let seccomp_filter = get_seccomp_filter(
211-
SeccompLevel::from_string(&seccomp_level).unwrap_or_else(|err| {
212-
panic!("Invalid value for seccomp-level: {}", err);
213-
}),
214-
)
215-
.unwrap_or_else(|err| {
213+
let seccomp_level_arg = arguments.single_value("seccomp-level").unwrap();
214+
let seccomp_level = SeccompLevel::from_string(&seccomp_level_arg).unwrap_or_else(|err| {
215+
panic!("Invalid value for seccomp-level: {}", err);
216+
});
217+
let seccomp_filter = get_seccomp_filter(seccomp_level).unwrap_or_else(|err| {
216218
panic!("Could not create seccomp filter: {}", err);
217219
});
218220

@@ -224,7 +226,7 @@ fn main_exitable() -> ExitCode {
224226
let boot_timer_enabled = arguments.flag_present("boot-timer");
225227
let api_enabled = !arguments.flag_present("no-api");
226228

227-
if api_enabled {
229+
let exit_code = if api_enabled {
228230
let bind_path = arguments
229231
.single_value("api-sock")
230232
.map(PathBuf::from)
@@ -262,22 +264,40 @@ fn main_exitable() -> ExitCode {
262264
&instance_info,
263265
boot_timer_enabled,
264266
)
265-
}
267+
};
268+
269+
(seccomp_level, exit_code)
266270
}
267271

268-
fn main () {
269-
// This idiom is the prescribed way to get a clean shutdown of Rust (that will report
270-
// no leaks in Valgrind or sanitizers). Calling `unsafe { libc::exit() }` does no
271-
// cleanup, and std::process::exit() does more--but does not run destructors. So the
272-
// best thing to do is to is bubble up the exit code through the whole stack, and
273-
// only exit when everything potentially destructible has cleaned itself up.
272+
// This idiom of wrapping main is the prescribed way to get a clean shutdown of Rust (that
273+
// will report no leaks in Valgrind or sanitizers). It gives destructors a chance to run.
274+
//
275+
// See process_exitable() method of Subscriber trait for what triggers the exit_code.
276+
//
277+
// Variable named _seccomp_level instead of seccomp_level to avoid warning that the
278+
// release build doesn't use it.
279+
//
280+
fn main() {
281+
// Release builds exit as soon as possible; faster and reduces impact of deadlock bugs.
274282
//
275-
// https://doc.rust-lang.org/std/process/fn.exit.html
276-
//
277-
// See process_exitable() method of Subscriber trait for what triggers the exit_code.
283+
#[cfg(not(debug_assertions))]
284+
{
285+
main_exitable();
286+
panic!("Release build bubbled exit_code to main() vs. ending abruptly earlier");
287+
}
288+
289+
// Debug builds exit as cleanly as they are able to, for Valgrind and sanity checking.
278290
//
279-
let exit_code = main_exitable();
280-
std::process::exit(i32::from(exit_code));
291+
#[cfg(debug_assertions)]
292+
{
293+
let (seccomp_level, exit_code) = main_exitable();
294+
295+
if seccomp_level == SeccompLevel::None {
296+
std::process::exit(i32::from(exit_code)); // includes Rust library cleanup
297+
} else {
298+
exit_firecracker_abruptly(exit_code); // see notes on seccomp interaction
299+
}
300+
}
281301
}
282302

283303
// Print supported snapshot data format versions.

src/vmm/src/lib.rs

Lines changed: 45 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -227,6 +227,21 @@ pub(crate) fn mem_size_mib(guest_memory: &GuestMemoryMmap) -> u64 {
227227
guest_memory.map_and_fold(0, |(_, region)| region.len(), |a, b| a + b) >> 20
228228
}
229229

230+
/// The default/recommended filter for production workloads is a tight whitelist that
231+
/// doesn't include some syscalls done by Rust during thread exit (rt_sigprocmask is
232+
/// the first to hit the filter). So when default seccomp filter is installed,
233+
/// calling Rust's `std::process::exit()` will result in a seccomp violation panic.
234+
///
235+
/// Calling the libc::_exit() function is even more basic than libc::exit(), and so
236+
/// it must be used when seccomp filtering is enabled, unless something else changes.
237+
///
238+
/// But see main() for how debug builds use std::process::exit() when they are verifying
239+
/// shutdown can run cleanly, when seccomp filtering is off.
240+
///
241+
pub fn exit_firecracker_abruptly(exit_code: ExitCode) -> ! { // ! -> diverging function
242+
unsafe { libc::_exit(i32::from(exit_code)) }
243+
}
244+
230245
/// Contains the state and associated methods required for the Firecracker VMM.
231246
pub struct Vmm {
232247
events_observer: Option<Box<dyn VmmEventsObserver>>,
@@ -353,13 +368,14 @@ impl Vmm {
353368
.map_err(Error::I8042Error)
354369
}
355370

356-
/// Waits for all vCPUs to exit. Does not terminate the Firecracker process.
357-
/// (See notes in main() about why ExitCode is bubbled up for clean shutdown.)
358-
pub fn stop(&mut self) {
371+
/// This stops the VMM. If it's a release build, this will exit the Firecracker process
372+
/// entirely. But debug builds enforce a higher level of cleanliness, and return an
373+
/// exit code that the caller should bubble up to main().
374+
///
375+
pub fn stop(&mut self, exit_code: ExitCode) -> ExitCode {
359376
info!("Vmm is stopping.");
360377

361-
self.exit_vcpus().unwrap(); // exit all not-already-exited VCPUs, join their threads
362-
378+
// Teardown the VMM (produces metrics we need to write out)
363379
if let Some(observer) = self.events_observer.as_mut() {
364380
if let Err(e) = observer.on_vmm_stop() {
365381
warn!("{}", Error::VmmObserverTeardown(e));
@@ -370,6 +386,23 @@ impl Vmm {
370386
if let Err(e) = METRICS.write() {
371387
error!("Failed to write metrics while stopping: {}", e);
372388
}
389+
390+
// The release build exits here, to reduce the impact of shutdown deadlock bugs.
391+
// It's also faster to let the OS do memory and resource cleanup, once semantically
392+
// important shutdown (e.g. flushing and writing any open files) is done.
393+
//
394+
#[cfg(not(debug_assertions))]
395+
exit_firecracker_abruptly(exit_code);
396+
397+
// The debug build will shut down in an orderly fashion, bubbling up the exit code all
398+
// the way to main and joining all threads (including ending the API server gracefully).
399+
//
400+
#[cfg(debug_assertions)]
401+
{
402+
self.exit_vcpus().unwrap(); // exit all not-already-exited VCPUs, join their threads
403+
404+
exit_code
405+
}
373406
}
374407

375408
/// Saves the state of a paused Microvm.
@@ -777,6 +810,12 @@ impl Subscriber for Vmm {
777810
}
778811
}
779812

813+
// If the exit_code can't be found on any vcpu, it means that the exit signal
814+
// has been issued by the i8042 controller in which case we exit with
815+
// FC_EXIT_CODE_OK.
816+
//
817+
let exit_code = opt_exit_code.unwrap_or(FC_EXIT_CODE_OK);
818+
780819
// !!! The caller of this routine is receiving the exit code to bubble back up
781820
// to the main() function to return cleanly. However, it does not have clean
782821
// access to the Vmm to shut it down (here we have it, since it is `self`). It
@@ -787,13 +826,7 @@ impl Subscriber for Vmm {
787826
// that will actually work with an exit code (all other Subscriber trait
788827
// implementers must use process())
789828
//
790-
self.stop();
791-
792-
// If the exit_code can't be found on any vcpu, it means that the exit signal
793-
// has been issued by the i8042 controller in which case we exit with
794-
// FC_EXIT_CODE_OK.
795-
//
796-
Some(opt_exit_code.unwrap_or(FC_EXIT_CODE_OK))
829+
Some(self.stop(exit_code)) // exits abruptly if release build, else returns
797830
} else {
798831
error!("Spurious EventManager event for handler: Vmm");
799832
None

0 commit comments

Comments
 (0)