Exit cleanly in debug build, abruptly in release build

AE1020 · AE1020 · commit 314da6ce34ad · 2021-05-04T10:23:17.000-04:00
In order to mitigate the risks of deadlock during shutdown in
production builds, this adds back the behavior of exiting the
process abruptly after writing metrics files.  This also offers
better performance by letting the OS deal with cleaning up
memory and resources.

This limits the behavior of unwinding all the way up to main()
and joining the threads to debug builds.  The policy could be
replaced by an independent flag (if someone thought it important
to run Valgrind on release builds, e.g. because it was too slow
using it with debug ones).
diff --git a/src/api_server/src/lib.rs b/src/api_server/src/lib.rs
@@ -280,7 +280,10 @@ impl ApiServer {
             Ok(ParsedRequest::GetMMDS) => Some(self.get_mmds()),
             Ok(ParsedRequest::PatchMMDS(value)) => Some(self.patch_mmds(value)),
             Ok(ParsedRequest::PutMMDS(value)) => Some(self.put_mmds(value)),
+
+            #[cfg(debug_assertions)]
             Ok(ParsedRequest::ShutdownInternal) => None,
+
             Err(e) => {
                 error!("{}", e);
                 Some(e.into())
diff --git a/src/api_server/src/parsed_request.rs b/src/api_server/src/parsed_request.rs
@@ -31,6 +31,8 @@ pub(crate) enum ParsedRequest {
     PatchMMDS(Value),
     PutMMDS(Value),
     Sync(Box<VmmAction>),
+
+    #[cfg(debug_assertions)]
     ShutdownInternal,  // !!! not an API, used by shutdown to thread::join the API thread
 }
 
@@ -58,7 +60,16 @@ impl ParsedRequest {
 
         match (request.method(), path, request.body.as_ref()) {
             (Method::Get, "", None) => parse_get_instance_info(),
-            (Method::Get, "shutdown-internal", None) => Ok(ParsedRequest::ShutdownInternal),
+
+            #[cfg(debug_assertions)]
+            (Method::Get, "shutdown-internal", None) => {
+                //
+                // This isn't a user-facing API, and was added solely to facilitate clean shutdowns.
+                // Calling it manually will cause problems, so only enable it in debug builds.
+                //
+                Ok(ParsedRequest::ShutdownInternal)
+            },
+
             (Method::Get, "balloon", None) => parse_get_balloon(path_tokens.get(1)),
             (Method::Get, "machine-config", None) => parse_get_machine_config(),
             (Method::Get, "mmds", None) => parse_get_mmds(),
diff --git a/src/firecracker/src/api_server_adapter.rs b/src/firecracker/src/api_server_adapter.rs
@@ -251,16 +251,17 @@ pub(crate) fn run_with_api(
         &mut event_manager,
     );
 
+    // Note: In the release build, this is never reached...because exit() is called
+    // abruptly (the OS does faster cleanup, and it reduces the risk of hanging).
+    // Top level main() will complain if the bubbling process happens in release builds.
+
     // We want to tell the API thread to shut down for a clean exit.  But this is after
     // the Vmm.stop() has been called, so it's a moment of internal finalization (as
     // opposed to be something the client might call to shut the Vm down).  Since it's
     // an internal signal implementing it with an HTTP request is probably not the ideal
     // way to do it...but having another way would involve waiting on the socket or some
     // other signal.  This leverages the existing wait.
     //
-    // !!! Since the code is only needed for a "clean" shutdown mode, a non-clean mode
-    // could not respond to the request, making this effectively a debug-only feature.
-    //
     let mut sock = UnixStream::connect(bind_path).unwrap();
     assert!(sock.write_all(b"GET /shutdown-internal HTTP/1.1\r\n\r\n").is_ok());
 
diff --git a/src/firecracker/src/main.rs b/src/firecracker/src/main.rs
@@ -16,21 +16,25 @@ use seccomp::{BpfProgram, SeccompLevel};
 use utils::arg_parser::{ArgParser, Argument};
 use utils::terminal::Terminal;
 use utils::validators::validate_instance_id;
+
 use vmm::default_syscalls::get_seccomp_filter;
 use vmm::resources::VmResources;
 use vmm::signal_handler::{mask_handled_signals, SignalManager};
 use vmm::version_map::FC_VERSION_TO_SNAP_VERSION;
 use vmm::vmm_config::instance_info::InstanceInfo;
 use vmm::vmm_config::logger::{init_logger, LoggerConfig, LoggerLevel};
 
+#[cfg(debug_assertions)]
+use vmm::exit_firecracker_abruptly;
+
 // The reason we place default API socket under /run is that API socket is a
 // runtime file.
 // see https://refspecs.linuxfoundation.org/FHS_3.0/fhs/ch03s15.html for more information.
 const DEFAULT_API_SOCK_PATH: &str = "/run/firecracker.socket";
 const DEFAULT_INSTANCE_ID: &str = "anonymous-instance";
 const FIRECRACKER_VERSION: &str = env!("FIRECRACKER_VERSION");
 
-fn main_exitable() -> ExitCode {
+fn main_exitable() -> (SeccompLevel, ExitCode) {
     LOGGER
         .configure(Some(DEFAULT_INSTANCE_ID.to_string()))
         .expect("Failed to register logger");
@@ -206,13 +210,11 @@ fn main_exitable() -> ExitCode {
     }
 
     // It's safe to unwrap here because the field's been provided with a default value.
-    let seccomp_level = arguments.single_value("seccomp-level").unwrap();
-    let seccomp_filter = get_seccomp_filter(
-        SeccompLevel::from_string(&seccomp_level).unwrap_or_else(|err| {
-            panic!("Invalid value for seccomp-level: {}", err);
-        }),
-    )
-    .unwrap_or_else(|err| {
+    let seccomp_level_arg = arguments.single_value("seccomp-level").unwrap();
+    let seccomp_level = SeccompLevel::from_string(&seccomp_level_arg).unwrap_or_else(|err| {
+        panic!("Invalid value for seccomp-level: {}", err);
+    });
+    let seccomp_filter = get_seccomp_filter(seccomp_level).unwrap_or_else(|err| {
         panic!("Could not create seccomp filter: {}", err);
     });
 
@@ -224,7 +226,7 @@ fn main_exitable() -> ExitCode {
     let boot_timer_enabled = arguments.flag_present("boot-timer");
     let api_enabled = !arguments.flag_present("no-api");
 
-    if api_enabled {
+    let exit_code = if api_enabled {
         let bind_path = arguments
             .single_value("api-sock")
             .map(PathBuf::from)
@@ -262,22 +264,40 @@ fn main_exitable() -> ExitCode {
             &instance_info,
             boot_timer_enabled,
         )
-    }
+    };
+
+    (seccomp_level, exit_code)
 }
 
-fn main () {
-    // This idiom is the prescribed way to get a clean shutdown of Rust (that will report
-    // no leaks in Valgrind or sanitizers).  Calling `unsafe { libc::exit() }` does no
-    // cleanup, and std::process::exit() does more--but does not run destructors.  So the
-    // best thing to do is to is bubble up the exit code through the whole stack, and
-    // only exit when everything potentially destructible has cleaned itself up.
+// This idiom of wrapping main is the prescribed way to get a clean shutdown of Rust (that
+// will report no leaks in Valgrind or sanitizers).  It gives destructors a chance to run.
+//
+// See process_exitable() method of Subscriber trait for what triggers the exit_code.
+//
+// Variable named _seccomp_level instead of seccomp_level to avoid warning that the
+// release build doesn't use it.
+//
+fn main() {
+    // Release builds exit as soon as possible; faster and reduces impact of deadlock bugs.
     //
-    // https://doc.rust-lang.org/std/process/fn.exit.html
-    //
-    // See process_exitable() method of Subscriber trait for what triggers the exit_code.
+    #[cfg(not(debug_assertions))]
+    {
+        main_exitable();
+        panic!("Release build bubbled exit_code to main() vs. ending abruptly earlier");
+    }
+
+    // Debug builds exit as cleanly as they are able to, for Valgrind and sanity checking.
     //
-    let exit_code = main_exitable();
-    std::process::exit(i32::from(exit_code));
+    #[cfg(debug_assertions)]
+    {
+        let (seccomp_level, exit_code) = main_exitable();
+
+        if seccomp_level == SeccompLevel::None {
+            std::process::exit(i32::from(exit_code));  // includes Rust library cleanup
+        } else {
+            exit_firecracker_abruptly(exit_code);  // see notes on seccomp interaction
+        }
+    }
 }
 
 // Print supported snapshot data format versions.
diff --git a/src/vmm/src/lib.rs b/src/vmm/src/lib.rs
@@ -227,6 +227,21 @@ pub(crate) fn mem_size_mib(guest_memory: &GuestMemoryMmap) -> u64 {
     guest_memory.map_and_fold(0, |(_, region)| region.len(), |a, b| a + b) >> 20
 }
 
+/// The default/recommended filter for production workloads is a tight whitelist that
+/// doesn't include some syscalls done by Rust during thread exit (rt_sigprocmask is
+/// the first to hit the filter).  So when default seccomp filter is installed,
+/// calling Rust's `std::process::exit()` will result in a seccomp violation panic.
+///
+/// Calling the libc::_exit() function is even more basic than libc::exit(), and so
+/// it must be used when seccomp filtering is enabled, unless something else changes.
+///
+/// But see main() for how debug builds use std::process::exit() when they are verifying
+/// shutdown can run cleanly, when seccomp filtering is off.
+///
+pub fn exit_firecracker_abruptly(exit_code: ExitCode) -> ! {  // ! -> diverging function
+    unsafe { libc::_exit(i32::from(exit_code)) }
+}
+
 /// Contains the state and associated methods required for the Firecracker VMM.
 pub struct Vmm {
     events_observer: Option<Box<dyn VmmEventsObserver>>,
@@ -353,13 +368,14 @@ impl Vmm {
             .map_err(Error::I8042Error)
     }
 
-    /// Waits for all vCPUs to exit.  Does not terminate the Firecracker process.
-    /// (See notes in main() about why ExitCode is bubbled up for clean shutdown.)
-    pub fn stop(&mut self) {
+    /// This stops the VMM.  If it's a release build, this will exit the Firecracker process
+    /// entirely.  But debug builds enforce a higher level of cleanliness, and return an
+    /// exit code that the caller should bubble up to main().
+    ///
+    pub fn stop(&mut self, exit_code: ExitCode) -> ExitCode {
         info!("Vmm is stopping.");
 
-        self.exit_vcpus().unwrap();  // exit all not-already-exited VCPUs, join their threads
-
+        // Teardown the VMM (produces metrics we need to write out)
         if let Some(observer) = self.events_observer.as_mut() {
             if let Err(e) = observer.on_vmm_stop() {
                 warn!("{}", Error::VmmObserverTeardown(e));
@@ -370,6 +386,23 @@ impl Vmm {
         if let Err(e) = METRICS.write() {
             error!("Failed to write metrics while stopping: {}", e);
         }
+
+        // The release build exits here, to reduce the impact of shutdown deadlock bugs.
+        // It's also faster to let the OS do memory and resource cleanup, once semantically
+        // important shutdown (e.g. flushing and writing any open files) is done.
+        //
+        #[cfg(not(debug_assertions))]
+        exit_firecracker_abruptly(exit_code);
+
+        // The debug build will shut down in an orderly fashion, bubbling up the exit code all
+        // the way to main and joining all threads (including ending the API server gracefully).
+        //
+        #[cfg(debug_assertions)]
+        {
+            self.exit_vcpus().unwrap();  // exit all not-already-exited VCPUs, join their threads
+
+            exit_code
+        }
     }
 
     /// Saves the state of a paused Microvm.
@@ -777,6 +810,12 @@ impl Subscriber for Vmm {
                 }
             }
 
+            // If the exit_code can't be found on any vcpu, it means that the exit signal
+            // has been issued by the i8042 controller in which case we exit with
+            // FC_EXIT_CODE_OK.
+            //
+            let exit_code = opt_exit_code.unwrap_or(FC_EXIT_CODE_OK);
+
             // !!! The caller of this routine is receiving the exit code to bubble back up
             // to the main() function to return cleanly.  However, it does not have clean
             // access to the Vmm to shut it down (here we have it, since it is `self`).  It
@@ -787,13 +826,7 @@ impl Subscriber for Vmm {
             // that will actually work with an exit code (all other Subscriber trait
             // implementers must use process())
             //
-            self.stop();
-
-            // If the exit_code can't be found on any vcpu, it means that the exit signal
-            // has been issued by the i8042 controller in which case we exit with
-            // FC_EXIT_CODE_OK.
-            //
-            Some(opt_exit_code.unwrap_or(FC_EXIT_CODE_OK))
+            Some(self.stop(exit_code))  // exits abruptly if release build, else returns
         } else {
             error!("Spurious EventManager event for handler: Vmm");
             None