Skip to content

Commit 3f4b0d6

Browse files
committed
userspace reboot: stop post-data services and wait for them to be killed
* Refactored code around stopping services a little bit to reuse it between full reboot and userspace reboot. * Add a scope_guard to fallback to full reboot in case userspace reboot fails. * In case of userspace reboot init will also wait for services to be terminated/killed and log the ones that didn't react to SIGTERM/SIGKILL in time. * If some of the services didn't react to SIGKILL, fail userspace reboot. Test: adb reboot userspace Bug: 135984674 Change-Id: I820c7bc406169333b0f929f0eea028d8384eb2ac
1 parent d11c6f7 commit 3f4b0d6

File tree

5 files changed

+157
-63
lines changed

5 files changed

+157
-63
lines changed

init/reboot.cpp

+116-54
Original file line numberDiff line numberDiff line change
@@ -22,6 +22,7 @@
2222
#include <linux/loop.h>
2323
#include <mntent.h>
2424
#include <semaphore.h>
25+
#include <stdlib.h>
2526
#include <sys/cdefs.h>
2627
#include <sys/ioctl.h>
2728
#include <sys/mount.h>
@@ -31,6 +32,7 @@
3132
#include <sys/types.h>
3233
#include <sys/wait.h>
3334

35+
#include <chrono>
3436
#include <memory>
3537
#include <set>
3638
#include <thread>
@@ -41,6 +43,7 @@
4143
#include <android-base/logging.h>
4244
#include <android-base/macros.h>
4345
#include <android-base/properties.h>
46+
#include <android-base/scopeguard.h>
4447
#include <android-base/strings.h>
4548
#include <android-base/unique_fd.h>
4649
#include <bootloader_message/bootloader_message.h>
@@ -59,6 +62,7 @@
5962
#include "service.h"
6063
#include "service_list.h"
6164
#include "sigchld_handler.h"
65+
#include "util.h"
6266

6367
#define PROC_SYSRQ "/proc/sysrq-trigger"
6468

@@ -75,6 +79,19 @@ namespace init {
7579

7680
static bool shutting_down = false;
7781

82+
static const std::set<std::string> kDebuggingServices{"tombstoned", "logd", "adbd", "console"};
83+
84+
static std::vector<Service*> GetDebuggingServices(bool only_post_data) {
85+
std::vector<Service*> ret;
86+
ret.reserve(kDebuggingServices.size());
87+
for (const auto& s : ServiceList::GetInstance()) {
88+
if (kDebuggingServices.count(s->name()) && (!only_post_data || s->is_post_data())) {
89+
ret.push_back(s.get());
90+
}
91+
}
92+
return ret;
93+
}
94+
7895
// represents umount status during reboot / shutdown.
7996
enum UmountStat {
8097
/* umount succeeded. */
@@ -446,6 +463,49 @@ static void KillZramBackingDevice() {
446463
LOG(INFO) << "zram_backing_dev: `" << backing_dev << "` is cleared successfully.";
447464
}
448465

466+
// Stops given services, waits for them to be stopped for |timeout| ms.
467+
// If terminate is true, then SIGTERM is sent to services, otherwise SIGKILL is sent.
468+
static void StopServices(const std::vector<Service*>& services, std::chrono::milliseconds timeout,
469+
bool terminate) {
470+
LOG(INFO) << "Stopping " << services.size() << " services by sending "
471+
<< (terminate ? "SIGTERM" : "SIGKILL");
472+
std::vector<pid_t> pids;
473+
pids.reserve(services.size());
474+
for (const auto& s : services) {
475+
if (s->pid() > 0) {
476+
pids.push_back(s->pid());
477+
}
478+
if (terminate) {
479+
s->Terminate();
480+
} else {
481+
s->Stop();
482+
}
483+
}
484+
if (timeout > 0ms) {
485+
WaitToBeReaped(pids, timeout);
486+
} else {
487+
// Even if we don't to wait for services to stop, we still optimistically reap zombies.
488+
ReapAnyOutstandingChildren();
489+
}
490+
}
491+
492+
// Like StopServices, but also logs all the services that failed to stop after the provided timeout.
493+
// Returns number of violators.
494+
static int StopServicesAndLogViolations(const std::vector<Service*>& services,
495+
std::chrono::milliseconds timeout, bool terminate) {
496+
StopServices(services, timeout, terminate);
497+
int still_running = 0;
498+
for (const auto& s : services) {
499+
if (s->IsRunning()) {
500+
LOG(ERROR) << "[service-misbehaving] : service '" << s->name() << "' is still running "
501+
<< timeout.count() << "ms after receiving "
502+
<< (terminate ? "SIGTERM" : "SIGKILL");
503+
still_running++;
504+
}
505+
}
506+
return still_running;
507+
}
508+
449509
//* Reboot / shutdown the system.
450510
// cmd ANDROID_RB_* as defined in android_reboot.h
451511
// reason Reason string like "reboot", "shutdown,userrequested"
@@ -510,12 +570,13 @@ static void DoReboot(unsigned int cmd, const std::string& reason, const std::str
510570
// Start reboot monitor thread
511571
sem_post(&reboot_semaphore);
512572

513-
// keep debugging tools until non critical ones are all gone.
514-
const std::set<std::string> kill_after_apps{"tombstoned", "logd", "adbd"};
515573
// watchdogd is a vendor specific component but should be alive to complete shutdown safely.
516574
const std::set<std::string> to_starts{"watchdogd"};
575+
std::vector<Service*> stop_first;
576+
stop_first.reserve(ServiceList::GetInstance().services().size());
517577
for (const auto& s : ServiceList::GetInstance()) {
518-
if (kill_after_apps.count(s->name())) {
578+
if (kDebuggingServices.count(s->name())) {
579+
// keep debugging tools until non critical ones are all gone.
519580
s->SetShutdownCritical();
520581
} else if (to_starts.count(s->name())) {
521582
if (auto result = s->Start(); !result) {
@@ -529,6 +590,8 @@ static void DoReboot(unsigned int cmd, const std::string& reason, const std::str
529590
LOG(ERROR) << "Could not start shutdown critical service '" << s->name()
530591
<< "': " << result.error();
531592
}
593+
} else {
594+
stop_first.push_back(s.get());
532595
}
533596
}
534597

@@ -571,49 +634,12 @@ static void DoReboot(unsigned int cmd, const std::string& reason, const std::str
571634
// optional shutdown step
572635
// 1. terminate all services except shutdown critical ones. wait for delay to finish
573636
if (shutdown_timeout > 0ms) {
574-
LOG(INFO) << "terminating init services";
575-
576-
// Ask all services to terminate except shutdown critical ones.
577-
for (const auto& s : ServiceList::GetInstance().services_in_shutdown_order()) {
578-
if (!s->IsShutdownCritical()) s->Terminate();
579-
}
580-
581-
int service_count = 0;
582-
// Only wait up to half of timeout here
583-
auto termination_wait_timeout = shutdown_timeout / 2;
584-
while (t.duration() < termination_wait_timeout) {
585-
ReapAnyOutstandingChildren();
586-
587-
service_count = 0;
588-
for (const auto& s : ServiceList::GetInstance()) {
589-
// Count the number of services running except shutdown critical.
590-
// Exclude the console as it will ignore the SIGTERM signal
591-
// and not exit.
592-
// Note: SVC_CONSOLE actually means "requires console" but
593-
// it is only used by the shell.
594-
if (!s->IsShutdownCritical() && s->pid() != 0 && (s->flags() & SVC_CONSOLE) == 0) {
595-
service_count++;
596-
}
597-
}
598-
599-
if (service_count == 0) {
600-
// All terminable services terminated. We can exit early.
601-
break;
602-
}
603-
604-
// Wait a bit before recounting the number or running services.
605-
std::this_thread::sleep_for(50ms);
606-
}
607-
LOG(INFO) << "Terminating running services took " << t
608-
<< " with remaining services:" << service_count;
609-
}
610-
611-
// minimum safety steps before restarting
612-
// 2. kill all services except ones that are necessary for the shutdown sequence.
613-
for (const auto& s : ServiceList::GetInstance().services_in_shutdown_order()) {
614-
if (!s->IsShutdownCritical()) s->Stop();
637+
StopServicesAndLogViolations(stop_first, shutdown_timeout / 2, true /* SIGTERM */);
615638
}
639+
// Send SIGKILL to ones that didn't terminate cleanly.
640+
StopServicesAndLogViolations(stop_first, 0ms, false /* SIGKILL */);
616641
SubcontextTerminate();
642+
// Reap subcontext pids.
617643
ReapAnyOutstandingChildren();
618644

619645
// 3. send volume shutdown to vold
@@ -625,9 +651,7 @@ static void DoReboot(unsigned int cmd, const std::string& reason, const std::str
625651
LOG(INFO) << "vold not running, skipping vold shutdown";
626652
}
627653
// logcat stopped here
628-
for (const auto& s : ServiceList::GetInstance().services_in_shutdown_order()) {
629-
if (kill_after_apps.count(s->name())) s->Stop();
630-
}
654+
StopServices(GetDebuggingServices(false /* only_post_data */), 0ms, false /* SIGKILL */);
631655
// 4. sync, try umount, and optionally run fsck for user shutdown
632656
{
633657
Timer sync_timer;
@@ -660,6 +684,7 @@ static void DoReboot(unsigned int cmd, const std::string& reason, const std::str
660684
}
661685

662686
static void EnterShutdown() {
687+
LOG(INFO) << "Entering shutdown mode";
663688
shutting_down = true;
664689
// Skip wait for prop if it is in progress
665690
ResetWaitForProp();
@@ -675,32 +700,69 @@ static void EnterShutdown() {
675700
}
676701

677702
static void LeaveShutdown() {
703+
LOG(INFO) << "Leaving shutdown mode";
678704
shutting_down = false;
679705
SendStartSendingMessagesMessage();
680706
}
681707

682-
static void DoUserspaceReboot() {
708+
static Result<void> DoUserspaceReboot() {
709+
LOG(INFO) << "Userspace reboot initiated";
710+
auto guard = android::base::make_scope_guard([] {
711+
// Leave shutdown so that we can handle a full reboot.
712+
LeaveShutdown();
713+
property_set("sys.powerctl", "reboot,abort-userspace-reboot");
714+
});
683715
// Triggering userspace-reboot-requested will result in a bunch of set_prop
684716
// actions. We should make sure, that all of them are propagated before
685717
// proceeding with userspace reboot.
686718
// TODO(b/135984674): implement proper synchronization logic.
687719
std::this_thread::sleep_for(500ms);
688720
EnterShutdown();
689-
// TODO(b/135984674): tear down post-data services
690-
LeaveShutdown();
721+
std::vector<Service*> stop_first;
722+
// Remember the services that were enabled. We will need to manually enable them again otherwise
723+
// triggers like class_start won't restart them.
724+
std::vector<Service*> were_enabled;
725+
stop_first.reserve(ServiceList::GetInstance().services().size());
726+
for (const auto& s : ServiceList::GetInstance().services_in_shutdown_order()) {
727+
if (s->is_post_data() && !kDebuggingServices.count(s->name())) {
728+
stop_first.push_back(s);
729+
}
730+
if (s->is_post_data() && s->IsEnabled()) {
731+
were_enabled.push_back(s);
732+
}
733+
}
734+
// TODO(b/135984674): do we need shutdown animation for userspace reboot?
735+
// TODO(b/135984674): control userspace timeout via read-only property?
736+
StopServicesAndLogViolations(stop_first, 10s, true /* SIGTERM */);
737+
if (int r = StopServicesAndLogViolations(stop_first, 20s, false /* SIGKILL */); r > 0) {
738+
// TODO(b/135984674): store information about offending services for debugging.
739+
return Error() << r << " post-data services are still running";
740+
}
691741
// TODO(b/135984674): remount userdata
742+
if (int r = StopServicesAndLogViolations(GetDebuggingServices(true /* only_post_data */), 5s,
743+
false /* SIGKILL */);
744+
r > 0) {
745+
// TODO(b/135984674): store information about offending services for debugging.
746+
return Error() << r << " debugging services are still running";
747+
}
748+
// TODO(b/135984674): deactivate APEX modules and switch back to bootstrap namespace.
749+
// Re-enable services
750+
for (const auto& s : were_enabled) {
751+
LOG(INFO) << "Re-enabling service '" << s->name() << "'";
752+
s->Enable();
753+
}
754+
LeaveShutdown();
692755
ActionManager::GetInstance().QueueEventTrigger("userspace-reboot-resume");
756+
guard.Disable(); // Go on with userspace reboot.
757+
return {};
693758
}
694759

695760
static void HandleUserspaceReboot() {
696761
LOG(INFO) << "Clearing queue and starting userspace-reboot-requested trigger";
697762
auto& am = ActionManager::GetInstance();
698763
am.ClearQueue();
699764
am.QueueEventTrigger("userspace-reboot-requested");
700-
auto handler = [](const BuiltinArguments&) {
701-
DoUserspaceReboot();
702-
return Result<void>{};
703-
};
765+
auto handler = [](const BuiltinArguments&) { return DoUserspaceReboot(); };
704766
am.QueueBuiltinAction(handler, "userspace-reboot");
705767
}
706768

init/service.h

+1
Original file line numberDiff line numberDiff line change
@@ -75,6 +75,7 @@ class Service {
7575
const std::vector<std::string>& args);
7676

7777
bool IsRunning() { return (flags_ & SVC_RUNNING) != 0; }
78+
bool IsEnabled() { return (flags_ & SVC_DISABLED) == 0; }
7879
Result<void> ExecStart();
7980
Result<void> Start();
8081
Result<void> StartIfNotDisabled();

init/sigchld_handler.cpp

+30-7
Original file line numberDiff line numberDiff line change
@@ -28,28 +28,31 @@
2828
#include <android-base/scopeguard.h>
2929
#include <android-base/stringprintf.h>
3030

31+
#include <thread>
32+
3133
#include "init.h"
3234
#include "service.h"
3335
#include "service_list.h"
3436

35-
using android::base::StringPrintf;
3637
using android::base::boot_clock;
3738
using android::base::make_scope_guard;
39+
using android::base::StringPrintf;
40+
using android::base::Timer;
3841

3942
namespace android {
4043
namespace init {
4144

42-
static bool ReapOneProcess() {
45+
static pid_t ReapOneProcess() {
4346
siginfo_t siginfo = {};
4447
// This returns a zombie pid or informs us that there are no zombies left to be reaped.
4548
// It does NOT reap the pid; that is done below.
4649
if (TEMP_FAILURE_RETRY(waitid(P_ALL, 0, &siginfo, WEXITED | WNOHANG | WNOWAIT)) != 0) {
4750
PLOG(ERROR) << "waitid failed";
48-
return false;
51+
return 0;
4952
}
5053

5154
auto pid = siginfo.si_pid;
52-
if (pid == 0) return false;
55+
if (pid == 0) return 0;
5356

5457
// At this point we know we have a zombie pid, so we use this scopeguard to reap the pid
5558
// whenever the function returns from this point forward.
@@ -92,20 +95,40 @@ static bool ReapOneProcess() {
9295
LOG(INFO) << name << " received signal " << siginfo.si_status << wait_string;
9396
}
9497

95-
if (!service) return true;
98+
if (!service) return pid;
9699

97100
service->Reap(siginfo);
98101

99102
if (service->flags() & SVC_TEMPORARY) {
100103
ServiceList::GetInstance().RemoveService(*service);
101104
}
102105

103-
return true;
106+
return pid;
104107
}
105108

106109
void ReapAnyOutstandingChildren() {
107-
while (ReapOneProcess()) {
110+
while (ReapOneProcess() != 0) {
111+
}
112+
}
113+
114+
void WaitToBeReaped(const std::vector<pid_t>& pids, std::chrono::milliseconds timeout) {
115+
Timer t;
116+
std::vector<pid_t> alive_pids(pids.begin(), pids.end());
117+
while (!alive_pids.empty() && t.duration() < timeout) {
118+
pid_t pid;
119+
while ((pid = ReapOneProcess()) != 0) {
120+
auto it = std::find(alive_pids.begin(), alive_pids.end(), pid);
121+
if (it != alive_pids.end()) {
122+
alive_pids.erase(it);
123+
}
124+
}
125+
if (alive_pids.empty()) {
126+
break;
127+
}
128+
std::this_thread::sleep_for(50ms);
108129
}
130+
LOG(INFO) << "Waiting for " << pids.size() << " pids to be reaped took " << t << " with "
131+
<< alive_pids.size() << " of them still running";
109132
}
110133

111134
} // namespace init

init/sigchld_handler.h

+5
Original file line numberDiff line numberDiff line change
@@ -17,11 +17,16 @@
1717
#ifndef _INIT_SIGCHLD_HANDLER_H_
1818
#define _INIT_SIGCHLD_HANDLER_H_
1919

20+
#include <chrono>
21+
#include <vector>
22+
2023
namespace android {
2124
namespace init {
2225

2326
void ReapAnyOutstandingChildren();
2427

28+
void WaitToBeReaped(const std::vector<pid_t>& pids, std::chrono::milliseconds timeout);
29+
2530
} // namespace init
2631
} // namespace android
2732

rootdir/init.rc

+5-2
Original file line numberDiff line numberDiff line change
@@ -918,11 +918,14 @@ on property:ro.debuggable=1
918918
on init && property:ro.debuggable=1
919919
start console
920920

921-
on userspace-reboot:
921+
on userspace-reboot
922922
# TODO(b/135984674): reset all necessary properties here.
923923
setprop sys.init.userspace_reboot_in_progress 1
924+
setprop sys.boot_completed 0
925+
setprop sys.init.updatable_crashing 0
926+
setprop apexd.status 0
924927

925-
on userspace-reboot-resume:
928+
on userspace-reboot-resume
926929
# TODO(b/135984674): remount userdata and reset checkpointing
927930
trigger nonencrypted
928931
trigger post-fs-data

0 commit comments

Comments
 (0)