Skip to content

Commit 6adc072

Browse files
EtiennePerotgvisor-bot
authored andcommitted
runsc: When mounting a new procfs fails, fall back to recursive bind-mount.
As part of sandbox startup, `runsc` needs to set up a chroot environment with a minimal working `procfs` filesystem mounted within. However, doing so from within a container (as applications like Dangerzone do) may fail, because in the container runtime's default configuration, some paths of the procfs filesystem visible from within the container may be obstructed. This prevents mounting new unobstructed instances of `procfs`. This change detects this case and falls back to the previous behavior of using a recursive bind-mount of `/proc` in such a case. The obstructed subdirectories of procfs are preserved in this case, which is fine because we only need a very minimal subset of `procfs` to actually work. Additionally, `runsc` actually only needs a few kernel parameter files and `/proc/self` in order to work. So this change sets up a `tmpfs` mount that contains just those files, with the kernel parameter files being plainly copied and `/proc/self` being a symlink to the one present in the mounted view of `procfs` (regardless of which mounting method was used). The `runtime_in_docker` test will continuously verify that this fallback mechanism works to avoid similar breakage in the future. Credits to @avagin for figuring out this solution. Fixes #10944. PiperOrigin-RevId: 692310347
1 parent 22d58ca commit 6adc072

File tree

7 files changed

+89
-20
lines changed

7 files changed

+89
-20
lines changed

pkg/coretag/coretag.go

Lines changed: 7 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -41,6 +41,7 @@ func Enable() error {
4141
}
4242

4343
// GetAllCoreTags returns the core tag of all the threads in the thread group.
44+
// PID 0 means the current pid.
4445
func GetAllCoreTags(pid int) ([]uint64, error) {
4546
// prctl(PR_SCHED_CORE_GET, PR_SCHED_CORE_SCOPE_THREAD_GROUP, ...) is not supported
4647
// in linux. So instead we get all threads from /proc/<pid>/task and get all the
@@ -75,9 +76,14 @@ func GetAllCoreTags(pid int) ([]uint64, error) {
7576
}
7677

7778
// getTids returns set of tids as reported by /proc/<pid>/task.
79+
// PID 0 means the current PID.
7880
func getTids(pid int) (map[int]struct{}, error) {
7981
tids := make(map[int]struct{})
80-
files, err := os.ReadDir("/proc/" + strconv.Itoa(pid) + "/task")
82+
path := "/proc/self/task"
83+
if pid != 0 {
84+
path = fmt.Sprintf("/proc/%d/task", pid)
85+
}
86+
files, err := os.ReadDir(path)
8187
if err != nil {
8288
return nil, err
8389
}

pkg/coretag/coretag_test.go

Lines changed: 10 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -16,6 +16,7 @@ package coretag
1616

1717
import (
1818
"os"
19+
"reflect"
1920
"testing"
2021

2122
"gvisor.dev/gvisor/pkg/hostos"
@@ -36,11 +37,19 @@ func TestEnable(t *testing.T) {
3637
t.Fatalf("Enable() got error %v, wanted nil", err)
3738
}
3839

39-
coreTags, err := GetAllCoreTags(os.Getpid())
40+
pid := os.Getpid()
41+
coreTags, err := GetAllCoreTags(pid)
4042
if err != nil {
4143
t.Fatalf("GetAllCoreTags() got error %v, wanted nil", err)
4244
}
4345
if len(coreTags) != 1 {
4446
t.Fatalf("Got coreTags %v, wanted len(coreTags)=1", coreTags)
4547
}
48+
coreTagsSelf, err := GetAllCoreTags(0)
49+
if err != nil {
50+
t.Fatalf("GetAllCoreTags(0) got error %v, wanted nil", err)
51+
}
52+
if !reflect.DeepEqual(coreTags, coreTagsSelf) {
53+
t.Fatalf("Got different coreTags for PID %d vs self: %v vs %v", pid, coreTags, coreTagsSelf)
54+
}
4655
}

runsc/boot/loader_test.go

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -245,7 +245,7 @@ func TestStartSignal(t *testing.T) {
245245
func TestHostnetWithRawSockets(t *testing.T) {
246246
// Drop CAP_NET_RAW from effective capabilities, if we have it.
247247
pid := os.Getpid()
248-
caps, err := capability.NewPid2(os.Getpid())
248+
caps, err := capability.NewPid2(0)
249249
if err != nil {
250250
t.Fatalf("error getting capabilities for pid %d: %v", pid, err)
251251
}

runsc/cmd/boot.go

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -471,7 +471,7 @@ func (b *Boot) Execute(_ context.Context, f *flag.FlagSet, args ...any) subcomma
471471

472472
// Verify that all sentry threads are properly core tagged, and log
473473
// current core tag.
474-
coreTags, err := coretag.GetAllCoreTags(os.Getpid())
474+
coreTags, err := coretag.GetAllCoreTags(0)
475475
if err != nil {
476476
util.Fatalf("Failed read current core tags: %v", err)
477477
}

runsc/cmd/chroot.go

Lines changed: 69 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -82,6 +82,73 @@ func copyFile(dst, src string) error {
8282
return err
8383
}
8484

85+
// setupMinimalProcfs creates a minimal procfs-like tree at `${chroot}/proc`.
86+
func setupMinimalProcfs(chroot string) error {
87+
// We can't always directly mount procfs because it may be obstructed
88+
// by submounts within it. See https://gvisor.dev/issue/10944.
89+
// All we really need from procfs is /proc/self and a few kernel
90+
// parameter files, which are typically not obstructed.
91+
// So we create a tmpfs at /proc and manually copy the kernel parameter
92+
// files into it. Then, to get /proc/self, we mount either a new
93+
// instance of procfs (if possible), or a recursive bind mount of the
94+
// procfs we do have access to (which still contains the obstructed
95+
// submounts but /proc/self is not obstructed), and we symlink
96+
// our /proc/self to the one in that mount.
97+
//
98+
// Why not try to mount the new procfs instance at /proc directly?
99+
// Because that would cause the set of files at /proc to differ
100+
// between the "new procfs instance" case and the "recursive bind
101+
// mount" case. Thus, this could introduce a bug whereby gVisor starts
102+
// to depend on a /proc file that is present in one case but not the
103+
// other, without decent test coverage to catch it.
104+
procRoot := filepath.Join(chroot, "/proc")
105+
if err := os.Mkdir(procRoot, 0755); err != nil {
106+
return fmt.Errorf("error creating /proc in chroot: %v", err)
107+
}
108+
if err := specutils.SafeMount("runsc-proc", procRoot, "tmpfs",
109+
unix.MS_NOSUID|unix.MS_NODEV|unix.MS_NOEXEC, "", "/proc"); err != nil {
110+
return fmt.Errorf("error mounting tmpfs in /proc: %v", err)
111+
}
112+
flags := uint32(unix.MS_NOSUID | unix.MS_NODEV | unix.MS_NOEXEC | unix.MS_RDONLY)
113+
procSubmountDir := "sandbox-proc"
114+
if newProcfsErr := mountInChroot(chroot, "proc", "/proc/"+procSubmountDir, "proc", flags); newProcfsErr != nil {
115+
log.Debugf("Unable to mount a new instance of the procfs file system at %q (%v); trying a recursive bind mount instead.", filepath.Join(procRoot, procSubmountDir), newProcfsErr)
116+
procSubmountDir = "host-proc"
117+
if bindErr := mountInChroot(chroot, "/proc", "/proc/"+procSubmountDir, "bind",
118+
unix.MS_BIND|unix.MS_REC|flags); bindErr != nil {
119+
return fmt.Errorf("error recursively bind-mounting proc at %q (%w) after also failing to mount a new procfs instance there (%v)", filepath.Join(procRoot, procSubmountDir), bindErr, newProcfsErr)
120+
}
121+
log.Debugf("Successfully mounted a recursive bind mount of procfs at %q; continuing.", filepath.Join(procRoot, procSubmountDir))
122+
}
123+
// Create needed directories.
124+
for _, d := range []string{
125+
"/proc/sys",
126+
"/proc/sys/kernel",
127+
"/proc/sys/vm",
128+
} {
129+
if err := os.Mkdir(filepath.Join(chroot, d), 0755); err != nil {
130+
return fmt.Errorf("error creating directory %q: %v", filepath.Join(chroot, d), err)
131+
}
132+
}
133+
// Copy needed files.
134+
for _, f := range []string{
135+
"/proc/sys/vm/mmap_min_addr",
136+
"/proc/sys/kernel/cap_last_cap",
137+
} {
138+
if err := copyFile(filepath.Join(chroot, f), f); err != nil {
139+
return fmt.Errorf("failed to copy %q -> %q: %w", f, filepath.Join(chroot, f), err)
140+
}
141+
}
142+
// Create symlink for /proc/self.
143+
if err := os.Symlink(procSubmountDir+"/self", filepath.Join(procRoot, "self")); err != nil {
144+
return fmt.Errorf("error creating symlink %q -> %q: %w", filepath.Join(procRoot, "self"), procSubmountDir+"/self", err)
145+
}
146+
if err := os.Chmod(procRoot, 0o111); err != nil {
147+
return fmt.Errorf("error chmodding %q: %v", procRoot, err)
148+
}
149+
return nil
150+
}
151+
85152
// setUpChroot creates an empty directory with runsc mounted at /runsc and proc
86153
// mounted at /proc.
87154
func setUpChroot(spec *specs.Spec, conf *config.Config) error {
@@ -109,9 +176,8 @@ func setUpChroot(spec *specs.Spec, conf *config.Config) error {
109176
log.Warningf("Failed to copy /etc/localtime: %v. UTC timezone will be used.", err)
110177
}
111178

112-
flags := uint32(unix.MS_NOSUID | unix.MS_NODEV | unix.MS_NOEXEC | unix.MS_RDONLY)
113-
if err := mountInChroot(chroot, "proc", "/proc", "proc", flags); err != nil {
114-
return fmt.Errorf("error mounting proc in chroot: %v", err)
179+
if err := setupMinimalProcfs(chroot); err != nil {
180+
return fmt.Errorf("error setting up minimal procfs in chroot %q: %v", chroot, err)
115181
}
116182

117183
if err := tpuProxyUpdateChroot("/", chroot, spec, conf); err != nil {

runsc/specutils/namespace.go

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -214,7 +214,7 @@ func SetUIDGIDMappings(cmd *exec.Cmd, s *specs.Spec) {
214214

215215
// HasCapabilities returns true if the user has all capabilities in 'cs'.
216216
func HasCapabilities(cs ...capability.Cap) bool {
217-
caps, err := capability.NewPid2(os.Getpid())
217+
caps, err := capability.NewPid2(0)
218218
if err != nil {
219219
return false
220220
}

test/e2e/runtime_in_docker_test.go

Lines changed: 0 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -74,18 +74,6 @@ func (test testVariant) run(ctx context.Context, logger testutil.Logger, runscPa
7474
ReadOnly: false,
7575
})
7676
}
77-
// Mount an unobstructed view of procfs at /proc2 so that the runtime
78-
// can mount a fresh procfs.
79-
// TODO(gvisor.dev/issue/10944): Remove this once issue is fixed.
80-
opts.Mounts = append(opts.Mounts, mount.Mount{
81-
Type: mount.TypeBind,
82-
Source: "/proc",
83-
Target: "/proc2",
84-
ReadOnly: false,
85-
BindOptions: &mount.BindOptions{
86-
NonRecursive: true,
87-
},
88-
})
8977
const wantMessage = "It became a jumble of words, a litany, almost a kind of glossolalia."
9078
args := []string{
9179
"/runtime",

0 commit comments

Comments
 (0)