forked from spkenv/spk
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathenv.rs
1226 lines (1111 loc) · 48.1 KB
/
env.rs
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
// Copyright (c) Contributors to the SPK project.
// SPDX-License-Identifier: Apache-2.0
// https://github.com/spkenv/spk
//! Functions related to the setup and teardown of the spfs runtime environment
//! and related system namespacing
use std::path::{Path, PathBuf};
use super::runtime;
use crate::{which, Error, Result};
pub const SPFS_DIR: &str = "/spfs";
pub const SPFS_DIR_PREFIX: &str = "/spfs/";
const NONE: Option<&str> = None;
/// Manages the configuration of an spfs runtime environment.
///
/// Specifically thing like, privilege escalation, mount namespace,
/// filesystem mounts, etc.
///
/// This type is explicitly not [`Send`] and not [`Sync`] because changes
/// being made often only affect the current thread (eg: unshare).
pub struct RuntimeConfigurator<User = NoUserChanges, MountNamespace = NoMountNamespace> {
user: User,
ns: MountNamespace,
}
/// Signifies that the effective user has not been modified
pub struct NoUserChanges;
/// Signifies that the mount namespace has not been modified
pub struct NoMountNamespace;
/// Signifies that the process has become root
pub struct IsRootUser {
pub original_uid: nix::unistd::Uid,
pub original_euid: nix::unistd::Uid,
}
/// Signifies that the process became root and has since dropped it
///
/// One dropping root, a process can never regain it.
pub struct IsNonRootUser;
/// Structure representing that the current thread has been moved into a new
/// mount namespace or was already in one and it has been validated.
///
/// Any function that expects an instance of this struct is intended to
/// perform all file I/O from inside the mount namespace. Not all threads of
/// the current process may be inside the mount namespace, so care must be
/// taken to avoid running tasks on threads that are not known to be in the
/// mount namespace. For example, using `tokio::spawn_blocking` or any file IO
/// functions from tokio may run on a thread from tokio's thread pool that is
/// not in the mount namespace.
///
/// This struct is `!Send` and `!Sync` to prevent it from being moved to or
/// referenced from a different thread where the mount namespace may be
/// different.
pub struct ThreadIsInMountNamespace {
/// The path to the mount namespace this struct represents.
pub mount_ns: std::path::PathBuf,
_not_send: NotSendMarker,
_not_sync: NotSyncMarker,
}
impl ThreadIsInMountNamespace {
/// Create a new guard without moving into a new mount namespace.
///
/// # Safety
///
/// This reads the existing mount namespace of the calling thread and it
/// is assumed the caller is already in a new mount namespace.
pub unsafe fn existing() -> Result<Self> {
Ok(ThreadIsInMountNamespace {
mount_ns: std::fs::read_link(format!(
"/proc/{}/task/{}/ns/mnt",
std::process::id(),
nix::unistd::gettid()
))
.map_err(|err| Error::String(format!("Failed to read mount namespace: {err}")))?,
_not_send: NotSendMarker(std::marker::PhantomData),
_not_sync: NotSyncMarker(std::marker::PhantomData),
})
}
}
/// Structure representing that all threads of the process have been moved
/// into a new mount namespace or was already in one and it has been
/// validated.
///
/// Unlike [`ThreadIsInMountNamespace`], this struct is `Send` and `Sync` and
/// it is safe to use `tokio::spawn_blocking` and tokio file IO.
pub struct ProcessIsInMountNamespace {
/// The path to the mount namespace this struct represents.
pub mount_ns: std::path::PathBuf,
}
impl ProcessIsInMountNamespace {
/// Create a new guard without moving into a new mount namespace.
///
/// # Safety
///
/// This reads the existing mount namespace of the calling thread and it
/// is assumed the caller is already in a new mount namespace.
pub unsafe fn existing() -> Result<Self> {
Ok(ProcessIsInMountNamespace {
mount_ns: std::fs::read_link("/proc/self/ns/mnt")
.map_err(|err| Error::String(format!("Failed to read mount namespace: {err}")))?,
})
}
}
mod __private {
use super::{ProcessIsInMountNamespace, ThreadIsInMountNamespace};
/// Marker trait for [`ThreadIsInMountNamespace`] and [`ProcessIsInMountNamespace`]
pub trait CurrentThreadIsInMountNamespace {
/// The path to the mount namespace associated of the current thread
fn mount_ns(&self) -> &std::path::Path;
}
impl CurrentThreadIsInMountNamespace for ThreadIsInMountNamespace {
#[inline]
fn mount_ns(&self) -> &std::path::Path {
&self.mount_ns
}
}
/// Marker trait for [`ProcessIsInMountNamespace`]
pub trait CurrentProcessIsInMountNamespace: CurrentThreadIsInMountNamespace {}
impl CurrentThreadIsInMountNamespace for ProcessIsInMountNamespace {
#[inline]
fn mount_ns(&self) -> &std::path::Path {
&self.mount_ns
}
}
impl CurrentProcessIsInMountNamespace for ProcessIsInMountNamespace {}
}
impl<User, MountNamespace> RuntimeConfigurator<User, MountNamespace> {
fn new(user: User, ns: MountNamespace) -> Self {
Self { user, ns }
}
}
impl Default for RuntimeConfigurator<NoUserChanges, NoMountNamespace> {
fn default() -> Self {
Self::new(NoUserChanges, NoMountNamespace)
}
}
impl<MountNamespace> RuntimeConfigurator<NoUserChanges, MountNamespace> {
/// Escalate the current process' privileges, becoming root
pub fn become_root(self) -> Result<RuntimeConfigurator<IsRootUser, MountNamespace>> {
tracing::debug!("becoming root...");
let original_euid = nix::unistd::geteuid();
if let Err(err) = nix::unistd::seteuid(nix::unistd::Uid::from_raw(0)) {
return Err(Error::wrap_nix(
err,
"Failed to become root user (effective)",
));
}
let original_uid = nix::unistd::getuid();
if let Err(err) = nix::unistd::setuid(nix::unistd::Uid::from_raw(0)) {
return Err(Error::wrap_nix(err, "Failed to become root user (actual)"));
}
Ok(RuntimeConfigurator::new(
IsRootUser {
original_euid,
original_uid,
},
self.ns,
))
}
}
impl<User> RuntimeConfigurator<User, NoMountNamespace> {
/// Enter a new mount namespace and return a guard that represents the thread
/// that is in the new namespace.
pub fn enter_mount_namespace(
self,
) -> Result<RuntimeConfigurator<User, ThreadIsInMountNamespace>> {
tracing::debug!("entering mount namespace...");
if let Err(err) = nix::sched::unshare(nix::sched::CloneFlags::CLONE_NEWNS) {
return Err(Error::wrap_nix(err, "Failed to enter mount namespace"));
}
// Safety: we just moved the thread into a new mount namespace.
let ns = unsafe { ThreadIsInMountNamespace::existing() }?;
Ok(RuntimeConfigurator::new(self.user, ns))
}
/// Make this configurator for an existing runtime.
///
/// The calling thread must already be operating in the provided runtime.
pub fn current_runtime(
self,
rt: &runtime::Runtime,
) -> Result<RuntimeConfigurator<User, ProcessIsInMountNamespace>> {
let Some(runtime_ns) = &rt.config.mount_namespace else {
return Err(Error::NoActiveRuntime);
};
// Safety: we are going to validate that this is the
// expected namespace for the provided runtime and so
// is considered to be a valid spfs mount namespace
let current_ns = unsafe { ProcessIsInMountNamespace::existing() }?;
if runtime_ns != ¤t_ns.mount_ns {
return Err(Error::String(format!(
"Current runtime does not match expected: {runtime_ns:?} != {:?}",
current_ns.mount_ns
)));
}
std::env::set_var("SPFS_RUNTIME", rt.name());
Ok(RuntimeConfigurator::new(self.user, current_ns))
}
/// Move this process into the namespace of an existing runtime
///
/// This function will fail if called from a process with multiple threads.
pub fn join_runtime(
self,
rt: &runtime::Runtime,
) -> Result<RuntimeConfigurator<User, ThreadIsInMountNamespace>> {
check_can_join()?;
let pid = match rt.status.owner {
None => return Err(Error::RuntimeNotInitialized(rt.name().into())),
Some(pid) => pid,
};
let ns_path = std::path::Path::new("/proc")
.join(pid.to_string())
.join("ns/mnt");
tracing::debug!(?ns_path, "Getting process namespace");
let file = match std::fs::File::open(&ns_path) {
Ok(file) => file,
Err(err) => {
return match err.kind() {
std::io::ErrorKind::NotFound => Err(Error::UnknownRuntime {
runtime: rt.name().into(),
source: Box::new(err),
}),
_ => Err(Error::RuntimeReadError(ns_path, err)),
}
}
};
if let Err(err) = nix::sched::setns(file, nix::sched::CloneFlags::empty()) {
return Err(match err {
nix::errno::Errno::EPERM => Error::new_errno(
libc::EPERM,
"spfs binary was not installed with required capabilities",
),
_ => err.into(),
});
}
std::env::set_var("SPFS_RUNTIME", rt.name());
// Safety: we've just entered an existing mount namespace
let ns = unsafe { ThreadIsInMountNamespace::existing() }?;
Ok(RuntimeConfigurator::new(self.user, ns))
}
}
/// Operations that do not need root but require the current thread to be in a
/// mount namespace.
impl<User, MountNamespace> RuntimeConfigurator<User, MountNamespace>
where
MountNamespace: __private::CurrentThreadIsInMountNamespace,
{
/// Return an error if the spfs filesystem is not mounted.
pub async fn ensure_mounts_already_exist(&self) -> Result<()> {
tracing::debug!("ensuring mounts already exist...");
let res = self.is_mounted(SPFS_DIR).await;
match res {
Err(err) => Err(err.wrap("Failed to check for existing mount")),
Ok(true) => Ok(()),
Ok(false) => Err(format!("'{SPFS_DIR}' is not mounted, will not remount").into()),
}
}
/// Check if the identified directory is an active mount point.
///
/// Returns false in the case where the path or its parent do not exist.
async fn is_mounted<P: Into<PathBuf>>(&self, target: P) -> Result<bool> {
let target = target.into();
let parent = match target.parent() {
None => return Ok(false),
Some(p) => p.to_owned(),
};
// A new thread created while holding _guard will be inside the same
// mount namespace...
let stat_parent_thread = std::thread::spawn(move || nix::sys::stat::stat(&parent));
let stat_target_thread = std::thread::spawn(move || nix::sys::stat::stat(&target));
let (st_parent, st_target) = tokio::task::spawn_blocking(move || {
let st_parent = stat_parent_thread.join();
let st_target = stat_target_thread.join();
(st_parent, st_target)
})
.await?;
let st_parent =
match st_parent.map_err(|_| Error::String("Failed to stat parent".to_owned()))? {
// the parent not existing means the child also doesn't exist
// and so cannot be considered as mounted
Err(nix::errno::Errno::ENOENT) => {
return Ok(false);
}
r => r?,
};
let st_target =
match st_target.map_err(|_| Error::String("Failed to stat target".to_owned()))? {
// a non-existent directory is considered not mounted
Err(nix::errno::Errno::ENOENT) => {
return Ok(false);
}
r => r?,
};
Ok(st_target.st_dev != st_parent.st_dev)
}
/// The path to the mount namespace associated of the current thread
#[inline]
pub fn mount_namespace(&self) -> &std::path::Path {
self.ns.mount_ns()
}
}
/// Operations that need root and require the current thread to be in a mount
/// namespace.
impl<MountNamespace> RuntimeConfigurator<IsRootUser, MountNamespace>
where
MountNamespace: __private::CurrentThreadIsInMountNamespace,
{
/// Remount key existing mount points so that new mounts and changes
/// to existing mounts don't propagate to the parent namespace.
///
/// We use MS_SLAVE for system mounts because we still want mount and
/// unmount events from the system to propagate into this new namespace.
/// We privatize any existing /spfs mount, though because we are likely
/// to replace it and don't want to affect any parent runtime.
pub async fn remove_mount_propagation(&self) -> Result<()> {
use nix::mount::{mount, MsFlags};
tracing::debug!("disable sharing of new mounts...");
let mut res = mount(NONE, "/", NONE, MsFlags::MS_SLAVE, NONE);
if let Err(err) = res {
return Err(Error::wrap_nix(
err,
"Failed to remove propagation from existing mount: /",
));
}
if self.is_mounted(SPFS_DIR).await? {
res = mount(NONE, SPFS_DIR, NONE, MsFlags::MS_PRIVATE, NONE);
if let Err(err) = res {
return Err(Error::wrap_nix(
err,
"Failed to privatize existing mount: /spfs",
));
}
}
if self.is_mounted("/tmp").await? {
res = mount(NONE, "/tmp", NONE, MsFlags::MS_SLAVE, NONE);
if let Err(err) = res {
return Err(Error::wrap_nix(
err,
"Failed to remove propagation from existing mount: /tmp",
));
}
}
Ok(())
}
/// Check or create the necessary directories for mounting the provided runtime
pub fn ensure_mount_targets_exist(&self, config: &runtime::Config) -> Result<()> {
tracing::debug!("ensuring mount targets exist...");
runtime::makedirs_with_perms(SPFS_DIR, 0o777)
.map_err(|source| Error::CouldNotCreateSpfsRoot { source })?;
if let Some(dir) = &config.runtime_dir {
runtime::makedirs_with_perms(dir, 0o777)
.map_err(|err| Error::RuntimeWriteError(dir.clone(), err))?
}
Ok(())
}
pub fn mount_runtime(&self, config: &runtime::Config) -> Result<()> {
use nix::mount::{mount, MsFlags};
let dir = match &config.runtime_dir {
Some(ref p) => p,
None => return Ok(()),
};
let tmpfs_opts = config
.tmpfs_size
.as_ref()
.map(|size| format!("size={size}"));
tracing::debug!("mounting runtime...");
let res = mount(
NONE,
dir,
Some("tmpfs"),
MsFlags::MS_NOEXEC,
tmpfs_opts.as_deref(),
);
if let Err(err) = res {
Err(Error::wrap_nix(err, format!("Failed to mount {dir:?}")))
} else {
Ok(())
}
}
pub fn unmount_runtime(&self, config: &runtime::Config) -> Result<()> {
let dir = match &config.runtime_dir {
Some(ref p) => p,
None => return Ok(()),
};
tracing::debug!("unmounting existing runtime...");
let result = nix::mount::umount2(dir, nix::mount::MntFlags::MNT_DETACH);
if let Err(err) = result {
return Err(Error::wrap_nix(err, format!("Failed to unmount {dir:?}")));
}
Ok(())
}
pub async fn setup_runtime(&self, rt: &runtime::Runtime) -> Result<()> {
tracing::debug!("setting up runtime...");
rt.ensure_required_directories().await
}
async fn mount_live_layers(&self, rt: &runtime::Runtime) -> Result<()> {
// Mounts the bind mounts from the any live layers in the runtime the top of paths
// inside /spfs
//
// It requires the mount destinations to exist under
// /spfs/. If they do not, the mount commands will error. The
// mount destinations are either provided by one of the layers
// in the runtime, or by an earlier call to
// ensure_extra_bind_mount_locations_exist() made in
// initialize_runtime()
let live_layers = rt.live_layers();
if !live_layers.is_empty() {
tracing::debug!("mounting the extra bind mounts over the {SPFS_DIR} filesystem ...");
let mount = super::resolve::which("mount").unwrap_or_else(|| "/usr/bin/mount".into());
for layer in live_layers {
let injection_mounts = layer.bind_mounts();
for extra_mount in injection_mounts {
let dest = if extra_mount.dest.starts_with(SPFS_DIR_PREFIX) {
PathBuf::from(extra_mount.dest.clone())
} else {
PathBuf::from(SPFS_DIR).join(extra_mount.dest.clone())
};
let mut cmd = tokio::process::Command::new(mount.clone());
cmd.arg("--bind");
cmd.arg(extra_mount.src.to_string_lossy().into_owned());
cmd.arg(dest);
tracing::debug!("About to run: {cmd:?}");
match cmd.status().await {
Err(err) => {
return Err(Error::process_spawn_error("mount".to_owned(), err, None))
}
Ok(status) => match status.code() {
Some(0) => (),
_ => {
return Err(format!(
"Failed to inject bind mount into the {SPFS_DIR} filesystem using: {cmd:?}"
).into())
}
},
}
}
}
}
Ok(())
}
async fn unmount_live_layers(&self, rt: &runtime::Runtime) -> Result<()> {
// Unmount the bind mounted items from the live layers
let live_layers = rt.live_layers();
if !live_layers.is_empty() {
tracing::debug!("unmounting the extra bind mounts from the {SPFS_DIR} filesystem ...");
let umount =
super::resolve::which("umount").unwrap_or_else(|| "/usr/bin/umount".into());
for layer in live_layers {
let injection_mounts = layer.bind_mounts();
for extra_mount in injection_mounts {
let mut cmd = tokio::process::Command::new(umount.clone());
cmd.arg(PathBuf::from(SPFS_DIR).join(extra_mount.dest.clone()));
tracing::debug!("About to run: {cmd:?}");
match cmd.status().await {
Err(err) => {
return Err(Error::process_spawn_error("umount".to_owned(), err, None))
}
Ok(status) => match status.code() {
Some(0) => (),
_ => return Err(format!("Failed to unmount a bind mount injected into the {SPFS_DIR} filesystem using: {cmd:?}").into()),
},
}
}
}
}
Ok(())
}
/// Mounts an overlayfs built up from the given list of rendered
/// layered directories (layer_dirs).
///
/// This first entry in layer_dirs should be the one you expect to
/// be the bottom-most layer in the overlayfs stack. Each
/// following entry will be placed on top of the previous one,
/// with the last entry in layer_dirs becoming the top-most layer
/// in the overlayfs stack. In the event that multiple layer
/// directories contain the same file, the one that comes later in
/// the slice will provide the contents of that file.
pub(crate) async fn mount_env_overlayfs<P: AsRef<Path>>(
&self,
rt: &runtime::Runtime,
layer_dirs: &[P],
) -> Result<()> {
tracing::debug!("mounting the overlay filesystem...");
let overlay_args = get_overlay_args(rt, layer_dirs)?;
let mount = super::resolve::which("mount").unwrap_or_else(|| "/usr/bin/mount".into());
tracing::debug!("{mount:?} -t overlay -o {overlay_args} none {SPFS_DIR}",);
// for some reason, the overlay mount process creates a bad filesystem if the
// mount command is called directly from this process. It may be some default
// option or minor detail in how the standard mount command works - possibly related
// to this process eventually dropping privileges, but that is uncertain right now
let mut cmd = tokio::process::Command::new(mount);
cmd.args(["-t", "overlay"]);
cmd.arg("-o");
cmd.arg(overlay_args);
cmd.arg("none");
cmd.arg(SPFS_DIR);
match cmd.status().await {
Err(err) => Err(Error::process_spawn_error("mount", err, None)),
Ok(status) => match status.code() {
Some(0) => Ok(()),
_ => Err("Failed to mount overlayfs".into()),
},
}?;
self.mount_live_layers(rt).await
}
#[cfg(feature = "fuse-backend")]
pub(crate) async fn mount_fuse_lower_dir(&self, rt: &runtime::Runtime) -> Result<()> {
self.mount_fuse_onto(rt, &rt.config.lower_dir).await
}
#[cfg(feature = "fuse-backend")]
pub(crate) async fn mount_env_fuse(&self, rt: &runtime::Runtime) -> Result<()> {
self.mount_fuse_onto(rt, SPFS_DIR).await?;
self.mount_live_layers(rt).await
}
#[cfg(feature = "fuse-backend")]
async fn mount_fuse_onto<P>(&self, rt: &runtime::Runtime, path: P) -> Result<()>
where
P: AsRef<std::ffi::OsStr>,
{
use spfs_encoding::prelude::*;
let path = path.as_ref().to_owned();
let platform = rt.to_platform().digest()?.to_string();
let opts = get_fuse_args(&rt.config, &self.user, true);
// A new thread created in mount namespace will be inside the same
// mount namespace...
let mount_and_wait_thread = std::thread::spawn(move || {
tracing::debug!("mounting the FUSE filesystem...");
let spfs_fuse = match super::resolve::which_spfs("fuse") {
None => return Err(Error::MissingBinary("spfs-fuse")),
Some(exe) => exe,
};
let mut cmd = std::process::Command::new(spfs_fuse);
cmd.arg("-o");
cmd.arg(opts);
// We are trusting that the runtime has been saved to the repository
// and so the platform that the runtime relies on has also been tagged
cmd.arg(platform);
cmd.arg(&path);
// The command logs all output to stderr, and should never hold onto
// a handle to this process' stdout as it can cause hanging
cmd.stdout(std::process::Stdio::null());
// Allowing stderr to be inherited causes this process to hang
// forever reading from that pipe, even after the child processes
// has exited (cause unknown).
// TODO: find a way to still see stderr output from the child
// process without it hanging.
cmd.stderr(std::process::Stdio::null());
tracing::debug!("{cmd:?}");
match cmd.status() {
Err(err) => return Err(Error::process_spawn_error("mount", err, None)),
Ok(status) if status.code() == Some(0) => {}
Ok(status) => {
return Err(Error::String(format!(
"Failed to mount fuse filesystem, mount command exited with non-zero status {:?}",
status.code()
)))
}
};
// the fuse filesystem may take some moments to be fully initialized, and we
// don't want to return until this is true. Otherwise, subsequent operations may
// see unexpected errors.
let mut sleep_time_ms = vec![2, 5, 10, 50, 100, 100, 100, 100];
while let Err(err) = std::fs::symlink_metadata(&path) {
if let Some(ms) = sleep_time_ms.pop() {
std::thread::sleep(std::time::Duration::from_millis(ms));
} else {
tracing::warn!("FUSE did not appear to start after delay: {err}");
break;
}
}
Ok(())
});
tokio::task::spawn_blocking(move || mount_and_wait_thread.join())
.await?
.map_err(|_| Error::String("Failed to mount and wait for fuse".to_owned()))??;
Ok(())
}
pub async fn mask_files(
&self,
config: &runtime::Config,
manifest: super::tracking::Manifest,
) -> Result<()> {
use std::os::unix::fs::{MetadataExt, PermissionsExt};
tracing::debug!("masking deleted files...");
let owner = self.user.original_uid;
let prefix = config
.upper_dir
.to_str()
.ok_or_else(|| {
crate::Error::String(format!(
"configured runtime upper_dir has invalid characters: {:?}",
config.upper_dir
))
})?
.to_owned();
// A new thread created in mount namespace will be inside the same
// mount namespace...
let mask_files_thread = std::thread::spawn(move || {
let nodes: Vec<_> = manifest.walk_abs(prefix).collect();
for node in nodes.iter() {
if !node.entry.kind.is_mask() {
continue;
}
let fullpath = node.path.to_path("/");
if let Some(parent) = fullpath.parent() {
tracing::trace!(?parent, "build parent dir for mask");
runtime::makedirs_with_perms(parent, 0o777)
.map_err(|err| Error::RuntimeWriteError(parent.to_owned(), err))?;
}
tracing::trace!(?node.path, "Creating file mask");
let existing = std::fs::symlink_metadata(&fullpath).ok();
if let Some(meta) = existing {
if runtime::is_removed_entry(&meta) {
continue;
}
if meta.is_file() {
std::fs::remove_file(&fullpath)
.map_err(|err| Error::RuntimeWriteError(fullpath.clone(), err))?;
} else {
std::fs::remove_dir_all(&fullpath)
.map_err(|err| Error::RuntimeWriteError(fullpath.clone(), err))?;
}
}
nix::sys::stat::mknod(
&fullpath,
nix::sys::stat::SFlag::S_IFCHR,
nix::sys::stat::Mode::empty(),
0,
)
.map_err(move |err| {
Error::wrap_nix(err, format!("Failed to create file mask: {}", node.path))
})?;
}
for node in nodes.iter().rev() {
if !node.entry.kind.is_tree() {
continue;
}
let fullpath = node.path.to_path("/");
if !fullpath.is_dir() {
continue;
}
let existing = std::fs::symlink_metadata(&fullpath)
.map_err(|err| Error::RuntimeReadError(fullpath.clone(), err))?;
if existing.permissions().mode() != node.entry.mode {
if let Err(err) = std::fs::set_permissions(
&fullpath,
std::fs::Permissions::from_mode(node.entry.mode),
) {
match err.kind() {
std::io::ErrorKind::NotFound => continue,
_ => {
return Err(Error::RuntimeSetPermissionsError(fullpath, err));
}
}
}
}
if existing.uid() != owner.as_raw() {
let res = nix::unistd::chown(&fullpath, Some(owner), None);
match res {
Ok(_) | Err(nix::errno::Errno::ENOENT) => continue,
Err(err) => {
return Err(Error::wrap_nix(
err,
format!("Failed to set ownership on masked file [{}]", node.path),
));
}
}
}
}
Ok(())
});
tokio::task::spawn_blocking(move || mask_files_thread.join())
.await?
.map_err(|_| Error::String("Failed to mask files".to_owned()))??;
Ok(())
}
}
/// Operations that need root and require the whole process to be in a mount
/// namespace.
impl<MountNamespace> RuntimeConfigurator<IsRootUser, MountNamespace>
where
MountNamespace: __private::CurrentProcessIsInMountNamespace,
{
/// Make a durable upper dir path for the runtime, copy the
/// contents of its previous upper dir to the new one.
pub async fn change_runtime_to_durable(&self, runtime: &mut runtime::Runtime) -> Result<i32> {
// Not all runtime backends support durable runtimes
match runtime.config.mount_backend {
runtime::MountBackend::FuseOnly | runtime::MountBackend::WinFsp => {
// a vfs-only runtime cannot be change to durable
return Err(Error::RuntimeChangeToDurableError(format!(
"{} backend does not support durable runtimes",
runtime.config.mount_backend
)));
}
runtime::MountBackend::OverlayFsWithFuse
| runtime::MountBackend::OverlayFsWithRenders => {}
}
tracing::info!("changing runtime to durable");
let old_upper_dir = runtime.data().config.upper_dir.clone();
tracing::debug!("old upper dir: {}", old_upper_dir.display());
let new_path = runtime.setup_durable_upper_dir().await?;
tracing::debug!("new upper path: {}", new_path.display());
runtime.ensure_upper_dirs().await?;
tracing::debug!("ensured upper dirs");
// this only syncs over the upper_dir contents, not the
// work_dir because the work_dir is updated and managed
// internally by overlayfs as changes are made. any edits or
// changes that have been completed, but not committed, will
// appear in the upper_dir.
let src_dir = match old_upper_dir.to_str() {
Some(path) => path,
None => {
return Err(Error::RuntimeChangeToDurableError(format!(
"current upper_dir '{}' has invalid characters",
old_upper_dir.display()
)))
}
};
let dest_dir = match new_path.to_str() {
Some(path) => path,
None => {
return Err(Error::RuntimeChangeToDurableError(format!(
"new upper_dir '{}' has invalid characters",
new_path.display()
)))
}
};
let args = vec!["-aD", src_dir, dest_dir];
let cmd_path = match which("rsync") {
Some(cmd) => cmd,
None => {
return Err(Error::RuntimeChangeToDurableError(
"rsync is not available on this host".to_string(),
))
}
};
let mut rsync = std::process::Command::new(cmd_path);
rsync.args(args);
tracing::debug!("the rsync command: {rsync:?}");
match rsync.status().map_err(|err| Error::String(err.to_string())) {
Ok(status) => match status.code() {
Some(0) => {
runtime.set_durable(true);
runtime.save_state_to_storage().await?;
tracing::info!("runtime saved as durable");
Ok(0)
}
Some(code) => Err(Error::RuntimeChangeToDurableError(format!(
"rsync failed with exit code: {code}"
))),
None => Err(Error::RuntimeChangeToDurableError(
"rsync was terminated by an unexpected signal".to_string(),
)),
},
Err(err) => Err(Error::RuntimeChangeToDurableError(format!(
"rsync failed to run: {err}"
))),
}
}
/// Unmount the non-fuse portion of the provided runtime, if applicable.
pub async fn unmount_env(&self, rt: &runtime::Runtime, lazy: bool) -> Result<()> {
tracing::debug!("unmounting existing env...");
// unmount fuse portion first, because once /spfs is unmounted many safety checks will
// fail and the runtime will effectively not be re-configurable anymore.
self.unmount_env_fuse(rt, lazy).await?;
self.unmount_env_overlayfs(rt, lazy).await?;
Ok(())
}
/// Unmount the overlayfs portion of the provided runtime, if applicable
pub async fn unmount_env_overlayfs(&self, rt: &runtime::Runtime, lazy: bool) -> Result<()> {
match rt.config.mount_backend {
runtime::MountBackend::FuseOnly | runtime::MountBackend::WinFsp => {
// a vfs-only runtime cannot be unmounted this way
// and should already be handled by a previous call to
// unmount_env_fuse
return Ok(());
}
runtime::MountBackend::OverlayFsWithFuse
| runtime::MountBackend::OverlayFsWithRenders => {}
}
let mut flags = nix::mount::MntFlags::empty();
if lazy {
// Perform a lazy unmount in case there are still open handles to files.
// This way we can mount over the old one without worrying about busyness
flags |= nix::mount::MntFlags::MNT_DETACH;
}
let result = nix::mount::umount2(SPFS_DIR, flags);
if let Err(err) = result {
return Err(Error::wrap_nix(
err,
format!("Failed to unmount {SPFS_DIR}"),
));
}
Ok(())
}
/// Unmount the fuse portion of the provided runtime, if applicable.
async fn unmount_env_fuse(&self, rt: &runtime::Runtime, lazy: bool) -> Result<()> {
let mount_path = match rt.config.mount_backend {
runtime::MountBackend::OverlayFsWithFuse => rt.config.lower_dir.as_path(),
runtime::MountBackend::FuseOnly => {
// Unmount any extra paths mounted in the depths of
// the fuse-only backend before fuse itself is
// unmounted to avoid issue with lazy unmounting.
self.unmount_live_layers(rt).await?;
std::path::Path::new(SPFS_DIR)
}
runtime::MountBackend::OverlayFsWithRenders | runtime::MountBackend::WinFsp => {
return Ok(())
}
};
tracing::debug!(%lazy, "unmounting existing fuse env @ {mount_path:?}...");
// The FUSE filesystem can take some time to start up, and
// if the runtime tries to exit too quickly, the fusermount
// command can return with errors because the filesystem has
// not yet initialized and the connection is not ready.
//
// A few retries in these cases gives time for the filesystem
// to enter a ready and connected state.
let mut retry_after_ms = vec![10, 50, 100, 200, 500, 1000];
while self.is_mounted(mount_path).await.unwrap_or(true) {
let flags = if lazy { "-uz" } else { "-u" };
let child = tokio::process::Command::new("fusermount")
.arg(flags)
.arg(mount_path)
.stderr(std::process::Stdio::piped())
.stdout(std::process::Stdio::piped())
.stdin(std::process::Stdio::piped())
.spawn()
.map_err(|err| Error::ProcessSpawnError("fusermount".into(), err))?;
match child.wait_with_output().await {
Err(err) => {
return Err(Error::String(format!(
"Failed to unmount FUSE filesystem: {err:?}"
)))
}
Ok(out) if out.status.code() == Some(0) => continue,
Ok(out) => {
let stderr = String::from_utf8_lossy(&out.stderr);
match retry_after_ms.pop() {
Some(wait_ms) => {
tracing::trace!(
"Retrying FUSE unmount which failed with, {:?}: {}",
out.status.code(),
stderr.trim()
);
tokio::time::sleep(std::time::Duration::from_millis(wait_ms)).await;
continue;
}
None => {
return Err(Error::String(format!(
"FUSE unmount returned non-zero exit status, {:?}: {}",
out.status.code(),
stderr.trim()
)))
}
}
}
};
}
Ok(())
}
}
/// Operations that need root but have no mount namespace requirements.
impl<MountNamespace> RuntimeConfigurator<IsRootUser, MountNamespace> {
/// Drop all capabilities and become the original user that
/// this thread was running as before becoming root
pub fn become_original_user(
self,
) -> Result<RuntimeConfigurator<IsNonRootUser, MountNamespace>> {
tracing::debug!("dropping root...");
let mut result = nix::unistd::setuid(self.user.original_uid);
if let Err(err) = result {
return Err(Error::wrap_nix(
err,
"Failed to become regular user (actual)",
));
}
result = nix::unistd::seteuid(self.user.original_euid);
if let Err(err) = result {
return Err(Error::wrap_nix(
err,
"Failed to become regular user (effective)",
));
}
self.drop_all_capabilities()?;
Ok(RuntimeConfigurator::new(IsNonRootUser, self.ns))
}
// Drop all of the capabilities held by the current thread
fn drop_all_capabilities(&self) -> Result<()> {
tracing::debug!("drop all capabilities/privileges...");
caps::clear(None, caps::CapSet::Effective)?;
caps::clear(None, caps::CapSet::Permitted)?;
caps::clear(None, caps::CapSet::Inheritable)?;
// the dumpable attribute can become unset when changing pids or
// calling a binary with capabilities (spfs). Resetting this to one
// restores ownership of the proc filesystem to the calling user which
// is important in being able to read and join an existing runtime's namespace
let result = unsafe { libc::prctl(libc::PR_SET_DUMPABLE, 1) };
if result != 0 {
Err(nix::errno::Errno::last().into())
} else {
Ok(())
}
}
}
// Checks if the current process will be able to join an existing runtime