Skip to content

Commit 2348484

Browse files
nixprimegvisor-bot
authored andcommitted
sentry/kernel: unshare atomically and without holding Task.mu unnecessarily
Both changes are consistent with Linux's kernel/fork.c:ksys_unshare(). PiperOrigin-RevId: 820375452
1 parent 923cb75 commit 2348484

File tree

3 files changed

+117
-124
lines changed

3 files changed

+117
-124
lines changed

pkg/sentry/kernel/auth/credentials.go

Lines changed: 22 additions & 28 deletions
Original file line numberDiff line numberDiff line change
@@ -152,6 +152,28 @@ func (c *Credentials) Fork() *Credentials {
152152
return nc
153153
}
154154

155+
// ForkIntoUserNamespace returns a copy of c after its caller has entered user
156+
// namespace ns.
157+
func (c *Credentials) ForkIntoUserNamespace(ns *UserNamespace) *Credentials {
158+
nc := c.Fork()
159+
nc.UserNamespace = ns
160+
// "The child process created by clone(2) with the CLONE_NEWUSER flag
161+
// starts out with a complete set of capabilities in the new user
162+
// namespace. Likewise, a process that creates a new user namespace using
163+
// unshare(2) or joins an existing user namespace using setns(2) gains a
164+
// full set of capabilities in that namespace."
165+
nc.PermittedCaps = AllCapabilities
166+
nc.InheritableCaps = 0
167+
nc.EffectiveCaps = AllCapabilities
168+
nc.BoundingCaps = AllCapabilities
169+
// "A call to clone(2), unshare(2), or setns(2) using the CLONE_NEWUSER
170+
// flag sets the "securebits" flags (see capabilities(7)) to their default
171+
// values (all flags disabled) in the child (for clone(2)) or caller (for
172+
// unshare(2), or setns(2)." - user_namespaces(7)
173+
nc.KeepCaps = false
174+
return nc
175+
}
176+
155177
// InGroup returns true if c is in group kgid. Compare Linux's
156178
// kernel/groups.c:in_group_p().
157179
func (c *Credentials) InGroup(kgid KGID) bool {
@@ -242,34 +264,6 @@ func (c *Credentials) UseGID(gid GID) (KGID, error) {
242264
return NoID, linuxerr.EPERM
243265
}
244266

245-
// SetUID translates the provided uid to the root user namespace and updates c's
246-
// uids to it. This performs no permissions or capabilities checks, the caller
247-
// is responsible for ensuring the calling context is permitted to modify c.
248-
func (c *Credentials) SetUID(uid UID) error {
249-
kuid := c.UserNamespace.MapToKUID(uid)
250-
if !kuid.Ok() {
251-
return linuxerr.EINVAL
252-
}
253-
c.RealKUID = kuid
254-
c.EffectiveKUID = kuid
255-
c.SavedKUID = kuid
256-
return nil
257-
}
258-
259-
// SetGID translates the provided gid to the root user namespace and updates c's
260-
// gids to it. This performs no permissions or capabilities checks, the caller
261-
// is responsible for ensuring the calling context is permitted to modify c.
262-
func (c *Credentials) SetGID(gid GID) error {
263-
kgid := c.UserNamespace.MapToKGID(gid)
264-
if !kgid.Ok() {
265-
return linuxerr.EINVAL
266-
}
267-
c.RealKGID = kgid
268-
c.EffectiveKGID = kgid
269-
c.SavedKGID = kgid
270-
return nil
271-
}
272-
273267
// LoadSeccheckData sets credential data based on mask.
274268
func (c *Credentials) LoadSeccheckData(mask seccheck.FieldMask, info *pb.ContextData) {
275269
if mask.Contains(seccheck.FieldCtxtCredentials) {

pkg/sentry/kernel/task_clone.go

Lines changed: 95 additions & 61 deletions
Original file line numberDiff line numberDiff line change
@@ -298,11 +298,7 @@ func (t *Task) Clone(args *linux.CloneArgs) (ThreadID, *SyscallControl, error) {
298298
}
299299

300300
if userns != creds.UserNamespace {
301-
if err := nt.SetUserNamespace(userns); err != nil {
302-
// This shouldn't be possible: userns was created from nt.creds, so
303-
// nt should have CAP_SYS_ADMIN in userns.
304-
panic("Task.Clone: SetUserNamespace failed: " + err.Error())
305-
}
301+
nt.creds.Store(creds.ForkIntoUserNamespace(userns))
306302
}
307303

308304
// This has to happen last, because e.g. ptraceClone may send a SIGSTOP to
@@ -618,7 +614,6 @@ func (t *Task) Unshare(flags int32) error {
618614
if flags&(linux.CLONE_VM|linux.CLONE_SIGHAND) != 0 {
619615
return linuxerr.EINVAL
620616
}
621-
creds := t.Credentials()
622617
if flags&linux.CLONE_THREAD != 0 {
623618
t.tg.signalHandlers.mu.Lock()
624619
if t.tg.tasksCount != 1 {
@@ -629,98 +624,137 @@ func (t *Task) Unshare(flags int32) error {
629624
// This isn't racy because we're the only living task, and therefore
630625
// the only task capable of creating new ones, in our thread group.
631626
}
627+
628+
// Prepare new execution context.
629+
creds := t.Credentials()
630+
var (
631+
newFSContext *FSContext
632+
newFDTable *FDTable
633+
newCreds bool
634+
newChildPIDNS *PIDNamespace
635+
newNetNS *inet.Namespace
636+
newUTSNS *UTSNamespace
637+
newIPCNS *IPCNamespace
638+
newMountNS *vfs.MountNamespace
639+
)
640+
defer func() {
641+
if newFSContext != nil {
642+
newFSContext.destroy(t)
643+
}
644+
if newFDTable != nil {
645+
newFDTable.DecRef(t)
646+
}
647+
if newNetNS != nil {
648+
newNetNS.DecRef(t)
649+
}
650+
if newUTSNS != nil {
651+
newUTSNS.DecRef(t)
652+
}
653+
if newIPCNS != nil {
654+
newIPCNS.DecRef(t)
655+
}
656+
if newMountNS != nil {
657+
newMountNS.DecRef(t)
658+
}
659+
}()
660+
if flags&linux.CLONE_FS != 0 || flags&linux.CLONE_NEWNS != 0 {
661+
newFSContext = t.FSContext().Fork()
662+
}
663+
if flags&linux.CLONE_FILES != 0 {
664+
newFDTable = t.fdTable.Fork(t, MaxFdLimit)
665+
}
632666
if flags&linux.CLONE_NEWUSER != 0 {
633667
if t.IsChrooted() {
634668
return linuxerr.EPERM
635669
}
670+
var err error
636671
newUserNS, err := creds.NewChildUserNamespace()
637672
if err != nil {
638673
return err
639674
}
640-
err = t.SetUserNamespace(newUserNS)
641-
if err != nil {
642-
return err
643-
}
644-
// Need to reload creds, because t.SetUserNamespace() changed task credentials.
645-
creds = t.Credentials()
675+
creds = t.Credentials().ForkIntoUserNamespace(newUserNS)
676+
newCreds = true
646677
}
647-
haveCapSysAdmin := t.HasCapability(linux.CAP_SYS_ADMIN)
678+
haveCapSysAdmin := creds.HasCapability(linux.CAP_SYS_ADMIN)
648679
if flags&linux.CLONE_NEWPID != 0 {
649680
if !haveCapSysAdmin {
650681
return linuxerr.EPERM
651682
}
652-
t.childPIDNamespace = t.tg.pidns.NewChild(t, t.k, t.UserNamespace())
683+
newChildPIDNS = t.tg.pidns.NewChild(t, t.k, creds.UserNamespace)
653684
}
654685
if flags&linux.CLONE_NEWNET != 0 {
655686
if !haveCapSysAdmin {
656687
return linuxerr.EPERM
657688
}
658-
netns := t.NetworkNamespace()
659-
netns = inet.NewNamespace(netns, t.UserNamespace())
660-
netnsInode := nsfs.NewInode(t, t.k.nsfsMount, netns)
661-
netns.SetInode(netnsInode)
662-
t.mu.Lock()
663-
oldNetns := t.netns
664-
t.netns = netns
665-
t.mu.Unlock()
666-
oldNetns.DecRef(t)
689+
newNetNS = inet.NewNamespace(t.netns, creds.UserNamespace)
690+
newNetNS.SetInode(nsfs.NewInode(t, t.k.nsfsMount, newNetNS))
667691
}
668-
669-
cu := cleanup.Cleanup{}
670-
// All cu actions has to be executed after releasing t.mu.
671-
defer cu.Clean()
672-
t.mu.Lock()
673-
defer t.mu.Unlock()
674692
if flags&linux.CLONE_NEWUTS != 0 {
675693
if !haveCapSysAdmin {
676694
return linuxerr.EPERM
677695
}
678-
// Note that this must happen after NewUserNamespace, so the
679-
// new user namespace is used if there is one.
680-
oldUTSNS := t.utsns
681-
t.utsns = t.utsns.Clone(creds.UserNamespace)
682-
t.utsns.SetInode(nsfs.NewInode(t, t.k.nsfsMount, t.utsns))
683-
cu.Add(func() { oldUTSNS.DecRef(t) })
696+
newUTSNS = t.utsns.Clone(creds.UserNamespace)
697+
newUTSNS.SetInode(nsfs.NewInode(t, t.k.nsfsMount, newUTSNS))
684698
}
685699
if flags&linux.CLONE_NEWIPC != 0 {
686700
if !haveCapSysAdmin {
687701
return linuxerr.EPERM
688702
}
689-
// Note that "If CLONE_NEWIPC is set, then create the process in a new IPC
690-
// namespace"
691-
oldIPCNS := t.ipcns
692-
t.ipcns = NewIPCNamespace(creds.UserNamespace)
693-
t.ipcns.InitPosixQueues(t, t.k.VFS(), creds)
694-
t.ipcns.SetInode(nsfs.NewInode(t, t.k.nsfsMount, t.ipcns))
695-
cu.Add(func() { oldIPCNS.DecRef(t) })
696-
}
697-
if flags&linux.CLONE_FILES != 0 {
698-
oldFDTable := t.fdTable
699-
t.fdTable = oldFDTable.Fork(t, MaxFdLimit)
700-
cu.Add(func() { oldFDTable.DecRef(t) })
701-
}
702-
if flags&linux.CLONE_FS != 0 || flags&linux.CLONE_NEWNS != 0 {
703-
oldFSContext := t.FSContext()
704-
// unshareFromTask() lowers the old fs context's ref count, but its for us to
705-
// destroy it if there are no other references.
706-
if oldFSContext.unshareFromTask(t, oldFSContext.Fork()) {
707-
// destroy() requires t.mu to not be held, hence the deferral.
708-
cu.Add(func() { oldFSContext.destroy(t) })
709-
}
703+
newIPCNS = NewIPCNamespace(creds.UserNamespace)
704+
newIPCNS.InitPosixQueues(t, t.k.VFS(), creds)
705+
newIPCNS.SetInode(nsfs.NewInode(t, t.k.nsfsMount, newIPCNS))
710706
}
711707
if flags&linux.CLONE_NEWNS != 0 {
712708
if !haveCapSysAdmin {
713709
return linuxerr.EPERM
714710
}
715-
oldMountNS := t.mountNamespace
716-
fsContext := t.FSContext()
717-
mntns, err := t.k.vfs.CloneMountNamespace(t, creds.UserNamespace, oldMountNS, &fsContext.root, &fsContext.cwd, t.k)
711+
fsContext := newFSContext
712+
if fsContext == nil {
713+
fsContext = t.FSContext()
714+
}
715+
var err error
716+
newMountNS, err = t.k.vfs.CloneMountNamespace(t, creds.UserNamespace, t.mountNamespace, &fsContext.root, &fsContext.cwd, t.k)
718717
if err != nil {
719718
return err
720719
}
721-
t.mountNamespace = mntns
722-
cu.Add(func() { oldMountNS.DecRef(t) })
723720
}
721+
722+
// Switch to new execution context. Store replaced resources in new* so
723+
// that they're cleaned up by the deferred function.
724+
if newCreds {
725+
t.creds.Store(creds)
726+
}
727+
t.mu.Lock()
728+
defer t.mu.Unlock()
729+
if newFSContext != nil {
730+
oldFSContext := t.FSContext()
731+
// unshareFromTask() lowers the old fs context's ref count, but its for us to
732+
// destroy it if there are no other references.
733+
if oldFSContext.unshareFromTask(t, newFSContext) {
734+
newFSContext = oldFSContext
735+
} else {
736+
newFSContext = nil
737+
}
738+
}
739+
if newFDTable != nil {
740+
t.fdTable, newFDTable = newFDTable, t.fdTable
741+
}
742+
if newChildPIDNS != nil {
743+
t.childPIDNamespace = newChildPIDNS
744+
}
745+
if newNetNS != nil {
746+
t.netns, newNetNS = newNetNS, t.netns
747+
}
748+
if newUTSNS != nil {
749+
t.utsns, newUTSNS = newUTSNS, t.utsns
750+
}
751+
if newIPCNS != nil {
752+
t.ipcns, newIPCNS = newIPCNS, t.ipcns
753+
}
754+
if newMountNS != nil {
755+
t.mountNamespace, newMountNS = newMountNS, t.mountNamespace
756+
}
757+
724758
return nil
725759
}
726760

pkg/sentry/kernel/task_identity.go

Lines changed: 0 additions & 35 deletions
Original file line numberDiff line numberDiff line change
@@ -419,41 +419,6 @@ func (t *Task) DropBoundingCapability(cp linux.Capability) error {
419419
return nil
420420
}
421421

422-
// SetUserNamespace attempts to move c into ns.
423-
//
424-
// Preconditions: The caller must be running on the task goroutine.
425-
func (t *Task) SetUserNamespace(ns *auth.UserNamespace) error {
426-
creds := t.Credentials()
427-
// "A process reassociating itself with a user namespace must have the
428-
// CAP_SYS_ADMIN capability in the target user namespace." - setns(2)
429-
//
430-
// If t just created ns, then t.creds is guaranteed to have CAP_SYS_ADMIN
431-
// in ns (by rule 3 in auth.Credentials.HasCapability).
432-
if !creds.HasCapabilityIn(linux.CAP_SYS_ADMIN, ns) {
433-
return linuxerr.EPERM
434-
}
435-
436-
creds = creds.Fork() // The credentials object is immutable. See doc for creds.
437-
creds.UserNamespace = ns
438-
// "The child process created by clone(2) with the CLONE_NEWUSER flag
439-
// starts out with a complete set of capabilities in the new user
440-
// namespace. Likewise, a process that creates a new user namespace using
441-
// unshare(2) or joins an existing user namespace using setns(2) gains a
442-
// full set of capabilities in that namespace."
443-
creds.PermittedCaps = auth.AllCapabilities
444-
creds.InheritableCaps = 0
445-
creds.EffectiveCaps = auth.AllCapabilities
446-
creds.BoundingCaps = auth.AllCapabilities
447-
// "A call to clone(2), unshare(2), or setns(2) using the CLONE_NEWUSER
448-
// flag sets the "securebits" flags (see capabilities(7)) to their default
449-
// values (all flags disabled) in the child (for clone(2)) or caller (for
450-
// unshare(2), or setns(2)." - user_namespaces(7)
451-
creds.KeepCaps = false
452-
t.creds.Store(creds)
453-
454-
return nil
455-
}
456-
457422
// SetKeepCaps will set the keep capabilities flag PR_SET_KEEPCAPS.
458423
//
459424
// Preconditions: The caller must be running on the task goroutine.

0 commit comments

Comments
 (0)