Skip to content

Commit

Permalink
kvm: honor memmap.File.MemoryType()
Browse files Browse the repository at this point in the history
Updates #11436

PiperOrigin-RevId: 723723715
  • Loading branch information
nixprime authored and gvisor-bot committed Feb 6, 2025
1 parent 6194338 commit 578733b
Show file tree
Hide file tree
Showing 25 changed files with 327 additions and 88 deletions.
1 change: 1 addition & 0 deletions pkg/hostarch/BUILD
Original file line number Diff line number Diff line change
Expand Up @@ -38,6 +38,7 @@ go_library(
"hostarch.go",
"hostarch_arm64.go",
"hostarch_x86.go",
"memory_type.go",
"sizes_util.go",
],
visibility = ["//:sandbox"],
Expand Down
59 changes: 59 additions & 0 deletions pkg/hostarch/memory_type.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,59 @@
// Copyright 2025 The gVisor Authors.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.

package hostarch

// MemoryType specifies CPU memory access behavior with respect to a given
// virtual memory mapping.
type MemoryType uint8

const (
// MemoryTypeInvalid ensures that the zero value of MemoryType is invalid.
MemoryTypeInvalid MemoryType = iota

// MemoryTypeUncacheable is equivalent to the following architectural
// memory types:
//
// x86: Uncacheable (UC)
// ARM64: Device-nGnRnE
//
// TODO(gvisor.dev/issue/11436): nv-mmap.c:nv_encode_caching()
// distinguishes between NV_PGPROT_UNCACHED => Normal-NC and
// NV_PGPROT_UNCACHED_DEVICE => Device-nGnRnE; is this just for
// performance, or required for correctness? cf. Armv8-M Architecture
// Reference Manual B7.16 "Mismatched memory attributes" R_GBKH
MemoryTypeUncacheable

// MemoryTypeWriteCombine is equivalent to the following architectural
// memory types:
//
// x86: Write-combining (WC)
// ARM64: Normal non-cacheable
//
// TODO(gvisor.dev/issue/11436): This is consistent with Linux's
// arch/arm64/include/asm/pgtable.h:pgprot_writecombine(). However, it also
// corresponds to NV_PGPROT_WRITE_COMBINED in the Nvidia driver (rather
// than NV_PGPROT_WRITE_COMBINED_DEVICE => Device-nGnRE), which on ARM64
// uses normal cacheable rather than non-cacheable on chipsets for which
// PDB_PROP_CL_IS_CHIPSET_IO_COHERENT is true, which seems to be the case
// on most systems.
MemoryTypeWriteCombine

// MemoryTypeWriteBack is equivalent to the following architectural memory
// types:
//
// x86: Write-back (WB)
// ARM64: Normal write-back cacheable
MemoryTypeWriteBack
)
84 changes: 56 additions & 28 deletions pkg/ring0/pagetables/pagetables_aarch64.go
Original file line number Diff line number Diff line change
Expand Up @@ -52,29 +52,26 @@ func (p *PageTables) TTBR1_EL1(noFlush bool, asid uint16) uint64 {

// Bits in page table entries.
const (
typeTable = 0x3 << 0
typeSect = 0x1 << 0
typePage = 0x3 << 0
pteValid = 0x1 << 0
pteTableBit = 0x1 << 1
pteTypeMask = 0x3 << 0
present = pteValid | pteTableBit
user = 0x1 << 6 /* AP[1] */
readOnly = 0x1 << 7 /* AP[2] */
accessed = 0x1 << 10
dbm = 0x1 << 51
writable = dbm
cont = 0x1 << 52
pxn = 0x1 << 53
xn = 0x1 << 54
dirty = 0x1 << 55
nG = 0x1 << 11
shared = 0x3 << 8
)

const (
mtDevicenGnRE = 0x1 << 2
mtNormal = 0x4 << 2
typeTable = 0x3 << 0
typeSect = 0x1 << 0
typePage = 0x3 << 0
pteValid = 0x1 << 0
pteTableBit = 0x1 << 1
pteTypeMask = 0x3 << 0
present = pteValid | pteTableBit
attrIndxShift = 2
attrIndxMask = 0x7
user = 0x1 << 6 /* AP[1] */
readOnly = 0x1 << 7 /* AP[2] */
accessed = 0x1 << 10
dbm = 0x1 << 51
writable = dbm
cont = 0x1 << 52
pxn = 0x1 << 53
xn = 0x1 << 54
dirty = 0x1 << 55
nG = 0x1 << 11
shared = 0x3 << 8
)

const (
Expand All @@ -83,6 +80,16 @@ const (
protDefault = accessed | shared
)

// MAIR entries. These are not intrinsic to ARM64 but must be configured by
// writing to MPU_MAIR0/1. These values match Linux's
// (arch/arm64/include/asm/memory.h:MT_*), which is not necessary but can't
// hurt.
const (
MAIREntryNormal = 0
MAIREntryNormalNC = 2
MAIREntryDevice_nGnRnE = 3
)

// MapOpts are x86 options.
type MapOpts struct {
// AccessType defines permissions.
Expand All @@ -93,6 +100,9 @@ type MapOpts struct {

// User indicates the page is a user page.
User bool

// MemoryType is the memory type.
MemoryType hostarch.MemoryType
}

// PTE is a page table entry.
Expand All @@ -119,8 +129,7 @@ func (p *PTE) Valid() bool {
//go:nosplit
func (p *PTE) Opts() MapOpts {
v := atomic.LoadUintptr((*uintptr)(p))

return MapOpts{
opts := MapOpts{
AccessType: hostarch.AccessType{
Read: true,
Write: v&readOnly == 0,
Expand All @@ -129,6 +138,15 @@ func (p *PTE) Opts() MapOpts {
Global: v&nG == 0,
User: v&user != 0,
}
switch (v >> attrIndxShift) & attrIndxMask {
case MAIREntryNormal:
opts.MemoryType = hostarch.MemoryTypeWriteBack
case MAIREntryNormalNC:
opts.MemoryType = hostarch.MemoryTypeWriteCombine
case MAIREntryDevice_nGnRnE:
opts.MemoryType = hostarch.MemoryTypeUncacheable
}
return opts
}

// SetSect sets this page as a sect page.
Expand Down Expand Up @@ -191,11 +209,21 @@ func (p *PTE) Set(addr uintptr, opts MapOpts) {

if opts.User {
v |= user
v |= mtNormal
} else {
v = v &^ user
v |= mtNormal
}

switch opts.MemoryType {
case hostarch.MemoryTypeUncacheable:
v |= MAIREntryDevice_nGnRnE << attrIndxShift
case hostarch.MemoryTypeWriteCombine:
v |= MAIREntryNormalNC << attrIndxShift
case hostarch.MemoryTypeWriteBack:
v |= MAIREntryNormal << attrIndxShift
default:
panic("invalid MemoryType")
}

atomic.StoreUintptr((*uintptr)(p), v)
}

Expand All @@ -209,7 +237,7 @@ func (p *PTE) setPageTable(pt *PageTables, ptes *PTEs) {
// This should never happen.
panic("unaligned physical address!")
}
v := addr | typeTable | protDefault | mtNormal
v := addr | typeTable | protDefault | (MAIREntryNormal << attrIndxShift)
atomic.StoreUintptr((*uintptr)(p), v)
}

Expand Down
59 changes: 48 additions & 11 deletions pkg/ring0/pagetables/pagetables_x86.go
Original file line number Diff line number Diff line change
Expand Up @@ -49,16 +49,28 @@ func (p *PageTables) CR3(noFlush bool, pcid uint16) uint64 {

// Bits in page table entries.
const (
present = 0x001
writable = 0x002
user = 0x004
writeThrough = 0x008
cacheDisable = 0x010
accessed = 0x020
dirty = 0x040
super = 0x080
global = 0x100
optionMask = executeDisable | 0xfff
present = 0x001
writable = 0x002
user = 0x004
accessed = 0x020
dirty = 0x040
super = 0x080
global = 0x100
optionMask = executeDisable | 0xfff

pwtShift = 3
patIndexMask = 0x3
)

// PAT entries. These are not intrinsic to x86 but must be configured by
// writing to the IA32_PAT MSR. These values match Linux's
// (arch/x86/mm/pat/memtype.c:pat_bp_init()), which is not necessary but can't
// hurt. Note that PTE.Set() currently depends on only using the first 4 PAT
// entries.
const (
PATEntryWB = 0
PATEntryWC = 1
PATEntryUC = 3
)

// MapOpts are x86 options.
Expand All @@ -71,6 +83,9 @@ type MapOpts struct {

// User indicates the page is a user page.
User bool

// MemoryType is the memory type.
MemoryType hostarch.MemoryType
}

// PTE is a page table entry.
Expand All @@ -97,7 +112,7 @@ func (p *PTE) Valid() bool {
//go:nosplit
func (p *PTE) Opts() MapOpts {
v := atomic.LoadUintptr((*uintptr)(p))
return MapOpts{
opts := MapOpts{
AccessType: hostarch.AccessType{
Read: v&present != 0,
Write: v&writable != 0,
Expand All @@ -106,6 +121,15 @@ func (p *PTE) Opts() MapOpts {
Global: v&global != 0,
User: v&user != 0,
}
switch (v >> pwtShift) & patIndexMask {
case PATEntryWB:
opts.MemoryType = hostarch.MemoryTypeWriteBack
case PATEntryWC:
opts.MemoryType = hostarch.MemoryTypeWriteCombine
case PATEntryUC:
opts.MemoryType = hostarch.MemoryTypeUncacheable
}
return opts
}

// SetSuper sets this page as a super page.
Expand Down Expand Up @@ -154,6 +178,19 @@ func (p *PTE) Set(addr uintptr, opts MapOpts) {
if opts.AccessType.Write {
v |= writable | dirty
}
// This assumes that we only use PAT entries 0-3, since the location of the
// 3rd bit (the PAT bit) varies depending on page size (and is never bit
// 5 == pwtShift + 2).
switch opts.MemoryType {
case hostarch.MemoryTypeUncacheable:
v |= PATEntryUC << pwtShift
case hostarch.MemoryTypeWriteCombine:
v |= PATEntryWC << pwtShift
case hostarch.MemoryTypeWriteBack:
v |= PATEntryWB << pwtShift
default:
panic("invalid MemoryType")
}
if p.IsSuper() {
// Note that this is inherited from the previous instance. Set
// does not change the value of Super. See above.
Expand Down
54 changes: 51 additions & 3 deletions pkg/sentry/devices/nvproxy/frontend.go
Original file line number Diff line number Diff line change
Expand Up @@ -46,8 +46,12 @@ type frontendDevice struct {
minor uint32
}

func (dev *frontendDevice) isCtlDevice() bool {
return dev.minor == nvgpu.NV_CONTROL_DEVICE_MINOR
}

func (dev *frontendDevice) basename() string {
if dev.minor == nvgpu.NV_CONTROL_DEVICE_MINOR {
if dev.isCtlDevice() {
return "nvidiactl"
}
return fmt.Sprintf("nvidia%d", dev.minor)
Expand Down Expand Up @@ -134,8 +138,9 @@ type frontendFD struct {
// These fields are marked nosave since we do not automatically reinvoke
// NV_ESC_RM_MAP_MEMORY after restore, so restored FDs have no
// mmap_context.
mmapLength uint64 `state:"nosave"`
mmapInternal uintptr `state:"nosave"`
mmapLength uint64 `state:"nosave"`
mmapInternal uintptr `state:"nosave"`
mmapMemType hostarch.MemoryType `state:"nosave"`

// clients are handles of clients owned by this frontendFD. clients is
// protected by dev.nvp.objsMu.
Expand Down Expand Up @@ -428,6 +433,12 @@ func rmAllocContextDMA2(fi *frontendIoctlState) (uintptr, error) {
}

func rmAllocMemory(fi *frontendIoctlState) (uintptr, error) {
// This is consistent with the NV_ACTUAL_DEVICE_ONLY() check in
// src/nvidia/arch/nvalloc/unix/src/escape.c:RmIoctl().
if fi.fd.dev.isCtlDevice() {
return 0, linuxerr.EINVAL
}

var ioctlParams nvgpu.IoctlNVOS02ParametersWithFD
if fi.ioctlParamsSize != nvgpu.SizeofIoctlNVOS02ParametersWithFD {
return 0, linuxerr.EINVAL
Expand Down Expand Up @@ -493,6 +504,19 @@ func rmAllocMemorySystem(fi *frontendIoctlState, ioctlParams *nvgpu.IoctlNVOS02P
fi.fd.dev.nvp.objAdd(fi.ctx, ioctlParams.Params.HRoot, ioctlParams.Params.HObjectNew, ioctlParams.Params.HClass, &miscObject{}, ioctlParams.Params.HObjectParent)
if createMmapCtx {
mapFile.mmapLength = ioctlParams.Params.Limit + 1
// Compare kernel-open/nvidia/nv-mmap.c:nvidia_mmap_helper() =>
// nv_encode_caching().
// - Note that rmAllocMemory() already ensured that fi.fd.dev is
// not nvidiactl, i.e. only the !NV_IS_CTL_DEVICE() branch is
// relevant here.
// - nvidia_mmap_helper() honors mmap_context->caching only if
// IS_FB_OFFSET() and !IS_UD_OFFSET(). We can get the information
// we need for IS_FB_OFFSET() from NV_ESC_CARD_INFO, but there
// doesn't seem to be any way for us to replicate IS_UD_OFFSET().
// So we must conservatively specify uncacheable. (This is
// unfortunate since it prevents us from using write-combining on
// framebuffer memory...)
mapFile.mmapMemType = hostarch.MemoryTypeUncacheable
}
}
fi.fd.dev.nvp.objsUnlock()
Expand Down Expand Up @@ -1311,6 +1335,12 @@ func rmVidHeapControl(fi *frontendIoctlState) (uintptr, error) {
}

func rmMapMemory(fi *frontendIoctlState) (uintptr, error) {
// This is consistent with the NV_CTL_DEVICE_ONLY() check in
// src/nvidia/arch/nvalloc/unix/src/escape.c:RmIoctl().
if !fi.fd.dev.isCtlDevice() {
return 0, linuxerr.EINVAL
}

var ioctlParams nvgpu.IoctlNVOS33ParametersWithFD
if fi.ioctlParamsSize != nvgpu.SizeofIoctlNVOS33ParametersWithFD {
return 0, linuxerr.EINVAL
Expand Down Expand Up @@ -1343,6 +1373,24 @@ func rmMapMemory(fi *frontendIoctlState) (uintptr, error) {
}
if ioctlParams.Params.Status == nvgpu.NV_OK {
mapFile.mmapLength = ioctlParams.Params.Length
// src/nvidia/arch/nvalloc/unix/src/escape.c:RmIoctl() forces
// NVOS33_FLAGS_CACHING_TYPE_DEFAULT. However, resMap implementations
// may override "caching type", so in general the memory type depends
// on the mapped object.
if _, memObj := fi.fd.dev.nvp.getObject(fi.ctx, ioctlParams.Params.HClient, ioctlParams.Params.HMemory); memObj == nil {
// getObject() already logged a warning; silently fall back to UC.
mapFile.mmapMemType = hostarch.MemoryTypeUncacheable
} else {
// XXX
// memType := memObj.impl.MemoryType()
// if memType == hostarch.MemoryTypeInvalid {
// fi.ctx.Warningf("nvproxy: mapped object with handle %v:%v (class %v, type %T) has unknown memory type", ioctlParams.Params.HClient, ioctlParams.Params.HMemory, memObj.class, memObj.impl)
// memType = hostarch.MemoryTypeUncacheable
// }
// mapFile.mmapMemType = memType
fi.ctx.Warningf("nvproxy: mapping object with handle %v:%v (class %v, type %T)", ioctlParams.Params.HClient, ioctlParams.Params.HMemory, memObj.class, memObj.impl)
mapFile.mmapMemType = hostarch.MemoryTypeUncacheable
}
}

ioctlParams.FD = origFD
Expand Down
Loading

0 comments on commit 578733b

Please sign in to comment.