Skip to content

Commit 578733b

Browse files
nixprimegvisor-bot
authored andcommitted
kvm: honor memmap.File.MemoryType()
Updates #11436 PiperOrigin-RevId: 723723715
1 parent 6194338 commit 578733b

25 files changed

+327
-88
lines changed

pkg/hostarch/BUILD

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -38,6 +38,7 @@ go_library(
3838
"hostarch.go",
3939
"hostarch_arm64.go",
4040
"hostarch_x86.go",
41+
"memory_type.go",
4142
"sizes_util.go",
4243
],
4344
visibility = ["//:sandbox"],

pkg/hostarch/memory_type.go

Lines changed: 59 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,59 @@
1+
// Copyright 2025 The gVisor Authors.
2+
//
3+
// Licensed under the Apache License, Version 2.0 (the "License");
4+
// you may not use this file except in compliance with the License.
5+
// You may obtain a copy of the License at
6+
//
7+
// http://www.apache.org/licenses/LICENSE-2.0
8+
//
9+
// Unless required by applicable law or agreed to in writing, software
10+
// distributed under the License is distributed on an "AS IS" BASIS,
11+
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12+
// See the License for the specific language governing permissions and
13+
// limitations under the License.
14+
15+
package hostarch
16+
17+
// MemoryType specifies CPU memory access behavior with respect to a given
18+
// virtual memory mapping.
19+
type MemoryType uint8
20+
21+
const (
22+
// MemoryTypeInvalid ensures that the zero value of MemoryType is invalid.
23+
MemoryTypeInvalid MemoryType = iota
24+
25+
// MemoryTypeUncacheable is equivalent to the following architectural
26+
// memory types:
27+
//
28+
// x86: Uncacheable (UC)
29+
// ARM64: Device-nGnRnE
30+
//
31+
// TODO(gvisor.dev/issue/11436): nv-mmap.c:nv_encode_caching()
32+
// distinguishes between NV_PGPROT_UNCACHED => Normal-NC and
33+
// NV_PGPROT_UNCACHED_DEVICE => Device-nGnRnE; is this just for
34+
// performance, or required for correctness? cf. Armv8-M Architecture
35+
// Reference Manual B7.16 "Mismatched memory attributes" R_GBKH
36+
MemoryTypeUncacheable
37+
38+
// MemoryTypeWriteCombine is equivalent to the following architectural
39+
// memory types:
40+
//
41+
// x86: Write-combining (WC)
42+
// ARM64: Normal non-cacheable
43+
//
44+
// TODO(gvisor.dev/issue/11436): This is consistent with Linux's
45+
// arch/arm64/include/asm/pgtable.h:pgprot_writecombine(). However, it also
46+
// corresponds to NV_PGPROT_WRITE_COMBINED in the Nvidia driver (rather
47+
// than NV_PGPROT_WRITE_COMBINED_DEVICE => Device-nGnRE), which on ARM64
48+
// uses normal cacheable rather than non-cacheable on chipsets for which
49+
// PDB_PROP_CL_IS_CHIPSET_IO_COHERENT is true, which seems to be the case
50+
// on most systems.
51+
MemoryTypeWriteCombine
52+
53+
// MemoryTypeWriteBack is equivalent to the following architectural memory
54+
// types:
55+
//
56+
// x86: Write-back (WB)
57+
// ARM64: Normal write-back cacheable
58+
MemoryTypeWriteBack
59+
)

pkg/ring0/pagetables/pagetables_aarch64.go

Lines changed: 56 additions & 28 deletions
Original file line numberDiff line numberDiff line change
@@ -52,29 +52,26 @@ func (p *PageTables) TTBR1_EL1(noFlush bool, asid uint16) uint64 {
5252

5353
// Bits in page table entries.
5454
const (
55-
typeTable = 0x3 << 0
56-
typeSect = 0x1 << 0
57-
typePage = 0x3 << 0
58-
pteValid = 0x1 << 0
59-
pteTableBit = 0x1 << 1
60-
pteTypeMask = 0x3 << 0
61-
present = pteValid | pteTableBit
62-
user = 0x1 << 6 /* AP[1] */
63-
readOnly = 0x1 << 7 /* AP[2] */
64-
accessed = 0x1 << 10
65-
dbm = 0x1 << 51
66-
writable = dbm
67-
cont = 0x1 << 52
68-
pxn = 0x1 << 53
69-
xn = 0x1 << 54
70-
dirty = 0x1 << 55
71-
nG = 0x1 << 11
72-
shared = 0x3 << 8
73-
)
74-
75-
const (
76-
mtDevicenGnRE = 0x1 << 2
77-
mtNormal = 0x4 << 2
55+
typeTable = 0x3 << 0
56+
typeSect = 0x1 << 0
57+
typePage = 0x3 << 0
58+
pteValid = 0x1 << 0
59+
pteTableBit = 0x1 << 1
60+
pteTypeMask = 0x3 << 0
61+
present = pteValid | pteTableBit
62+
attrIndxShift = 2
63+
attrIndxMask = 0x7
64+
user = 0x1 << 6 /* AP[1] */
65+
readOnly = 0x1 << 7 /* AP[2] */
66+
accessed = 0x1 << 10
67+
dbm = 0x1 << 51
68+
writable = dbm
69+
cont = 0x1 << 52
70+
pxn = 0x1 << 53
71+
xn = 0x1 << 54
72+
dirty = 0x1 << 55
73+
nG = 0x1 << 11
74+
shared = 0x3 << 8
7875
)
7976

8077
const (
@@ -83,6 +80,16 @@ const (
8380
protDefault = accessed | shared
8481
)
8582

83+
// MAIR entries. These are not intrinsic to ARM64 but must be configured by
84+
// writing to MPU_MAIR0/1. These values match Linux's
85+
// (arch/arm64/include/asm/memory.h:MT_*), which is not necessary but can't
86+
// hurt.
87+
const (
88+
MAIREntryNormal = 0
89+
MAIREntryNormalNC = 2
90+
MAIREntryDevice_nGnRnE = 3
91+
)
92+
8693
// MapOpts are x86 options.
8794
type MapOpts struct {
8895
// AccessType defines permissions.
@@ -93,6 +100,9 @@ type MapOpts struct {
93100

94101
// User indicates the page is a user page.
95102
User bool
103+
104+
// MemoryType is the memory type.
105+
MemoryType hostarch.MemoryType
96106
}
97107

98108
// PTE is a page table entry.
@@ -119,8 +129,7 @@ func (p *PTE) Valid() bool {
119129
//go:nosplit
120130
func (p *PTE) Opts() MapOpts {
121131
v := atomic.LoadUintptr((*uintptr)(p))
122-
123-
return MapOpts{
132+
opts := MapOpts{
124133
AccessType: hostarch.AccessType{
125134
Read: true,
126135
Write: v&readOnly == 0,
@@ -129,6 +138,15 @@ func (p *PTE) Opts() MapOpts {
129138
Global: v&nG == 0,
130139
User: v&user != 0,
131140
}
141+
switch (v >> attrIndxShift) & attrIndxMask {
142+
case MAIREntryNormal:
143+
opts.MemoryType = hostarch.MemoryTypeWriteBack
144+
case MAIREntryNormalNC:
145+
opts.MemoryType = hostarch.MemoryTypeWriteCombine
146+
case MAIREntryDevice_nGnRnE:
147+
opts.MemoryType = hostarch.MemoryTypeUncacheable
148+
}
149+
return opts
132150
}
133151

134152
// SetSect sets this page as a sect page.
@@ -191,11 +209,21 @@ func (p *PTE) Set(addr uintptr, opts MapOpts) {
191209

192210
if opts.User {
193211
v |= user
194-
v |= mtNormal
195212
} else {
196213
v = v &^ user
197-
v |= mtNormal
198214
}
215+
216+
switch opts.MemoryType {
217+
case hostarch.MemoryTypeUncacheable:
218+
v |= MAIREntryDevice_nGnRnE << attrIndxShift
219+
case hostarch.MemoryTypeWriteCombine:
220+
v |= MAIREntryNormalNC << attrIndxShift
221+
case hostarch.MemoryTypeWriteBack:
222+
v |= MAIREntryNormal << attrIndxShift
223+
default:
224+
panic("invalid MemoryType")
225+
}
226+
199227
atomic.StoreUintptr((*uintptr)(p), v)
200228
}
201229

@@ -209,7 +237,7 @@ func (p *PTE) setPageTable(pt *PageTables, ptes *PTEs) {
209237
// This should never happen.
210238
panic("unaligned physical address!")
211239
}
212-
v := addr | typeTable | protDefault | mtNormal
240+
v := addr | typeTable | protDefault | (MAIREntryNormal << attrIndxShift)
213241
atomic.StoreUintptr((*uintptr)(p), v)
214242
}
215243

pkg/ring0/pagetables/pagetables_x86.go

Lines changed: 48 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -49,16 +49,28 @@ func (p *PageTables) CR3(noFlush bool, pcid uint16) uint64 {
4949

5050
// Bits in page table entries.
5151
const (
52-
present = 0x001
53-
writable = 0x002
54-
user = 0x004
55-
writeThrough = 0x008
56-
cacheDisable = 0x010
57-
accessed = 0x020
58-
dirty = 0x040
59-
super = 0x080
60-
global = 0x100
61-
optionMask = executeDisable | 0xfff
52+
present = 0x001
53+
writable = 0x002
54+
user = 0x004
55+
accessed = 0x020
56+
dirty = 0x040
57+
super = 0x080
58+
global = 0x100
59+
optionMask = executeDisable | 0xfff
60+
61+
pwtShift = 3
62+
patIndexMask = 0x3
63+
)
64+
65+
// PAT entries. These are not intrinsic to x86 but must be configured by
66+
// writing to the IA32_PAT MSR. These values match Linux's
67+
// (arch/x86/mm/pat/memtype.c:pat_bp_init()), which is not necessary but can't
68+
// hurt. Note that PTE.Set() currently depends on only using the first 4 PAT
69+
// entries.
70+
const (
71+
PATEntryWB = 0
72+
PATEntryWC = 1
73+
PATEntryUC = 3
6274
)
6375

6476
// MapOpts are x86 options.
@@ -71,6 +83,9 @@ type MapOpts struct {
7183

7284
// User indicates the page is a user page.
7385
User bool
86+
87+
// MemoryType is the memory type.
88+
MemoryType hostarch.MemoryType
7489
}
7590

7691
// PTE is a page table entry.
@@ -97,7 +112,7 @@ func (p *PTE) Valid() bool {
97112
//go:nosplit
98113
func (p *PTE) Opts() MapOpts {
99114
v := atomic.LoadUintptr((*uintptr)(p))
100-
return MapOpts{
115+
opts := MapOpts{
101116
AccessType: hostarch.AccessType{
102117
Read: v&present != 0,
103118
Write: v&writable != 0,
@@ -106,6 +121,15 @@ func (p *PTE) Opts() MapOpts {
106121
Global: v&global != 0,
107122
User: v&user != 0,
108123
}
124+
switch (v >> pwtShift) & patIndexMask {
125+
case PATEntryWB:
126+
opts.MemoryType = hostarch.MemoryTypeWriteBack
127+
case PATEntryWC:
128+
opts.MemoryType = hostarch.MemoryTypeWriteCombine
129+
case PATEntryUC:
130+
opts.MemoryType = hostarch.MemoryTypeUncacheable
131+
}
132+
return opts
109133
}
110134

111135
// SetSuper sets this page as a super page.
@@ -154,6 +178,19 @@ func (p *PTE) Set(addr uintptr, opts MapOpts) {
154178
if opts.AccessType.Write {
155179
v |= writable | dirty
156180
}
181+
// This assumes that we only use PAT entries 0-3, since the location of the
182+
// 3rd bit (the PAT bit) varies depending on page size (and is never bit
183+
// 5 == pwtShift + 2).
184+
switch opts.MemoryType {
185+
case hostarch.MemoryTypeUncacheable:
186+
v |= PATEntryUC << pwtShift
187+
case hostarch.MemoryTypeWriteCombine:
188+
v |= PATEntryWC << pwtShift
189+
case hostarch.MemoryTypeWriteBack:
190+
v |= PATEntryWB << pwtShift
191+
default:
192+
panic("invalid MemoryType")
193+
}
157194
if p.IsSuper() {
158195
// Note that this is inherited from the previous instance. Set
159196
// does not change the value of Super. See above.

pkg/sentry/devices/nvproxy/frontend.go

Lines changed: 51 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -46,8 +46,12 @@ type frontendDevice struct {
4646
minor uint32
4747
}
4848

49+
func (dev *frontendDevice) isCtlDevice() bool {
50+
return dev.minor == nvgpu.NV_CONTROL_DEVICE_MINOR
51+
}
52+
4953
func (dev *frontendDevice) basename() string {
50-
if dev.minor == nvgpu.NV_CONTROL_DEVICE_MINOR {
54+
if dev.isCtlDevice() {
5155
return "nvidiactl"
5256
}
5357
return fmt.Sprintf("nvidia%d", dev.minor)
@@ -134,8 +138,9 @@ type frontendFD struct {
134138
// These fields are marked nosave since we do not automatically reinvoke
135139
// NV_ESC_RM_MAP_MEMORY after restore, so restored FDs have no
136140
// mmap_context.
137-
mmapLength uint64 `state:"nosave"`
138-
mmapInternal uintptr `state:"nosave"`
141+
mmapLength uint64 `state:"nosave"`
142+
mmapInternal uintptr `state:"nosave"`
143+
mmapMemType hostarch.MemoryType `state:"nosave"`
139144

140145
// clients are handles of clients owned by this frontendFD. clients is
141146
// protected by dev.nvp.objsMu.
@@ -428,6 +433,12 @@ func rmAllocContextDMA2(fi *frontendIoctlState) (uintptr, error) {
428433
}
429434

430435
func rmAllocMemory(fi *frontendIoctlState) (uintptr, error) {
436+
// This is consistent with the NV_ACTUAL_DEVICE_ONLY() check in
437+
// src/nvidia/arch/nvalloc/unix/src/escape.c:RmIoctl().
438+
if fi.fd.dev.isCtlDevice() {
439+
return 0, linuxerr.EINVAL
440+
}
441+
431442
var ioctlParams nvgpu.IoctlNVOS02ParametersWithFD
432443
if fi.ioctlParamsSize != nvgpu.SizeofIoctlNVOS02ParametersWithFD {
433444
return 0, linuxerr.EINVAL
@@ -493,6 +504,19 @@ func rmAllocMemorySystem(fi *frontendIoctlState, ioctlParams *nvgpu.IoctlNVOS02P
493504
fi.fd.dev.nvp.objAdd(fi.ctx, ioctlParams.Params.HRoot, ioctlParams.Params.HObjectNew, ioctlParams.Params.HClass, &miscObject{}, ioctlParams.Params.HObjectParent)
494505
if createMmapCtx {
495506
mapFile.mmapLength = ioctlParams.Params.Limit + 1
507+
// Compare kernel-open/nvidia/nv-mmap.c:nvidia_mmap_helper() =>
508+
// nv_encode_caching().
509+
// - Note that rmAllocMemory() already ensured that fi.fd.dev is
510+
// not nvidiactl, i.e. only the !NV_IS_CTL_DEVICE() branch is
511+
// relevant here.
512+
// - nvidia_mmap_helper() honors mmap_context->caching only if
513+
// IS_FB_OFFSET() and !IS_UD_OFFSET(). We can get the information
514+
// we need for IS_FB_OFFSET() from NV_ESC_CARD_INFO, but there
515+
// doesn't seem to be any way for us to replicate IS_UD_OFFSET().
516+
// So we must conservatively specify uncacheable. (This is
517+
// unfortunate since it prevents us from using write-combining on
518+
// framebuffer memory...)
519+
mapFile.mmapMemType = hostarch.MemoryTypeUncacheable
496520
}
497521
}
498522
fi.fd.dev.nvp.objsUnlock()
@@ -1311,6 +1335,12 @@ func rmVidHeapControl(fi *frontendIoctlState) (uintptr, error) {
13111335
}
13121336

13131337
func rmMapMemory(fi *frontendIoctlState) (uintptr, error) {
1338+
// This is consistent with the NV_CTL_DEVICE_ONLY() check in
1339+
// src/nvidia/arch/nvalloc/unix/src/escape.c:RmIoctl().
1340+
if !fi.fd.dev.isCtlDevice() {
1341+
return 0, linuxerr.EINVAL
1342+
}
1343+
13141344
var ioctlParams nvgpu.IoctlNVOS33ParametersWithFD
13151345
if fi.ioctlParamsSize != nvgpu.SizeofIoctlNVOS33ParametersWithFD {
13161346
return 0, linuxerr.EINVAL
@@ -1343,6 +1373,24 @@ func rmMapMemory(fi *frontendIoctlState) (uintptr, error) {
13431373
}
13441374
if ioctlParams.Params.Status == nvgpu.NV_OK {
13451375
mapFile.mmapLength = ioctlParams.Params.Length
1376+
// src/nvidia/arch/nvalloc/unix/src/escape.c:RmIoctl() forces
1377+
// NVOS33_FLAGS_CACHING_TYPE_DEFAULT. However, resMap implementations
1378+
// may override "caching type", so in general the memory type depends
1379+
// on the mapped object.
1380+
if _, memObj := fi.fd.dev.nvp.getObject(fi.ctx, ioctlParams.Params.HClient, ioctlParams.Params.HMemory); memObj == nil {
1381+
// getObject() already logged a warning; silently fall back to UC.
1382+
mapFile.mmapMemType = hostarch.MemoryTypeUncacheable
1383+
} else {
1384+
// XXX
1385+
// memType := memObj.impl.MemoryType()
1386+
// if memType == hostarch.MemoryTypeInvalid {
1387+
// fi.ctx.Warningf("nvproxy: mapped object with handle %v:%v (class %v, type %T) has unknown memory type", ioctlParams.Params.HClient, ioctlParams.Params.HMemory, memObj.class, memObj.impl)
1388+
// memType = hostarch.MemoryTypeUncacheable
1389+
// }
1390+
// mapFile.mmapMemType = memType
1391+
fi.ctx.Warningf("nvproxy: mapping object with handle %v:%v (class %v, type %T)", ioctlParams.Params.HClient, ioctlParams.Params.HMemory, memObj.class, memObj.impl)
1392+
mapFile.mmapMemType = hostarch.MemoryTypeUncacheable
1393+
}
13461394
}
13471395

13481396
ioctlParams.FD = origFD

0 commit comments

Comments
 (0)