Skip to content

Commit 3dd3785

Browse files
committed
openhcl/tdx: inject machine check on ept exits caused by bad guest addresses (microsoft#1340)
On TDX, we see some cases where the guest attempts to access an address with an incorrect shared bit. It's unclear if this is an issue in OpenHCL, the guest, or the host, but fix OpenHCL crashing with an emulation failure due to a `GuestMemory` access failure, and instead inject a machine check into the guest. For addresses outside of mmio and ram, continue to emulate but log that the guest did something strange. In the future, we may also inject a machine check on that path. Tested via a uefi app that attempts to access a shared page at a private gpa (see https://github.com/chris-oo/openvmm/blob/uefi-tmk-write-to-shared/tmk/tmk_launch/src/main.rs) with the following crash: ``` [kmsg]: [3.058450] virt_mshv_vtl::processor::tdx: WARN guest accessed inaccessible gpa, injecting MC gpa=0x666d9000 is_shared=false [kmsg]: [3.061137] virt_mshv_vtl::processor: WARN Guest has reported system crash crash=VtlCrash { vp_index: VpIndex(0), last_vtl: Vtl0, control: GuestCrashCtl { pre_os_id: 0, no_crash_dump: false, crash_message: true, crash_notify: true }, parameters: [12, 0, 0, 6790a820, 4d7] } [kmsg]: [3.061758] virt_mshv_vtl::processor: WARN Guest has reported a system crash message "!!!! X64 Exception Type - 12(#MC - Machine-Check) CPU Apic ID - 00000000 !!!!<5c>r<5c>nRIP - 00000000666DB030, CS - 0000000000000038, RFLAGS - 0000000000010202<5c>r<5c>nRAX - 00000000666D9000, RCX - 0000000000000042, RDX - 3333333333333333<5c>r<5c>nRBX - 0000000066E00018, RSP - 0000000033D99150, RBP - 0000000033D99190<5c>r<5c>nRSI - 0000000066E54718, RDI - 0000000033DB8160<5c>r<5c>nR8 - 0000000000000000, R9 - 00000000666ED508, R10 - 00000000666ED5EE<5c>r<5c>nR11 - 000000000000000A, R12 - 0000000000000000, R13 - 0000000000000000<5c>r<5c>nR14 - 0000000033D99AA0, R15 - 0000000033D99A98<5c>r<5c>nDS - 0000000000000030, ES - 0000000000000030, FS - 0000000000000030<5c>r<5c>nGS - 0000000000000030, SS - 0000000000000030<5c>r<5c>nCR0 - 0000000080010073, CR2 - 0000000000000000, CR3 - 0000000033801000<5c>r<5c>nCR4 - 0000000000000668, CR8 - 0000000000000000<5c>r<5c>nDR0 - 0000000000000000, DR1 - 0000000000000000, DR2 - 0000000000000000<5c>r<5c>nDR3 - ``` microsoft#1426 tracks implementing this for SNP.
1 parent bcdce69 commit 3dd3785

File tree

3 files changed

+211
-40
lines changed

3 files changed

+211
-40
lines changed

openhcl/virt_mshv_vtl/src/processor/snp/mod.rs

Lines changed: 17 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -1370,11 +1370,23 @@ impl UhProcessor<'_, SnpBacked> {
13701370
}
13711371

13721372
SevExitCode::NPF if has_intercept => {
1373-
// Determine whether an NPF needs to be handled. If not, assume this fault is spurious
1374-
// and that the instruction can be retried. The intercept itself may be presented by the
1375-
// hypervisor as either a GPA intercept or an exception intercept.
1376-
// The hypervisor configures the NPT to generate a #VC inside the guest for accesses to
1377-
// unmapped memory. This means that accesses to unmapped memory for lower VTLs will be
1373+
// TODO SNP: This code needs to be fixed to not rely on the
1374+
// hypervisor message to check the validity of the NPF, rather
1375+
// we should look at the SNP hardware exit info only like we do
1376+
// with TDX.
1377+
//
1378+
// TODO SNP: This code should be fixed so we do not attempt to
1379+
// emulate a NPF with an address that has the wrong shared bit,
1380+
// as this will cause the emulator to raise an internal error,
1381+
// and instead inject a machine check like TDX.
1382+
//
1383+
// Determine whether an NPF needs to be handled. If not, assume
1384+
// this fault is spurious and that the instruction can be
1385+
// retried. The intercept itself may be presented by the
1386+
// hypervisor as either a GPA intercept or an exception
1387+
// intercept. The hypervisor configures the NPT to generate a
1388+
// #VC inside the guest for accesses to unmapped memory. This
1389+
// means that accesses to unmapped memory for lower VTLs will be
13781390
// forwarded to underhill as a #VC exception.
13791391
let exit_info2 = vmsa.exit_info2();
13801392
let interruption_pending = vmsa.event_inject().valid()

openhcl/virt_mshv_vtl/src/processor/tdx/mod.rs

Lines changed: 125 additions & 35 deletions
Original file line numberDiff line numberDiff line change
@@ -74,6 +74,7 @@ use virt_support_x86emu::emulate::emulate_insn_memory_op;
7474
use virt_support_x86emu::emulate::emulate_io;
7575
use virt_support_x86emu::emulate::emulate_translate_gva;
7676
use virt_support_x86emu::translate::TranslationRegisters;
77+
use vm_topology::memory::AddressType;
7778
use vmcore::vmtime::VmTimeAccess;
7879
use x86defs::RFlags;
7980
use x86defs::X64_CR0_ET;
@@ -165,11 +166,21 @@ impl TdxExit<'_> {
165166
fn qualification(&self) -> u64 {
166167
self.0.rcx
167168
}
168-
fn gla(&self) -> u64 {
169-
self.0.rdx
169+
fn gla(&self) -> Option<u64> {
170+
// Only valid for EPT exits.
171+
if self.code().vmx_exit().basic_reason() == VmxExitBasic::EPT_VIOLATION {
172+
Some(self.0.rdx)
173+
} else {
174+
None
175+
}
170176
}
171-
fn gpa(&self) -> u64 {
172-
self.0.r8
177+
fn gpa(&self) -> Option<u64> {
178+
// Only valid for EPT exits.
179+
if self.code().vmx_exit().basic_reason() == VmxExitBasic::EPT_VIOLATION {
180+
Some(self.0.r8)
181+
} else {
182+
None
183+
}
173184
}
174185
fn _exit_interruption_info(&self) -> InterruptionInformation {
175186
(self.0.r9 as u32).into()
@@ -1999,7 +2010,7 @@ impl UhProcessor<'_, TdxBacked> {
19992010
&mut self.backing.vtls[intercepted_vtl].exit_stats.wbinvd
20002011
}
20012012
VmxExitBasic::EPT_VIOLATION => {
2002-
let gpa = exit_info.gpa();
2013+
let gpa = exit_info.gpa().expect("is EPT exit");
20032014
let ept_info = VmxEptExitQualification::from(exit_info.qualification());
20042015
// If this was an EPT violation while handling an iret, and
20052016
// that iret cleared the NMI blocking state, restore it.
@@ -2016,34 +2027,8 @@ impl UhProcessor<'_, TdxBacked> {
20162027
)
20172028
.into();
20182029
assert!(!old_interruptibility.blocked_by_nmi());
2019-
}
2020-
// TODO TDX: If this is an access to a shared gpa, we need to
2021-
// check the intercept page to see if this is a real exit or
2022-
// spurious. This exit is only real if the hypervisor has
2023-
// delivered an intercept message for this GPA.
2024-
//
2025-
// However, at this point the kernel has cleared that
2026-
// information so some kind of redesign is required to figure
2027-
// this out.
2028-
//
2029-
// For now, we instead treat EPTs on readable RAM as spurious
2030-
// and log appropriately. This check is also not entirely
2031-
// sufficient, as it may be a write access where the page is
2032-
// protected, but we don't yet support MNF/guest VSM so this is
2033-
// okay enough.
2034-
else if self.partition.gm[intercepted_vtl].check_gpa_readable(gpa) {
2035-
tracelimit::warn_ratelimited!(gpa, "possible spurious EPT violation, ignoring");
20362030
} else {
2037-
// Emulate the access.
2038-
self.emulate(
2039-
dev,
2040-
self.backing.vtls[intercepted_vtl]
2041-
.interruption_information
2042-
.valid(),
2043-
intercepted_vtl,
2044-
TdxEmulationCache::default(),
2045-
)
2046-
.await?;
2031+
self.handle_ept(intercepted_vtl, dev, gpa, ept_info).await?;
20472032
}
20482033

20492034
&mut self.backing.vtls[intercepted_vtl].exit_stats.ept_violation
@@ -2388,6 +2373,111 @@ impl UhProcessor<'_, TdxBacked> {
23882373
self.backing.vtls[vtl].exception_error_code = 0;
23892374
}
23902375

2376+
fn inject_mc(&mut self, vtl: GuestVtl) {
2377+
self.backing.vtls[vtl].interruption_information = InterruptionInformation::new()
2378+
.with_valid(true)
2379+
.with_vector(x86defs::Exception::MACHINE_CHECK.0)
2380+
.with_interruption_type(INTERRUPT_TYPE_HARDWARE_EXCEPTION);
2381+
}
2382+
2383+
async fn handle_ept(
2384+
&mut self,
2385+
intercepted_vtl: GuestVtl,
2386+
dev: &impl CpuIo,
2387+
gpa: u64,
2388+
ept_info: VmxEptExitQualification,
2389+
) -> Result<(), VpHaltReason<UhRunVpError>> {
2390+
let vtom = self.partition.caps.vtom.unwrap_or(0);
2391+
let is_shared = (gpa & vtom) == vtom && vtom != 0;
2392+
let canonical_gpa = gpa & !vtom;
2393+
2394+
// Only emulate the access if the gpa is mmio or outside of ram.
2395+
let address_type = self
2396+
.partition
2397+
.lower_vtl_memory_layout
2398+
.probe_address(canonical_gpa);
2399+
2400+
match address_type {
2401+
Some(AddressType::Mmio) => {
2402+
// Emulate the access.
2403+
self.emulate(
2404+
dev,
2405+
self.backing.vtls[intercepted_vtl]
2406+
.interruption_information
2407+
.valid(),
2408+
intercepted_vtl,
2409+
TdxEmulationCache::default(),
2410+
)
2411+
.await?;
2412+
}
2413+
Some(AddressType::Ram) => {
2414+
// TODO TDX: This path changes when we support VTL page
2415+
// protections and MNF. That will require injecting events to
2416+
// VTL1 or other handling.
2417+
//
2418+
// For now, we just check if the exit was suprious or if we
2419+
// should inject a machine check. An exit is considered spurious
2420+
// if the gpa is accessible.
2421+
if self.partition.gm[intercepted_vtl].check_gpa_readable(gpa) {
2422+
tracelimit::warn_ratelimited!(gpa, "possible spurious EPT violation, ignoring");
2423+
} else {
2424+
// TODO: It would be better to show what exact bitmap check
2425+
// failed, but that requires some refactoring of how the
2426+
// different bitmaps are stored. Do this when we support VTL
2427+
// protections or MNF.
2428+
//
2429+
// If we entered this path, it means the bitmap check on
2430+
// `check_gpa_readable` failed, so we can assume that if the
2431+
// address is shared, the actual state of the page is
2432+
// private, and vice versa. This is because the address
2433+
// should have already been checked to be valid memory
2434+
// described to the guest or not.
2435+
tracelimit::warn_ratelimited!(
2436+
gpa,
2437+
is_shared,
2438+
?ept_info,
2439+
"guest accessed inaccessible gpa, injecting MC"
2440+
);
2441+
2442+
// TODO: Implement IA32_MCG_STATUS MSR for more reporting
2443+
self.inject_mc(intercepted_vtl);
2444+
}
2445+
}
2446+
None => {
2447+
if !self.cvm_partition().hide_isolation {
2448+
// TODO: Addresses outside of ram and mmio probably should
2449+
// not be accessed by the guest, if it has been told about
2450+
// isolation. While it's okay as we will return FFs or
2451+
// discard writes for addresses that are not mmio, we should
2452+
// consider if instead we should also inject a machine check
2453+
// for such accesses. The guest should not access any
2454+
// addresses not described to it.
2455+
//
2456+
// For now, log that the guest did this.
2457+
tracelimit::warn_ratelimited!(
2458+
gpa,
2459+
is_shared,
2460+
?ept_info,
2461+
"guest accessed gpa not described in memory layout, emulating anyways"
2462+
);
2463+
}
2464+
2465+
// Emulate the access.
2466+
self.emulate(
2467+
dev,
2468+
self.backing.vtls[intercepted_vtl]
2469+
.interruption_information
2470+
.valid(),
2471+
intercepted_vtl,
2472+
TdxEmulationCache::default(),
2473+
)
2474+
.await?;
2475+
}
2476+
}
2477+
2478+
Ok(())
2479+
}
2480+
23912481
fn handle_tdvmcall(&mut self, dev: &impl CpuIo, intercepted_vtl: GuestVtl) {
23922482
let regs = self.runner.tdx_enter_guest_gps();
23932483
if regs[TdxGp::R10] == 0 {
@@ -2791,7 +2881,7 @@ impl<T: CpuIo> X86EmulatorSupport for UhEmulationState<'_, '_, T, TdxBacked> {
27912881
}
27922882

27932883
fn physical_address(&self) -> Option<u64> {
2794-
Some(TdxExit(self.vp.runner.tdx_vp_enter_exit_info()).gpa())
2884+
TdxExit(self.vp.runner.tdx_vp_enter_exit_info()).gpa()
27952885
}
27962886

27972887
fn initial_gva_translation(&self) -> Option<virt_support_x86emu::emulate::InitialTranslation> {
@@ -2802,8 +2892,8 @@ impl<T: CpuIo> X86EmulatorSupport for UhEmulationState<'_, '_, T, TdxBacked> {
28022892
&& ept_info.gva_valid()
28032893
{
28042894
Some(virt_support_x86emu::emulate::InitialTranslation {
2805-
gva: exit_info.gla(),
2806-
gpa: exit_info.gpa(),
2895+
gva: exit_info.gla().expect("already validated EPT exit"),
2896+
gpa: exit_info.gpa().expect("already validated EPT exit"),
28072897
translate_mode: match ept_info.access_mask() {
28082898
0x1 => TranslateMode::Read,
28092899
// As defined in "Table 28-7. Exit Qualification for EPT

vm/vmcore/vm_topology/src/memory.rs

Lines changed: 69 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -100,6 +100,15 @@ fn validate_ranges_core<T>(ranges: &[T], getter: impl Fn(&T) -> &MemoryRange) ->
100100
Ok(())
101101
}
102102

103+
/// The type backing an address.
104+
#[derive(Debug, Copy, Clone, PartialEq, Eq)]
105+
pub enum AddressType {
106+
/// The address describes ram.
107+
Ram,
108+
/// The address describes mmio.
109+
Mmio,
110+
}
111+
103112
impl MemoryLayout {
104113
/// Makes a new memory layout for a guest with `ram_size` bytes of memory
105114
/// and MMIO gaps at the locations specified by `gaps`.
@@ -282,6 +291,27 @@ impl MemoryLayout {
282291
pub fn end_of_ram_or_mmio(&self) -> u64 {
283292
std::cmp::max(self.mmio.last().expect("mmio set").end(), self.end_of_ram())
284293
}
294+
295+
/// Probe a given address to see if it is in the memory layout described by
296+
/// `self`. Returns the [`AddressType`] of the address if it is in the
297+
/// layout.
298+
///
299+
/// This does not check the vtl2_range.
300+
pub fn probe_address(&self, address: u64) -> Option<AddressType> {
301+
let ranges = self
302+
.ram
303+
.iter()
304+
.map(|r| (&r.range, AddressType::Ram))
305+
.chain(self.mmio.iter().map(|r| (r, AddressType::Mmio)));
306+
307+
for (range, address_type) in ranges {
308+
if range.contains_addr(address) {
309+
return Some(address_type);
310+
}
311+
}
312+
313+
None
314+
}
285315
}
286316

287317
#[cfg(test)]
@@ -386,4 +416,43 @@ mod tests {
386416
];
387417
MemoryLayout::new_from_ranges(ram, mmio).unwrap_err();
388418
}
419+
420+
#[test]
421+
fn probe_address() {
422+
let mmio = &[
423+
MemoryRange::new(GB..2 * GB),
424+
MemoryRange::new(3 * GB..4 * GB),
425+
];
426+
let ram = &[
427+
MemoryRangeWithNode {
428+
range: MemoryRange::new(0..GB),
429+
vnode: 0,
430+
},
431+
MemoryRangeWithNode {
432+
range: MemoryRange::new(2 * GB..3 * GB),
433+
vnode: 0,
434+
},
435+
MemoryRangeWithNode {
436+
range: MemoryRange::new(4 * GB..TB + 2 * GB),
437+
vnode: 0,
438+
},
439+
];
440+
441+
let layout = MemoryLayout::new_from_ranges(ram, mmio).unwrap();
442+
443+
assert_eq!(layout.probe_address(0), Some(AddressType::Ram));
444+
assert_eq!(layout.probe_address(256), Some(AddressType::Ram));
445+
assert_eq!(layout.probe_address(2 * GB), Some(AddressType::Ram));
446+
assert_eq!(layout.probe_address(4 * GB), Some(AddressType::Ram));
447+
assert_eq!(layout.probe_address(TB), Some(AddressType::Ram));
448+
assert_eq!(layout.probe_address(TB + 1), Some(AddressType::Ram));
449+
450+
assert_eq!(layout.probe_address(GB), Some(AddressType::Mmio));
451+
assert_eq!(layout.probe_address(GB + 123), Some(AddressType::Mmio));
452+
assert_eq!(layout.probe_address(3 * GB), Some(AddressType::Mmio));
453+
454+
assert_eq!(layout.probe_address(TB + 2 * GB), None);
455+
assert_eq!(layout.probe_address(TB + 3 * GB), None);
456+
assert_eq!(layout.probe_address(4 * TB), None);
457+
}
389458
}

0 commit comments

Comments
 (0)