Skip to content

openhcl/tdx: inject machine check on ept exits caused by bad guest addresses #1340

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 5 commits into from
May 28, 2025
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
22 changes: 17 additions & 5 deletions openhcl/virt_mshv_vtl/src/processor/snp/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -1370,11 +1370,23 @@ impl UhProcessor<'_, SnpBacked> {
}

SevExitCode::NPF if has_intercept => {
// Determine whether an NPF needs to be handled. If not, assume this fault is spurious
// and that the instruction can be retried. The intercept itself may be presented by the
// hypervisor as either a GPA intercept or an exception intercept.
// The hypervisor configures the NPT to generate a #VC inside the guest for accesses to
// unmapped memory. This means that accesses to unmapped memory for lower VTLs will be
// TODO SNP: This code needs to be fixed to not rely on the
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Are you sure? I thought Alex said this was necessary. Doesn't the legacy HCL do this, too?

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I talked with Jon offline and he said we shouldn't be relying on the hypervisor for anything - the hardware exit info on both SNP and TDX should have what we need. I haven't looked at the legacy HCL in detail but he said that's what he believes to be the case after some refactoring.

// hypervisor message to check the validity of the NPF, rather
// we should look at the SNP hardware exit info only like we do
// with TDX.
//
// TODO SNP: This code should be fixed so we do not attempt to
// emulate a NPF with an address that has the wrong shared bit,
// as this will cause the emulator to raise an internal error,
// and instead inject a machine check like TDX.
//
// Determine whether an NPF needs to be handled. If not, assume
// this fault is spurious and that the instruction can be
// retried. The intercept itself may be presented by the
// hypervisor as either a GPA intercept or an exception
// intercept. The hypervisor configures the NPT to generate a
// #VC inside the guest for accesses to unmapped memory. This
// means that accesses to unmapped memory for lower VTLs will be
// forwarded to underhill as a #VC exception.
let exit_info2 = vmsa.exit_info2();
let interruption_pending = vmsa.event_inject().valid()
Expand Down
160 changes: 125 additions & 35 deletions openhcl/virt_mshv_vtl/src/processor/tdx/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -74,6 +74,7 @@ use virt_support_x86emu::emulate::emulate_insn_memory_op;
use virt_support_x86emu::emulate::emulate_io;
use virt_support_x86emu::emulate::emulate_translate_gva;
use virt_support_x86emu::translate::TranslationRegisters;
use vm_topology::memory::AddressType;
use vmcore::vmtime::VmTimeAccess;
use x86defs::RFlags;
use x86defs::X64_CR0_ET;
Expand Down Expand Up @@ -165,11 +166,21 @@ impl TdxExit<'_> {
fn qualification(&self) -> u64 {
self.0.rcx
}
fn gla(&self) -> u64 {
self.0.rdx
fn gla(&self) -> Option<u64> {
// Only valid for EPT exits.
if self.code().vmx_exit().basic_reason() == VmxExitBasic::EPT_VIOLATION {
Some(self.0.rdx)
} else {
None
}
}
fn gpa(&self) -> u64 {
self.0.r8
fn gpa(&self) -> Option<u64> {
// Only valid for EPT exits.
if self.code().vmx_exit().basic_reason() == VmxExitBasic::EPT_VIOLATION {
Some(self.0.r8)
} else {
None
}
}
fn _exit_interruption_info(&self) -> InterruptionInformation {
(self.0.r9 as u32).into()
Expand Down Expand Up @@ -1997,7 +2008,7 @@ impl UhProcessor<'_, TdxBacked> {
&mut self.backing.vtls[intercepted_vtl].exit_stats.wbinvd
}
VmxExitBasic::EPT_VIOLATION => {
let gpa = exit_info.gpa();
let gpa = exit_info.gpa().expect("is EPT exit");
let ept_info = VmxEptExitQualification::from(exit_info.qualification());
// If this was an EPT violation while handling an iret, and
// that iret cleared the NMI blocking state, restore it.
Expand All @@ -2014,34 +2025,8 @@ impl UhProcessor<'_, TdxBacked> {
)
.into();
assert!(!old_interruptibility.blocked_by_nmi());
}
// TODO TDX: If this is an access to a shared gpa, we need to
// check the intercept page to see if this is a real exit or
// spurious. This exit is only real if the hypervisor has
// delivered an intercept message for this GPA.
//
// However, at this point the kernel has cleared that
// information so some kind of redesign is required to figure
// this out.
//
// For now, we instead treat EPTs on readable RAM as spurious
// and log appropriately. This check is also not entirely
// sufficient, as it may be a write access where the page is
// protected, but we don't yet support MNF/guest VSM so this is
// okay enough.
else if self.partition.gm[intercepted_vtl].check_gpa_readable(gpa) {
tracelimit::warn_ratelimited!(gpa, "possible spurious EPT violation, ignoring");
} else {
// Emulate the access.
self.emulate(
dev,
self.backing.vtls[intercepted_vtl]
.interruption_information
.valid(),
intercepted_vtl,
TdxEmulationCache::default(),
)
.await?;
self.handle_ept(intercepted_vtl, dev, gpa, ept_info).await?;
}

&mut self.backing.vtls[intercepted_vtl].exit_stats.ept_violation
Expand Down Expand Up @@ -2386,6 +2371,111 @@ impl UhProcessor<'_, TdxBacked> {
self.backing.vtls[vtl].exception_error_code = 0;
}

fn inject_mc(&mut self, vtl: GuestVtl) {
self.backing.vtls[vtl].interruption_information = InterruptionInformation::new()
.with_valid(true)
.with_vector(x86defs::Exception::MACHINE_CHECK.0)
.with_interruption_type(INTERRUPT_TYPE_HARDWARE_EXCEPTION);
}

async fn handle_ept(
&mut self,
intercepted_vtl: GuestVtl,
dev: &impl CpuIo,
gpa: u64,
ept_info: VmxEptExitQualification,
) -> Result<(), VpHaltReason<UhRunVpError>> {
let vtom = self.partition.caps.vtom.unwrap_or(0);
let is_shared = (gpa & vtom) == vtom && vtom != 0;
let canonical_gpa = gpa & !vtom;

// Only emulate the access if the gpa is mmio or outside of ram.
let address_type = self
.partition
.lower_vtl_memory_layout
.probe_address(canonical_gpa);

match address_type {
Some(AddressType::Mmio) => {
// Emulate the access.
self.emulate(
dev,
self.backing.vtls[intercepted_vtl]
.interruption_information
.valid(),
intercepted_vtl,
TdxEmulationCache::default(),
)
.await?;
}
Some(AddressType::Ram) => {
// TODO TDX: This path changes when we support VTL page
// protections and MNF. That will require injecting events to
// VTL1 or other handling.
//
// For now, we just check if the exit was suprious or if we
// should inject a machine check. An exit is considered spurious
// if the gpa is accessible.
if self.partition.gm[intercepted_vtl].check_gpa_readable(gpa) {
tracelimit::warn_ratelimited!(gpa, "possible spurious EPT violation, ignoring");
} else {
// TODO: It would be better to show what exact bitmap check
// failed, but that requires some refactoring of how the
// different bitmaps are stored. Do this when we support VTL
// protections or MNF.
//
// If we entered this path, it means the bitmap check on
// `check_gpa_readable` failed, so we can assume that if the
// address is shared, the actual state of the page is
// private, and vice versa. This is because the address
// should have already been checked to be valid memory
// described to the guest or not.
tracelimit::warn_ratelimited!(
gpa,
is_shared,
?ept_info,
"guest accessed inaccessible gpa, injecting MC"
);

// TODO: Implement IA32_MCG_STATUS MSR for more reporting
self.inject_mc(intercepted_vtl);
}
}
None => {
if !self.cvm_partition().hide_isolation {
// TODO: Addresses outside of ram and mmio probably should
// not be accessed by the guest, if it has been told about
// isolation. While it's okay as we will return FFs or
// discard writes for addresses that are not mmio, we should
// consider if instead we should also inject a machine check
// for such accesses. The guest should not access any
// addresses not described to it.
//
// For now, log that the guest did this.
tracelimit::warn_ratelimited!(
gpa,
is_shared,
?ept_info,
"guest accessed gpa not described in memory layout, emulating anyways"
);
}

// Emulate the access.
self.emulate(
dev,
self.backing.vtls[intercepted_vtl]
.interruption_information
.valid(),
intercepted_vtl,
TdxEmulationCache::default(),
)
.await?;
}
}

Ok(())
}

fn handle_tdvmcall(&mut self, dev: &impl CpuIo, intercepted_vtl: GuestVtl) {
let regs = self.runner.tdx_enter_guest_gps();
if regs[TdxGp::R10] == 0 {
Expand Down Expand Up @@ -2789,7 +2879,7 @@ impl<T: CpuIo> X86EmulatorSupport for UhEmulationState<'_, '_, T, TdxBacked> {
}

fn physical_address(&self) -> Option<u64> {
Some(TdxExit(self.vp.runner.tdx_vp_enter_exit_info()).gpa())
TdxExit(self.vp.runner.tdx_vp_enter_exit_info()).gpa()
}

fn initial_gva_translation(
Expand All @@ -2802,8 +2892,8 @@ impl<T: CpuIo> X86EmulatorSupport for UhEmulationState<'_, '_, T, TdxBacked> {
&& ept_info.gva_valid()
{
Some(virt_support_x86emu::emulate::InitialTranslation {
gva: exit_info.gla(),
gpa: exit_info.gpa(),
gva: exit_info.gla().expect("already validated EPT exit"),
gpa: exit_info.gpa().expect("already validated EPT exit"),
translate_mode: match ept_info.access_mask() {
0x1 => TranslateMode::Read,
// As defined in "Table 28-7. Exit Qualification for EPT
Expand Down
69 changes: 69 additions & 0 deletions vm/vmcore/vm_topology/src/memory.rs
Original file line number Diff line number Diff line change
Expand Up @@ -100,6 +100,15 @@ fn validate_ranges_core<T>(ranges: &[T], getter: impl Fn(&T) -> &MemoryRange) ->
Ok(())
}

/// The type backing an address.
#[derive(Debug, Copy, Clone, PartialEq, Eq)]
pub enum AddressType {
/// The address describes ram.
Ram,
/// The address describes mmio.
Mmio,
}

impl MemoryLayout {
/// Makes a new memory layout for a guest with `ram_size` bytes of memory
/// and MMIO gaps at the locations specified by `gaps`.
Expand Down Expand Up @@ -282,6 +291,27 @@ impl MemoryLayout {
pub fn end_of_ram_or_mmio(&self) -> u64 {
std::cmp::max(self.mmio.last().expect("mmio set").end(), self.end_of_ram())
}

/// Probe a given address to see if it is in the memory layout described by
/// `self`. Returns the [`AddressType`] of the address if it is in the
/// layout.
///
/// This does not check the vtl2_range.
pub fn probe_address(&self, address: u64) -> Option<AddressType> {
let ranges = self
.ram
.iter()
.map(|r| (&r.range, AddressType::Ram))
.chain(self.mmio.iter().map(|r| (r, AddressType::Mmio)));

for (range, address_type) in ranges {
if range.contains_addr(address) {
return Some(address_type);
}
}

None
}
}

#[cfg(test)]
Expand Down Expand Up @@ -386,4 +416,43 @@ mod tests {
];
MemoryLayout::new_from_ranges(ram, mmio).unwrap_err();
}

#[test]
fn probe_address() {
let mmio = &[
MemoryRange::new(GB..2 * GB),
MemoryRange::new(3 * GB..4 * GB),
];
let ram = &[
MemoryRangeWithNode {
range: MemoryRange::new(0..GB),
vnode: 0,
},
MemoryRangeWithNode {
range: MemoryRange::new(2 * GB..3 * GB),
vnode: 0,
},
MemoryRangeWithNode {
range: MemoryRange::new(4 * GB..TB + 2 * GB),
vnode: 0,
},
];

let layout = MemoryLayout::new_from_ranges(ram, mmio).unwrap();

assert_eq!(layout.probe_address(0), Some(AddressType::Ram));
assert_eq!(layout.probe_address(256), Some(AddressType::Ram));
assert_eq!(layout.probe_address(2 * GB), Some(AddressType::Ram));
assert_eq!(layout.probe_address(4 * GB), Some(AddressType::Ram));
assert_eq!(layout.probe_address(TB), Some(AddressType::Ram));
assert_eq!(layout.probe_address(TB + 1), Some(AddressType::Ram));

assert_eq!(layout.probe_address(GB), Some(AddressType::Mmio));
assert_eq!(layout.probe_address(GB + 123), Some(AddressType::Mmio));
assert_eq!(layout.probe_address(3 * GB), Some(AddressType::Mmio));

assert_eq!(layout.probe_address(TB + 2 * GB), None);
assert_eq!(layout.probe_address(TB + 3 * GB), None);
assert_eq!(layout.probe_address(4 * TB), None);
}
}