microsoft · chris-oo · May 28, 2025 · May 12, 2025 · May 12, 2025 · May 12, 2025
@@ -1370,11 +1370,23 @@ impl UhProcessor<'_, SnpBacked> {
             }
 
             SevExitCode::NPF if has_intercept => {
-                // Determine whether an NPF needs to be handled. If not, assume this fault is spurious
-                // and that the instruction can be retried. The intercept itself may be presented by the
-                // hypervisor as either a GPA intercept or an exception intercept.
-                // The hypervisor configures the NPT to generate a #VC inside the guest for accesses to
-                // unmapped memory. This means that accesses to unmapped memory for lower VTLs will be
+                // TODO SNP: This code needs to be fixed to not rely on the
+                // hypervisor message to check the validity of the NPF, rather
+                // we should look at the SNP hardware exit info only like we do
+                // with TDX.
+                //
+                // TODO SNP: This code should be fixed so we do not attempt to
+                // emulate a NPF with an address that has the wrong shared bit,
+                // as this will cause the emulator to raise an internal error,
+                // and instead inject a machine check like TDX.
+                //
+                // Determine whether an NPF needs to be handled. If not, assume
+                // this fault is spurious and that the instruction can be
+                // retried. The intercept itself may be presented by the
+                // hypervisor as either a GPA intercept or an exception
+                // intercept. The hypervisor configures the NPT to generate a
+                // #VC inside the guest for accesses to unmapped memory. This
+                // means that accesses to unmapped memory for lower VTLs will be
                 // forwarded to underhill as a #VC exception.
                 let exit_info2 = vmsa.exit_info2();
                 let interruption_pending = vmsa.event_inject().valid()

@@ -74,6 +74,7 @@ use virt_support_x86emu::emulate::emulate_insn_memory_op;
 use virt_support_x86emu::emulate::emulate_io;
 use virt_support_x86emu::emulate::emulate_translate_gva;
 use virt_support_x86emu::translate::TranslationRegisters;
+use vm_topology::memory::AddressType;
 use vmcore::vmtime::VmTimeAccess;
 use x86defs::RFlags;
 use x86defs::X64_CR0_ET;
@@ -165,11 +166,21 @@ impl TdxExit<'_> {
     fn qualification(&self) -> u64 {
         self.0.rcx
     }
-    fn gla(&self) -> u64 {
-        self.0.rdx
+    fn gla(&self) -> Option<u64> {
+        // Only valid for EPT exits.
+        if self.code().vmx_exit().basic_reason() == VmxExitBasic::EPT_VIOLATION {
+            Some(self.0.rdx)
+        } else {
+            None
+        }
     }
-    fn gpa(&self) -> u64 {
-        self.0.r8
+    fn gpa(&self) -> Option<u64> {
+        // Only valid for EPT exits.
+        if self.code().vmx_exit().basic_reason() == VmxExitBasic::EPT_VIOLATION {
+            Some(self.0.r8)
+        } else {
+            None
+        }
     }
     fn _exit_interruption_info(&self) -> InterruptionInformation {
         (self.0.r9 as u32).into()
@@ -1997,7 +2008,7 @@ impl UhProcessor<'_, TdxBacked> {
                 &mut self.backing.vtls[intercepted_vtl].exit_stats.wbinvd
             }
             VmxExitBasic::EPT_VIOLATION => {
-                let gpa = exit_info.gpa();
+                let gpa = exit_info.gpa().expect("is EPT exit");
                 let ept_info = VmxEptExitQualification::from(exit_info.qualification());
                 // If this was an EPT violation while handling an iret, and
                 // that iret cleared the NMI blocking state, restore it.
@@ -2014,34 +2025,8 @@ impl UhProcessor<'_, TdxBacked> {
                         )
                         .into();
                     assert!(!old_interruptibility.blocked_by_nmi());
-                }
-                // TODO TDX: If this is an access to a shared gpa, we need to
-                // check the intercept page to see if this is a real exit or
-                // spurious. This exit is only real if the hypervisor has
-                // delivered an intercept message for this GPA.
-                //
-                // However, at this point the kernel has cleared that
-                // information so some kind of redesign is required to figure
-                // this out.
-                //
-                // For now, we instead treat EPTs on readable RAM as spurious
-                // and log appropriately. This check is also not entirely
-                // sufficient, as it may be a write access where the page is
-                // protected, but we don't yet support MNF/guest VSM so this is
-                // okay enough.
-                else if self.partition.gm[intercepted_vtl].check_gpa_readable(gpa) {
-                    tracelimit::warn_ratelimited!(gpa, "possible spurious EPT violation, ignoring");
                 } else {
-                    // Emulate the access.
-                    self.emulate(
-                        dev,
-                        self.backing.vtls[intercepted_vtl]
-                            .interruption_information
-                            .valid(),
-                        intercepted_vtl,
-                        TdxEmulationCache::default(),
-                    )
-                    .await?;
+                    self.handle_ept(intercepted_vtl, dev, gpa, ept_info).await?;
                 }
 
                 &mut self.backing.vtls[intercepted_vtl].exit_stats.ept_violation
@@ -2386,6 +2371,111 @@ impl UhProcessor<'_, TdxBacked> {
         self.backing.vtls[vtl].exception_error_code = 0;
     }
 
+    fn inject_mc(&mut self, vtl: GuestVtl) {
+        self.backing.vtls[vtl].interruption_information = InterruptionInformation::new()
+            .with_valid(true)
+            .with_vector(x86defs::Exception::MACHINE_CHECK.0)
+            .with_interruption_type(INTERRUPT_TYPE_HARDWARE_EXCEPTION);
+    }
+
+    async fn handle_ept(
+        &mut self,
+        intercepted_vtl: GuestVtl,
+        dev: &impl CpuIo,
+        gpa: u64,
+        ept_info: VmxEptExitQualification,
+    ) -> Result<(), VpHaltReason<UhRunVpError>> {
+        let vtom = self.partition.caps.vtom.unwrap_or(0);
+        let is_shared = (gpa & vtom) == vtom && vtom != 0;
+        let canonical_gpa = gpa & !vtom;
+
+        // Only emulate the access if the gpa is mmio or outside of ram.
+        let address_type = self
+            .partition
+            .lower_vtl_memory_layout
+            .probe_address(canonical_gpa);
+
+        match address_type {
+            Some(AddressType::Mmio) => {
+                // Emulate the access.
+                self.emulate(
+                    dev,
+                    self.backing.vtls[intercepted_vtl]
+                        .interruption_information
+                        .valid(),
+                    intercepted_vtl,
+                    TdxEmulationCache::default(),
+                )
+                .await?;
+            }
+            Some(AddressType::Ram) => {
+                // TODO TDX: This path changes when we support VTL page
+                // protections and MNF. That will require injecting events to
+                // VTL1 or other handling.
+                //
+                // For now, we just check if the exit was suprious or if we
+                // should inject a machine check. An exit is considered spurious
+                // if the gpa is accessible.
+                if self.partition.gm[intercepted_vtl].check_gpa_readable(gpa) {
+                    tracelimit::warn_ratelimited!(gpa, "possible spurious EPT violation, ignoring");
+                } else {
+                    // TODO: It would be better to show what exact bitmap check
+                    // failed, but that requires some refactoring of how the
+                    // different bitmaps are stored. Do this when we support VTL
+                    // protections or MNF.
+                    //
+                    // If we entered this path, it means the bitmap check on
+                    // `check_gpa_readable` failed, so we can assume that if the
+                    // address is shared, the actual state of the page is
+                    // private, and vice versa. This is because the address
+                    // should have already been checked to be valid memory
+                    // described to the guest or not.
+                    tracelimit::warn_ratelimited!(
+                        gpa,
+                        is_shared,
+                        ?ept_info,
+                        "guest accessed inaccessible gpa, injecting MC"
+                    );
+
+                    // TODO: Implement IA32_MCG_STATUS MSR for more reporting
+                    self.inject_mc(intercepted_vtl);
+                }
+            }
+            None => {
+                if !self.cvm_partition().hide_isolation {
+                    // TODO: Addresses outside of ram and mmio probably should
+                    // not be accessed by the guest, if it has been told about
+                    // isolation. While it's okay as we will return FFs or
+                    // discard writes for addresses that are not mmio, we should
+                    // consider if instead we should also inject a machine check
+                    // for such accesses. The guest should not access any
+                    // addresses not described to it.
+                    //
+                    // For now, log that the guest did this.
+                    tracelimit::warn_ratelimited!(
+                        gpa,
+                        is_shared,
+                        ?ept_info,
+                        "guest accessed gpa not described in memory layout, emulating anyways"
+                    );
+                }
+
+                // Emulate the access.
+                self.emulate(
+                    dev,
+                    self.backing.vtls[intercepted_vtl]
+                        .interruption_information
+                        .valid(),
+                    intercepted_vtl,
+                    TdxEmulationCache::default(),
+                )
+                .await?;
+            }
+        }
+
+        Ok(())
+    }
+
     fn handle_tdvmcall(&mut self, dev: &impl CpuIo, intercepted_vtl: GuestVtl) {
         let regs = self.runner.tdx_enter_guest_gps();
         if regs[TdxGp::R10] == 0 {
@@ -2789,7 +2879,7 @@ impl<T: CpuIo> X86EmulatorSupport for UhEmulationState<'_, '_, T, TdxBacked> {
     }
 
     fn physical_address(&self) -> Option<u64> {
-        Some(TdxExit(self.vp.runner.tdx_vp_enter_exit_info()).gpa())
+        TdxExit(self.vp.runner.tdx_vp_enter_exit_info()).gpa()
     }
 
     fn initial_gva_translation(
@@ -2802,8 +2892,8 @@ impl<T: CpuIo> X86EmulatorSupport for UhEmulationState<'_, '_, T, TdxBacked> {
             && ept_info.gva_valid()
         {
             Some(virt_support_x86emu::emulate::InitialTranslation {
-                gva: exit_info.gla(),
-                gpa: exit_info.gpa(),
+                gva: exit_info.gla().expect("already validated EPT exit"),
+                gpa: exit_info.gpa().expect("already validated EPT exit"),
                 translate_mode: match ept_info.access_mask() {
                     0x1 => TranslateMode::Read,
                     // As defined in "Table 28-7. Exit Qualification for EPT

@@ -100,6 +100,15 @@ fn validate_ranges_core<T>(ranges: &[T], getter: impl Fn(&T) -> &MemoryRange) ->
     Ok(())
 }
 
+/// The type backing an address.
+#[derive(Debug, Copy, Clone, PartialEq, Eq)]
+pub enum AddressType {
+    /// The address describes ram.
+    Ram,
+    /// The address describes mmio.
+    Mmio,
+}
+
 impl MemoryLayout {
     /// Makes a new memory layout for a guest with `ram_size` bytes of memory
     /// and MMIO gaps at the locations specified by `gaps`.
@@ -282,6 +291,27 @@ impl MemoryLayout {
     pub fn end_of_ram_or_mmio(&self) -> u64 {
         std::cmp::max(self.mmio.last().expect("mmio set").end(), self.end_of_ram())
     }
+
+    /// Probe a given address to see if it is in the memory layout described by
+    /// `self`. Returns the [`AddressType`] of the address if it is in the
+    /// layout.
+    ///
+    /// This does not check the vtl2_range.
+    pub fn probe_address(&self, address: u64) -> Option<AddressType> {
+        let ranges = self
+            .ram
+            .iter()
+            .map(|r| (&r.range, AddressType::Ram))
+            .chain(self.mmio.iter().map(|r| (r, AddressType::Mmio)));
+
+        for (range, address_type) in ranges {
+            if range.contains_addr(address) {
+                return Some(address_type);
+            }
+        }
+
+        None
+    }
 }
 
 #[cfg(test)]
@@ -386,4 +416,43 @@ mod tests {
         ];
         MemoryLayout::new_from_ranges(ram, mmio).unwrap_err();
     }
+
+    #[test]
+    fn probe_address() {
+        let mmio = &[
+            MemoryRange::new(GB..2 * GB),
+            MemoryRange::new(3 * GB..4 * GB),
+        ];
+        let ram = &[
+            MemoryRangeWithNode {
+                range: MemoryRange::new(0..GB),
+                vnode: 0,
+            },
+            MemoryRangeWithNode {
+                range: MemoryRange::new(2 * GB..3 * GB),
+                vnode: 0,
+            },
+            MemoryRangeWithNode {
+                range: MemoryRange::new(4 * GB..TB + 2 * GB),
+                vnode: 0,
+            },
+        ];
+
+        let layout = MemoryLayout::new_from_ranges(ram, mmio).unwrap();
+
+        assert_eq!(layout.probe_address(0), Some(AddressType::Ram));
+        assert_eq!(layout.probe_address(256), Some(AddressType::Ram));
+        assert_eq!(layout.probe_address(2 * GB), Some(AddressType::Ram));
+        assert_eq!(layout.probe_address(4 * GB), Some(AddressType::Ram));
+        assert_eq!(layout.probe_address(TB), Some(AddressType::Ram));
+        assert_eq!(layout.probe_address(TB + 1), Some(AddressType::Ram));
+
+        assert_eq!(layout.probe_address(GB), Some(AddressType::Mmio));
+        assert_eq!(layout.probe_address(GB + 123), Some(AddressType::Mmio));
+        assert_eq!(layout.probe_address(3 * GB), Some(AddressType::Mmio));
+
+        assert_eq!(layout.probe_address(TB + 2 * GB), None);
+        assert_eq!(layout.probe_address(TB + 3 * GB), None);
+        assert_eq!(layout.probe_address(4 * TB), None);
+    }
 }