Skip to content

Commit

Permalink
Merge commit '525e68639d18aa59820380179774a5489495f60a' into main-liv…
Browse files Browse the repository at this point in the history
…e-migration-pvm
  • Loading branch information
pojntfx committed Feb 4, 2025
2 parents cb42196 + 525e686 commit c72c6a2
Show file tree
Hide file tree
Showing 9 changed files with 165 additions and 22 deletions.
7 changes: 7 additions & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,13 @@ and this project adheres to

### Added

- [#4987](https://github.com/firecracker-microvm/firecracker/pull/4987): Reset
physical counter register (`CNTPCT_EL0`) on VM startup. This avoids VM reading
the host physical counter value. This is only possible on 6.4 and newer
kernels. For older kernels physical counter will still be passed to the guest
unmodified. See more info
[here](https://github.com/firecracker-microvm/firecracker/blob/main/docs/prod-host-setup.md#arm-only-vm-physical-counter-behaviour)

### Changed

- [#4913](https://github.com/firecracker-microvm/firecracker/pull/4913): Removed
Expand Down
18 changes: 9 additions & 9 deletions docs/prod-host-setup.md
Original file line number Diff line number Diff line change
Expand Up @@ -328,13 +328,16 @@ For vendor-specific recommendations, please consult the resources below:
- ARM:
[Speculative Processor Vulnerability](https://developer.arm.com/support/arm-security-updates/speculative-processor-vulnerability)

##### [ARM only] Physical counter directly passed through to the guest
##### [ARM only] VM Physical counter behaviour

On ARM, the physical counter (i.e `CNTPCT`) it is returning the
[actual EL1 physical counter value of the host][1]. From the discussions before
merging this change [upstream][2], this seems like a conscious design decision
of the ARM code contributors, giving precedence to performance over the ability
to trap and control this in the hypervisor.
On ARM, Firecracker tries to reset the `CNTPCT` physical counter on VM boot.
This is done in order to prevent VM from reading host physical counter value.
Firecracker will only try to reset the counter if the host KVM contains
`KVM_CAP_COUNTER_OFFSET` capability. This capability is only present in kernels
containing
[this](https://lore.kernel.org/all/[email protected]/)
patch series (starting from 6.4 and newer). For older kernels the counter value
will be passed through from the host.

##### Verification

Expand Down Expand Up @@ -428,6 +431,3 @@ To validate that the change took effect, the file
[^1]: Look for `GRUB_CMDLINE_LINUX` in file `/etc/default/grub` in RPM-based
systems, and
[this doc for Ubuntu](https://wiki.ubuntu.com/Kernel/KernelBootParameters).

[1]: https://elixir.free-electrons.com/linux/v4.14.203/source/virt/kvm/arm/hyp/timer-sr.c#L63
[2]: https://lists.cs.columbia.edu/pipermail/kvmarm/2017-January/023323.html
6 changes: 6 additions & 0 deletions src/vmm/src/arch/aarch64/regs.rs
Original file line number Diff line number Diff line change
Expand Up @@ -99,6 +99,12 @@ arm64_sys_reg!(SYS_CNTV_CVAL_EL0, 3, 3, 14, 3, 2);
// https://elixir.bootlin.com/linux/v6.8/source/arch/arm64/include/asm/sysreg.h#L459
arm64_sys_reg!(SYS_CNTPCT_EL0, 3, 3, 14, 0, 1);

// Physical Timer EL0 count Register
// The id of this register is same as SYS_CNTPCT_EL0, but KVM defines it
// separately, so we do as well.
// https://elixir.bootlin.com/linux/v6.12.6/source/arch/arm64/include/uapi/asm/kvm.h#L259
arm64_sys_reg!(KVM_REG_ARM_PTIMER_CNT, 3, 3, 14, 0, 1);

// Translation Table Base Register
// https://developer.arm.com/documentation/ddi0595/2021-03/AArch64-Registers/TTBR1-EL1--Translation-Table-Base-Register-1--EL1-
arm64_sys_reg!(TTBR1_EL1, 3, 0, 2, 0, 1);
Expand Down
42 changes: 40 additions & 2 deletions src/vmm/src/arch/aarch64/vcpu.rs
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,7 @@ use kvm_ioctls::VcpuFd;

use super::get_fdt_addr;
use super::regs::*;
use crate::vstate::kvm::OptionalCapabilities;
use crate::vstate::memory::GuestMemoryMmap;

/// Errors thrown while setting aarch64 registers.
Expand Down Expand Up @@ -78,6 +79,7 @@ pub fn setup_boot_regs(
cpu_id: u8,
boot_ip: u64,
mem: &GuestMemoryMmap,
optional_capabilities: &OptionalCapabilities,
) -> Result<(), VcpuError> {
let kreg_off = offset_of!(kvm_regs, regs);

Expand Down Expand Up @@ -106,6 +108,23 @@ pub fn setup_boot_regs(
vcpufd
.set_one_reg(id, &get_fdt_addr(mem).to_le_bytes())
.map_err(|err| VcpuError::SetOneReg(id, err))?;

// Reset the physical counter for the guest. This way we avoid guest reading
// host physical counter.
// Resetting KVM_REG_ARM_PTIMER_CNT for single vcpu is enough because there is only
// one timer struct with offsets per VM.
// Because the access to KVM_REG_ARM_PTIMER_CNT is only present starting 6.4 kernel,
// we only do the reset if KVM_CAP_COUNTER_OFFSET is present as it was added
// in the same patch series as the ability to set the KVM_REG_ARM_PTIMER_CNT register.
// Path series which introduced the needed changes:
// https://lore.kernel.org/all/[email protected]/
// Note: the value observed by the guest will still be above 0, because there is a delta
// time between this resetting and first call to KVM_RUN.
if optional_capabilities.counter_offset {
vcpufd
.set_one_reg(KVM_REG_ARM_PTIMER_CNT, &[0; 8])
.map_err(|err| VcpuError::SetOneReg(id, err))?;
}
}
Ok(())
}
Expand Down Expand Up @@ -226,8 +245,9 @@ mod tests {
let vm = kvm.fd.create_vm().unwrap();
let vcpu = vm.create_vcpu(0).unwrap();
let mem = arch_mem(layout::FDT_MAX_SIZE + 0x1000);
let optional_capabilities = kvm.optional_capabilities();

let res = setup_boot_regs(&vcpu, 0, 0x0, &mem);
let res = setup_boot_regs(&vcpu, 0, 0x0, &mem, &optional_capabilities);
assert!(matches!(
res.unwrap_err(),
VcpuError::SetOneReg(0x6030000000100042, _)
Expand All @@ -237,7 +257,25 @@ mod tests {
vm.get_preferred_target(&mut kvi).unwrap();
vcpu.vcpu_init(&kvi).unwrap();

setup_boot_regs(&vcpu, 0, 0x0, &mem).unwrap();
setup_boot_regs(&vcpu, 0, 0x0, &mem, &optional_capabilities).unwrap();

// Check that the register is reset on compatible kernels.
// Because there is a delta in time between we reset the register and time we
// read it, we cannot compare with 0. Instead we compare it with meaningfully
// small value.
if optional_capabilities.counter_offset {
let mut reg_bytes = [0_u8; 8];
vcpu.get_one_reg(SYS_CNTPCT_EL0, &mut reg_bytes).unwrap();
let counter_value = u64::from_le_bytes(reg_bytes);

// We are reading the SYS_CNTPCT_EL0 right after resetting it.
// If reset did happen successfully, the value should be quite small when we read it.
// If the reset did not happen, the value will be same as on the host and it surely
// will be more that MAX_VALUE.
let max_value = 1000;

assert!(counter_value < max_value);
}
}

#[test]
Expand Down
29 changes: 21 additions & 8 deletions src/vmm/src/builder.rs
Original file line number Diff line number Diff line change
Expand Up @@ -814,16 +814,16 @@ pub fn configure_system_for_boot(
cpu_config,
};

// Configure vCPUs with normalizing and setting the generated CPU configuration.
for vcpu in vcpus.iter_mut() {
vcpu.kvm_vcpu
.configure(vmm.guest_memory(), entry_addr, &vcpu_config)
.map_err(VmmError::VcpuConfigure)
.map_err(Internal)?;
}

#[cfg(target_arch = "x86_64")]
{
// Configure vCPUs with normalizing and setting the generated CPU configuration.
for vcpu in vcpus.iter_mut() {
vcpu.kvm_vcpu
.configure(vmm.guest_memory(), entry_addr, &vcpu_config)
.map_err(VmmError::VcpuConfigure)
.map_err(Internal)?;
}

// Write the kernel command line to guest memory. This is x86_64 specific, since on
// aarch64 the command line will be specified through the FDT.
let cmdline_size = boot_cmdline
Expand Down Expand Up @@ -858,6 +858,19 @@ pub fn configure_system_for_boot(
}
#[cfg(target_arch = "aarch64")]
{
let optional_capabilities = vmm.kvm.optional_capabilities();
// Configure vCPUs with normalizing and setting the generated CPU configuration.
for vcpu in vcpus.iter_mut() {
vcpu.kvm_vcpu
.configure(
vmm.guest_memory(),
entry_addr,
&vcpu_config,
&optional_capabilities,
)
.map_err(VmmError::VcpuConfigure)
.map_err(Internal)?;
}
let vcpu_mpidr = vcpus
.iter_mut()
.map(|cpu| cpu.kvm_vcpu.get_mpidr())
Expand Down
18 changes: 17 additions & 1 deletion src/vmm/src/vstate/kvm.rs
Original file line number Diff line number Diff line change
Expand Up @@ -140,7 +140,13 @@ impl Kvm {
}
}
}

#[cfg(target_arch = "aarch64")]
/// Optional capabilities.
#[derive(Debug, Default)]
pub struct OptionalCapabilities {
/// KVM_CAP_COUNTER_OFFSET
pub counter_offset: bool,
}
#[cfg(target_arch = "aarch64")]
impl Kvm {
const DEFAULT_CAPABILITIES: [u32; 7] = [
Expand All @@ -152,6 +158,16 @@ impl Kvm {
kvm_bindings::KVM_CAP_MP_STATE,
kvm_bindings::KVM_CAP_ONE_REG,
];

/// Returns struct with optional capabilities statuses.
pub fn optional_capabilities(&self) -> OptionalCapabilities {
OptionalCapabilities {
counter_offset: self
.fd
.check_extension_raw(kvm_bindings::KVM_CAP_COUNTER_OFFSET.into())
!= 0,
}
}
}

#[cfg(target_arch = "x86_64")]
Expand Down
9 changes: 7 additions & 2 deletions src/vmm/src/vstate/vcpu/aarch64.rs
Original file line number Diff line number Diff line change
Expand Up @@ -22,7 +22,7 @@ use crate::cpu_config::aarch64::custom_cpu_template::VcpuFeatures;
use crate::cpu_config::templates::CpuConfiguration;
use crate::logger::{error, IncMetric, METRICS};
use crate::vcpu::{VcpuConfig, VcpuError};
use crate::vstate::kvm::Kvm;
use crate::vstate::kvm::{Kvm, OptionalCapabilities};
use crate::vstate::memory::{Address, GuestAddress, GuestMemoryMmap};
use crate::vstate::vcpu::VcpuEmulation;
use crate::vstate::vm::Vm;
Expand Down Expand Up @@ -116,6 +116,7 @@ impl KvmVcpu {
guest_mem: &GuestMemoryMmap,
kernel_load_addr: GuestAddress,
vcpu_config: &VcpuConfig,
optional_capabilities: &OptionalCapabilities,
) -> Result<(), KvmVcpuError> {
for reg in vcpu_config.cpu_config.regs.iter() {
self.fd
Expand All @@ -128,6 +129,7 @@ impl KvmVcpu {
self.index,
kernel_load_addr.raw_value(),
guest_mem,
optional_capabilities,
)
.map_err(KvmVcpuError::ConfigureRegisters)?;

Expand Down Expand Up @@ -338,7 +340,8 @@ mod tests {

#[test]
fn test_configure_vcpu() {
let (_, _, mut vcpu, vm_mem) = setup_vcpu(0x10000);
let (kvm, _, mut vcpu, vm_mem) = setup_vcpu(0x10000);
let optional_capabilities = kvm.optional_capabilities();

let vcpu_config = VcpuConfig {
vcpu_count: 1,
Expand All @@ -349,6 +352,7 @@ mod tests {
&vm_mem,
GuestAddress(crate::arch::get_kernel_start()),
&vcpu_config,
&optional_capabilities,
)
.unwrap();

Expand All @@ -358,6 +362,7 @@ mod tests {
&vm_mem,
GuestAddress(crate::arch::get_kernel_start()),
&vcpu_config,
&optional_capabilities,
);
assert_eq!(
err.unwrap_err(),
Expand Down
1 change: 1 addition & 0 deletions src/vmm/src/vstate/vcpu/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -1008,6 +1008,7 @@ pub(crate) mod tests {
smt: false,
cpu_config: crate::cpu_config::aarch64::CpuConfiguration::default(),
},
&kvm.optional_capabilities(),
)
.expect("failed to configure vcpu");

Expand Down
57 changes: 57 additions & 0 deletions tests/integration_tests/functional/test_snapshot_basic.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,15 +5,19 @@
import filecmp
import logging
import os
import platform
import re
import shutil
import time
from pathlib import Path

import pytest

import host_tools.cargo_build as host
import host_tools.drive as drive_tools
from framework import utils
from framework.microvm import SnapshotType
from framework.properties import global_props
from framework.utils import check_filesystem, check_output
from framework.utils_vsock import (
ECHO_SERVER_PORT,
Expand Down Expand Up @@ -540,3 +544,56 @@ def test_vmgenid(guest_kernel_linux_6_1, rootfs, microvm_factory, snapshot_type)

# Update the base for next iteration
base_snapshot = snapshot


# TODO add `global_props.host_os == "amzn2"` condition
# once amazon linux kernels have patches.
@pytest.mark.skipif(
platform.machine() != "aarch64" or global_props.host_linux_version_tpl < (6, 4),
reason="This is aarch64 specific test and should only be run on 6.4 and later kernels",
)
def test_physical_couter_reset_aarch64(uvm_nano):
"""
Test that the CNTPCT_EL0 register is reset on VM boot.
We assume the smallest VM will not consume more than
some MAX_VALUE cycles to be created and snapshotted.
The MAX_VALUE is selected by doing a manual run of this test and
seeing what the actual counter value is. The assumption here is that
if resetting will not occur the guest counter value will be huge as it
will be a copy of host value. The host value in its turn will be huge because
it will include host OS boot + CI prep + other CI tests ...
"""
vm = uvm_nano
vm.add_net_iface()
vm.start()

snapshot = vm.snapshot_full()
vm.kill()
snap_editor = host.get_binary("snapshot-editor")

cntpct_el0 = hex(0x603000000013DF01)
# If a CPU runs at 3GHz, it will have a counter value of 1_000_000_000
# in 1/3 of a second. The host surely will run for more than 1/3 second before
# executing this test.
max_value = 800_000_000

cmd = [
str(snap_editor),
"info-vmstate",
"vcpu-states",
"--vmstate-path",
str(snapshot.vmstate),
]
_, stdout, _ = utils.check_output(cmd)

# The output will look like this:
# kvm_mp_state: 0x0
# mpidr: 0x80000000
# 0x6030000000100000 0x0000000e0
# 0x6030000000100002 0xffff00fe33c0
for line in stdout.splitlines():
parts = line.split()
if len(parts) == 2:
reg_id, reg_value = parts
if reg_id == cntpct_el0:
assert int(reg_value, 16) < max_value

0 comments on commit c72c6a2

Please sign in to comment.