Skip to content

Commit 0e6886c

Browse files
committed
configurable bounce buffer for dma
1 parent 87b6429 commit 0e6886c

File tree

18 files changed

+327
-93
lines changed

18 files changed

+327
-93
lines changed

Cargo.lock

+5
Original file line numberDiff line numberDiff line change
@@ -1065,6 +1065,7 @@ dependencies = [
10651065
"async-trait",
10661066
"futures",
10671067
"guestmem",
1068+
"hvdef",
10681069
"inspect",
10691070
"scsi_buffers",
10701071
"stackfuture",
@@ -2351,6 +2352,7 @@ name = "guestmem"
23512352
version = "0.0.0"
23522353
dependencies = [
23532354
"inspect",
2355+
"memory_range",
23542356
"pal_event",
23552357
"sparse_mmap",
23562358
"thiserror",
@@ -4191,6 +4193,7 @@ dependencies = [
41914193
"futures",
41924194
"guestmem",
41934195
"guid",
4196+
"hvdef",
41944197
"inspect",
41954198
"inspect_counters",
41964199
"mesh",
@@ -4205,8 +4208,10 @@ dependencies = [
42054208
"task_control",
42064209
"test_with_tracing",
42074210
"thiserror",
4211+
"tracelimit",
42084212
"tracing",
42094213
"user_driver",
4214+
"virt_mshv_vtl",
42104215
"vmcore",
42114216
"zerocopy",
42124217
]

openhcl/underhill_core/src/nvme_manager.rs

+24-8
Original file line numberDiff line numberDiff line change
@@ -24,6 +24,7 @@ use thiserror::Error;
2424
use tracing::Instrument;
2525
use user_driver::vfio::VfioDevice;
2626
use user_driver::vfio::VfioDmaBuffer;
27+
use virt_mshv_vtl::UhPartition;
2728
use vm_resource::kind::DiskHandleKind;
2829
use vm_resource::AsyncResolveResource;
2930
use vm_resource::ResourceId;
@@ -83,6 +84,9 @@ impl NvmeManager {
8384
driver_source: &VmTaskDriverSource,
8485
vp_count: u32,
8586
dma_buffer: Arc<dyn VfioDmaBuffer>,
87+
dma_bounce_buffer_pages_per_queue: u64,
88+
dma_bounce_buffer_pages_per_io_threshold: Option<u32>,
89+
partition: Option<Arc<UhPartition>>,
8690
) -> Self {
8791
let (send, recv) = mesh::channel();
8892
let driver = driver_source.simple();
@@ -91,6 +95,9 @@ impl NvmeManager {
9195
devices: HashMap::new(),
9296
vp_count,
9397
dma_buffer,
98+
dma_bounce_buffer_pages_per_queue,
99+
dma_bounce_buffer_pages_per_io_threshold,
100+
partition,
94101
};
95102
let task = driver.spawn("nvme-manager", async move { worker.run(recv).await });
96103
Self {
@@ -167,6 +174,9 @@ struct NvmeManagerWorker {
167174
#[inspect(skip)]
168175
dma_buffer: Arc<dyn VfioDmaBuffer>,
169176
vp_count: u32,
177+
dma_bounce_buffer_pages_per_queue: u64,
178+
dma_bounce_buffer_pages_per_io_threshold: Option<u32>,
179+
partition: Option<Arc<UhPartition>>,
170180
}
171181

172182
impl NvmeManagerWorker {
@@ -238,14 +248,20 @@ impl NvmeManagerWorker {
238248
.await
239249
.map_err(InnerError::Vfio)?;
240250

241-
let driver =
242-
nvme_driver::NvmeDriver::new(&self.driver_source, self.vp_count, device)
243-
.instrument(tracing::info_span!(
244-
"nvme_driver_init",
245-
pci_id = entry.key()
246-
))
247-
.await
248-
.map_err(InnerError::DeviceInitFailed)?;
251+
let driver = nvme_driver::NvmeDriver::new(
252+
&self.driver_source,
253+
self.vp_count,
254+
device,
255+
self.dma_bounce_buffer_pages_per_queue,
256+
self.dma_bounce_buffer_pages_per_io_threshold,
257+
self.partition.clone(),
258+
)
259+
.instrument(tracing::info_span!(
260+
"nvme_driver_init",
261+
pci_id = entry.key()
262+
))
263+
.await
264+
.map_err(InnerError::DeviceInitFailed)?;
249265

250266
entry.insert(driver)
251267
}

openhcl/underhill_core/src/worker.rs

+82-41
Original file line numberDiff line numberDiff line change
@@ -1246,12 +1246,89 @@ async fn new_underhill_vm(
12461246

12471247
let boot_info = runtime_params.parsed_openhcl_boot();
12481248

1249+
// Determine if x2apic is supported so that the topology matches
1250+
// reality.
1251+
//
1252+
// We don't know if x2apic is forced on, but currently it doesn't really
1253+
// matter because the topology's initial x2apic state is not currently
1254+
// used in Underhill.
1255+
//
1256+
// FUTURE: consider having Underhill decide whether x2apic is enabled at
1257+
// boot rather than allowing the host to make that decision. This would
1258+
// just require Underhill setting the apicbase register on the VPs
1259+
// before start.
1260+
//
1261+
// TODO: centralize cpuid querying logic.
1262+
#[cfg(guest_arch = "x86_64")]
1263+
let x2apic = if isolation.is_hardware_isolated() {
1264+
// For hardware CVMs, always enable x2apic support at boot.
1265+
vm_topology::processor::x86::X2ApicState::Enabled
1266+
} else if safe_x86_intrinsics::cpuid(x86defs::cpuid::CpuidFunction::VersionAndFeatures.0, 0).ecx
1267+
& (1 << 21)
1268+
!= 0
1269+
{
1270+
vm_topology::processor::x86::X2ApicState::Supported
1271+
} else {
1272+
vm_topology::processor::x86::X2ApicState::Unsupported
1273+
};
1274+
1275+
#[cfg(guest_arch = "x86_64")]
1276+
let processor_topology = new_x86_topology(&boot_info.cpus, x2apic)
1277+
.context("failed to construct the processor topology")?;
1278+
1279+
#[cfg(guest_arch = "aarch64")]
1280+
let processor_topology = new_aarch64_topology(
1281+
boot_info
1282+
.gic
1283+
.context("did not get gic state from bootloader")?,
1284+
&boot_info.cpus,
1285+
)
1286+
.context("failed to construct the processor topology")?;
1287+
12491288
// The amount of memory required by the GET igvm_attest request
12501289
let attestation = get_protocol::IGVM_ATTEST_MSG_SHARED_GPA as u64 * hvdef::HV_PAGE_SIZE;
12511290

1252-
// TODO: determine actual memory usage by NVME/MANA. hardcode as 10MB
1253-
let device_dma = 10 * 1024 * 1024;
1291+
const MIN_PER_QUEUE_PAGES: u64 = (128 * 1024 + hvdef::HV_PAGE_SIZE) / hvdef::HV_PAGE_SIZE;
1292+
const DEFAULT_DMA_BOUNCE_BUFFER_PAGES_PER_QUEUE: u64 = 128;
1293+
#[allow(clippy::assertions_on_constants)]
1294+
const _: () = assert!(
1295+
DEFAULT_DMA_BOUNCE_BUFFER_PAGES_PER_QUEUE >= MIN_PER_QUEUE_PAGES,
1296+
"not enough room for an ATAPI IO plus a PRP list"
1297+
);
1298+
1299+
const DEFAULT_NVME_DRIVERS: u32 = 8;
1300+
let (max_nvme_drivers, dma_bounce_buffer_pages_per_queue, dma_bounce_buffer_pages_per_io_threshold) = dps.general.vtl2_settings.as_ref().map_or(
1301+
(DEFAULT_NVME_DRIVERS, DEFAULT_DMA_BOUNCE_BUFFER_PAGES_PER_QUEUE, None),
1302+
|vtl2_settings| {
1303+
let original_dma_bounce_buffer_pages_per_queue = vtl2_settings
1304+
.fixed
1305+
.dma_bounce_buffer_pages_per_queue
1306+
.unwrap_or(DEFAULT_DMA_BOUNCE_BUFFER_PAGES_PER_QUEUE);
1307+
1308+
let dma_bounce_buffer_pages_per_queue = if original_dma_bounce_buffer_pages_per_queue < MIN_PER_QUEUE_PAGES {
1309+
tracing::warn!(
1310+
"the value of dma_bounce_buffer_pages_per_queue ({}) is less than MIN_PER_QUEUE_PAGES ({})",
1311+
original_dma_bounce_buffer_pages_per_queue, MIN_PER_QUEUE_PAGES
1312+
);
1313+
MIN_PER_QUEUE_PAGES
1314+
} else {
1315+
original_dma_bounce_buffer_pages_per_queue
1316+
};
1317+
1318+
(
1319+
vtl2_settings.fixed.max_nvme_drivers.unwrap_or(DEFAULT_NVME_DRIVERS),
1320+
dma_bounce_buffer_pages_per_queue,
1321+
vtl2_settings.fixed.dma_bounce_buffer_pages_per_io_threshold,
1322+
)
1323+
},
1324+
);
12541325

1326+
// TODO: determine actual memory usage by NVME/MANA. hardcode as 10MB
1327+
let device_dma = 10 * 1024 * 1024
1328+
+ max_nvme_drivers as u64
1329+
* processor_topology.vp_count() as u64
1330+
* dma_bounce_buffer_pages_per_queue
1331+
* hvdef::HV_PAGE_SIZE;
12551332
// Determine the amount of shared memory to reserve from VTL0.
12561333
let shared_pool_size = match isolation {
12571334
#[cfg(guest_arch = "x86_64")]
@@ -1314,45 +1391,6 @@ async fn new_underhill_vm(
13141391
physical_address_size,
13151392
)?;
13161393

1317-
// Determine if x2apic is supported so that the topology matches
1318-
// reality.
1319-
//
1320-
// We don't know if x2apic is forced on, but currently it doesn't really
1321-
// matter because the topology's initial x2apic state is not currently
1322-
// used in Underhill.
1323-
//
1324-
// FUTURE: consider having Underhill decide whether x2apic is enabled at
1325-
// boot rather than allowing the host to make that decision. This would
1326-
// just require Underhill setting the apicbase register on the VPs
1327-
// before start.
1328-
//
1329-
// TODO: centralize cpuid querying logic.
1330-
#[cfg(guest_arch = "x86_64")]
1331-
let x2apic = if isolation.is_hardware_isolated() {
1332-
// For hardware CVMs, always enable x2apic support at boot.
1333-
vm_topology::processor::x86::X2ApicState::Enabled
1334-
} else if safe_x86_intrinsics::cpuid(x86defs::cpuid::CpuidFunction::VersionAndFeatures.0, 0).ecx
1335-
& (1 << 21)
1336-
!= 0
1337-
{
1338-
vm_topology::processor::x86::X2ApicState::Supported
1339-
} else {
1340-
vm_topology::processor::x86::X2ApicState::Unsupported
1341-
};
1342-
1343-
#[cfg(guest_arch = "x86_64")]
1344-
let processor_topology = new_x86_topology(&boot_info.cpus, x2apic)
1345-
.context("failed to construct the processor topology")?;
1346-
1347-
#[cfg(guest_arch = "aarch64")]
1348-
let processor_topology = new_aarch64_topology(
1349-
boot_info
1350-
.gic
1351-
.context("did not get gic state from bootloader")?,
1352-
&boot_info.cpus,
1353-
)
1354-
.context("failed to construct the processor topology")?;
1355-
13561394
let mut with_vmbus: bool = false;
13571395
let mut with_vmbus_relay = false;
13581396
if dps.general.vmbus_redirection_enabled {
@@ -1770,6 +1808,9 @@ async fn new_underhill_vm(
17701808
&driver_source,
17711809
processor_topology.vp_count(),
17721810
vfio_dma_buffer(&shared_vis_pages_pool),
1811+
dma_bounce_buffer_pages_per_queue,
1812+
dma_bounce_buffer_pages_per_io_threshold,
1813+
Some(partition.clone()),
17731814
);
17741815

17751816
resolver.add_async_resolver::<DiskHandleKind, _, NvmeDiskConfig, _>(NvmeDiskResolver::new(

openhcl/virt_mshv_vtl/src/lib.rs

+21
Original file line numberDiff line numberDiff line change
@@ -1774,6 +1774,27 @@ impl UhPartition {
17741774
);
17751775
}
17761776
}
1777+
1778+
/// Pins the specified guest physical address ranges in the hypervisor.
1779+
/// The memory ranges passed to this function must be VA backed memory.
1780+
/// If a partial failure occurs (i.e., some but not all the ranges were successfully pinned),
1781+
/// the function will automatically attempt to unpin any successfully pinned ranges.
1782+
/// This "rollback" behavior ensures that no partially pinned state remains, which
1783+
/// could otherwise lead to inconsistencies.
1784+
///
1785+
pub fn pin_gpa_ranges(&self, ranges: &[MemoryRange]) -> Result<(), HvError> {
1786+
self.inner.hcl.pin_gpa_ranges(ranges)
1787+
}
1788+
1789+
/// Unpins the specified guest physical address ranges in the hypervisor.
1790+
/// The memory ranges passed to this function must be VA backed memory.
1791+
/// If a partial failure occurs (i.e., some but not all the ranges were successfully unpinned),
1792+
/// the function will automatically attempt to pin any successfully unpinned ranges. This "rollback"
1793+
/// behavior ensures that no partially unpinned state remains, which could otherwise lead to inconsistencies.
1794+
///
1795+
pub fn unpin_gpa_ranges(&self, ranges: &[MemoryRange]) -> Result<(), HvError> {
1796+
self.inner.hcl.unpin_gpa_ranges(ranges)
1797+
}
17771798
}
17781799

17791800
#[cfg(guest_arch = "x86_64")]

vm/devices/get/underhill_config/src/lib.rs

+7-1
Original file line numberDiff line numberDiff line change
@@ -178,8 +178,14 @@ pub struct Vtl2SettingsFixed {
178178
pub scsi_sub_channels: u16,
179179
/// size of the io-uring submission queues
180180
pub io_ring_size: u32,
181-
/// Max bounce buffer pages active per cpu
181+
/// Max bounce buffer pages active per cpu for unaligned IOs
182182
pub max_bounce_buffer_pages: Option<u32>,
183+
// DMA bounce buffer pages per queue
184+
pub dma_bounce_buffer_pages_per_queue: Option<u64>,
185+
// Threshold of io size in pages to use bounce buffer
186+
pub dma_bounce_buffer_pages_per_io_threshold: Option<u32>,
187+
// Max nvme drivers
188+
pub max_nvme_drivers: Option<u32>,
183189
}
184190

185191
#[derive(Debug, Clone, MeshPayload, Inspect)]

vm/devices/get/underhill_config/src/schema/v1.rs

+3
Original file line numberDiff line numberDiff line change
@@ -528,6 +528,9 @@ impl ParseSchema<crate::Vtl2SettingsFixed> for Vtl2SettingsFixed {
528528
scsi_sub_channels: self.scsi_sub_channels.map_or(0, |x| x as u16),
529529
io_ring_size: self.io_ring_size.unwrap_or(256),
530530
max_bounce_buffer_pages: self.max_bounce_buffer_pages,
531+
dma_bounce_buffer_pages_per_queue: self.dma_bounce_buffer_pages_per_queue,
532+
dma_bounce_buffer_pages_per_io_threshold: self.dma_bounce_buffer_pages_per_io_threshold,
533+
max_nvme_drivers: self.max_nvme_drivers,
531534
})
532535
}
533536
}

vm/devices/get/vtl2_settings_proto/src/vtl2_settings.namespaces.proto

+3
Original file line numberDiff line numberDiff line change
@@ -27,6 +27,9 @@ message Vtl2SettingsFixed {
2727
optional uint32 io_ring_size = 2;
2828
// Specify the maximum number of bounce buffer pages allowed per cpu
2929
optional uint32 max_bounce_buffer_pages = 3;
30+
optional uint64 dma_bounce_buffer_pages_per_queue = 4;
31+
optional uint32 dma_bounce_buffer_pages_per_io_threshold = 5;
32+
optional uint32 max_nvme_drivers = 6;
3033
}
3134

3235
message Vtl2SettingsDynamic {

vm/devices/storage/disk_backend/Cargo.toml

+1
Original file line numberDiff line numberDiff line change
@@ -8,6 +8,7 @@ rust-version.workspace = true
88

99
[dependencies]
1010
scsi_buffers.workspace = true
11+
hvdef.workspace = true
1112

1213
guestmem.workspace = true
1314
vm_resource.workspace = true

vm/devices/storage/disk_backend/src/lib.rs

+3
Original file line numberDiff line numberDiff line change
@@ -19,6 +19,7 @@ pub mod sync_wrapper;
1919
pub mod zerodisk;
2020

2121
use guestmem::AccessError;
22+
use hvdef::HvError;
2223
use inspect::Inspect;
2324
use scsi_buffers::RequestBuffers;
2425
use stackfuture::StackFuture;
@@ -50,6 +51,8 @@ pub enum DiskError {
5051
ReservationConflict,
5152
#[error("unsupported eject")]
5253
UnsupportedEject,
54+
#[error("failed to pin/unpin guest memory {0}")]
55+
Hv(HvError),
5356
}
5457

5558
/// Io error details

vm/devices/storage/disk_nvme/nvme_driver/Cargo.toml

+3
Original file line numberDiff line numberDiff line change
@@ -16,6 +16,8 @@ task_control.workspace = true
1616
user_driver.workspace = true
1717
guestmem.workspace = true
1818
vmcore.workspace = true
19+
virt_mshv_vtl.workspace = true
20+
hvdef.workspace = true
1921

2022
anyhow.workspace = true
2123
event-listener.workspace = true
@@ -25,6 +27,7 @@ safeatomic.workspace = true
2527
slab.workspace = true
2628
thiserror.workspace = true
2729
tracing.workspace = true
30+
tracelimit.workspace = true
2831
zerocopy.workspace = true
2932

3033
[dev-dependencies]

0 commit comments

Comments
 (0)