Skip to content

Commit a432354

Browse files
committed
Support shared mmap for running VMs
This allows us to run VMs while streaming memory changes to disk support mmap shared update log add configuration for memory backing file Progress tweaks
1 parent efebfca commit a432354

File tree

15 files changed

+303
-48
lines changed

15 files changed

+303
-48
lines changed

resources/seccomp/aarch64-unknown-linux-musl.json

Lines changed: 13 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -596,6 +596,19 @@
596596
}
597597
]
598598
},
599+
{
600+
"syscall": "msync",
601+
"comment": "Used to sync memory from mmap to disk",
602+
"args": [
603+
{
604+
"index": 2,
605+
"type": "dword",
606+
"op": "eq",
607+
"val": 4,
608+
"comment": "MS_SYNC"
609+
}
610+
]
611+
},
599612
{
600613
"syscall": "rt_sigaction",
601614
"comment": "rt_sigaction is used by libc::abort during a panic to install the default handler for SIGABRT",

resources/seccomp/x86_64-unknown-linux-musl.json

Lines changed: 13 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -248,6 +248,19 @@
248248
}
249249
]
250250
},
251+
{
252+
"syscall": "msync",
253+
"comment": "Used to sync memory from mmap to disk",
254+
"args": [
255+
{
256+
"index": 2,
257+
"type": "dword",
258+
"op": "eq",
259+
"val": 4,
260+
"comment": "MS_SYNC"
261+
}
262+
]
263+
},
251264
{
252265
"syscall": "rt_sigaction",
253266
"comment": "rt_sigaction is used by libc::abort during a panic to install the default handler for SIGABRT",

src/api_server/src/parsed_request.rs

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -17,6 +17,7 @@ use crate::request::logger::parse_put_logger;
1717
use crate::request::machine_configuration::{
1818
parse_get_machine_config, parse_patch_machine_config, parse_put_machine_config,
1919
};
20+
use crate::request::memory_backing_file::parse_put_memory_backing_file;
2021
use crate::request::metrics::parse_put_metrics;
2122
use crate::request::mmds::{parse_get_mmds, parse_patch_mmds, parse_put_mmds};
2223
use crate::request::net::{parse_patch_net, parse_put_net};
@@ -112,6 +113,7 @@ impl ParsedRequest {
112113
(Method::Put, "network-interfaces", Some(body)) => {
113114
parse_put_net(body, path_tokens.get(1))
114115
}
116+
(Method::Put, "memory-backing-file", Some(body)) => parse_put_memory_backing_file(body),
115117
(Method::Put, "shutdown-internal", None) => {
116118
Ok(ParsedRequest::new(RequestAction::ShutdownInternal))
117119
}
Lines changed: 42 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,42 @@
1+
// Copyright 2018 Amazon.com, Inc. or its affiliates. All Rights Reserved.
2+
// SPDX-License-Identifier: Apache-2.0
3+
4+
use super::super::VmmAction;
5+
use crate::parsed_request::{Error, ParsedRequest};
6+
use crate::request::Body;
7+
use logger::{IncMetric, METRICS};
8+
use vmm::vmm_config::memory_backing_file::MemoryBackingFileConfig;
9+
10+
pub(crate) fn parse_put_memory_backing_file(body: &Body) -> Result<ParsedRequest, Error> {
11+
METRICS.put_api_requests.memory_backing_file_cfg_count.inc();
12+
Ok(ParsedRequest::new_sync(VmmAction::SetMemoryBackingFile(
13+
serde_json::from_slice::<MemoryBackingFileConfig>(body.raw()).map_err(|e| {
14+
METRICS.put_api_requests.memory_backing_file_cfg_fails.inc();
15+
Error::SerdeJson(e)
16+
})?,
17+
)))
18+
}
19+
20+
#[cfg(test)]
21+
mod tests {
22+
use std::path::PathBuf;
23+
24+
use super::*;
25+
26+
#[test]
27+
fn test_parse_memory_backing_file() {
28+
assert!(parse_put_memory_backing_file(&Body::new("invalid_payload")).is_err());
29+
30+
let body = r#"{
31+
"path": "./memory.snap"
32+
}"#;
33+
let same_body = MemoryBackingFileConfig {
34+
path: PathBuf::from("./memory.snap"),
35+
};
36+
let result = parse_put_memory_backing_file(&Body::new(body));
37+
assert!(result.is_ok());
38+
let parsed_req = result.unwrap_or_else(|_e| panic!("Failed test."));
39+
40+
assert!(parsed_req == ParsedRequest::new_sync(VmmAction::SetMemoryBackingFile(same_body)));
41+
}
42+
}

src/api_server/src/request/mod.rs

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -8,6 +8,7 @@ pub mod drive;
88
pub mod instance_info;
99
pub mod logger;
1010
pub mod machine_configuration;
11+
pub mod memory_backing_file;
1112
pub mod metrics;
1213
pub mod mmds;
1314
pub mod net;

src/api_server/swagger/firecracker.yaml

Lines changed: 31 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -350,6 +350,29 @@ paths:
350350
description: Internal server error
351351
schema:
352352
$ref: "#/definitions/Error"
353+
354+
/memory-backing-file:
355+
put:
356+
summary: Configures a memory backing file to sync the memory changes to during the runtime of the vm
357+
operationId: putMemoryBackingFile
358+
parameters:
359+
- name: body
360+
in: body
361+
description: Path to memory backing file
362+
required: true
363+
schema:
364+
$ref: "#/definitions/MemoryBackingFile"
365+
responses:
366+
204:
367+
description: Memory backing file configured
368+
400:
369+
description: Memory backing file failed
370+
schema:
371+
$ref: "#/definitions/Error"
372+
default:
373+
description: Internal server error.
374+
schema:
375+
$ref: "#/definitions/Error"
353376

354377
/metrics:
355378
put:
@@ -1047,6 +1070,14 @@ definitions:
10471070
tx_rate_limiter:
10481071
$ref: "#/definitions/RateLimiter"
10491072

1073+
MemoryBackingFile:
1074+
type: object
1075+
required:
1076+
- path
1077+
properties:
1078+
path:
1079+
type: string
1080+
10501081
PartialDrive:
10511082
type: object
10521083
required:

src/logger/src/metrics.rs

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -403,6 +403,10 @@ pub struct PutRequestsMetrics {
403403
pub machine_cfg_count: SharedIncMetric,
404404
/// Number of failures in configuring the machine.
405405
pub machine_cfg_fails: SharedIncMetric,
406+
/// Number of PUTs for setting memory backing file.
407+
pub memory_backing_file_cfg_count: SharedIncMetric,
408+
/// Number of failures in configuring the machine.
409+
pub memory_backing_file_cfg_fails: SharedIncMetric,
406410
/// Number of PUTs for initializing the metrics system.
407411
pub metrics_count: SharedIncMetric,
408412
/// Number of failures in initializing the metrics system.

src/vm-memory/src/lib.rs

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -117,7 +117,7 @@ pub fn create_guest_memory(
117117
for region in regions {
118118
let flags = match region.0 {
119119
None => libc::MAP_NORESERVE | libc::MAP_PRIVATE | libc::MAP_ANONYMOUS,
120-
Some(_) => libc::MAP_NORESERVE | libc::MAP_PRIVATE,
120+
Some(_) => libc::MAP_NORESERVE | libc::MAP_SHARED,
121121
};
122122

123123
let mmap_region =

src/vmm/src/builder.rs

Lines changed: 45 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -5,9 +5,11 @@
55
66
use std::convert::TryFrom;
77
use std::fmt::{Display, Formatter};
8+
use std::fs::File;
89
use std::io::{self, Read, Seek, SeekFrom};
910
use std::os::unix::io::{AsRawFd, RawFd};
1011
use std::sync::{Arc, Mutex};
12+
use vm_memory::FileOffset;
1113

1214
use arch::InitrdConfig;
1315
#[cfg(target_arch = "x86_64")]
@@ -28,7 +30,6 @@ use linux_loader::loader::KernelLoader;
2830
use logger::{error, warn, METRICS};
2931
use seccompiler::BpfThreadMap;
3032
use snapshot::Persist;
31-
use userfaultfd::Uffd;
3233
use utils::eventfd::EventFd;
3334
use utils::terminal::Terminal;
3435
use utils::time::TimestampUs;
@@ -43,7 +44,7 @@ use crate::construct_kvm_mpidrs;
4344
use crate::device_manager::legacy::PortIODeviceManager;
4445
use crate::device_manager::mmio::MMIODeviceManager;
4546
use crate::device_manager::persist::MMIODevManagerConstructorArgs;
46-
use crate::persist::{MicrovmState, MicrovmStateError};
47+
use crate::persist::{MemoryDescriptor, MicrovmState, MicrovmStateError};
4748
use crate::resources::VmResources;
4849
use crate::vmm_config::boot_source::BootConfig;
4950
use crate::vmm_config::instance_info::InstanceInfo;
@@ -58,6 +59,8 @@ use crate::{device_manager, mem_size_mib, Error, EventManager, Vmm, VmmEventsObs
5859
pub enum StartMicrovmError {
5960
/// Unable to attach block device to Vmm.
6061
AttachBlockDevice(io::Error),
62+
/// Unable to create the memory backing file.
63+
BackingMemoryFile(io::Error),
6164
/// This error is thrown by the minimal boot loader implementation.
6265
ConfigureSystem(arch::Error),
6366
/// Internal errors are due to resource exhaustion.
@@ -112,6 +115,9 @@ impl Display for StartMicrovmError {
112115
write!(f, "Unable to attach block device to Vmm: {}", err)
113116
}
114117
ConfigureSystem(err) => write!(f, "System configuration error: {:?}", err),
118+
BackingMemoryFile(err) => {
119+
write!(f, "Unable to create the memory backing file: {}", err)
120+
}
115121
CreateRateLimiter(err) => write!(f, "Cannot create RateLimiter: {}", err),
116122
CreateNetDevice(err) => {
117123
let mut err_msg = format!("{:?}", err);
@@ -231,7 +237,7 @@ fn create_vmm_and_vcpus(
231237
instance_info: &InstanceInfo,
232238
event_manager: &mut EventManager,
233239
guest_memory: GuestMemoryMmap,
234-
uffd: Option<Uffd>,
240+
memory_descriptor: Option<MemoryDescriptor>,
235241
track_dirty_pages: bool,
236242
vcpu_count: u8,
237243
) -> std::result::Result<(Vmm, Vec<Vcpu>), StartMicrovmError> {
@@ -297,7 +303,7 @@ fn create_vmm_and_vcpus(
297303
shutdown_exit_code: None,
298304
vm,
299305
guest_memory,
300-
uffd,
306+
memory_descriptor,
301307
vcpus_handles: Vec::new(),
302308
vcpus_exit_evt,
303309
mmio_device_manager,
@@ -329,8 +335,23 @@ pub fn build_microvm_for_boot(
329335
let boot_config = vm_resources.boot_source().ok_or(MissingKernelConfig)?;
330336

331337
let track_dirty_pages = vm_resources.track_dirty_pages();
332-
let guest_memory =
333-
create_guest_memory(vm_resources.vm_config().mem_size_mib, track_dirty_pages)?;
338+
339+
let backing_memory_file = if let Some(ref file) = vm_resources.backing_memory_file {
340+
file.set_len((vm_resources.vm_config().mem_size_mib * 1024 * 1024) as u64)
341+
.map_err(|e| {
342+
error!("Failed to set backing memory file size: {}", e);
343+
StartMicrovmError::BackingMemoryFile(e)
344+
})?;
345+
346+
Some(file.clone())
347+
} else {
348+
None
349+
};
350+
let guest_memory = create_guest_memory(
351+
vm_resources.vm_config().mem_size_mib,
352+
backing_memory_file.clone(),
353+
track_dirty_pages,
354+
)?;
334355
let vcpu_config = vm_resources.vcpu_config();
335356
let entry_addr = load_kernel(boot_config, &guest_memory)?;
336357
let initrd = load_initrd_from_config(boot_config, &guest_memory)?;
@@ -362,7 +383,7 @@ pub fn build_microvm_for_boot(
362383
instance_info,
363384
event_manager,
364385
guest_memory,
365-
None,
386+
backing_memory_file.map(MemoryDescriptor::File),
366387
track_dirty_pages,
367388
vcpu_config.vcpu_count,
368389
)?;
@@ -451,7 +472,7 @@ pub fn build_microvm_from_snapshot(
451472
event_manager: &mut EventManager,
452473
microvm_state: MicrovmState,
453474
guest_memory: GuestMemoryMmap,
454-
uffd: Option<Uffd>,
475+
memory_descriptor: Option<MemoryDescriptor>,
455476
track_dirty_pages: bool,
456477
seccomp_filters: &BpfThreadMap,
457478
vm_resources: &mut VmResources,
@@ -466,7 +487,7 @@ pub fn build_microvm_from_snapshot(
466487
instance_info,
467488
event_manager,
468489
guest_memory.clone(),
469-
uffd,
490+
memory_descriptor,
470491
track_dirty_pages,
471492
vcpu_count,
472493
)?;
@@ -581,15 +602,24 @@ pub fn build_microvm_from_snapshot(
581602
/// Creates GuestMemory of `mem_size_mib` MiB in size.
582603
pub fn create_guest_memory(
583604
mem_size_mib: usize,
605+
backing_memory_file: Option<Arc<File>>,
584606
track_dirty_pages: bool,
585607
) -> std::result::Result<GuestMemoryMmap, StartMicrovmError> {
586608
let mem_size = mem_size_mib << 20;
587609
let arch_mem_regions = arch::arch_memory_regions(mem_size);
588610

611+
let mut offset = 0_u64;
589612
vm_memory::create_guest_memory(
590613
&arch_mem_regions
591614
.iter()
592-
.map(|(addr, size)| (None, *addr, *size))
615+
.map(|(addr, size)| {
616+
let file_offset = backing_memory_file
617+
.clone()
618+
.map(|file| FileOffset::from_arc(file, offset));
619+
offset += *size as u64;
620+
621+
(file_offset, *addr, *size)
622+
})
593623
.collect::<Vec<_>>()[..],
594624
track_dirty_pages,
595625
)
@@ -1068,7 +1098,7 @@ pub mod tests {
10681098
}
10691099

10701100
pub(crate) fn default_vmm() -> Vmm {
1071-
let guest_memory = create_guest_memory(128, false).unwrap();
1101+
let guest_memory = create_guest_memory(128, None, false).unwrap();
10721102

10731103
let vcpus_exit_evt = EventFd::new(libc::EFD_NONBLOCK)
10741104
.map_err(Error::EventFd)
@@ -1096,12 +1126,12 @@ pub mod tests {
10961126
shutdown_exit_code: None,
10971127
vm,
10981128
guest_memory,
1099-
uffd: None,
11001129
vcpus_handles: Vec::new(),
11011130
vcpus_exit_evt,
11021131
mmio_device_manager,
11031132
#[cfg(target_arch = "x86_64")]
11041133
pio_device_manager,
1134+
memory_descriptor: None,
11051135
}
11061136
}
11071137

@@ -1283,21 +1313,21 @@ pub mod tests {
12831313

12841314
// Case 1: create guest memory without dirty page tracking
12851315
{
1286-
let guest_memory = create_guest_memory(mem_size, false).unwrap();
1316+
let guest_memory = create_guest_memory(mem_size, None, false).unwrap();
12871317
assert!(!is_dirty_tracking_enabled(&guest_memory));
12881318
}
12891319

12901320
// Case 2: create guest memory with dirty page tracking
12911321
{
1292-
let guest_memory = create_guest_memory(mem_size, true).unwrap();
1322+
let guest_memory = create_guest_memory(mem_size, None, true).unwrap();
12931323
assert!(is_dirty_tracking_enabled(&guest_memory));
12941324
}
12951325
}
12961326

12971327
#[test]
12981328
fn test_create_vcpus() {
12991329
let vcpu_count = 2;
1300-
let guest_memory = create_guest_memory(128, false).unwrap();
1330+
let guest_memory = create_guest_memory(128, None, false).unwrap();
13011331

13021332
#[allow(unused_mut)]
13031333
let mut vm = setup_kvm_vm(&guest_memory, false).unwrap();

src/vmm/src/lib.rs

Lines changed: 6 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -50,10 +50,10 @@ use devices::virtio::{
5050
use devices::BusDevice;
5151
use event_manager::{EventManager as BaseEventManager, EventOps, Events, MutEventSubscriber};
5252
use logger::{error, info, warn, LoggerError, MetricsError, METRICS};
53+
use persist::MemoryDescriptor;
5354
use rate_limiter::BucketUpdate;
5455
use seccompiler::BpfProgram;
5556
use snapshot::Persist;
56-
use userfaultfd::Uffd;
5757
use utils::epoll::EventSet;
5858
use utils::eventfd::EventFd;
5959
use vm_memory::{GuestMemory, GuestMemoryMmap, GuestMemoryRegion};
@@ -260,10 +260,6 @@ pub struct Vmm {
260260
// Guest VM core resources.
261261
vm: Vm,
262262
guest_memory: GuestMemoryMmap,
263-
// Save UFFD in order to keep it open in the Firecracker process, as well.
264-
// Since this field is never read again, we need to allow `dead_code`.
265-
#[allow(dead_code)]
266-
uffd: Option<Uffd>,
267263
vcpus_handles: Vec<VcpuHandle>,
268264
// Used by Vcpus and devices to initiate teardown; Vmm should never write here.
269265
vcpus_exit_evt: EventFd,
@@ -272,6 +268,11 @@ pub struct Vmm {
272268
mmio_device_manager: MMIODeviceManager,
273269
#[cfg(target_arch = "x86_64")]
274270
pio_device_manager: PortIODeviceManager,
271+
272+
// The mem file that should be mmaped. We need to keep a reference of the UFFD in the
273+
// process so we allow dead_code
274+
#[allow(dead_code)]
275+
memory_descriptor: Option<MemoryDescriptor>,
275276
}
276277

277278
impl Vmm {

0 commit comments

Comments
 (0)