Skip to content

Commit 88630c0

Browse files
authored
Make migration status available after migration out (#330)
Use a similar technique to what we use for instance states: when a server drops its VM controller, clone the receiver side of the migration state channel and use the clone to answer subsequent migration state queries. Tested with ad hoc live migrations & new PHD tests.
1 parent b6f2dc6 commit 88630c0

File tree

7 files changed

+83
-21
lines changed

7 files changed

+83
-21
lines changed

Cargo.lock

Lines changed: 1 addition & 0 deletions
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

bin/propolis-server/src/lib/server.rs

Lines changed: 43 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -9,6 +9,7 @@ use std::convert::TryFrom;
99
use std::sync::Arc;
1010
use std::{collections::BTreeMap, net::SocketAddr};
1111

12+
use crate::migrate::MigrateError;
1213
use crate::serial::history_buffer::SerialHistoryOffset;
1314
use dropshot::{
1415
channel, endpoint, ApiDescription, HttpError, HttpResponseCreated,
@@ -28,6 +29,7 @@ use thiserror::Error;
2829
use tokio::sync::{mpsc, oneshot, MappedMutexGuard, Mutex, MutexGuard};
2930
use tokio_tungstenite::tungstenite::protocol::{Role, WebSocketConfig};
3031
use tokio_tungstenite::WebSocketStream;
32+
use uuid::Uuid;
3133

3234
use crate::spec::{ServerSpecBuilder, ServerSpecBuilderError};
3335
use crate::vm::VmController;
@@ -102,8 +104,15 @@ pub enum VmControllerState {
102104
/// outgoing controller can publish new state changes even after the
103105
/// server has dropped its reference to it (its state worker may
104106
/// continue running for a time).
105-
watcher:
107+
state_watcher:
106108
tokio::sync::watch::Receiver<api::InstanceStateMonitorResponse>,
109+
110+
/// A clone of the receiver side of the former VM controller's migration
111+
/// status channel, used to serve queries for migration status after an
112+
/// instance is destroyed. (These are common because an instance stops
113+
/// itself after successfully migrating out.)
114+
migrate_state_watcher:
115+
tokio::sync::watch::Receiver<Option<(Uuid, api::MigrationState)>>,
107116
},
108117
}
109118

@@ -137,13 +146,15 @@ impl VmControllerState {
137146
// the final transition to the "destroyed" state happens only when
138147
// all references to the VM have been dropped, including the one
139148
// this routine just exchanged and will return.
140-
let watcher = vm.state_watcher().clone();
149+
let state_watcher = vm.state_watcher().clone();
150+
let migrate_state_watcher = vm.migrate_state_watcher().clone();
141151
if let VmControllerState::Created(vm) = std::mem::replace(
142152
self,
143153
VmControllerState::Destroyed {
144154
last_instance,
145155
last_instance_spec: Box::new(last_instance_spec),
146-
watcher,
156+
state_watcher,
157+
migrate_state_watcher,
147158
},
148159
) {
149160
Some(vm)
@@ -571,9 +582,10 @@ async fn instance_get_common(
571582
VmControllerState::Destroyed {
572583
last_instance,
573584
last_instance_spec,
574-
watcher,
585+
state_watcher,
586+
..
575587
} => {
576-
let watcher = watcher.borrow();
588+
let watcher = state_watcher.borrow();
577589
let mut last_instance = last_instance.clone();
578590
last_instance.state = watcher.state;
579591
Ok((last_instance, *last_instance_spec.clone()))
@@ -627,7 +639,9 @@ async fn instance_state_monitor(
627639
));
628640
}
629641
VmControllerState::Created(vm) => vm.state_watcher().clone(),
630-
VmControllerState::Destroyed { watcher, .. } => watcher.clone(),
642+
VmControllerState::Destroyed { state_watcher, .. } => {
643+
state_watcher.clone()
644+
}
631645
}
632646
};
633647

@@ -793,10 +807,29 @@ async fn instance_migrate_status(
793807
path_params: Path<api::InstanceMigrateStatusRequest>,
794808
) -> Result<HttpResponseOk<api::InstanceMigrateStatusResponse>, HttpError> {
795809
let migration_id = path_params.into_inner().migration_id;
796-
let vm = rqctx.context().vm().await?;
797-
vm.migrate_status(migration_id).map_err(Into::into).map(|state| {
798-
HttpResponseOk(api::InstanceMigrateStatusResponse { state })
799-
})
810+
let ctx = rqctx.context();
811+
match &*ctx.services.vm.lock().await {
812+
VmControllerState::NotCreated => Err(HttpError::for_internal_error(
813+
"Server not initialized (no instance)".to_string(),
814+
)),
815+
VmControllerState::Created(vm) => {
816+
vm.migrate_status(migration_id).map_err(Into::into).map(|state| {
817+
HttpResponseOk(api::InstanceMigrateStatusResponse { state })
818+
})
819+
}
820+
VmControllerState::Destroyed { migrate_state_watcher, .. } => {
821+
let watcher = migrate_state_watcher.borrow();
822+
match *watcher {
823+
None => Err((MigrateError::NoMigrationInProgress).into()),
824+
Some((id, state)) if id == migration_id => {
825+
Ok(HttpResponseOk(api::InstanceMigrateStatusResponse {
826+
state,
827+
}))
828+
}
829+
Some(_) => Err((MigrateError::UuidMismatch).into()),
830+
}
831+
}
832+
}
800833
}
801834

802835
/// Issues a snapshot request to a crucible backend.

bin/propolis-server/src/lib/vm/mod.rs

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -650,6 +650,12 @@ impl VmController {
650650
&self.vm_objects.monitor_rx
651651
}
652652

653+
pub fn migrate_state_watcher(
654+
&self,
655+
) -> &tokio::sync::watch::Receiver<Option<(Uuid, ApiMigrationState)>> {
656+
&self.vm_objects.migrate_state_rx
657+
}
658+
653659
/// Asks to queue a request to start a source migration task for this VM.
654660
/// The migration will have the supplied `migration_id` and will obtain its
655661
/// connection to the target by calling `upgrade_fn` to obtain a future that

phd-tests/framework/src/test_vm/mod.rs

Lines changed: 3 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -291,12 +291,12 @@ impl TestVm {
291291
pub fn migrate_from(
292292
&mut self,
293293
source: &Self,
294+
migration_id: Uuid,
294295
timeout_duration: Duration,
295296
) -> Result<()> {
296297
let _vm_guard = self.tracing_span.enter();
297298
match self.state {
298299
VmState::New => {
299-
let migration_id = Uuid::new_v4();
300300
info!(
301301
?migration_id,
302302
"Migrating from source at address {}",
@@ -348,17 +348,15 @@ impl TestVm {
348348
..Default::default()
349349
};
350350
backoff::retry(backoff, watch_migrate)
351-
.map_err(|e| anyhow!("error during migration: {}", e))?;
351+
.map_err(|e| anyhow!("error during migration: {}", e))
352352
}
353353
VmState::Ensured { .. } => {
354354
return Err(VmStateError::InstanceAlreadyEnsured.into());
355355
}
356356
}
357-
358-
Ok(())
359357
}
360358

361-
fn get_migration_state(
359+
pub fn get_migration_state(
362360
&self,
363361
migration_id: Uuid,
364362
) -> Result<MigrationState> {

phd-tests/tests/Cargo.toml

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -9,3 +9,4 @@ edition = "2021"
99
propolis-client = { workspace = true, features = ["generated"] }
1010
phd-testcase.workspace = true
1111
tracing.workspace = true
12+
uuid.workspace = true

phd-tests/tests/src/crucible/migrate.rs

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -3,6 +3,7 @@ use std::time::Duration;
33
use phd_framework::test_vm::vm_config::DiskInterface;
44
use phd_testcase::*;
55
use tracing::info;
6+
use uuid::Uuid;
67

78
#[phd_testcase]
89
fn smoke_test(ctx: &TestContext) {
@@ -30,7 +31,7 @@ fn smoke_test(ctx: &TestContext) {
3031

3132
let mut target =
3233
ctx.vm_factory.new_vm("crucible_migrate_smoke_target", config)?;
33-
target.migrate_from(&source, Duration::from_secs(60))?;
34+
target.migrate_from(&source, Uuid::new_v4(), Duration::from_secs(60))?;
3435
let lsout = target.run_shell_command("ls foo.bar")?;
3536
assert_eq!(lsout, "foo.bar");
3637
}
@@ -72,7 +73,7 @@ fn load_test(ctx: &TestContext) {
7273
// Start copying the generated file into a second file, then start a
7374
// migration while that copy is in progress.
7475
source.run_shell_command("dd if=./rand.txt of=./rand_new.txt &")?;
75-
target.migrate_from(&source, Duration::from_secs(60))?;
76+
target.migrate_from(&source, Uuid::new_v4(), Duration::from_secs(60))?;
7677

7778
// Wait for the background command to finish running, then compute the
7879
// hash of the copied file. If all went well this will match the hash of

phd-tests/tests/src/migrate.rs

Lines changed: 26 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,8 @@
11
use std::time::Duration;
22

33
use phd_testcase::*;
4+
use propolis_client::handmade::api::MigrationState;
5+
use uuid::Uuid;
46

57
#[phd_testcase]
68
fn smoke_test(ctx: &TestContext) {
@@ -19,7 +21,16 @@ fn smoke_test(ctx: &TestContext) {
1921
.vm_factory
2022
.new_vm_from_cloned_config("migration_smoke_target", &source)?;
2123

22-
target.migrate_from(&source, Duration::from_secs(60))?;
24+
let migration_id = Uuid::new_v4();
25+
target.migrate_from(&source, migration_id, Duration::from_secs(60))?;
26+
27+
// Explicitly check migration status on both the source and target to make
28+
// sure it is available even after migration has finished.
29+
let src_migration_state = source.get_migration_state(migration_id)?;
30+
assert_eq!(src_migration_state, MigrationState::Finish);
31+
let target_migration_state = target.get_migration_state(migration_id)?;
32+
assert_eq!(target_migration_state, MigrationState::Finish);
33+
2334
let lsout = target.run_shell_command("ls foo.bar")?;
2435
assert_eq!(lsout, "foo.bar");
2536
}
@@ -44,7 +55,18 @@ fn incompatible_vms(ctx: &TestContext) {
4455
cfg,
4556
)?;
4657

47-
assert!(target.migrate_from(&source, Duration::from_secs(60)).is_err());
58+
let migration_id = Uuid::new_v4();
59+
assert!(target
60+
.migrate_from(&source, migration_id, Duration::from_secs(60))
61+
.is_err());
62+
63+
// Explicitly check migration status on both the source and target to
64+
// make sure it is available even after migration has finished.
65+
let src_migration_state = source.get_migration_state(migration_id)?;
66+
assert_eq!(src_migration_state, MigrationState::Error);
67+
let target_migration_state =
68+
target.get_migration_state(migration_id)?;
69+
assert_eq!(target_migration_state, MigrationState::Error);
4870
}
4971
}
5072

@@ -62,9 +84,9 @@ fn multiple_migrations(ctx: &TestContext) {
6284

6385
vm0.launch()?;
6486
vm0.wait_to_boot()?;
65-
vm1.migrate_from(&vm0, Duration::from_secs(60))?;
87+
vm1.migrate_from(&vm0, Uuid::new_v4(), Duration::from_secs(60))?;
6688
assert_eq!(vm1.run_shell_command("echo Hello world")?, "Hello world");
67-
vm2.migrate_from(&vm1, Duration::from_secs(60))?;
89+
vm2.migrate_from(&vm1, Uuid::new_v4(), Duration::from_secs(60))?;
6890
assert_eq!(
6991
vm2.run_shell_command("echo I have migrated!")?,
7092
"I have migrated!"

0 commit comments

Comments
 (0)