-
Notifications
You must be signed in to change notification settings - Fork 50
Do not panic in switch synchronization task #8714
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
base: main
Are you sure you want to change the base?
Changes from all commits
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -887,16 +887,41 @@ impl BackgroundTask for SwitchPortSettingsManager { | |
} | ||
}; | ||
|
||
// TODO: is this correct? Do we place the BgpConfig for both switches in a single Vec to send to the bootstore? | ||
let mut bgp: Vec<SledBgpConfig> = switch_bgp_config.iter().map(|(_location, (_id, config))| { | ||
let announcements = bgp_announce_prefixes | ||
.get(&config.bgp_announce_set_id) | ||
.expect("bgp config is present but announce set is not populated") | ||
.iter() | ||
.map(|prefix| { | ||
Ipv4Net::new(prefix.value, prefix.length) | ||
.expect("Prefix4 and Ipv4Net's value types have diverged") | ||
}).collect(); | ||
let prefixes = match bgp_announce_prefixes | ||
.get(&config.bgp_announce_set_id) { | ||
Some(v) => v.clone(), | ||
None => { | ||
// There should always be a set of prefixes for a given announce set id. | ||
// If this is `None` we need to audit the history of the specified bgp config | ||
// and announce set to see what went wrong. | ||
error!( | ||
log, | ||
"bgp config references an announce set that does not exist"; | ||
"bgp_config_id" => ?config.id(), | ||
"announce_set_id" => ?config.bgp_announce_set_id, | ||
); | ||
vec![] | ||
}, | ||
}; | ||
|
||
let mut announcements = vec![]; | ||
|
||
for prefix in prefixes { | ||
let net = match Ipv4Net::new(prefix.value, prefix.length) { | ||
Ok(v) => v, | ||
Err(e) => { | ||
error!( | ||
log, | ||
"failed to create Ipv4Net from Prefix4"; | ||
"prefix4" => ?prefix, | ||
"error" => %DisplayErrorChain::new(&e), | ||
Comment on lines
+911
to
+918
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. This conversion seems like it should be infallible, I think? Could If a |
||
); | ||
continue; | ||
}, | ||
}; | ||
announcements.push(net); | ||
} | ||
|
||
SledBgpConfig { | ||
asn: config.asn.0, | ||
|
@@ -915,6 +940,21 @@ impl BackgroundTask for SwitchPortSettingsManager { | |
continue; | ||
}; | ||
|
||
let port_settings_id = match port.port_settings_id { | ||
Some(id) => id, | ||
None => { | ||
// Theoretically this should never be possible. If we have a PortSettingsChange::Apply, that | ||
// means we have an active port_settings record and the port_settings_id should be `Some(Uuid)`. | ||
Comment on lines
+946
to
+947
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Is this actually impossible, as in it's guaranteed that the settings we read from the DB always have an ID? If so, is there a way to structure the types such that the ID is not optional? |
||
error!( | ||
log, | ||
"PortSettingsChange::Apply without a port_settings_id"; | ||
"port" => ?port, | ||
"info" => ?info, | ||
); | ||
continue; | ||
}, | ||
}; | ||
|
||
let peer_configs = match self.datastore.bgp_peer_configs(opctx, *location, port.port_name.to_string()).await { | ||
Ok(v) => v, | ||
Err(e) => { | ||
|
@@ -1028,7 +1068,7 @@ impl BackgroundTask for SwitchPortSettingsManager { | |
.datastore | ||
.communities_for_peer( | ||
opctx, | ||
port.port_settings_id.unwrap(), | ||
port_settings_id, | ||
PHY0, //TODO https://github.com/oxidecomputer/omicron/issues/3062 | ||
IpNetwork::from(IpAddr::from(peer.addr)) | ||
).await { | ||
|
@@ -1046,7 +1086,7 @@ impl BackgroundTask for SwitchPortSettingsManager { | |
//TODO consider awaiting in parallel and joining | ||
let allow_import = match self.datastore.allow_import_for_peer( | ||
opctx, | ||
port.port_settings_id.unwrap(), | ||
port_settings_id, | ||
PHY0, //TODO https://github.com/oxidecomputer/omicron/issues/3062 | ||
IpNetwork::from(IpAddr::from(peer.addr)), | ||
).await { | ||
|
@@ -1070,7 +1110,7 @@ impl BackgroundTask for SwitchPortSettingsManager { | |
|
||
let allow_export = match self.datastore.allow_export_for_peer( | ||
opctx, | ||
port.port_settings_id.unwrap(), | ||
port_settings_id, | ||
PHY0, //TODO https://github.com/oxidecomputer/omicron/issues/3062 | ||
IpNetwork::from(IpAddr::from(peer.addr)), | ||
).await { | ||
|
@@ -1301,14 +1341,29 @@ impl BackgroundTask for SwitchPortSettingsManager { | |
} | ||
|
||
// if at least one succeeded, record this update in the db | ||
let config = match serde_json::to_value(&desired_config) { | ||
Ok(data) => { | ||
BootstoreConfig { | ||
key: NETWORK_KEY.into(), | ||
generation: desired_config.generation as i64, | ||
data, | ||
time_created: chrono::Utc::now(), | ||
time_deleted: None, | ||
} | ||
} | ||
Err(e) => { | ||
error!( | ||
log, | ||
"error while serializing config for storage"; | ||
"config" => ?desired_config, | ||
"error" => %e, | ||
); | ||
// if we cannot successfully serialize the config, we cannot store the config | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I think this one should go back to being an
|
||
continue; | ||
} | ||
}; | ||
|
||
if one_succeeded { | ||
let config = BootstoreConfig { | ||
key: NETWORK_KEY.into(), | ||
generation: desired_config.generation as i64, | ||
data: serde_json::to_value(&desired_config).unwrap(), | ||
time_created: chrono::Utc::now(), | ||
time_deleted: None, | ||
}; | ||
if let Err(e) = self.datastore.ensure_bootstore_config(opctx, config.clone()).await { | ||
// if this fails, worst case scenario is that we will send the bootstore | ||
// information it already has on the next run | ||
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Getting rid of potential panics is good, but this change makes me nervous:
sync_switch_configuration
background task #8579 specifically because of the panic; if this had been an error log all along, would we have just logged an "ought to be impossible" condition and not even realized it had happened?