-
Notifications
You must be signed in to change notification settings - Fork 50
Support bundles should include log files #7973
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from 7 commits
842d736
7d47c84
c704351
a345482
b90e07c
e1ba8c3
23430d7
a77990a
652bf17
0c4cafa
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -1222,8 +1222,8 @@ impl Zfs { | |
snap_name: &'a str, | ||
properties: &'a [(&'a str, &'a str)], | ||
) -> Result<(), CreateSnapshotError> { | ||
let mut command = std::process::Command::new(ZFS); | ||
let mut cmd = command.arg("snapshot"); | ||
let mut command = std::process::Command::new(PFEXEC); | ||
let mut cmd = command.args([ZFS, "snapshot"]); | ||
for (name, value) in properties.iter() { | ||
cmd = cmd.arg("-o").arg(&format!("{name}={value}")); | ||
} | ||
|
@@ -1240,9 +1240,9 @@ impl Zfs { | |
filesystem: &str, | ||
snap_name: &str, | ||
) -> Result<(), DestroySnapshotError> { | ||
let mut command = std::process::Command::new(ZFS); | ||
let mut command = std::process::Command::new(PFEXEC); | ||
let path = format!("{filesystem}@{snap_name}"); | ||
let cmd = command.args(&["destroy", &path]); | ||
let cmd = command.args(&[ZFS, "destroy", &path]); | ||
execute(cmd).map(|_| ()).map_err(|err| DestroySnapshotError { | ||
filesystem: filesystem.to_string(), | ||
snap_name: snap_name.to_string(), | ||
|
@@ -1315,7 +1315,18 @@ pub struct Snapshot { | |
|
||
impl Snapshot { | ||
/// Return the full path to the snapshot directory within the filesystem. | ||
/// | ||
/// NB: Be careful when calling this method as it may return a `Utf8PathBuf` | ||
/// that does not actually map to a real filesystem path. On helios systems | ||
/// `rpool/ROOT/<BE>` for example will will return | ||
/// "legacy/.zfs/snapshot/<SNAP_NAME>" because the mountpoint of the dataset | ||
/// is a "legacy" mount. Additionally a fileystem with no mountpoint will | ||
/// have a zfs mountpoint property of "-". | ||
Comment on lines
+1319
to
+1324
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I don't love this API, it seems really easy to misuse, even with this comment. I suppose we have filed an issue here, but I think we should prioritize fixing this quickly, rather than leaving it as tech debt There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I can pick this up right after we land this. I will probably steal the implementation from here and just update call sites |
||
pub fn full_path(&self) -> Result<Utf8PathBuf, GetValueError> { | ||
// TODO: | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Do we need to track this in an issue? There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Filed as #8023 |
||
// When a mountpoint is returned as "legacy" we could go fish around in | ||
// "/etc/mnttab". That would probably mean making this function return a | ||
// result of an option. | ||
let mountpoint = Zfs::get_value(&self.filesystem, "mountpoint")?; | ||
Ok(Utf8PathBuf::from(mountpoint) | ||
.join(format!(".zfs/snapshot/{}", self.snap_name))) | ||
|
Original file line number | Diff line number | Diff line change | ||||
---|---|---|---|---|---|---|
|
@@ -14,6 +14,7 @@ use camino_tempfile::tempfile_in; | |||||
use futures::FutureExt; | ||||||
use futures::StreamExt; | ||||||
use futures::future::BoxFuture; | ||||||
use futures::stream::FuturesUnordered; | ||||||
use nexus_db_model::SupportBundle; | ||||||
use nexus_db_model::SupportBundleState; | ||||||
use nexus_db_queries::authz; | ||||||
|
@@ -42,6 +43,7 @@ use tokio::io::AsyncReadExt; | |||||
use tokio::io::AsyncSeekExt; | ||||||
use tokio::io::SeekFrom; | ||||||
use tufaceous_artifact::ArtifactHash; | ||||||
use zip::ZipArchive; | ||||||
use zip::ZipWriter; | ||||||
use zip::write::FullFileOptions; | ||||||
|
||||||
|
@@ -679,6 +681,30 @@ impl BundleCollection<'_> { | |||||
); | ||||||
} | ||||||
} | ||||||
|
||||||
// For each zone we concurrently fire off a request to its | ||||||
// sled-agent to collect its logs in a zip file and write the | ||||||
// result to the support bundle. | ||||||
let zones = sled_client.support_logs().await?.into_inner(); | ||||||
let mut log_futs: FuturesUnordered<_> = zones | ||||||
.iter() | ||||||
.map(|zone| { | ||||||
save_zone_log_zip_or_error( | ||||||
log, | ||||||
&sled_client, | ||||||
zone, | ||||||
&sled_path, | ||||||
) | ||||||
}) | ||||||
.collect(); | ||||||
|
||||||
while let Some(log_collection_result) = log_futs.next().await { | ||||||
// We log any errors saving the zip file to disk and | ||||||
// continue on. | ||||||
if let Err(e) = log_collection_result { | ||||||
error!(&self.log, "failed to write logs outut: {e}"); | ||||||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
Suggested change
|
||||||
} | ||||||
} | ||||||
} | ||||||
} | ||||||
|
||||||
|
@@ -790,6 +816,86 @@ async fn sha2_hash(file: &mut tokio::fs::File) -> anyhow::Result<ArtifactHash> { | |||||
Ok(ArtifactHash(digest.as_slice().try_into()?)) | ||||||
} | ||||||
|
||||||
/// For a given zone, save its service's logs into the provided destination | ||||||
/// path. This path should be the location to a per-sled directory that will end | ||||||
/// up in the final support bundle zip file. | ||||||
async fn save_zone_log_zip_or_error( | ||||||
logger: &slog::Logger, | ||||||
client: &sled_agent_client::Client, | ||||||
zone: &str, | ||||||
path: &Utf8Path, | ||||||
) -> anyhow::Result<()> { | ||||||
// In the future when support bundle collection exposes tuning parameters | ||||||
// this can turn into a collection parameter. | ||||||
const DEFAULT_MAX_ROTATED_LOGS: u32 = 5; | ||||||
|
||||||
match client.support_logs_download(zone, DEFAULT_MAX_ROTATED_LOGS).await { | ||||||
Ok(res) => { | ||||||
let bytestream = res.into_inner(); | ||||||
let output_dir = path.join(format!("logs/{zone}")); | ||||||
let output_file = output_dir.join("logs.zip"); | ||||||
|
||||||
// Ensure the logs output directory exists. | ||||||
tokio::fs::create_dir_all(&output_dir).await.with_context( | ||||||
|| format!("failed to create output directory: {output_dir}"), | ||||||
)?; | ||||||
|
||||||
let mut file = | ||||||
tokio::fs::File::create(&output_file).await.with_context( | ||||||
|| format!("failed to create file: {output_file}"), | ||||||
)?; | ||||||
|
||||||
let stream = bytestream.into_inner().map(|chunk| { | ||||||
chunk.map_err(|e| std::io::Error::other(e.to_string())) | ||||||
}); | ||||||
let mut reader = tokio_util::io::StreamReader::new(stream); | ||||||
let _nbytes = tokio::io::copy(&mut reader, &mut file).await?; | ||||||
|
||||||
// Unpack the zip so we don't end up with zip files inside of our | ||||||
// final zip | ||||||
let zipfile = output_file.clone(); | ||||||
tokio::task::spawn_blocking(move || { | ||||||
extract_zip_file(&output_dir, &zipfile) | ||||||
}) | ||||||
.await | ||||||
.map_err(|join_error| { | ||||||
anyhow::anyhow!(join_error) | ||||||
.context("unzipping support bundle logs zip panicked") | ||||||
})??; | ||||||
|
||||||
// Cleanup the zip file since we no longer need it | ||||||
if let Err(e) = tokio::fs::remove_file(&output_file).await { | ||||||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Just to confirm: We clean this up so it doesn't get included in the final zipfile, right? (I'm just confirming the "cancel safety" of There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Yea, that's the idea. |
||||||
error!( | ||||||
logger, | ||||||
"failed to cleanup temporary logs zip file"; | ||||||
"error" => %e, | ||||||
"file" => %output_file, | ||||||
|
||||||
); | ||||||
} | ||||||
} | ||||||
Err(err) => { | ||||||
tokio::fs::write(path.join("{zone}.logs.err"), err.to_string()) | ||||||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. This will generate a literal
Suggested change
|
||||||
.await?; | ||||||
} | ||||||
}; | ||||||
|
||||||
Ok(()) | ||||||
} | ||||||
|
||||||
fn extract_zip_file( | ||||||
output_dir: &Utf8Path, | ||||||
zip_file: &Utf8Path, | ||||||
) -> Result<(), anyhow::Error> { | ||||||
let mut zip = std::fs::File::open(&zip_file) | ||||||
.with_context(|| format!("failed to open zip file: {zip_file}"))?; | ||||||
let mut archive = ZipArchive::new(&mut zip)?; | ||||||
archive.extract(&output_dir).with_context(|| { | ||||||
format!("failed to extract log zip file to: {output_dir}") | ||||||
})?; | ||||||
Ok(()) | ||||||
} | ||||||
|
||||||
/// Run a `sled-dianostics` future and save its output to a corresponding file. | ||||||
async fn save_diag_cmd_output_or_error<F, S: serde::Serialize>( | ||||||
path: &Utf8Path, | ||||||
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Out of curiosity, why did we not need
pfexec
before?There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I think because as far as I can tell zone bundles and the sled diagnostics creates are the only consumers. I ran into the issue because I was running
cargo nextest run
and I saw permission denied, where as sled-agent is running as root.There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I suspect this may be a "usable in tests" things -- adding the pfexec is harmless, but without it, we cannot run integration tests