Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

WIP: add manifest summarization #3030

Open
wants to merge 14 commits into
base: latest
Choose a base branch
from
59 changes: 58 additions & 1 deletion src/core/src/collection.rs
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@
use camino::Utf8PathBuf as PathBuf;

use crate::encodings::Idx;
use crate::manifest::{Manifest, Record};
use crate::manifest::{Manifest, Record, RecordSummary};
use crate::prelude::*;
use crate::storage::{FSStorage, InnerStorage, MemStorage, SigStore, ZipStorage};
use crate::{Error, Result};
Expand Down Expand Up @@ -187,6 +187,10 @@
assert_eq!(sig.signatures.len(), 1);
Ok(sig)
}

pub fn summarize(&self) -> Result<Vec<RecordSummary>> {

Check warning on line 191 in src/core/src/collection.rs

View check run for this annotation

Codecov / codecov/patch

src/core/src/collection.rs#L191

Added line #L191 was not covered by tests
Ok(self.manifest.summarize())
}
}

impl Select for Collection {
Expand All @@ -205,6 +209,7 @@
use super::Collection;

use crate::encodings::HashFunctions;
use crate::manifest::write_summary;
use crate::prelude::Select;
use crate::selection::Selection;
use crate::signature::Signature;
Expand Down Expand Up @@ -405,4 +410,56 @@
assert_eq!(this_mh.scaled(), 100);
}
}

#[test]
fn collection_summarize_zipfile() {
let base_path = PathBuf::from(env!("CARGO_MANIFEST_DIR"));

let test_sigs = vec![PathBuf::from("../../tests/test-data/prot/all.zip")];

let full_paths: Vec<PathBuf> = test_sigs
.into_iter()
.map(|sig| base_path.join(sig))
.collect();

let collection = Collection::from_zipfile(&full_paths[0]).unwrap();

let summaries = collection.summarize().unwrap();

let summary_output = write_summary(&summaries);

assert!(summary_output.contains("num signatures: 8"));
assert!(summary_output.contains("total hashes: 31758"));
assert!(
summary_output.contains("2 sketches with DNA, k=31, scaled=1000 10415 total hashes")
);
assert!(
summary_output.contains("2 sketches with protein, k=19, scaled=100 8214 total hashes")
);
assert!(
summary_output.contains("2 sketches with dayhoff, k=19, scaled=100 7945 total hashes")
);
assert!(summary_output.contains("2 sketches with hp, k=19, scaled=100 5184 total hashes"));
}

// #[test]
// fn collection_summarize_rocksdb() {
// use crate::index::revindex::RevIndex;
// let base_path = PathBuf::from(env!("CARGO_MANIFEST_DIR"));

// let db_file = PathBuf::from("../../tests/test-data/revindex/all.dna-k31-sc1000.rocksdb");

// let db_path = base_path.join(&db_file);

// let db = RevIndex::open(db_path, true, None).unwrap();
// let collection = db.load_collection_from_rocksdb();

// let summaries = collection.summarize().unwrap();

// let summary_output = write_summary(&summaries);

// assert!(summary_output.contains("num signatures: 2"));
// assert!(summary_output.contains("total hashes: 10415"));
// assert!(summary_output.contains("2 sketches with DNA, k=31, scaled=1000 10415 total hashes"));
// }
}
148 changes: 141 additions & 7 deletions src/core/src/manifest.rs
Original file line number Diff line number Diff line change
@@ -1,3 +1,6 @@
use std::collections::HashMap;
use std::convert::TryInto;

Check failure on line 2 in src/core/src/manifest.rs

View workflow job for this annotation

GitHub Actions / Lints (beta)

the item `TryInto` is imported redundantly

Check warning on line 2 in src/core/src/manifest.rs

View workflow job for this annotation

GitHub Actions / test (beta)

the item `TryInto` is imported redundantly

Check warning on line 2 in src/core/src/manifest.rs

View workflow job for this annotation

GitHub Actions / test (beta)

the item `TryInto` is imported redundantly
use std::fmt::Write as FmtWrite;
use std::fs::File;
use std::io::{BufRead, BufReader, Read, Write};
use std::ops::Deref;
Expand All @@ -7,15 +10,16 @@
#[cfg(feature = "parallel")]
use rayon::prelude::*;
use serde::de;
use serde::{Deserialize, Serialize};
use serde::ser::SerializeStruct;
use serde::{Deserialize, Serialize, Serializer};

use crate::encodings::HashFunctions;
use crate::prelude::*;
use crate::signature::SigsTrait;
use crate::sketch::Sketch;
use crate::Result;

#[derive(Debug, Serialize, Deserialize, Clone, CopyGetters, Getters, Setters, PartialEq, Eq)]
#[derive(Debug, Serialize, Deserialize, Clone, CopyGetters, Getters, Setters, Eq)]
pub struct Record {
#[getset(get = "pub", set = "pub")]
internal_location: PathBuf,
Expand Down Expand Up @@ -72,6 +76,16 @@
}
}

impl PartialEq for Record {
fn eq(&self, other: &Self) -> bool {
self.ksize == other.ksize
&& self.moltype == other.moltype
&& self.num == other.num
&& self.scaled == other.scaled
&& self.with_abundance == other.with_abundance
}
}

#[derive(Debug, Default, Serialize, Deserialize, Clone)]
pub struct Manifest {
records: Vec<Record>,
Expand Down Expand Up @@ -198,6 +212,34 @@
pub fn iter(&self) -> impl Iterator<Item = &Record> {
self.records.iter()
}

pub fn summarize(&self) -> Vec<RecordSummary> {
let mut summary_map: HashMap<(u32, String, u32, u64, bool), RecordSummary> = HashMap::new();

for record in self.iter() {
let key = (
record.ksize(),
record.moltype.clone(),
record.num,
record.scaled,
record.with_abundance(),
);
let entry = summary_map.entry(key).or_insert_with(|| RecordSummary {
ksize: record.ksize(),
moltype: record.moltype.clone(),
num: record.num,
scaled: record.scaled,
with_abundance: record.with_abundance(),
count: 0,
total_n_hashes: 0,
});

entry.count += 1;
entry.total_n_hashes += record.n_hashes;
}

summary_map.into_values().collect()
}
}

impl Select for Manifest {
Expand Down Expand Up @@ -323,18 +365,80 @@
}
}

#[derive(Debug, Hash, PartialEq, Eq, Clone, Deserialize)]
pub struct RecordSummary {
ksize: u32,
moltype: String,
num: u32,
scaled: u64,
with_abundance: bool,
// Fields for counting and aggregation
count: usize,
total_n_hashes: usize,
}

impl Serialize for RecordSummary {
fn serialize<S>(&self, serializer: S) -> std::result::Result<S::Ok, S::Error>
where
S: Serializer,
{
let mut state = serializer.serialize_struct("RecordSummary", 7)?;
state.serialize_field("ksize", &self.ksize)?;
state.serialize_field("moltype", &self.moltype)?;
state.serialize_field("num", &self.num)?;
state.serialize_field("scaled", &self.scaled)?;
state.serialize_field("with_abundance", &self.with_abundance)?;
state.serialize_field("count", &self.count)?;
state.serialize_field("total_n_hashes", &self.total_n_hashes)?;
state.end()
}
}

impl std::fmt::Display for RecordSummary {
fn fmt(&self, f: &mut std::fmt::Formatter) -> std::fmt::Result {

Check warning on line 398 in src/core/src/manifest.rs

View check run for this annotation

Codecov / codecov/patch

src/core/src/manifest.rs#L398

Added line #L398 was not covered by tests
write!(
f,
"{} sketches with {}, k={}, scaled={} {} total hashes",
self.count, self.moltype, self.ksize, self.scaled, self.total_n_hashes
)
}
}

pub fn write_summary(summaries: &[RecordSummary]) -> String {
let mut output = String::new();
FmtWrite::write_str(&mut output, "num signatures: ").unwrap();
writeln!(
output,
"{}",
summaries.iter().map(|s| s.count).sum::<usize>()
)
.unwrap();
FmtWrite::write_str(&mut output, "** examining manifest...\n").unwrap();
writeln!(
output,
"total hashes: {}",
summaries.iter().map(|s| s.total_n_hashes).sum::<usize>()
)
.unwrap();
FmtWrite::write_str(&mut output, "summary of sketches:\n").unwrap();
for summary in summaries {
writeln!(output, " {}", summary).unwrap();
}

output
}

#[cfg(test)]
mod test {
use super::{write_summary, Manifest};
use crate::collection::Collection;
use crate::encodings::HashFunctions;
use crate::selection::{Select, Selection};
use camino::Utf8PathBuf as PathBuf;
use std::fs::File;
use std::io::Write;
use tempfile::TempDir;

use super::Manifest;
use crate::collection::Collection;
use crate::encodings::HashFunctions;
use crate::selection::{Select, Selection};

#[test]
fn manifest_from_pathlist() {
let temp_dir = TempDir::new().unwrap();
Expand Down Expand Up @@ -479,4 +583,34 @@
let scaled100 = manifest.select(&selection).unwrap();
assert_eq!(scaled100.len(), 6);
}

#[test]
fn manifest_summarize() {
let base_path = PathBuf::from(env!("CARGO_MANIFEST_DIR"));

let test_sigs = vec![
PathBuf::from(
"../../tests/test-data/prot/protein/GCA_001593925.1_ASM159392v1_protein.faa.gz.sig",
),
PathBuf::from(
"../../tests/test-data/prot/protein/GCA_001593935.1_ASM159393v1_protein.faa.gz.sig",
),
];

let full_paths: Vec<PathBuf> = test_sigs
.into_iter()
.map(|sig| base_path.join(sig))
.collect();

let manifest = Manifest::from(&full_paths[..]); // pass full_paths as a slice

let summaries = &manifest.summarize();
// let summaries = &manifest.summarize();
let serialized_summaries = serde_json::to_string(&summaries).unwrap();
let output = write_summary(&summaries);
let expected_output = "num signatures: 2\n** examining manifest...\ntotal hashes: 8214\nsummary of sketches:\n 2 sketches with protein, k=19, scaled=100 8214 total hashes\n";
let expected_serialized = "[{\"ksize\":19,\"moltype\":\"protein\",\"num\":0,\"scaled\":100,\"with_abundance\":false,\"count\":2,\"total_n_hashes\":8214}]";
assert_eq!(output, expected_output);
assert_eq!(serialized_summaries.trim(), expected_serialized);
}
}
Loading