From dc292f4e05a395854af6a0856550604475897060 Mon Sep 17 00:00:00 2001 From: Luiz Irber Date: Mon, 3 Feb 2025 17:42:57 -0800 Subject: [PATCH] refactor: remove finch conversion, support zstd and lzma in wasm (#3521) Address https://github.com/sourmash-bio/branchwater/issues/41 Allow zstd and lzma in wasm. Removed finch conversion because it has conflicts with lzma version, but also... I don't think it was ever used? People can stick to `0.18.0` and we can bring it back if needed. --- Cargo.lock | 137 ++++++++++++++----------------------------- src/core/Cargo.toml | 8 +-- src/core/src/from.rs | 124 --------------------------------------- src/core/src/lib.rs | 3 - 4 files changed, 49 insertions(+), 223 deletions(-) delete mode 100644 src/core/src/from.rs diff --git a/Cargo.lock b/Cargo.lock index 9cd80dbfad..acbaa7c430 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -79,15 +79,6 @@ version = "0.1.2" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "597bb81c80a54b6a4381b23faba8d7774b144c94cbd1d6fe3f1329bd776554ab" -[[package]] -name = "bincode" -version = "1.3.3" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "b1f45e9417d87227c7a56d22e471c6206462cba514c7590c09aff4cf6d1ddcad" -dependencies = [ - "serde", -] - [[package]] name = "bindgen" version = "0.69.5" @@ -194,16 +185,6 @@ version = "1.5.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "a2bd12c1caf447e69cd4528f47f94d203fd2582878ecb9e9465484c4148a8223" -[[package]] -name = "bzip2" -version = "0.4.4" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "bdb116a6ef3f6c3698828873ad02c3014b3c85cadb88496095628e3ef1e347f8" -dependencies = [ - "bzip2-sys", - "libc", -] - [[package]] name = "bzip2-sys" version = "0.1.11+1.0.8" @@ -224,12 +205,6 @@ dependencies = [ "serde", ] -[[package]] -name = "capnp" -version = "0.14.11" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "2dca085c2c7d9d65ad749d450b19b551efaa8e3476a439bdca07aca8533097f3" - [[package]] name = "cast" version = "0.3.0" @@ -592,23 +567,6 @@ version = "2.1.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "e8c02a5121d4ea3eb16a80748c74f5549a5665e4c21333c6098f283870fbdea6" -[[package]] -name = "finch" -version = "0.6.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "0f4b8b623c5c3af6c407c47231c75563722a5f49f99da2f6376866e154b9bf39" -dependencies = [ - "bincode", - "capnp", - "memmap", - "murmurhash3", - "needletail 0.5.1", - "rayon", - "serde", - "serde_json", - "thiserror 1.0.69", -] - [[package]] name = "fixedbitset" version = "0.4.2" @@ -848,6 +806,26 @@ dependencies = [ "winapi", ] +[[package]] +name = "liblzma" +version = "0.3.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "603222e049bf0da71529325ada5d02dc3871cbd3679cf905429f7f0de93da87b" +dependencies = [ + "liblzma-sys", +] + +[[package]] +name = "liblzma-sys" +version = "0.3.12" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d87bb9f27519cd690390611ab3e23e8ac3e383c1f67b733a4b36c684211d7671" +dependencies = [ + "cc", + "libc", + "pkg-config", +] + [[package]] name = "libm" version = "0.2.6" @@ -906,17 +884,6 @@ version = "0.4.25" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "04cbf5b083de1c7e0222a7a51dbfdba1cbe1c6ab0b15e29fff3f6c077fd9cd9f" -[[package]] -name = "lzma-sys" -version = "0.1.17" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "bdb4b7c3eddad11d3af9e86c487607d2d2442d185d848575365c4856ba96d619" -dependencies = [ - "cc", - "libc", - "pkg-config", -] - [[package]] name = "matrixmultiply" version = "0.3.8" @@ -939,16 +906,6 @@ version = "2.7.4" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "78ca9ab1a0babb1e7d5695e3530886289c18cf2f87ec19a575a0abdce112e3a3" -[[package]] -name = "memmap" -version = "0.7.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "6585fd95e7bb50d6cc31e20d4cf9afb4e2ba16c5846fc76793f11218da9c475b" -dependencies = [ - "libc", - "winapi", -] - [[package]] name = "memmap2" version = "0.9.5" @@ -1006,20 +963,6 @@ dependencies = [ "typenum", ] -[[package]] -name = "needletail" -version = "0.5.1" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "db05a5ab397f64070d8c998fa0fbb84e484b81f95752af317dac183a82d9295d" -dependencies = [ - "buffer-redux", - "bytecount", - "bzip2", - "flate2", - "memchr", - "xz2", -] - [[package]] name = "needletail" version = "0.6.1" @@ -1039,7 +982,9 @@ checksum = "bd625dd485c2d20bdb98d7ec364f798b256ac09997ef18b4274be2168f53a647" dependencies = [ "cfg-if", "flate2", + "liblzma", "thiserror 1.0.69", + "zstd", ] [[package]] @@ -1659,7 +1604,7 @@ checksum = "9f1341053f34bb13b5e9590afb7d94b48b48d4b87467ec28e3c238693bb553de" [[package]] name = "sourmash" -version = "0.18.0" +version = "0.19.0" dependencies = [ "az", "byteorder", @@ -1671,18 +1616,18 @@ dependencies = [ "criterion", "csv", "enum_dispatch", - "finch", "fixedbitset", "getrandom 0.2.15", "getset", "histogram", "itertools 0.14.0", "js-sys", + "liblzma", "log", "md5", "memmap2", "murmurhash3", - "needletail 0.6.1", + "needletail", "niffler", "nohash-hasher", "num-iter", @@ -2277,15 +2222,6 @@ dependencies = [ "tap", ] -[[package]] -name = "xz2" -version = "0.1.6" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "c179869f34fc7c01830d3ce7ea2086bc3a07e0d35289b667d0a8bf910258926c" -dependencies = [ - "lzma-sys", -] - [[package]] name = "yansi" version = "1.0.0-rc.1" @@ -2312,13 +2248,30 @@ dependencies = [ "syn 2.0.87", ] +[[package]] +name = "zstd" +version = "0.13.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "bffb3309596d527cfcba7dfc6ed6052f1d39dfbd7c867aa2e865e4a449c10110" +dependencies = [ + "zstd-safe", +] + +[[package]] +name = "zstd-safe" +version = "7.0.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "43747c7422e2924c11144d5229878b98180ef8b06cca4ab5af37afc8a8d8ea3e" +dependencies = [ + "zstd-sys", +] + [[package]] name = "zstd-sys" -version = "2.0.7+zstd.1.5.4" +version = "2.0.13+zstd.1.5.6" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "94509c3ba2fe55294d752b79842c530ccfab760192521df74a081a78d2b3c7f5" +checksum = "38ff0f21cfee8f97d94cef41359e0c89aa6113028ab0291aa8ca0038995a95aa" dependencies = [ "cc", - "libc", "pkg-config", ] diff --git a/src/core/Cargo.toml b/src/core/Cargo.toml index 7802b137d5..6f7d6c1c23 100644 --- a/src/core/Cargo.toml +++ b/src/core/Cargo.toml @@ -1,6 +1,6 @@ [package] name = "sourmash" -version = "0.18.0" +version = "0.19.0" authors = ["Luiz Irber ", "N. Tessa Pierce-Ward ", "C. Titus Brown "] description = "tools for comparing biological sequences with k-mer sketches" repository = "https://github.com/sourmash-bio/sourmash" @@ -19,7 +19,6 @@ crate-type = ["lib", "staticlib", "cdylib"] bench = false [features] -from-finch = ["dep:finch"] parallel = ["dep:rayon"] maturin = [] branchwater = ["dep:rocksdb", "parallel"] @@ -34,7 +33,6 @@ cfg-if = "1.0" counter = "0.6.0" csv = "1.3.1" enum_dispatch = "0.3.13" -finch = { version = "0.6.0", optional = true } fixedbitset = "0.4.0" getset = "0.1.4" histogram = "0.11.2" @@ -44,7 +42,9 @@ md5 = "0.7.0" memmap2 = "0.9.5" murmurhash3 = "0.0.5" needletail = { version = "0.6.1", default-features = false } -niffler = { version = "2.4.0", default-features = false, features = [ "gz" ] } +niffler = { version = "2.6.0", default-features = false, features = [ "gz", "zstd", "lzma" ] } +# declare liblzma here to allow static compilation and fix macos linking issues in Python wheel +liblzma = { version = "0.3.0", features = ["static"] } nohash-hasher = "0.2.0" num-iter = "0.1.45" once_cell = "1.18.0" diff --git a/src/core/src/from.rs b/src/core/src/from.rs deleted file mode 100644 index 347d90afc5..0000000000 --- a/src/core/src/from.rs +++ /dev/null @@ -1,124 +0,0 @@ -use finch::sketch_schemes::mash::MashSketcher; -use finch::sketch_schemes::SketchScheme; - -use crate::encodings::HashFunctions; -use crate::sketch::minhash::KmerMinHash; - -/* - TODO: - - also convert scaled sketches - - sourmash Signature equivalent is the finch Sketch, write conversions for that too -*/ - -impl From for KmerMinHash { - fn from(other: MashSketcher) -> KmerMinHash { - let values = other.to_vec(); - - let mut new_mh = KmerMinHash::new( - 0, - values.first().unwrap().kmer.len() as u32, - HashFunctions::Murmur64Dna, - 42, - true, - values.len() as u32, - ); - - let hash_with_abunds: Vec<(u64, u64)> = - values.iter().map(|x| (x.hash, x.count as u64)).collect(); - - new_mh - .add_many_with_abund(&hash_with_abunds) - .expect("Error adding hashes with abund"); - - new_mh - } -} - -#[cfg(test)] -mod test { - use std::collections::HashMap; - use std::collections::HashSet; - - use crate::encodings::HashFunctions; - use crate::signature::SigsTrait; - use crate::sketch::minhash::KmerMinHash; - - use finch::sketch_schemes::mash::MashSketcher; - use needletail::kmer::CanonicalKmers; - use needletail::Sequence; - - use super::*; - - #[test] - fn finch_behavior() { - let mut a = KmerMinHash::new(0, 10, HashFunctions::Murmur64Dna, 42, true, 20); - let mut b = MashSketcher::new(20, 10, 42); - - let seq = b"TGCCGCCCAGCACCGGGTGACTAGGTTGAGCCATGATTAACCTGCAATGA"; - let rc = seq.reverse_complement(); - - a.add_sequence(seq, false).unwrap(); - - for (_, kmer, _) in CanonicalKmers::new(seq, &rc, 10) { - b.push(kmer, 0); - } - - let b_hashes = b.to_vec(); - - let s1: HashSet<_> = a.mins().into_iter().collect(); - let s2: HashSet<_> = b_hashes.iter().map(|x| x.hash).collect(); - let i1 = &s1 & &s2; - - assert!(i1.len() == a.size()); - assert!(i1.len() == b_hashes.len()); - - if let Some(abunds) = a.abunds() { - let mins = a.mins(); - let smap: HashMap<_, _> = mins.iter().zip(abunds.iter()).collect(); - println!("{:?}", smap); - for item in b_hashes.iter() { - assert!(smap.contains_key(&{ item.hash })); - assert!( - **smap.get(&{ item.hash }).unwrap() == ((item.count + item.extra_count) as u64) - ); - } - } - } - - #[test] - fn from_finch() { - let mut a = KmerMinHash::new(0, 10, HashFunctions::Murmur64Dna, 42, true, 20); - let mut b = MashSketcher::new(20, 10, 42); - - let seq = b"TGCCGCCCAGCACCGGGTGACTAGGTTGAGCCATGATTAACCTGCAATGA"; - let rc = seq.reverse_complement(); - - a.add_sequence(seq, false).unwrap(); - - for (_, kmer, _) in CanonicalKmers::new(seq, &rc, 10) { - b.push(kmer, 0); - } - - let c = KmerMinHash::from(b); - - let s1: HashSet<_> = a.mins().into_iter().collect(); - let s2: HashSet<_> = c.mins().into_iter().collect(); - let i1 = &s1 & &s2; - - assert!(i1.len() == a.mins().len()); - assert!(i1.len() == c.mins().len()); - - if let Some(a_abunds) = a.abunds() { - if let Some(c_abunds) = c.abunds() { - let a_mins = a.mins(); - let a_smap: HashMap<_, _> = a_mins.iter().zip(a_abunds.iter()).collect(); - let c_mins = c.mins(); - let c_smap: HashMap<_, _> = c_mins.iter().zip(c_abunds.iter()).collect(); - for item in a_smap.iter() { - assert!(c_smap.contains_key(*item.0)); - assert!(c_smap.get(*item.0).unwrap() == item.1); - } - } - } - } -} diff --git a/src/core/src/lib.rs b/src/core/src/lib.rs index d8f994f87c..47403dc302 100644 --- a/src/core/src/lib.rs +++ b/src/core/src/lib.rs @@ -37,9 +37,6 @@ pub mod signature; pub mod sketch; pub mod storage; -#[cfg(feature = "from-finch")] -pub mod from; - use cfg_if::cfg_if; use murmurhash3::murmurhash3_x64_128;