Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

feat(web): sort & run #1562

Draft
wants to merge 22 commits into
base: master
Choose a base branch
from
Draft
Changes from all commits
Commits
Show all changes
22 commits
Select commit Hold shift + click to select a range
b1d509e
feat(web): extract per-sequence datasets; prototype sorting page
ivan-aksamentov Jan 21, 2025
201b689
feat: add dataset name to the analysis output
ivan-aksamentov Jan 22, 2025
a81044d
feat(web): parameterize state on currently viewed dataset
ivan-aksamentov Jan 28, 2025
e42542f
fix: avoid circular import between main bundle and launcher worker
ivan-aksamentov Jan 29, 2025
e2119e4
fix: initialize "datasets" atom to avoid infinite suspense
ivan-aksamentov Jan 29, 2025
0d102db
fix: mark all autosuggested datasets as selected before the run
ivan-aksamentov Jan 29, 2025
f4ff81c
refactor: fix type mismatch and remove the cast
ivan-aksamentov Jan 29, 2025
7a50591
fix(web): place sequences only to the matching dataset's tree
ivan-aksamentov Jan 29, 2025
0b0e92a
refactor: lint
ivan-aksamentov Jan 29, 2025
1d4b6fb
feat(web): add dataset selector to switch between analysis results
ivan-aksamentov Jan 29, 2025
584a791
Merge branch 'master' into feat/web-sort-and-run
ivan-aksamentov Jan 30, 2025
611d659
fix: ensure state immutability in multiAtom
ivan-aksamentov Jan 31, 2025
6923e90
fix: make sure result table shows only selected dataset
ivan-aksamentov Jan 31, 2025
221e258
fix: sort page not rendering
ivan-aksamentov Jan 31, 2025
a465aa7
feat: switch auspice tree on dataset switch
ivan-aksamentov Jan 31, 2025
8dff666
fix: prevent auspice instances to share internal state
ivan-aksamentov Jan 31, 2025
308bb18
feat: display list of suggested datasets with checkboxes on main page
ivan-aksamentov Feb 4, 2025
6892c1d
Merge branch 'master' into feat/web-sort-and-run
ivan-aksamentov Mar 6, 2025
1f18da3
feat(cli): add global optimization mode for sort command
ivan-aksamentov Mar 7, 2025
613679a
fix(cli): attempt to fix out of bounds index access
ivan-aksamentov Mar 7, 2025
ef07231
Merge pull request #1577 from nextstrain/feat/cli-sort-global
ivan-aksamentov Mar 20, 2025
bd56b1d
Merge branch 'master' into feat/web-sort-and-run
ivan-aksamentov Mar 20, 2025
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions Cargo.lock

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

1 change: 1 addition & 0 deletions packages/nextclade-cli/Cargo.toml
Original file line number Diff line number Diff line change
@@ -33,6 +33,7 @@ indexmap = { workspace = true }
itertools = { workspace = true }
lazy_static = { workspace = true }
log = { workspace = true }
maplit = { workspace = true }
nextclade = { path = "../nextclade" }
num_cpus = { workspace = true }
ordered-float = { workspace = true }
158 changes: 118 additions & 40 deletions packages/nextclade-cli/src/cli/nextclade_seq_sort.rs
Original file line number Diff line number Diff line change
@@ -4,14 +4,17 @@ use crate::io::http_client::HttpClient;
use eyre::{Report, WrapErr};
use itertools::Itertools;
use log::{trace, LevelFilter};
use maplit::btreemap;
use nextclade::io::csv::CsvStructFileWriter;
use nextclade::io::fasta::{FastaReader, FastaRecord, FastaWriter};
use nextclade::io::fs::path_to_string;
use nextclade::make_error;
use nextclade::sort::minimizer_index::{MinimizerIndexJson, MINIMIZER_INDEX_ALGO_VERSION};
use nextclade::sort::minimizer_search::{run_minimizer_search, MinimizerSearchDatasetResult, MinimizerSearchRecord};
use nextclade::sort::minimizer_search::{
find_best_datasets, run_minimizer_search, MinimizerSearchDatasetResult, MinimizerSearchRecord,
};
use nextclade::utils::option::{OptionMapMutFallible, OptionMapRefFallible};
use nextclade::utils::string::truncate;
use nextclade::{make_error, make_internal_report};
use ordered_float::OrderedFloat;
use owo_colors::OwoColorize;
use schemars::JsonSchema;
@@ -151,43 +154,122 @@ fn writer_thread(
verbose: bool,
) -> Result<(), Report> {
let NextcladeSortArgs {
input_fastas,
output_dir,
output_path,
output_results_tsv,
search_params,
..
} = args;

let template = output_path.map_ref_fallible(move |output_path| -> Result<TinyTemplate, Report> {
let mut template = TinyTemplate::new();
template
.add_template("output", output_path)
.wrap_err_with(|| format!("When parsing template: '{output_path}'"))?;
Ok(template)
})?;
if search_params.global {
// NOTE(perf): this gathers all suggestions results and discards sequence data to make sure we don't store the
// whole thing in memory. We will have to read fasta again later to write some outputs.
let results: BTreeMap<usize, _> = result_receiver
.iter()
.map(|result| (result.fasta_record.index, result.result))
.collect();

// let seqs_with_no_hits = results
// .iter()
// .filter(|(_, r)| r.datasets.is_empty())
// .map(|(fasta_index, _)| fasta_index)
// .sorted_unstable()
// .copied()
// .collect_vec();

let best_datasets = find_best_datasets(&results, search_params)?;

let mut stats = StatsPrinter::new(
"Suggested datasets for each sequence (after global optimization)",
verbose,
);
let mut reader = FastaReader::from_paths(input_fastas)?;
let mut writer = DatasetSortWriter::new(output_path, output_dir, output_results_tsv)?;
loop {
let mut record = FastaRecord::default();
reader.read(&mut record)?;
if record.is_empty() {
break;
}

let mut writers = BTreeMap::new();
let mut stats = StatsPrinter::new(verbose);
let best_dataset = best_datasets
.iter()
.find(|best_dataset| best_dataset.qry_indices.contains(&record.index));

let datasets = if let Some(best_dataset) = best_dataset {
let dataset = results[&record.index]
.datasets
.iter()
.find(|d| d.name == best_dataset.name)
.ok_or_else(|| make_internal_report!("Unable to find dataset '{}'", best_dataset.name))?;
vec![dataset.clone()]
} else {
vec![]
};
stats.print_seq(&datasets, &record.seq_name);
writer.write_one(&record, &datasets)?;
}
} else {
let mut stats = StatsPrinter::new("Suggested datasets for each sequence", verbose);
let mut writer = DatasetSortWriter::new(output_path, output_dir, output_results_tsv)?;
for MinimizerSearchRecord { fasta_record, result } in result_receiver {
let datasets = {
if result.datasets.len() == 0 {
&[]
} else if search_params.all_matches {
&result.datasets[..]
} else {
&result.datasets[0..1]
}
};
stats.print_seq(datasets, &fasta_record.seq_name);
writer.write_one(&fasta_record, datasets)?;
}
stats.finish();
}

let mut results_csv =
output_results_tsv.map_ref_fallible(|output_results_tsv| CsvStructFileWriter::new(output_results_tsv, b'\t'))?;
Ok(())
}

for record in result_receiver {
let datasets = &{
if search_params.all_matches {
record.result.datasets
} else {
record.result.datasets.into_iter().take(1).collect_vec()
}
};
pub struct DatasetSortWriter<'t> {
writers: BTreeMap<PathBuf, FastaWriter>,
results_csv: Option<CsvStructFileWriter>,
output_dir: Option<PathBuf>,
template: Option<TinyTemplate<'t>>,
}

stats.print_seq(datasets, &record.fasta_record.seq_name);
impl<'t> DatasetSortWriter<'t> {
pub fn new(
output_path: &'t Option<String>,
output_dir: &Option<PathBuf>,
output_results_tsv: &Option<String>,
) -> Result<Self, Report> {
let template = output_path.map_ref_fallible(move |output_path| -> Result<TinyTemplate<'t>, Report> {
let mut template = TinyTemplate::new();
template
.add_template("output", output_path)
.wrap_err_with(|| format!("When parsing template: '{output_path}'"))?;
Ok(template)
})?;

let results_csv =
output_results_tsv.map_ref_fallible(|output_results_tsv| CsvStructFileWriter::new(output_results_tsv, b'\t'))?;

Ok(Self {
writers: btreemap! {},
results_csv,
output_dir: output_dir.clone(),
template,
})
}

pub fn write_one(&mut self, record: &FastaRecord, datasets: &[MinimizerSearchDatasetResult]) -> Result<(), Report> {
if datasets.is_empty() {
results_csv.map_mut_fallible(|results_csv| {
self.results_csv.map_mut_fallible(|results_csv| {
results_csv.write(&SeqSortCsvEntry {
index: record.fasta_record.index,
seq_name: &record.fasta_record.seq_name,
index: record.index,
seq_name: &record.seq_name,
dataset: None,
score: None,
num_hits: None,
@@ -196,38 +278,34 @@ fn writer_thread(
}

for dataset in datasets {
results_csv.map_mut_fallible(|results_csv| {
self.results_csv.map_mut_fallible(|results_csv| {
results_csv.write(&SeqSortCsvEntry {
index: record.fasta_record.index,
seq_name: &record.fasta_record.seq_name,
index: record.index,
seq_name: &record.seq_name,
dataset: Some(&dataset.name),
score: Some(dataset.score),
num_hits: Some(dataset.n_hits),
})
})?;
}

let names = datasets
let dataset_names = datasets
.iter()
.map(|dataset| get_all_prefix_names(&dataset.name))
.collect::<Result<Vec<Vec<String>>, Report>>()?
.into_iter()
.flatten()
.unique();

for name in names {
let filepath = get_filepath(&name, &template, output_dir)?;

for name in dataset_names {
let filepath = get_filepath(&name, &self.template, &self.output_dir)?;
if let Some(filepath) = filepath {
let writer = get_or_insert_writer(&mut writers, filepath)?;
writer.write(&record.fasta_record.seq_name, &record.fasta_record.seq, false)?;
let writer = get_or_insert_writer(&mut self.writers, filepath)?;
writer.write(&record.seq_name, &record.seq, false)?;
}
}
Ok(())
}

stats.finish();

Ok(())
}

pub fn get_all_prefix_names(name: impl AsRef<str>) -> Result<Vec<String>, Report> {
@@ -250,9 +328,9 @@ struct StatsPrinter {
}

impl StatsPrinter {
fn new(enabled: bool) -> Self {
fn new(title: impl AsRef<str>, enabled: bool) -> Self {
if enabled {
println!("Suggested datasets for each sequence");
println!("{}", title.as_ref());
println!("{}┐", "─".repeat(110));
println!(
"{:^40} │ {:^40} │ {:^10} │ {:^10} │",
14 changes: 13 additions & 1 deletion packages/nextclade-cli/src/dataset/dataset_download.rs
Original file line number Diff line number Diff line change
@@ -150,6 +150,7 @@ pub fn dataset_zip_load(
}

Ok(NextcladeParams {
dataset_name: dataset_zip.to_str().unwrap().to_owned(),
ref_record,
gene_map,
tree,
@@ -296,6 +297,7 @@ pub fn dataset_dir_load(
}

Ok(NextcladeParams {
dataset_name: dataset_dir.to_str().unwrap().to_owned(),
ref_record,
gene_map,
tree,
@@ -344,14 +346,17 @@ pub fn dataset_json_load(
}

NextcladeParamsOptional {
dataset_name: dataset_json.to_str().map(ToOwned::to_owned),
ref_record,
gene_map,
tree,
virus_properties,
}
};

NextcladeParams::from_auspice(&auspice_json, &overrides, cdses)
// TODO: should we support multiple datasets here?
let mut datasets = NextcladeParams::from_auspice(&auspice_json, &overrides, cdses)?;
Ok(datasets.remove(0))
}

pub fn dataset_individual_files_load(
@@ -395,6 +400,12 @@ pub fn dataset_individual_files_load(
}

Ok(NextcladeParams {
dataset_name: run_args
.inputs
.input_pathogen_json
.as_ref()
.map(|s| s.to_str().unwrap().to_owned())
.unwrap_or_default(),
ref_record,
gene_map,
tree,
@@ -470,6 +481,7 @@ pub fn dataset_str_download_and_load(
}

Ok(NextcladeParams {
dataset_name: name.to_owned(),
ref_record,
gene_map,
tree,
1 change: 1 addition & 0 deletions packages/nextclade-web/.eslintrc.js
Original file line number Diff line number Diff line change
@@ -115,6 +115,7 @@ module.exports = {
'jsx-a11y/label-has-associated-control': ['warn', { assert: 'either' }],
'lodash/chaining': 'off',
'lodash/import-scope': 'off',
'lodash/matches-prop-shorthand': 'off',
'lodash/prefer-constant': 'off',
'lodash/prefer-lodash-chain': 'off',
'lodash/prefer-lodash-method': 'off',
10 changes: 3 additions & 7 deletions packages/nextclade-web/src/components/Common/MutationBadge.tsx
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
import React from 'react'
import { useRecoilValue } from 'recoil'
import { isNil } from 'lodash'
import { viewedDatasetNameAtom } from 'src/state/dataset.state'
import styled, { useTheme } from 'styled-components'
import { shade } from 'polished'
import type { AaSub, NucSub } from 'src/types'
@@ -52,12 +53,6 @@ export const PositionText = styled.span`
color: ${(props) => props.theme.gray800};
`

export const VersionText = styled.span`
padding: 1px 2px;
background-color: ${(props) => props.theme.gray400};
color: ${(props) => props.theme.gray800};
`

export interface NucleotideMutationBadgeProps {
mutation: NucSub
}
@@ -99,7 +94,8 @@ export function AminoacidMutationBadge({ mutation }: AminoacidMutationBadgeProps
const theme = useTheme()

const { cdsName, refAa, qryAa, pos } = mutation
const cds = useRecoilValue(cdsAtom(cdsName))
const datasetName = useRecoilValue(viewedDatasetNameAtom)
const cds = useRecoilValue(cdsAtom({ datasetName, cdsName }))

const geneBg = cds?.color ?? '#999'
const refBg = getAminoacidColor(refAa)
6 changes: 6 additions & 0 deletions packages/nextclade-web/src/components/Export/ExportPage.tsx
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
import { useRouter } from 'next/router'
import React, { useState } from 'react'
import { ViewedDatasetSelector } from 'src/components/Main/ViewedDatasetSelector'
import styled from 'styled-components'
import { TabContent, TabLabel, TabNav, TabPane } from 'src/components/Common/TabsFull'
import { ExportTabColumnConfig } from 'src/components/Export/ExportTabColumnConfig'
@@ -15,6 +16,11 @@ export function ExportPage() {
return (
<Layout>
<Container>
<div>
<label> {t('Select dataset')}</label>
<ViewedDatasetSelector />
</div>

<Header>
<h4 className="mx-auto">{t('Download output files')}</h4>
</Header>
Loading