Skip to content
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
24 changes: 16 additions & 8 deletions common_scripts/clade_translator.py
Original file line number Diff line number Diff line change
@@ -1,8 +1,9 @@
#!/bin/env python3
import os
import copy
from Bio import SeqIO, AlignIO, SeqRecord, Seq
import os

import numpy as np
from Bio import AlignIO, Seq, SeqIO, SeqRecord


def parse_args():
Expand Down Expand Up @@ -51,23 +52,30 @@ def get_coordinate_map(ref, qry):

coord_maps = {'nuc': get_coordinate_map(orig_ref.seq, new_ref.seq)}
for f in orig_features:
coord_maps[f] = get_coordinate_map(orig_features[f].extract(orig_ref).translate().seq,
new_features[f].extract(new_ref).translate().seq)
try:
coord_maps[f] = get_coordinate_map(orig_features[f].extract(orig_ref).translate().seq,
new_features[f].extract(new_ref).translate().seq)
except:
print(f"Could not map {f}")


with open(args.clades) as f:
clades = [l.strip().split('\t') for l in f]

with open(args.output_clades, 'w') as f:
f.write("clade\tgene\tsite\talt\n")
for clade in clades:
if (len(clade) < 4) or clades[0][0]=='#':
f.write('\t'.join(clade) + '\n')
continue

if clade[1] in coord_maps:
new_pos = max(0,coord_maps[clade[1]][0][int(clade[2])-1])+1
f.write('\t'.join([clade[0], clade[1], str(new_pos),clade[3]]) + '\n')

else:
try:
new_pos = max(0,coord_maps[clade[1]][0][int(clade[2])-1])+1
f.write('\t'.join([clade[0], clade[1], str(new_pos),clade[3]]) + '\n')
except:
print(f"Could not map {clade}")

if clade[1] == 'clade':
f.write('\t'.join(clade) + '\n')

82 changes: 49 additions & 33 deletions flu/Snakefile
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,6 @@ wildcard_constraints:
reference="[^_/]+",



rule all:
input:
[
Expand All @@ -19,23 +18,23 @@ rule all:
for reference in config["builds"][strain][segment]
],


rule fetch_data:
output:
raw_sequences="data/{strain}/raw_{segment}.fasta"
raw_sequences="data/{strain}/raw_{segment}.fasta",
shell:
"""
scp -r [email protected]:/scicore/home/neher/neher/nextstrain/seasonal-flu/data/{wildcards.strain}/raw_{wildcards.segment}.fasta {output.raw_sequences}
"""


rule download_clades:
message:
"Downloading clade definitions for {wildcards.strain} from {params.source} -> {output}"
output:
"data/clades_{strain}_{segment}_{reference}_raw.tsv",
params:
source=lambda w: config["builds"][w.strain][w.segment][w.reference][
"clade_url"
],
source=lambda w: config["builds"][w.strain][w.segment][w.reference]["clade_url"],
shell:
"curl {params.source} | sed '/V1A\\tHA1\\t146\\tI/d' >{output}"

Expand All @@ -58,6 +57,7 @@ rule offset_clades:
>{output}
"""


rule download_includes:
output:
"data/includes_{strain}_{segment}_{reference}.tsv",
Expand Down Expand Up @@ -99,8 +99,10 @@ rule parse:


def genes(w):
if w.segment=='ha': return ["SigPep", "HA1", "HA2"]
if w.segment=='na': return ["NA"]
if w.segment == "ha":
return ["SigPep", "HA1", "HA2"]
if w.segment == "na":
return ["NA"]


rule subsample:
Expand All @@ -112,13 +114,15 @@ rule subsample:
sampled_sequences="build/{strain}/{segment}/{reference}/subsample_raw.fasta",
sampled_strains="build/{strain}/{segment}/{reference}/subsample_raw.txt",
params:
filter_arguments=lambda w: config["builds"][w.strain][w.segment][
w.reference
]["filter"],
reference_EPI_ISL=lambda w: config["builds"][w.strain][w.segment][
w.reference
]["reference_EPI_ISL"],
other_include = lambda w:config["builds"][w.strain][w.segment][w.reference].get("include_file","")
filter_arguments=lambda w: config["builds"][w.strain][w.segment][w.reference][
"filter"
],
reference_EPI_ISL=lambda w: config["builds"][w.strain][w.segment][w.reference][
"reference_EPI_ISL"
],
other_include=lambda w: config["builds"][w.strain][w.segment][w.reference].get(
"include_file", ""
),
shell:
"""
augur filter \
Expand Down Expand Up @@ -225,15 +229,16 @@ rule tree:
> /dev/null
"""


# root using dates in treetime, use 1500 as sequence length (good enough, doesn't matter)
rule root:
input:
tree=rules.tree.output.tree,
metadata = rules.parse.output.metadata,
metadata=rules.parse.output.metadata,
output:
tree="build/{strain}/{segment}/{reference}/tree_rooted.nwk",
params:
outdir = "build/{strain}/{segment}/{reference}/tt_out"
outdir="build/{strain}/{segment}/{reference}/tt_out",
shell:
"""
treetime clock \
Expand All @@ -245,6 +250,7 @@ rule root:
cp {params.outdir}/rerooted.newick {output.tree}
"""


# refine while keeping the root
rule refine:
input:
Expand Down Expand Up @@ -344,21 +350,26 @@ rule clades:
> /dev/null
"""


## TODO explicitly relabel clade branch labels to differentiate long and short ones
# currently long ones are overwritten by short ones.
rule make_short_clades:
input:
clades=rules.offset_clades.output,
output:
clades = "data/clades-short_{strain}_{segment}_{reference}.tsv"
clades="data/clades-short_{strain}_{segment}_{reference}.tsv",
run:
with open(str(input.clades)) as fh:
clades = fh.readlines()

for contraction in config["builds"][wildcards.strain][wildcards.segment][wildcards.reference].get("clade_contractions", []):
clades = [x.replace(contraction['orig'], contraction['short']) for x in clades]
for contraction in config["builds"][wildcards.strain][wildcards.segment][
wildcards.reference
].get("clade_contractions", []):
clades = [
x.replace(contraction["orig"], contraction["short"]) for x in clades
]

with open(str(output.clades), 'w') as fh:
with open(str(output.clades), "w") as fh:
for line in clades:
fh.write(line)

Expand All @@ -370,7 +381,7 @@ rule clades_short:
tree=rules.refine.output.tree,
aa_muts=rules.aa_muts_explicit.output.node_data,
nuc_muts=rules.ancestral.output.node_data,
clades = "data/clades-short_{strain}_{segment}_{reference}.tsv"
clades="data/clades-short_{strain}_{segment}_{reference}.tsv",
output:
node_data="build/{strain}/{segment}/{reference}/clades-short.json",
shell:
Expand All @@ -383,26 +394,27 @@ rule clades_short:
sed -i 's/clade_membership/short_clade/' {output.node_data}
"""


# make sure all differences between the alignment reference and the root are attached as mutations to the root
rule attach_root_mutations:
input:
aa_muts=rules.aa_muts_explicit.output.node_data,
nuc_muts=rules.ancestral.output.node_data,
translations = rules.align.output.alignment,
tree = rules.refine.output.tree
translations=rules.align.output.alignment,
tree=rules.refine.output.tree,
output:
aa_muts="build/{strain}/{segment}/{reference}/aa_muts_adapted.json",
nuc_muts="build/{strain}/{segment}/{reference}/nuc_muts_adapted.json"
nuc_muts="build/{strain}/{segment}/{reference}/nuc_muts_adapted.json",
params:
genes = genes,
genes=genes,
translations=lambda w: expand(
"build/{strain}/{segment}/{reference}/aligned.gene.{genes}.fasta",
strain=w.strain,
segment=w.segment,
genes=genes(w),
reference=w.reference,
),
reference = lambda w: w.reference
reference=lambda w: w.reference,
shell:
"""
python3 ../common_scripts/attach_root_mutations.py \
Expand All @@ -428,7 +440,9 @@ def get_node_data(w):
node_data.append("build/{strain}/{segment}/{reference}/clades.json".format(**w))

if "clade_contractions" in config["builds"][w.strain][w.segment][w.reference]:
node_data.append("build/{strain}/{segment}/{reference}/clades-short.json".format(**w))
node_data.append(
"build/{strain}/{segment}/{reference}/clades-short.json".format(**w)
)

return node_data

Expand All @@ -439,8 +453,10 @@ rule export:
input:
tree=rules.refine.output.tree,
metadata=rules.parse.output.metadata,
node_data = get_node_data,
auspice_config=lambda w: config["files"]["auspice_config_shortclade"] if "clade_contractions" in config["builds"][w.strain][w.segment][w.reference] else config["files"]["auspice_config"],
node_data=get_node_data,
auspice_config=lambda w: config["files"]["auspice_config_shortclade"]
if "clade_contractions" in config["builds"][w.strain][w.segment][w.reference]
else config["files"]["auspice_config"],
output:
auspice_json="auspice/{strain}/{segment}/{reference}/auspice_raw.json",
params:
Expand All @@ -466,7 +482,7 @@ rule swap_strain_accession:
output:
auspice_json="auspice/{strain}/{segment}/{reference}/auspice.json",
params:
fake_clade = lambda w: '--add-fake-clade none' if w.segment != 'ha' else ''
fake_clade=lambda w: "--add-fake-clade none" if w.segment != "ha" else "",
shell:
"""
python3 scripts/swap_strain_accession.py \
Expand Down Expand Up @@ -526,10 +542,11 @@ rule assemble_folder:
cp {input.tree} {output.tree};
"""

if 'timestamp' not in config:

if "timestamp" not in config:
timestamp = datetime.datetime.utcnow().isoformat()[:-7] + "Z"
else:
timestamp = config['timestamp']
timestamp = config["timestamp"]


rule test_nextclade:
Expand Down Expand Up @@ -558,7 +575,6 @@ rule test_nextclade:
"""



rule clean:
shell:
"""
Expand Down
6 changes: 4 additions & 2 deletions rsv/Snakefile
Original file line number Diff line number Diff line change
Expand Up @@ -68,9 +68,11 @@ rule lift_clades_to_reference:
input:
clade_file="data/{a_or_b}/{reference}/clades_{clade_type}_raw.tsv",
reference="references/{a_or_b}/{reference}/reference.gbk",
orig_reference="data/{a_or_b}/{reference}/clade_reference.gbk",
orig_reference=lambda w: config["builds"][w.a_or_b][w.reference]["clades"][
w.clade_type
]["ref_path"],
output:
clade_file="data/{a_or_b}/{reference}/clades_{clade_type}.tsv",
clade_file="data/{a_or_b}/{reference}/clades_{clade_type,[A-Za-z0-9]+}.tsv",
shell:
"""
python3 ../common_scripts/clade_translator.py \
Expand Down
11 changes: 11 additions & 0 deletions rsv/profiles/auspice_config.json
Original file line number Diff line number Diff line change
Expand Up @@ -26,6 +26,11 @@
"title": "G Clades (Goya et al)",
"type": "categorical"
},
{
"key": "pango",
"title": "Proposed lineages",
"type": "categorical"
},
{
"key": "date",
"title": "Sample Date",
Expand Down Expand Up @@ -57,6 +62,12 @@
"displayName": "G_clade (Goya et al)",
"description": "Clades based on the G gene and Goya et al, IRV, 2019.",
"hideInWeb": false
},
{
"name": "pango",
"displayName": "Proposed lineages",
"description": "Proposed lineages based on Pango",
"hideInWeb": false
}
]
}
Expand Down
18 changes: 17 additions & 1 deletion rsv/profiles/builds.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@ auspice_config: "profiles/auspice_config.json"
color_schemes: "profiles/color_schemes.tsv"
exclude: "profiles/exclude.txt"

timestamp: "2023-02-03T12:00:00Z"
timestamp: "2023-05-10T12:00:00Z"

builds:
a:
Expand All @@ -17,10 +17,18 @@ builds:
key: clade_membership
label_key: clade_annotation
def: "references/a/EPI_ISL_412866/clades_genome.tsv"
ref_path: "data/a/EPI_ISL_412866/clade_reference.gbk"
G:
key: G_clade
label_key: G_clade_label
def: "references/a/EPI_ISL_412866/clades_G.tsv"
ref_path: "data/a/EPI_ISL_412866/clade_reference.gbk"
pango:
key: pango
label_key: pango_label
excel_path: "profiles/pango/amino-acid-genotypes.xlsx"
excel_sheet: "AA_RSVA"
ref_path: "profiles/pango/REFROOTA.gb"
b:
EPI_ISL_1653999:
filter: "--min-date 1965 --probabilistic-sampling --group-by year --subsample-max-sequences 1500 --query 'genome_coverage>0.95'"
Expand All @@ -31,10 +39,18 @@ builds:
key: clade_membership
label_key: clade_annotation
def: "references/b/EPI_ISL_1653999/clades_genome.tsv"
ref_path: "data/b/EPI_ISL_1653999/clade_reference.gbk"
G:
key: G_clade
label_key: G_clade_label
def: "references/b/EPI_ISL_1653999/clades_G.tsv"
ref_path: "data/b/EPI_ISL_1653999/clade_reference.gbk"
pango:
key: pango
label_key: pango_label
excel_path: "profiles/pango/amino-acid-genotypes.xlsx"
excel_sheet: "AA_RSVB"
ref_path: "profiles/pango/REFROOTB.gb"


unused_builds:
Expand Down
Loading