neherlab · corneliusroemer · May 12, 2023 · May 12, 2023 · May 12, 2023 · May 12, 2023
diff --git a/common_scripts/clade_translator.py b/common_scripts/clade_translator.py
@@ -1,8 +1,9 @@
 #!/bin/env python3
-import os
 import copy
-from Bio import SeqIO, AlignIO, SeqRecord, Seq
+import os
+
 import numpy as np
+from Bio import AlignIO, Seq, SeqIO, SeqRecord
 
 
 def parse_args():
@@ -51,23 +52,30 @@ def get_coordinate_map(ref, qry):
 
     coord_maps = {'nuc': get_coordinate_map(orig_ref.seq, new_ref.seq)}
     for f in orig_features:
-        coord_maps[f] = get_coordinate_map(orig_features[f].extract(orig_ref).translate().seq,
-                                           new_features[f].extract(new_ref).translate().seq)
+        try:
+            coord_maps[f] = get_coordinate_map(orig_features[f].extract(orig_ref).translate().seq,
+                                            new_features[f].extract(new_ref).translate().seq)
+        except:
+            print(f"Could not map {f}")
 
 
     with open(args.clades) as f:
         clades = [l.strip().split('\t') for l in f]
 
     with open(args.output_clades, 'w') as f:
+        f.write("clade\tgene\tsite\talt\n")
         for clade in clades:
             if (len(clade) < 4) or clades[0][0]=='#':
                 f.write('\t'.join(clade) + '\n')
                 continue
 
             if clade[1] in coord_maps:
-                new_pos = max(0,coord_maps[clade[1]][0][int(clade[2])-1])+1
-                f.write('\t'.join([clade[0], clade[1], str(new_pos),clade[3]]) + '\n')
-
-            else:
+                try:
+                    new_pos = max(0,coord_maps[clade[1]][0][int(clade[2])-1])+1
+                    f.write('\t'.join([clade[0], clade[1], str(new_pos),clade[3]]) + '\n')
+                except:
+                    print(f"Could not map {clade}")
+
+            if clade[1] == 'clade':
                 f.write('\t'.join(clade) + '\n')
 
diff --git a/flu/Snakefile b/flu/Snakefile
@@ -9,7 +9,6 @@ wildcard_constraints:
     reference="[^_/]+",
 
 
-
 rule all:
     input:
         [
@@ -19,23 +18,23 @@ rule all:
             for reference in config["builds"][strain][segment]
         ],
 
+
 rule fetch_data:
     output:
-        raw_sequences="data/{strain}/raw_{segment}.fasta"
+        raw_sequences="data/{strain}/raw_{segment}.fasta",
     shell:
         """
         scp -r [email protected]:/scicore/home/neher/neher/nextstrain/seasonal-flu/data/{wildcards.strain}/raw_{wildcards.segment}.fasta {output.raw_sequences}
         """
 
+
 rule download_clades:
     message:
         "Downloading clade definitions for {wildcards.strain} from {params.source} -> {output}"
     output:
         "data/clades_{strain}_{segment}_{reference}_raw.tsv",
     params:
-        source=lambda w: config["builds"][w.strain][w.segment][w.reference][
-            "clade_url"
-        ],
+        source=lambda w: config["builds"][w.strain][w.segment][w.reference]["clade_url"],
     shell:
         "curl {params.source} | sed '/V1A\\tHA1\\t146\\tI/d' >{output}"
 
@@ -58,6 +57,7 @@ rule offset_clades:
             >{output}
         """
 
+
 rule download_includes:
     output:
         "data/includes_{strain}_{segment}_{reference}.tsv",
@@ -99,8 +99,10 @@ rule parse:
 
 
 def genes(w):
-    if w.segment=='ha': return ["SigPep", "HA1", "HA2"]
-    if w.segment=='na': return ["NA"]
+    if w.segment == "ha":
+        return ["SigPep", "HA1", "HA2"]
+    if w.segment == "na":
+        return ["NA"]
 
 
 rule subsample:
@@ -112,13 +114,15 @@ rule subsample:
         sampled_sequences="build/{strain}/{segment}/{reference}/subsample_raw.fasta",
         sampled_strains="build/{strain}/{segment}/{reference}/subsample_raw.txt",
     params:
-        filter_arguments=lambda w: config["builds"][w.strain][w.segment][
-            w.reference
-        ]["filter"],
-        reference_EPI_ISL=lambda w: config["builds"][w.strain][w.segment][
-            w.reference
-        ]["reference_EPI_ISL"],
-        other_include = lambda w:config["builds"][w.strain][w.segment][w.reference].get("include_file","")
+        filter_arguments=lambda w: config["builds"][w.strain][w.segment][w.reference][
+            "filter"
+        ],
+        reference_EPI_ISL=lambda w: config["builds"][w.strain][w.segment][w.reference][
+            "reference_EPI_ISL"
+        ],
+        other_include=lambda w: config["builds"][w.strain][w.segment][w.reference].get(
+            "include_file", ""
+        ),
     shell:
         """
         augur filter \
@@ -225,15 +229,16 @@ rule tree:
             > /dev/null
         """
 
+
 # root using dates in treetime, use 1500 as sequence length (good enough, doesn't matter)
 rule root:
     input:
         tree=rules.tree.output.tree,
-        metadata = rules.parse.output.metadata,
+        metadata=rules.parse.output.metadata,
     output:
         tree="build/{strain}/{segment}/{reference}/tree_rooted.nwk",
     params:
-        outdir = "build/{strain}/{segment}/{reference}/tt_out"
+        outdir="build/{strain}/{segment}/{reference}/tt_out",
     shell:
         """
         treetime clock \
@@ -245,6 +250,7 @@ rule root:
         cp {params.outdir}/rerooted.newick {output.tree}
         """
 
+
 # refine while keeping the root
 rule refine:
     input:
@@ -344,21 +350,26 @@ rule clades:
             > /dev/null
         """
 
+
 ## TODO explicitly relabel clade branch labels to differentiate long and short ones
 # currently long ones are overwritten by short ones.
 rule make_short_clades:
     input:
         clades=rules.offset_clades.output,
     output:
-        clades = "data/clades-short_{strain}_{segment}_{reference}.tsv"
+        clades="data/clades-short_{strain}_{segment}_{reference}.tsv",
     run:
         with open(str(input.clades)) as fh:
             clades = fh.readlines()
 
-        for contraction in config["builds"][wildcards.strain][wildcards.segment][wildcards.reference].get("clade_contractions", []):
-            clades = [x.replace(contraction['orig'], contraction['short']) for x in clades]
+        for contraction in config["builds"][wildcards.strain][wildcards.segment][
+            wildcards.reference
+        ].get("clade_contractions", []):
+            clades = [
+                x.replace(contraction["orig"], contraction["short"]) for x in clades
+            ]
 
-        with open(str(output.clades), 'w') as fh:
+        with open(str(output.clades), "w") as fh:
             for line in clades:
                 fh.write(line)
 
@@ -370,7 +381,7 @@ rule clades_short:
         tree=rules.refine.output.tree,
         aa_muts=rules.aa_muts_explicit.output.node_data,
         nuc_muts=rules.ancestral.output.node_data,
-        clades = "data/clades-short_{strain}_{segment}_{reference}.tsv"
+        clades="data/clades-short_{strain}_{segment}_{reference}.tsv",
     output:
         node_data="build/{strain}/{segment}/{reference}/clades-short.json",
     shell:
@@ -383,26 +394,27 @@ rule clades_short:
             sed -i 's/clade_membership/short_clade/' {output.node_data}
         """
 
+
 # make sure all differences between the alignment reference and the root are attached as mutations to the root
 rule attach_root_mutations:
     input:
         aa_muts=rules.aa_muts_explicit.output.node_data,
         nuc_muts=rules.ancestral.output.node_data,
-        translations = rules.align.output.alignment,
-        tree = rules.refine.output.tree
+        translations=rules.align.output.alignment,
+        tree=rules.refine.output.tree,
     output:
         aa_muts="build/{strain}/{segment}/{reference}/aa_muts_adapted.json",
-        nuc_muts="build/{strain}/{segment}/{reference}/nuc_muts_adapted.json"
+        nuc_muts="build/{strain}/{segment}/{reference}/nuc_muts_adapted.json",
     params:
-        genes = genes,
+        genes=genes,
         translations=lambda w: expand(
             "build/{strain}/{segment}/{reference}/aligned.gene.{genes}.fasta",
             strain=w.strain,
             segment=w.segment,
             genes=genes(w),
             reference=w.reference,
         ),
-        reference = lambda w: w.reference
+        reference=lambda w: w.reference,
     shell:
         """
         python3 ../common_scripts/attach_root_mutations.py \
@@ -428,7 +440,9 @@ def get_node_data(w):
         node_data.append("build/{strain}/{segment}/{reference}/clades.json".format(**w))
 
     if "clade_contractions" in config["builds"][w.strain][w.segment][w.reference]:
-        node_data.append("build/{strain}/{segment}/{reference}/clades-short.json".format(**w))
+        node_data.append(
+            "build/{strain}/{segment}/{reference}/clades-short.json".format(**w)
+        )
 
     return node_data
 
@@ -439,8 +453,10 @@ rule export:
     input:
         tree=rules.refine.output.tree,
         metadata=rules.parse.output.metadata,
-        node_data = get_node_data,
-        auspice_config=lambda w: config["files"]["auspice_config_shortclade"] if "clade_contractions" in config["builds"][w.strain][w.segment][w.reference] else config["files"]["auspice_config"],
+        node_data=get_node_data,
+        auspice_config=lambda w: config["files"]["auspice_config_shortclade"]
+        if "clade_contractions" in config["builds"][w.strain][w.segment][w.reference]
+        else config["files"]["auspice_config"],
     output:
         auspice_json="auspice/{strain}/{segment}/{reference}/auspice_raw.json",
     params:
@@ -466,7 +482,7 @@ rule swap_strain_accession:
     output:
         auspice_json="auspice/{strain}/{segment}/{reference}/auspice.json",
     params:
-        fake_clade = lambda w: '--add-fake-clade none' if w.segment != 'ha' else ''
+        fake_clade=lambda w: "--add-fake-clade none" if w.segment != "ha" else "",
     shell:
         """
         python3 scripts/swap_strain_accession.py \
@@ -526,10 +542,11 @@ rule assemble_folder:
         cp {input.tree} {output.tree};
         """
 
-if 'timestamp' not in config:
+
+if "timestamp" not in config:
     timestamp = datetime.datetime.utcnow().isoformat()[:-7] + "Z"
 else:
-    timestamp = config['timestamp']
+    timestamp = config["timestamp"]
 
 
 rule test_nextclade:
@@ -558,7 +575,6 @@ rule test_nextclade:
         """
 
 
-
 rule clean:
     shell:
         """

diff --git a/rsv/Snakefile b/rsv/Snakefile
@@ -68,9 +68,11 @@ rule lift_clades_to_reference:
     input:
         clade_file="data/{a_or_b}/{reference}/clades_{clade_type}_raw.tsv",
         reference="references/{a_or_b}/{reference}/reference.gbk",
-        orig_reference="data/{a_or_b}/{reference}/clade_reference.gbk",
+        orig_reference=lambda w: config["builds"][w.a_or_b][w.reference]["clades"][
+            w.clade_type
+        ]["ref_path"],
     output:
-        clade_file="data/{a_or_b}/{reference}/clades_{clade_type}.tsv",
+        clade_file="data/{a_or_b}/{reference}/clades_{clade_type,[A-Za-z0-9]+}.tsv",
     shell:
         """
         python3 ../common_scripts/clade_translator.py \

diff --git a/rsv/profiles/auspice_config.json b/rsv/profiles/auspice_config.json
@@ -26,6 +26,11 @@
       "title": "G Clades (Goya et al)",
       "type": "categorical"
     },
+    {
+      "key": "pango",
+      "title": "Proposed lineages",
+      "type": "categorical"
+    },
     {
       "key": "date",
       "title": "Sample Date",
@@ -57,6 +62,12 @@
           "displayName": "G_clade (Goya et al)",
           "description": "Clades based on the G gene and Goya et al, IRV, 2019.",
           "hideInWeb": false
+        },
+        {
+          "name": "pango",
+          "displayName": "Proposed lineages",
+          "description": "Proposed lineages based on Pango",
+          "hideInWeb": false
         }
       ]
     }

diff --git a/rsv/profiles/builds.yaml b/rsv/profiles/builds.yaml
@@ -4,7 +4,7 @@ auspice_config: "profiles/auspice_config.json"
 color_schemes: "profiles/color_schemes.tsv"
 exclude: "profiles/exclude.txt"
 
-timestamp: "2023-02-03T12:00:00Z"
+timestamp: "2023-05-10T12:00:00Z"
 
 builds:
   a:
@@ -17,10 +17,18 @@ builds:
           key: clade_membership
           label_key: clade_annotation
           def: "references/a/EPI_ISL_412866/clades_genome.tsv"
+          ref_path: "data/a/EPI_ISL_412866/clade_reference.gbk"
         G:
           key: G_clade
           label_key: G_clade_label
           def: "references/a/EPI_ISL_412866/clades_G.tsv"
+          ref_path: "data/a/EPI_ISL_412866/clade_reference.gbk"
+        pango:
+          key: pango
+          label_key: pango_label
+          excel_path: "profiles/pango/amino-acid-genotypes.xlsx"
+          excel_sheet: "AA_RSVA"
+          ref_path: "profiles/pango/REFROOTA.gb"
   b:
     EPI_ISL_1653999:
       filter: "--min-date 1965 --probabilistic-sampling --group-by year --subsample-max-sequences 1500 --query 'genome_coverage>0.95'"
@@ -31,10 +39,18 @@ builds:
           key: clade_membership
           label_key: clade_annotation
           def: "references/b/EPI_ISL_1653999/clades_genome.tsv"
+          ref_path: "data/b/EPI_ISL_1653999/clade_reference.gbk"
         G:
           key: G_clade
           label_key: G_clade_label
           def: "references/b/EPI_ISL_1653999/clades_G.tsv"
+          ref_path: "data/b/EPI_ISL_1653999/clade_reference.gbk"
+        pango:
+          key: pango
+          label_key: pango_label
+          excel_path: "profiles/pango/amino-acid-genotypes.xlsx"
+          excel_sheet: "AA_RSVB"
+          ref_path: "profiles/pango/REFROOTB.gb"
 
 
 unused_builds: