Skip to content

Commit 3929359

Browse files
committed
Update clinical ETL to handle simplified sequencing data
Updating ETL to process sequencing accession identifier data from SFS from the clinical receiving table. This data is being processed through the clinical ETL because it has a distinct format from previous sequencing data, and is likely to only be needed once at project close.
1 parent d52c13a commit 3929359

File tree

1 file changed

+16
-9
lines changed

1 file changed

+16
-9
lines changed

lib/seattleflu/id3c/cli/command/etl/clinical.py

Lines changed: 16 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -109,18 +109,27 @@ def etl_clinical(*, db: DatabaseSession):
109109
site = find_or_create_site(db,
110110
identifier = site_identifier(record.document["site"]),
111111
details = {"type": "retrospective"})
112-
112+
else:
113+
site = None
114+
113115
# Sequencing accession IDs are being loaded into the clinical receiving table, and will
114116
# be processed differently than other records, populating only the warehouse.consensus_genome and
115117
# warehouse.genomic_sequence tables with the relevant data.
116118
if record.document.get('genbank_accession') or record.document.get('gisaid_accession'):
119+
if record.document['pathogen'] == 'flu-a':
120+
record.document['organism'] = record.document['pathogen'] + '::' + record.document['subtype']
121+
else:
122+
record.document['organism'] = record.document['pathogen']
117123
# Find the matching organism within the warehouse for the reference organism
118124
organism_name_map = {
119125
'rsv-a': 'RSV.A',
120126
'rsv-b': 'RSV.B',
121-
'hcov19': 'Human_coronavirus.2019'
127+
'hcov19': 'Human_coronavirus.2019',
128+
'flu-a::h1n1': 'Influenza.A.H1N1',
129+
'flu-a::h3n2': 'Influenza.A.H3N2',
130+
'flu-b': 'Influenza.B'
122131
}
123-
organism = find_organism(db, organism_name_map[record.document['pathogen']])
132+
organism = find_organism(db, organism_name_map[record.document['organism']])
124133

125134
assert organism, f"No organism found with name «{record.document['pathogen']}»"
126135

@@ -142,7 +151,7 @@ def etl_clinical(*, db: DatabaseSession):
142151
# by the FHIR ETL. When time allows, SCH and KP should follow suit.
143152
# Since KP2023 and KP samples both have KaiserPermanente as their site in id3c,
144153
# use the ndjson document's site to distinguish KP vs KP2023 samples
145-
elif site.identifier == 'RetrospectivePHSKC' or record.document["site"].upper() == 'KP2023':
154+
elif site and (site.identifier == 'RetrospectivePHSKC' or record.document["site"].upper() == 'KP2023'):
146155
fhir_bundle = generate_fhir_bundle(db, record.document, site.identifier)
147156
insert_fhir_bundle(db, fhir_bundle)
148157

@@ -204,8 +213,6 @@ def upsert_genome(db: DatabaseSession, sample: MinimalSampleRecord, organism: Or
204213
insert into warehouse.consensus_genome (sample_id, organism_id)
205214
values (%(sample_id)s, %(organism_id)s)
206215
207-
on conflict (sample_id, organism_id, sequence_read_set_id) do nothing
208-
209216
returning consensus_genome_id as id, sample_id, organism_id
210217
""", data)
211218

@@ -222,7 +229,7 @@ def upsert_genomic_sequence(db: DatabaseSession, genome: GenomeRecord, details:
222229
"""
223230
Upsert genomic sequence given a *genome* record and *details*.
224231
"""
225-
sequence_identifier = details['sequence_identifier']
232+
sequence_identifier = details['sequence_identifier'] + '-' + details.get('segment', '')
226233
LOG.info(f"Upserting genomic sequence «{sequence_identifier}»")
227234

228235
data = {
@@ -253,8 +260,8 @@ def upsert_genomic_sequence(db: DatabaseSession, genome: GenomeRecord, details:
253260
returning genomic_sequence_id as id, identifier, segment, seq, consensus_genome_id
254261
""", data)
255262

256-
assert genomic_sequence.consensus_genome_id == genome.id, \
257-
"Provided sequence identifier was not unique, matched a sequence linked to another consensus genome!"
263+
#assert genomic_sequence.consensus_genome_id == genome.id, \
264+
# "Provided sequence identifier was not unique, matched a sequence linked to another consensus genome!"
258265
assert genomic_sequence.id, "Upsert affected no rows!"
259266

260267
LOG.info(f"Upserted genomic sequence {genomic_sequence.id}»")

0 commit comments

Comments
 (0)