Skip to content

Commit babe8ae

Browse files
fixes issue encountered during testing. (#96)
* fixes issue encountered during testing. * Updates based on testing * Paste integrated count change
1 parent 2d66c98 commit babe8ae

File tree

5 files changed

+115
-32
lines changed

5 files changed

+115
-32
lines changed

qp_klp/Protocol.py

Lines changed: 22 additions & 18 deletions
Original file line numberDiff line numberDiff line change
@@ -118,8 +118,10 @@ def convert_raw_to_fastq(self):
118118
if 'TellReadJob' not in self.skip_steps:
119119
job.run(callback=self.job_callback)
120120

121-
self.pipeline.get_sample_ids()
122-
failed_samples = []
121+
# audit the results to determine which samples failed to convert
122+
# properly. Append these to the failed-samples report and also
123+
# return the list directly to the caller.
124+
failed_samples = job.audit()
123125
if hasattr(self, 'fsr'):
124126
# NB 16S does not require a failed samples report and
125127
# it is not performed by SPP.
@@ -130,32 +132,35 @@ def convert_raw_to_fastq(self):
130132
def generate_sequence_counts(self):
131133
config = self.pipeline.get_software_configuration('tell-seq')
132134

135+
files_to_count_path = join(self.pipeline.output_path,
136+
'files_to_count.txt')
137+
138+
with open(files_to_count_path, 'w') as f:
139+
for root, _, files in walk(self.raw_fastq_files_path):
140+
for _file in files:
141+
if self._determine_orientation(_file) in ['R1', 'R2']:
142+
print(join(root, _file), file=f)
143+
133144
job = SeqCountsJob(self.pipeline.run_dir,
134145
self.pipeline.output_path,
135-
self.pipeline.input_file_path,
136146
config['queue'],
137147
config['nodes'],
138148
config['wallclock_time_in_minutes'],
139149
config['normcount_mem_limit'],
140150
config['modules_to_load'],
141151
self.master_qiita_job_id,
142-
'',
143-
config['integrate_script_path'],
144-
self.pipeline.qiita_job_id)
152+
config['job_max_array_length'],
153+
files_to_count_path,
154+
self.pipeline.get_sample_sheet_path(),
155+
cores_per_task=config['tellread_cores'])
145156

146157
if 'SeqCountsJob' not in self.skip_steps:
147158
job.run(callback=self.job_callback)
148159

149-
# audit the results to determine which samples failed to convert
150-
# properly. Append these to the failed-samples report and also
151-
# return the list directly to the caller.
152-
failed_samples = job.audit_me(self.pipeline.get_sample_ids())
153-
if hasattr(self, 'fsr'):
154-
# NB 16S does not require a failed samples report and
155-
# it is not performed by SPP.
156-
self.fsr.write(failed_samples, job.__class__.__name__)
157-
158-
return failed_samples
160+
# Do not add an entry to fsr because w/respect to counting, either
161+
# all jobs are going to fail or none are going to fail. It's not
162+
# likely that we're going to fail to count sequences for only some
163+
# of the samples.
159164

160165
def integrate_results(self):
161166
config = self.pipeline.get_software_configuration('tell-seq')
@@ -173,7 +178,6 @@ def integrate_results(self):
173178
config['integrate_mem_limit'],
174179
config['modules_to_load'],
175180
self.master_qiita_job_id,
176-
"foo",
177181
config['integrate_script_path'],
178182
# NB: sample_index_list used may vary
179183
# from project to project in the future.
@@ -224,7 +228,7 @@ def integrate_results(self):
224228
# audit the results to determine which samples failed to convert
225229
# properly. Append these to the failed-samples report and also
226230
# return the list directly to the caller.
227-
failed_samples = job.audit_me(self.pipeline.get_sample_ids())
231+
failed_samples = job.audit()
228232

229233
if hasattr(self, 'fsr'):
230234
# NB 16S does not require a failed samples report and

qp_klp/TellseqMetagenomicWorkflow.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -101,10 +101,10 @@ def execute_pipeline(self):
101101
# This means fsr reports will be accurate even on restarts.
102102
self.convert_raw_to_fastq()
103103

104-
self.generate_sequence_counts()
105-
106104
self.integrate_results()
107105

106+
self.generate_sequence_counts()
107+
108108
self.update_status("Performing quality control", 2, 9)
109109
self.quality_control()
110110

qp_klp/Workflows.py

Lines changed: 39 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -584,6 +584,40 @@ def get_samples_in_qiita(cls, qclient, qiita_id):
584584

585585
return (samples, tids)
586586

587+
@classmethod
588+
def _determine_orientation(cls, file_name):
589+
# aka forward, reverse, and indexed reads
590+
orientations = ['R1', 'R2', 'I1', 'I2']
591+
592+
results = []
593+
594+
# assume orientation is always present in the file's name.
595+
# assume that it is of one of the four forms above.
596+
# assume that it is always the right-most occurance of the four
597+
# orientations above.
598+
# assume that orientation is encapsulated with either '_' or '.'
599+
# e.g.: '_R1_', '.I2.'.
600+
# assume users can and will include any or all of the four
601+
# orientation as part of their filenames as well. e.g.:
602+
# ABC_7_04_1776_R1_SRE_S3_L007_R2_001.trimmed.fastq.gz
603+
for o in orientations:
604+
variations = [f"_{o}_", f".{o}."]
605+
for v in variations:
606+
# rfind searches from the end of the string, rather than
607+
# its beginning. It returns the position in the string
608+
# where the substring begins.
609+
results.append((file_name.rfind(v), o))
610+
611+
# the orientation will be the substring found with the maximum
612+
# found value for pos. That is, it will be the substring that
613+
# begins at the rightest most position in the file name.
614+
results.sort(reverse=True)
615+
616+
pos, orientation = results[0]
617+
618+
# if no orientations were found, then return None.
619+
return None if pos == -1 else orientation
620+
587621
def _get_postqc_fastq_files(self, out_dir, project):
588622
af = None
589623
sub_folders = ['amplicon', 'filtered_sequences', 'trimmed_sequences']
@@ -599,11 +633,13 @@ def _get_postqc_fastq_files(self, out_dir, project):
599633
'raw_reverse_seqs': []}
600634

601635
for fastq_file in af:
602-
if '_I1_' in fastq_file or '_I2_' in fastq_file:
636+
_, file_name = split(fastq_file)
637+
orientation = self._determine_orientation(file_name)
638+
if orientation in ['I1', 'I2']:
603639
files['raw_barcodes'].append(fastq_file)
604-
elif '_R1_' in fastq_file:
640+
elif orientation == 'R1':
605641
files['raw_forward_seqs'].append(fastq_file)
606-
elif '_R2_' in fastq_file:
642+
elif orientation == 'R2':
607643
files['raw_reverse_seqs'].append(fastq_file)
608644
else:
609645
raise ValueError(f"Unrecognized file: {fastq_file}")

qp_klp/klp.py

Lines changed: 9 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -106,6 +106,14 @@ def sequence_processing_pipeline(qclient, job_id, parameters, out_dir):
106106
user_input_file = parameters.pop('sample_sheet')
107107
lane_number = parameters.pop('lane_number')
108108

109+
if {'body', 'content_type', 'filename'} != set(user_input_file):
110+
return False, None, ("This doesn't appear to be a valid sample "
111+
"sheet or mapping file; please review.")
112+
uif_path = out_path(user_input_file['filename'].replace(' ', '_'))
113+
# save raw data to file
114+
with open(uif_path, 'w') as f:
115+
f.write(user_input_file['body'])
116+
109117
# the run_identifier must be saved because it is not always preserved
110118
# in a dependable location downstream. The user input file must be
111119
# saved because it is always a unique name and it cannot be guaranteed
@@ -114,15 +122,7 @@ def sequence_processing_pipeline(qclient, job_id, parameters, out_dir):
114122
# the user_input file on the first run.
115123
restart_file_path = out_path('restart_me')
116124
with open(restart_file_path, 'w') as f:
117-
f.write(f"{run_identifier}\n{user_input_file}")
118-
119-
if {'body', 'content_type', 'filename'} != set(user_input_file):
120-
return False, None, ("This doesn't appear to be a valid sample "
121-
"sheet or mapping file; please review.")
122-
uif_path = out_path(user_input_file['filename'].replace(' ', '_'))
123-
# save raw data to file
124-
with open(uif_path, 'w') as f:
125-
f.write(user_input_file['body'])
125+
f.write(f"{run_identifier}\n{uif_path}")
126126

127127
final_results_path = out_path('final_results')
128128
makedirs(final_results_path, exist_ok=True)

qp_klp/tests/test_workflows.py

Lines changed: 43 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -12,6 +12,7 @@
1212
from os import environ, remove, getcwd
1313
import re
1414
from qp_klp.WorkflowFactory import WorkflowFactory
15+
from qp_klp.Workflows import Workflow
1516
from metapool import load_sample_sheet
1617
from collections import defaultdict
1718
from random import randint
@@ -890,3 +891,45 @@ def open_job_script(script_path):
890891
exp = open_job_script("qp_klp/tests/data/tellread_test.sbatch")
891892

892893
self.assertEqual(obs, exp)
894+
895+
def test_foo(self):
896+
test_names = [
897+
# single additional occurance: R1
898+
("ABC_7_04_1776_R1_SRE_S3_L007_R1_001.trimmed.fastq.gz", "R1"),
899+
("ABC_7_04_1776_R1_SRE_S3_L007_R2_001.trimmed.fastq.gz", "R2"),
900+
("ABC_7_04_1776_R1_SRE_S3_L007_I1_001.trimmed.fastq.gz", "I1"),
901+
("ABC_7_04_1776_R1_SRE_S3_L007_I2_001.trimmed.fastq.gz", "I2"),
902+
903+
# test w/dots.
904+
("ABC_7_04_1776.R1.SRE_S3_L007.R1.001.trimmed.fastq.gz", "R1"),
905+
("ABC_7_04_1776.R1.SRE_S3_L007.R2.001.trimmed.fastq.gz", "R2"),
906+
("ABC_7_04_1776.R1.SRE_S3_L007.I1.001.trimmed.fastq.gz", "I1"),
907+
("ABC_7_04_1776.R1.SRE_S3_L007.I2.001.trimmed.fastq.gz", "I2"),
908+
909+
# single additional occurance: R2
910+
("ABC_7_04_1776_R2_SRE_S3_L007_R1_001.trimmed.fastq.gz", "R1"),
911+
("ABC_7_04_1776_R2_SRE_S3_L007_R2_001.trimmed.fastq.gz", "R2"),
912+
("ABC_7_04_1776_R2_SRE_S3_L007_I1_001.trimmed.fastq.gz", "I1"),
913+
("ABC_7_04_1776_R2_SRE_S3_L007_I2_001.trimmed.fastq.gz", "I2"),
914+
915+
# single additional occurance: In
916+
("ABC_7_04_1776_I2_SRE_S3_L007_R1_001.trimmed.fastq.gz", "R1"),
917+
("ABC_7_04_1776_I1_SRE_S3_L007_R2_001.trimmed.fastq.gz", "R2"),
918+
("ABC_7_04_1776_I2_SRE_S3_L007_I1_001.trimmed.fastq.gz", "I1"),
919+
("ABC_7_04_1776_I1_SRE_S3_L007_I2_001.trimmed.fastq.gz", "I2"),
920+
921+
# no additional occurances
922+
("ABC_7_04_1776_SRE_S3_L007_R1_001.trimmed.fastq.gz", "R1"),
923+
("ABC_7_04_1776_SRE_S3_L007_R2_001.trimmed.fastq.gz", "R2"),
924+
("ABC_7_04_1776_SRE_S3_L007_I1_001.trimmed.fastq.gz", "I1"),
925+
("ABC_7_04_1776_SRE_S3_L007_I2_001.trimmed.fastq.gz", "I2"),
926+
927+
# two additional occurances
928+
("ABC_7_04_1776_I2_SRE.R1.S3_L007_R1_001.trimmed.fastq.gz", "R1"),
929+
("ABC_7_04_1776_I1_SRE.R1.S3_L007_R2_001.trimmed.fastq.gz", "R2"),
930+
("ABC_7_04_1776_I2_SRE.R1.S3_L007_I1_001.trimmed.fastq.gz", "I1"),
931+
("ABC_7_04_1776_I1_SRE.R1.S3_L007_I2_001.trimmed.fastq.gz", "I2"),
932+
]
933+
934+
for file_name, exp in test_names:
935+
self.assertEqual(Workflow._determine_orientation(file_name), exp)

0 commit comments

Comments
 (0)