Skip to content

Commit

Permalink
fixes issue encountered during testing.
Browse files Browse the repository at this point in the history
  • Loading branch information
charles-cowart committed Dec 13, 2024
1 parent 2d66c98 commit 63faccb
Show file tree
Hide file tree
Showing 2 changed files with 82 additions and 3 deletions.
42 changes: 39 additions & 3 deletions qp_klp/Workflows.py
Original file line number Diff line number Diff line change
Expand Up @@ -584,6 +584,40 @@ def get_samples_in_qiita(cls, qclient, qiita_id):

return (samples, tids)

@classmethod
def _determine_orientation(cls, file_name):
# aka forward, reverse, and indexed reads
orientations = ['R1', 'R2', 'I1', 'I2']

results = []

# assume orientation is always present in the file's name.
# assume that it is of one of the four forms above.
# assume that it is always the right-most occurance of the four
# orientations above.
# assume that orientation is encapsulated with either '_' or '.'
# e.g.: '_R1_', '.I2.'.
# assume users can and will include any or all of the four
# orientation as part of their filenames as well. e.g.:
# ABC_7_04_1776_R1_SRE_S3_L007_R2_001.trimmed.fastq.gz
for o in orientations:
variations = [f"_{o}_", f".{o}."]
for v in variations:
# rfind searches from the end of the string, rather than
# its beginning. It returns the position in the string
# where the substring begins.
results.append((file_name.rfind(v), o))

# the orientation will be the substring found with the maximum
# found value for pos. That is, it will be the substring that
# begins at the rightest most position in the file name.
results.sort(reverse=True)

pos, orientation = results[0]

# if no orientations were found, then return None.
return None if pos == -1 else orientation

def _get_postqc_fastq_files(self, out_dir, project):
af = None
sub_folders = ['amplicon', 'filtered_sequences', 'trimmed_sequences']
Expand All @@ -599,11 +633,13 @@ def _get_postqc_fastq_files(self, out_dir, project):
'raw_reverse_seqs': []}

for fastq_file in af:
if '_I1_' in fastq_file or '_I2_' in fastq_file:
_, file_name = split(fastq_file)
orientation = self._determine_orientation(file_name)
if orientation in ['I1', 'I2']:
files['raw_barcodes'].append(fastq_file)
elif '_R1_' in fastq_file:
elif orientation == 'R1':
files['raw_forward_seqs'].append(fastq_file)
elif '_R2_' in fastq_file:
elif orientation == 'R2':
files['raw_reverse_seqs'].append(fastq_file)
else:
raise ValueError(f"Unrecognized file: {fastq_file}")
Expand Down
43 changes: 43 additions & 0 deletions qp_klp/tests/test_workflows.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,7 @@
from os import environ, remove, getcwd
import re
from qp_klp.WorkflowFactory import WorkflowFactory
from qp_klp.Workflows import Workflow
from metapool import load_sample_sheet
from collections import defaultdict
from random import randint
Expand Down Expand Up @@ -890,3 +891,45 @@ def open_job_script(script_path):
exp = open_job_script("qp_klp/tests/data/tellread_test.sbatch")

self.assertEqual(obs, exp)

def test_foo(self):
test_names = [
# single additional occurance: R1
("ABC_7_04_1776_R1_SRE_S3_L007_R1_001.trimmed.fastq.gz", "R1"),
("ABC_7_04_1776_R1_SRE_S3_L007_R2_001.trimmed.fastq.gz", "R2"),
("ABC_7_04_1776_R1_SRE_S3_L007_I1_001.trimmed.fastq.gz", "I1"),
("ABC_7_04_1776_R1_SRE_S3_L007_I2_001.trimmed.fastq.gz", "I2"),

# test w/dots.
("ABC_7_04_1776.R1.SRE_S3_L007.R1.001.trimmed.fastq.gz", "R1"),
("ABC_7_04_1776.R1.SRE_S3_L007.R2.001.trimmed.fastq.gz", "R2"),
("ABC_7_04_1776.R1.SRE_S3_L007.I1.001.trimmed.fastq.gz", "I1"),
("ABC_7_04_1776.R1.SRE_S3_L007.I2.001.trimmed.fastq.gz", "I2"),

# single additional occurance: R2
("ABC_7_04_1776_R2_SRE_S3_L007_R1_001.trimmed.fastq.gz", "R1"),
("ABC_7_04_1776_R2_SRE_S3_L007_R2_001.trimmed.fastq.gz", "R2"),
("ABC_7_04_1776_R2_SRE_S3_L007_I1_001.trimmed.fastq.gz", "I1"),
("ABC_7_04_1776_R2_SRE_S3_L007_I2_001.trimmed.fastq.gz", "I2"),

# single additional occurance: In
("ABC_7_04_1776_I2_SRE_S3_L007_R1_001.trimmed.fastq.gz", "R1"),
("ABC_7_04_1776_I1_SRE_S3_L007_R2_001.trimmed.fastq.gz", "R2"),
("ABC_7_04_1776_I2_SRE_S3_L007_I1_001.trimmed.fastq.gz", "I1"),
("ABC_7_04_1776_I1_SRE_S3_L007_I2_001.trimmed.fastq.gz", "I2"),

# no additional occurances
("ABC_7_04_1776_SRE_S3_L007_R1_001.trimmed.fastq.gz", "R1"),
("ABC_7_04_1776_SRE_S3_L007_R2_001.trimmed.fastq.gz", "R2"),
("ABC_7_04_1776_SRE_S3_L007_I1_001.trimmed.fastq.gz", "I1"),
("ABC_7_04_1776_SRE_S3_L007_I2_001.trimmed.fastq.gz", "I2"),

# two additional occurances
("ABC_7_04_1776_I2_SRE.R1.S3_L007_R1_001.trimmed.fastq.gz", "R1"),
("ABC_7_04_1776_I1_SRE.R1.S3_L007_R2_001.trimmed.fastq.gz", "R2"),
("ABC_7_04_1776_I2_SRE.R1.S3_L007_I1_001.trimmed.fastq.gz", "I1"),
("ABC_7_04_1776_I1_SRE.R1.S3_L007_I2_001.trimmed.fastq.gz", "I2"),
]

for file_name, exp in test_names:
self.assertEqual(Workflow._determine_orientation(file_name), exp)

0 comments on commit 63faccb

Please sign in to comment.