Skip to content

Conditional fastq file finder fix #86

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 2 commits into from
Apr 4, 2024
Merged
Show file tree
Hide file tree
Changes from 1 commit
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
6 changes: 3 additions & 3 deletions qp_klp/Step.py
Original file line number Diff line number Diff line change
Expand Up @@ -505,7 +505,7 @@ def generate_commands(self):

self.write_commands_to_output_path()

def _get_fastq_files(self, out_dir, project):
def _get_postqc_fastq_files(self, out_dir, project):
af = None
sub_folders = ['amplicon', 'filtered_sequences', 'trimmed_sequences']
for sub_folder in sub_folders:
Expand All @@ -520,7 +520,7 @@ def _get_fastq_files(self, out_dir, project):
'raw_reverse_seqs': []}

for fastq_file in af:
if '_I1_' in fastq_file:
if '_I1_' in fastq_file or '_I2_' in fastq_file:
files['raw_barcodes'].append(fastq_file)
elif '_R1_' in fastq_file:
files['raw_forward_seqs'].append(fastq_file)
Expand Down Expand Up @@ -593,7 +593,7 @@ def load_preps_into_qiita(self, qclient):

data = []
for project, _, qiita_id in self.special_map:
fastq_files = self._get_fastq_files(
fastq_files = self._get_postqc_fastq_files(
self.pipeline.output_path, project)

for vals in self.touched_studies_prep_info[qiita_id]:
Expand Down
164 changes: 164 additions & 0 deletions qp_klp/tests/test_step.py
Original file line number Diff line number Diff line change
Expand Up @@ -269,6 +269,13 @@ def setUp(self):
self.qiita_id,
Step.METAGENOMIC_TYPE)

self.amplicon_pipeline = Pipeline(self.master_config_path,
self.good_run_id, None,
self.good_mapping_file_path,
self.output_file_path,
self.qiita_id,
Step.AMPLICON_TYPE)

self.fake_bin_path = self._get_searchable_path()

self.delete_these = []
Expand Down Expand Up @@ -419,6 +426,68 @@ def _create_test_input(self, stage):
with open(fake_path, 'w') as f:
f.write("This is a file")

if stage >= 5:
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

why 5?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The 5 was so that it wouldn't interfere with legacy tests. Looking at it again there's something I want to tweak. I'll keep you posted.

for project in exp:
# currently unused for testing but pre-defined here in case
# the need arises.
# html_log_path = join(self.output_file_path, 'NuQCJob',
# project, 'fastp_reports_dir', 'html')
# json_log_path = join(self.output_file_path, 'NuQCJob',
# 'project', 'fastp_reports_dir', 'json')
trimmed_files_path = join(self.output_file_path, 'NuQCJob',
project, 'filtered_sequences')
empty_files_path = join(self.output_file_path, 'NuQCJob',
project, 'zero_files')
adapter_trimmed_files_path = join(self.output_file_path,
'NuQCJob',
'only-adapter-filtered',
project)

fake_paths = [trimmed_files_path, empty_files_path,
adapter_trimmed_files_path]

for fake_path in fake_paths:
makedirs(fake_path, exist_ok=True)

empty_files = {
'Feist_11661': [
'CDPH-SAL_Salmonella_Typhi_MDL-150',
'CDPH-SAL_Salmonella_Typhi_MDL-151'
],
'Gerwick_6123': ['8A', '9A', '10A'],
'NYU_BMS_Melanoma_13059': ['XX581451B02', 'XY256645B01',
'XZ112567B02'
]}

f_list = []
for sample in exp[project]:
f_list += [
join(trimmed_files_path,
f'{sample}_SXXX_L001_R1_001.trimmed.fastq.gz'),
join(trimmed_files_path,
f'{sample}_SXXX_L001_R2_001.trimmed.fastq.gz'),
join(trimmed_files_path,
f'{sample}_SXXX_L001_I1_001.trimmed.fastq.gz'),
join(trimmed_files_path,
f'{sample}_SXXX_L001_I2_001.trimmed.fastq.gz'),
join(adapter_trimmed_files_path,
f'{sample}_SXXX_L001_R1_001.fastq.gz'),
join(adapter_trimmed_files_path,
f'{sample}_SXXX_L001_R2_001.fastq.gz')
]

for sample in empty_files[project]:
f_list += [
join(trimmed_files_path,
f'{sample}_SXXX_L001_R1_001.trimmed.fastq.gz'),
join(trimmed_files_path,
f'{sample}_SXXX_L001_R2_001.trimmed.fastq.gz')
]

for file_path in f_list:
with open(file_path, 'w') as f:
f.write("This is a file.")


class BasicStepTests(BaseStepTests):
def test_creation(self):
Expand Down Expand Up @@ -899,6 +968,101 @@ def test_precheck(self):
with self.assertRaisesRegex(PipelineError, msg):
step.precheck(fake_client)

def test_conditional_fastqc_finder(self):
self._create_test_input(5)

# For a metagenomic pipeline, we expect indexed files to be removed
# from the results. We also expect only trimmed files from Feist_11661
# retrieved, and none from other projects, adapter-trimmed-only files,
# or zero-length files.
step = Step(self.pipeline_replicates, self.qiita_id, None)
results = step._get_postqc_fastq_files(self.output_file_path,
'Feist_11661')

exp = {
"raw_forward_seqs": [
"/NuQCJob/Feist_11661/filtered_sequences/CDPH-SAL_Salmonella"
"_Typhi_MDL-143_SXXX_L001_R1_001.trimmed.fastq.gz",
"/NuQCJob/Feist_11661/filtered_sequences/CDPH-SAL_Salmonella"
"_Typhi_MDL-144_SXXX_L001_R1_001.trimmed.fastq.gz",
"/NuQCJob/Feist_11661/filtered_sequences/CDPH-SAL_Salmonella"
"_Typhi_MDL-145_SXXX_L001_R1_001.trimmed.fastq.gz",
"/NuQCJob/Feist_11661/filtered_sequences/CDPH-SAL_Salmonella"
"_Typhi_MDL-146_SXXX_L001_R1_001.trimmed.fastq.gz",
"/NuQCJob/Feist_11661/filtered_sequences/CDPH-SAL_Salmonella"
"_Typhi_MDL-147_SXXX_L001_R1_001.trimmed.fastq.gz",
"/NuQCJob/Feist_11661/filtered_sequences/CDPH-SAL_Salmonella"
"_Typhi_MDL-150_SXXX_L001_R1_001.trimmed.fastq.gz",
"/NuQCJob/Feist_11661/filtered_sequences/CDPH-SAL_Salmonella"
"_Typhi_MDL-151_SXXX_L001_R1_001.trimmed.fastq.gz"
],
"raw_reverse_seqs": [
"/NuQCJob/Feist_11661/filtered_sequences/CDPH-SAL_Salmonella"
"_Typhi_MDL-143_SXXX_L001_R2_001.trimmed.fastq.gz",
"/NuQCJob/Feist_11661/filtered_sequences/CDPH-SAL_Salmonella"
"_Typhi_MDL-144_SXXX_L001_R2_001.trimmed.fastq.gz",
"/NuQCJob/Feist_11661/filtered_sequences/CDPH-SAL_Salmonella"
"_Typhi_MDL-145_SXXX_L001_R2_001.trimmed.fastq.gz",
"/NuQCJob/Feist_11661/filtered_sequences/CDPH-SAL_Salmonella"
"_Typhi_MDL-146_SXXX_L001_R2_001.trimmed.fastq.gz",
"/NuQCJob/Feist_11661/filtered_sequences/CDPH-SAL_Salmonella"
"_Typhi_MDL-147_SXXX_L001_R2_001.trimmed.fastq.gz",
"/NuQCJob/Feist_11661/filtered_sequences/CDPH-SAL_Salmonella"
"_Typhi_MDL-150_SXXX_L001_R2_001.trimmed.fastq.gz",
"/NuQCJob/Feist_11661/filtered_sequences/CDPH-SAL_Salmonella"
"_Typhi_MDL-151_SXXX_L001_R2_001.trimmed.fastq.gz"
]
}

# metagenomic runs shouldn't return a set of data like exp above.
# It shouldn't include I1 and I2 files.
self.assertEqual(set(results.keys()), {'raw_forward_seqs',
'raw_reverse_seqs'})
for key in results.keys():
# remove base output_file_path from the results.
obs = [x.replace(self.output_file_path, '')
for x in results[key]]
self.assertEqual(set(obs), set(exp[key]))

# Hack an amplicon pipeline. reuse project-names, sample-names and
# qiita-ids. Expected results should be just as they are for
# metagenomic pipelines, except the index files are included.
step = Step(self.amplicon_pipeline, self.qiita_id, None)

exp['raw_barcodes'] = [
"/NuQCJob/Feist_11661/filtered_sequences/CDPH-SAL_Salmonella"
"_Typhi_MDL-143_SXXX_L001_I1_001.trimmed.fastq.gz",
"/NuQCJob/Feist_11661/filtered_sequences/CDPH-SAL_Salmonella"
"_Typhi_MDL-143_SXXX_L001_I2_001.trimmed.fastq.gz",
"/NuQCJob/Feist_11661/filtered_sequences/CDPH-SAL_Salmonella"
"_Typhi_MDL-144_SXXX_L001_I1_001.trimmed.fastq.gz",
"/NuQCJob/Feist_11661/filtered_sequences/CDPH-SAL_Salmonella"
"_Typhi_MDL-144_SXXX_L001_I2_001.trimmed.fastq.gz",
"/NuQCJob/Feist_11661/filtered_sequences/CDPH-SAL_Salmonella"
"_Typhi_MDL-145_SXXX_L001_I1_001.trimmed.fastq.gz",
"/NuQCJob/Feist_11661/filtered_sequences/CDPH-SAL_Salmonella"
"_Typhi_MDL-145_SXXX_L001_I2_001.trimmed.fastq.gz",
"/NuQCJob/Feist_11661/filtered_sequences/CDPH-SAL_Salmonella"
"_Typhi_MDL-146_SXXX_L001_I1_001.trimmed.fastq.gz",
"/NuQCJob/Feist_11661/filtered_sequences/CDPH-SAL_Salmonella"
"_Typhi_MDL-146_SXXX_L001_I2_001.trimmed.fastq.gz",
"/NuQCJob/Feist_11661/filtered_sequences/CDPH-SAL_Salmonella"
"_Typhi_MDL-147_SXXX_L001_I1_001.trimmed.fastq.gz",
"/NuQCJob/Feist_11661/filtered_sequences/CDPH-SAL_Salmonella"
"_Typhi_MDL-147_SXXX_L001_I2_001.trimmed.fastq.gz"
]

results = step._get_postqc_fastq_files(self.output_file_path,
'Feist_11661')

self.assertEqual(set(results.keys()), {'raw_barcodes',
'raw_forward_seqs',
'raw_reverse_seqs'})
for key in results.keys():
obs = [x.replace(self.output_file_path, '')
for x in results[key]]
self.assertEqual(set(obs), set(exp[key]))


class ReplicateTests(BaseStepTests):
def setUp(self):
Expand Down