Skip to content

Commit 99e627b

Browse files
Updates based on initial runs in production. (#89)
* Updates based on initial runs in production. * Updates from production. tests updated.
1 parent 4f2f4d5 commit 99e627b

File tree

6 files changed

+73
-47
lines changed

6 files changed

+73
-47
lines changed

qp_klp/Step.py

Lines changed: 44 additions & 18 deletions
Original file line numberDiff line numberDiff line change
@@ -1,9 +1,8 @@
1-
from itertools import chain
21
from collections import defaultdict
32
from json import dumps, load
43
from metapool import load_sample_sheet
54
from os import makedirs, walk, listdir
6-
from os.path import join, exists, split, basename, dirname
5+
from os.path import join, exists, split, basename, dirname, abspath
76
from sequence_processing_pipeline.ConvertJob import ConvertJob
87
from sequence_processing_pipeline.FastQCJob import FastQCJob
98
from sequence_processing_pipeline.GenPrepFileJob import GenPrepFileJob
@@ -380,11 +379,6 @@ def _generate_prep_file(self, config, input_file_path, seqpro_path):
380379

381380
gpf_job.run(callback=self.update_callback)
382381

383-
# concatenate the lists of paths across all study_ids into a single
384-
# list. Replace sample-names w/tube-ids in all relevant prep-files.
385-
preps = list(chain.from_iterable(gpf_job.prep_file_paths.values()))
386-
self._overwrite_prep_files(preps)
387-
388382
return gpf_job
389383

390384
def _helper_process_fastp_report_dirs(self):
@@ -727,6 +721,7 @@ def _get_tube_ids_from_qiita(self, qclient):
727721

728722
# use empty dict {} as an indication that get_tube_ids_from_qiita was
729723
# called but no tube-ids were found for any project.
724+
# to clarify, self.tube_id_map maps sample-names to tube-ids.
730725
self.tube_id_map = tids_by_qiita_id
731726
# should samples_in_qiita be none if tube_id_map is not?
732727
self.samples_in_qiita = sample_names_by_qiita_id
@@ -860,7 +855,10 @@ def _process_tube_ids(self, project_name, qiita_id, samples):
860855
# return None otherwise
861856

862857
@classmethod
863-
def _replace_with_tube_ids(cls, prep_file_path, tube_id_map):
858+
def _replace_tube_ids_w_sample_names(cls, prep_file_path, tube_id_map):
859+
# reversed_map maps tube-ids to sample-names
860+
reversed_map = {tube_id_map[k]: k for k in tube_id_map}
861+
864862
# passing tube_id_map as a parameter allows for easier testing.
865863
df = pd.read_csv(prep_file_path, sep='\t', dtype=str, index_col=False)
866864
# save copy of sample_name column as 'old_sample_name'
@@ -874,16 +872,13 @@ def _replace_with_tube_ids(cls, prep_file_path, tube_id_map):
874872
# remove leading zeroes if they exist to match Qiita results.
875873
sample_name = sample_name.lstrip('0')
876874

877-
reversed_map = {tube_id_map[k]: k for k in tube_id_map}
878875
if sample_name in reversed_map:
879876
df.at[i, "sample_name"] = reversed_map[sample_name]
880877

881878
df.to_csv(prep_file_path, index=False, sep="\t")
882879

883880
def _overwrite_prep_files(self, prep_file_paths):
884-
# replaces sample-names in prep-files with tube-ids according to
885-
# a dict with project-names as keys and another dict as a value.
886-
# this dict uses sample-names as keys and tube-ids as values.
881+
# replace tube-ids in prep-info files w/sample-names.
887882
if self.tube_id_map is None:
888883
raise ValueError("get_tube_ids_from_qiita() was not called")
889884

@@ -905,12 +900,10 @@ def _overwrite_prep_files(self, prep_file_paths):
905900
if len(matching_files) == 0:
906901
continue
907902

908-
if len(matching_files) > 1:
909-
raise ValueError("More than one match found for project "
910-
f"'{fqp_name}': {str(matching_files)}")
911-
912-
Step._replace_with_tube_ids(matching_files[0],
913-
self.tube_id_map[qiita_id])
903+
for matching_file in matching_files:
904+
Step._replace_tube_ids_w_sample_names(matching_file,
905+
self.tube_id_map[
906+
qiita_id])
914907

915908
def update_blanks_in_qiita(self, qclient):
916909
for sif_path in self.sifs:
@@ -1010,6 +1003,39 @@ def execute_pipeline(self, qclient, increment_status, update=True,
10101003
if "GenPrepFileJob" not in skip_steps:
10111004
self.generate_prep_file()
10121005

1006+
# moved final component of genprepfilejob outside of object.
1007+
# obtain the paths to the prep-files generated by GenPrepFileJob
1008+
# w/out having to recover full state.
1009+
tmp = join(self.pipeline.output_path, 'GenPrepFileJob', 'PrepFiles')
1010+
1011+
self.has_replicates = False
1012+
1013+
prep_paths = []
1014+
self.prep_file_paths = {}
1015+
1016+
for root, dirs, files in walk(tmp):
1017+
for _file in files:
1018+
# breakup the prep-info-file into segments
1019+
# (run-id, project_qid, other) and cleave
1020+
# the qiita-id from the project_name.
1021+
qid = _file.split('.')[1].split('_')[-1]
1022+
1023+
if qid not in self.prep_file_paths:
1024+
self.prep_file_paths[qid] = []
1025+
1026+
_path = abspath(join(root, _file))
1027+
if _path.endswith('.tsv'):
1028+
prep_paths.append(_path)
1029+
self.prep_file_paths[qid].append(_path)
1030+
1031+
for _dir in dirs:
1032+
if _dir == '1':
1033+
# if PrepFiles contains the '1' directory, then it's a
1034+
# given that this sample-sheet contains replicates.
1035+
self.has_replicates = True
1036+
1037+
self._overwrite_prep_files(prep_paths)
1038+
10131039
# for now, simply re-run any line below as if it was a new job, even
10141040
# for a restart. functionality is idempotent, except for the
10151041
# registration of new preps in Qiita. These will simply be removed

qp_klp/klp.py

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -180,6 +180,8 @@ def sequence_processing_pipeline(qclient, job_id, parameters, out_dir):
180180
if exists(join(out_dir, 'GenPrepFileJob')):
181181
skip_steps.append('FastQCJob')
182182

183+
# it doesn't matter if cmds.log is a valid cmds.log or just
184+
# an empty file. The cmds.log will get overwritten downstream.
183185
if exists(join(out_dir, 'cmds.log')):
184186
skip_steps.append('GenPrepFileJob')
185187

qp_klp/tests/data/configuration_profiles/iseq_metagenomic.json

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -92,4 +92,4 @@
9292
}
9393
}
9494
}
95-
}
95+
}

qp_klp/tests/data/configuration_profiles/novaseq_amplicon.json

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -60,4 +60,4 @@
6060
}
6161
}
6262
}
63-
}
63+
}

qp_klp/tests/data/configuration_profiles/novaseq_metatranscriptomic.json

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -92,4 +92,4 @@
9292
}
9393
}
9494
}
95-
}
95+
}

qp_klp/tests/data/process_all_fastq_files.sh

Lines changed: 24 additions & 26 deletions
Original file line numberDiff line numberDiff line change
@@ -9,7 +9,9 @@
99
### as well as sbatch -c. demux threads remains fixed at 1.
1010
### Note -c set to 4 and thread counts set to 7 during testing.
1111
#SBATCH -c 2
12-
#SBATCH --gres=node_jobs:2
12+
### Commented out for now, but there is a possibility it will be needed
13+
### in the future.
14+
###SBATCH --gres=node_jobs:2
1315

1416

1517
echo "---------------"
@@ -53,8 +55,8 @@ export TMPDIR=${TMPDIR}
5355
export TMPDIR=$(mktemp -d)
5456
echo $TMPDIR
5557

56-
mkdir -p ${WKDIR}NuQCJob/fastp_reports_dir/html
57-
mkdir -p ${WKDIR}NuQCJob/fastp_reports_dir/json
58+
mkdir -p ${WKDIR}/fastp_reports_dir/html
59+
mkdir -p ${WKDIR}/fastp_reports_dir/json
5860

5961
export ADAPTER_ONLY_OUTPUT=${OUTPUT}/only-adapter-filtered
6062
mkdir -p ${ADAPTER_ONLY_OUTPUT}
@@ -74,9 +76,8 @@ function mux-runner () {
7476

7577
jobd=${TMPDIR}
7678
id_map=${jobd}/id_map
77-
seqs_r1=${jobd}/seqs.r1.fastq.gz
78-
seqs_r2=${jobd}/seqs.r2.fastq
79-
r1_filt=${jobd}/seqs.r1.adapter-removed.fastq.gz
79+
seqs_reads=${jobd}/seqs.interleaved.fastq
80+
seq_reads_filter_alignment=${jobd}/seqs.interleaved.filter_alignment.fastq
8081

8182
for i in $(seq 1 ${n})
8283
do
@@ -86,14 +87,19 @@ function mux-runner () {
8687
base=$(echo ${line} | cut -f 3 -d" ")
8788
r1_name=$(basename ${r1} .fastq.gz)
8889
r2_name=$(basename ${r2} .fastq.gz)
89-
r1_adapter_only=${ADAPTER_ONLY_OUTPUT}/${r1_name}.fastq.gz
90+
r_adapter_only=${ADAPTER_ONLY_OUTPUT}/${r1_name}.interleave.fastq.gz
9091

9192
s_name=$(basename "${r1}" | sed -r 's/\.fastq\.gz//')
9293
html_name=$(echo "$s_name.html")
9394
json_name=$(echo "$s_name.json")
9495

9596
echo -e "${i}\t${r1_name}\t${r2_name}\t${base}" >> ${id_map}
9697

98+
# movi, in the current version, works on the interleaved version of the
99+
# fwd/rev reads so we are gonna take advantage fastp default output
100+
# to minimize steps. Additionally, movi expects the input to not be
101+
# gz, so we are not going to compress seqs_r1
102+
97103
fastp \
98104
-l 100 \
99105
-i ${r1} \
@@ -102,47 +108,39 @@ function mux-runner () {
102108
--adapter_fasta fastp_known_adapters_formatted.fna \
103109
--html REMOVED/qp-knight-lab-processing/qp_klp/tests/data/output_dir/NuQCJob/fastp_reports_dir/html/${html_name} \
104110
--json REMOVED/qp-knight-lab-processing/qp_klp/tests/data/output_dir/NuQCJob/fastp_reports_dir/json/${json_name} \
105-
--stdout | gzip > ${r1_filt}
111+
--stdout | gzip > ${r_adapter_only}
106112

107113
# multiplex and write adapter filtered data all at once
108-
zcat ${r1_filt} | \
114+
zcat ${r_adapter_only} | \
109115
sed -r "1~4s/^@(.*)/@${i}${delimiter}\1/" \
110-
>> ${seqs_r1}
111-
cat ${r1_filt} | \
112-
gzip -c > ${r1_adapter_only} &
113-
wait
114-
115-
rm ${r1_filt} &
116-
wait
116+
>> ${seqs_reads}
117117
done
118118

119119
# minimap/samtools pair commands are now generated in NuQCJob._generate_mmi_filter_cmds()
120-
# and passed to this template. This method assumes ${jobd} is the correct location to
121-
# filter files, the initial file is "${jobd}/seqs.r1.fastq"), and the output name is
122-
# "${jobd}/seqs.r1.ALIGN.fastq".
123-
minimap2 -2 -ax sr -t 1 /databases/minimap2/db_1.mmi ${jobd}/seqs.r1.fastq -a | samtools fastq -@ 1 -f 12 -F 256 > ${jobd}/foo
120+
# and passed to this template.
121+
minimap2 -2 -ax sr -t 1 /databases/minimap2/db_1.mmi ${jobd}/seqs.interleaved.fastq -a | samtools fastq -@ 1 -f 12 -F 256 > ${jobd}/foo
124122
minimap2 -2 -ax sr -t 1 /databases/minimap2/db_2.mmi ${jobd}/foo -a | samtools fastq -@ 1 -f 12 -F 256 > ${jobd}/bar
125-
mv ${jobd}/bar ${jobd}/seqs.r1.ALIGN.fastq
123+
mv ${jobd}/bar ${jobd}/seqs.interleaved.filter_alignment.fastq
126124
[ -e ${jobd}/foo ] && rm ${jobd}/foo
127125
[ -e ${jobd}/bar ] && rm ${jobd}/bar
128126

129127
/home/user/user_dir/Movi/build/movi-default query \
130128
--index /scratch/movi_hg38_chm13_hprc94 \
131-
--read <(zcat ${jobd}/seqs.r1.ALIGN.fastq.gz) \
129+
--read ${seq_reads_filter_alignment} \
132130
--stdout | gzip > ${jobd}/seqs.movi.txt.gz
133131

134132
python /home/user/user_dir/human_host_filtration/scripts/qiita_filter_pmls.py <(zcat ${jobd}/seqs.movi.txt.gz) | \
135-
seqtk subseq ${jobd}/seqs.r1.ALIGN.fastq.gz - | gzip > ${jobd}/seqs.r1.final.fastq.gz
133+
seqtk subseq ${seq_reads_filter_alignment} - > ${jobd}/seqs.final.fastq
136134

137-
REMOVED/sequence_processing_pipeline/scripts/splitter ${jobd}/seqs.r1.final.fastq \
135+
REMOVED/sequence_processing_pipeline/scripts/splitter ${jobd}/seqs.final.fastq \
138136
${jobd}/reads.r1.fastq ${delimiter} ${r1_tag} &
139-
REMOVED/sequence_processing_pipeline/scripts/splitter ${jobd}/seqs.r1.final.fastq \
137+
REMOVED/sequence_processing_pipeline/scripts/splitter ${jobd}/seqs.final.fastq \
140138
${jobd}/reads.r2.fastq ${delimiter} ${r2_tag} &
141139
wait
142140
fastq_pair -t 50000000 ${jobd}/reads.r1.fastq ${jobd}/reads.r2.fastq
143141

144142
# keep seqs.movi.txt and migrate it to NuQCJob directory.
145-
mv ${jobd}/seqs.movi.txt.gz REMOVED/qp-knight-lab-processing/qp_klp/tests/data/output_dir/NuQCJob/seqs.movi.${SLURM_ARRAY_TASK_ID}.txt.gz
143+
mv ${jobd}/seqs.movi.txt.gz REMOVED/qp-knight-lab-processing/qp_klp/tests/data/output_dir/NuQCJob/logs/seqs.movi.${SLURM_ARRAY_TASK_ID}.txt.gz
146144
}
147145
export -f mux-runner
148146

0 commit comments

Comments
 (0)