Skip to content

Commit 2b7832e

Browse files
authored
add subsample_reads (#138)
* add subsample_reads * support the 2 different seq counts * index_col * self.assay_type == 'Amplicon' * pack warnings * so -> _ * self.warnings -> self.assay_warnings * "w" * addressing @AmandaBirmingham comments * self.warnings -> self.assay_warnings * flake 8
1 parent 07e2cdc commit 2b7832e

File tree

2 files changed

+63
-1
lines changed

2 files changed

+63
-1
lines changed

src/qp_klp/Assays.py

Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -35,6 +35,7 @@ class Assay():
3535
w/'_'.
3636
"""
3737
assay_type = ASSAY_NAME_NONE
38+
assay_warnings = []
3839

3940
@classmethod
4041
def _replace_tube_ids_w_sample_names(cls, prep_file_path, tube_id_map):
@@ -166,6 +167,7 @@ def execute_pipeline(self):
166167
self.convert_raw_to_fastq()
167168
self.integrate_results()
168169
self.generate_sequence_counts()
170+
self.subsample_reads()
169171

170172
self.update_status("QC-ing reads", 2, 9)
171173
if "NuQCJob" not in self.skip_steps:
@@ -260,6 +262,13 @@ def execute_pipeline(self):
260262
self.update_status("Generating packaging commands", 8, 9)
261263
self.generate_commands()
262264

265+
# store the warnings, if they exist so they are packed with the
266+
# final results
267+
if self.assay_warnings:
268+
wfp = f'{self.pipeline.output_path}/final_results/WARNINGS.txt'
269+
with open(wfp, 'w') as f:
270+
f.write('\n'.join(self.assay_warnings))
271+
263272
self.update_status("Packaging results", 9, 9)
264273
if self.update:
265274
self.execute_commands()

src/qp_klp/Protocol.py

Lines changed: 54 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -4,11 +4,14 @@
44
from sequence_processing_pipeline.TRIntegrateJob import TRIntegrateJob
55
from sequence_processing_pipeline.PipelineError import PipelineError
66
from sequence_processing_pipeline.util import determine_orientation
7-
from os.path import join, split
7+
from os.path import join, split, basename, dirname
88
from re import match
99
from os import makedirs, rename, walk
1010
from metapool import load_sample_sheet
1111
from metapool.sample_sheet import PROTOCOL_NAME_ILLUMINA, PROTOCOL_NAME_TELLSEQ
12+
import pandas as pd
13+
from glob import glob
14+
from qiita_client.util import system_call
1215

1316

1417
PROTOCOL_NAME_NONE = "None"
@@ -22,6 +25,56 @@ class Protocol():
2225
initialization.
2326
"""
2427
protocol_type = PROTOCOL_NAME_NONE
28+
# this value was selected by looking at all the successful NuQC/SPP jobs,
29+
# the max sequeces were: 712,497,596
30+
MAX_READS = 720000000
31+
32+
def subsample_reads(self):
33+
if self.assay_type == 'Amplicon':
34+
return
35+
36+
df = pd.read_csv(self.reports_path)
37+
if 'raw_reads_r1r2' in df.columns:
38+
# this is a TellSeq run: SeqCounts.csv
39+
read_col = 'raw_reads_r1r2'
40+
index_col = 'Sample_ID'
41+
elif '# Reads' in df.columns:
42+
# this is a Illumina: Demultiplex_Stats.csv
43+
read_col = '# Reads'
44+
index_col = 'SampleID'
45+
else:
46+
raise ValueError(
47+
'Not sure how to check for seq counts to subsample, '
48+
'please let an admin know.')
49+
# df will keep any rows/samples with more than the self.MAX_READS
50+
df = df[df[read_col] > self.MAX_READS]
51+
if df.shape[0]:
52+
for _, row in df.iterrows():
53+
sn = row[index_col]
54+
# look for any sample (fwd/rev pairs) that have the sample_name
55+
# as prefix of their filename
56+
files = glob(f'{self.raw_fastq_files_path}/*/{sn}*.fastq.gz')
57+
# for each file let's get their folder (dn) and filename (bn),
58+
# then create a fullpath with with dn and bn where we are
59+
# changing the filename from fastq.gz to full.gz; then
60+
# subsample this full.gz to a new file with the correct
61+
# fastq.gz via seqtk
62+
for f in files:
63+
dn = dirname(f)
64+
bn = basename(f)
65+
nbn = join(dn, bn.replace('fastq.gz', 'full.gz'))
66+
cmd = f'mv {f} {nbn}'
67+
_, se, rv = system_call(cmd)
68+
if rv != 0 or se:
69+
raise ValueError(f'Error during mv: {cmd}. {se}')
70+
cmd = (f'seqtk sample -s 42 {nbn} {self.MAX_READS} '
71+
f'| gzip > {f}')
72+
_, se, rv = system_call(cmd)
73+
if rv != 0 or se:
74+
raise ValueError(f'Error during seqtk: {cmd}. {se}')
75+
self.assay_warnings.append(
76+
f'{sn} ({bn}) had {row[read_col]} sequences, '
77+
f'subsampling to {self.MAX_READS}')
2578

2679

2780
class Illumina(Protocol):

0 commit comments

Comments
 (0)