44from sequence_processing_pipeline .TRIntegrateJob import TRIntegrateJob
55from sequence_processing_pipeline .PipelineError import PipelineError
66from sequence_processing_pipeline .util import determine_orientation
7- from os .path import join , split
7+ from os .path import join , split , basename , dirname
88from re import match
99from os import makedirs , rename , walk
1010from metapool import load_sample_sheet
1111from metapool .sample_sheet import PROTOCOL_NAME_ILLUMINA , PROTOCOL_NAME_TELLSEQ
12+ import pandas as pd
13+ from glob import glob
14+ from qiita_client .util import system_call
1215
1316
1417PROTOCOL_NAME_NONE = "None"
@@ -22,6 +25,56 @@ class Protocol():
2225 initialization.
2326 """
2427 protocol_type = PROTOCOL_NAME_NONE
28+ # this value was selected by looking at all the successful NuQC/SPP jobs,
29+ # the max sequeces were: 712,497,596
30+ MAX_READS = 720000000
31+
32+ def subsample_reads (self ):
33+ if self .assay_type == 'Amplicon' :
34+ return
35+
36+ df = pd .read_csv (self .reports_path )
37+ if 'raw_reads_r1r2' in df .columns :
38+ # this is a TellSeq run: SeqCounts.csv
39+ read_col = 'raw_reads_r1r2'
40+ index_col = 'Sample_ID'
41+ elif '# Reads' in df .columns :
42+ # this is a Illumina: Demultiplex_Stats.csv
43+ read_col = '# Reads'
44+ index_col = 'SampleID'
45+ else :
46+ raise ValueError (
47+ 'Not sure how to check for seq counts to subsample, '
48+ 'please let an admin know.' )
49+ # df will keep any rows/samples with more than the self.MAX_READS
50+ df = df [df [read_col ] > self .MAX_READS ]
51+ if df .shape [0 ]:
52+ for _ , row in df .iterrows ():
53+ sn = row [index_col ]
54+ # look for any sample (fwd/rev pairs) that have the sample_name
55+ # as prefix of their filename
56+ files = glob (f'{ self .raw_fastq_files_path } /*/{ sn } *.fastq.gz' )
57+ # for each file let's get their folder (dn) and filename (bn),
58+ # then create a fullpath with with dn and bn where we are
59+ # changing the filename from fastq.gz to full.gz; then
60+ # subsample this full.gz to a new file with the correct
61+ # fastq.gz via seqtk
62+ for f in files :
63+ dn = dirname (f )
64+ bn = basename (f )
65+ nbn = join (dn , bn .replace ('fastq.gz' , 'full.gz' ))
66+ cmd = f'mv { f } { nbn } '
67+ _ , se , rv = system_call (cmd )
68+ if rv != 0 or se :
69+ raise ValueError (f'Error during mv: { cmd } . { se } ' )
70+ cmd = (f'seqtk sample -s 42 { nbn } { self .MAX_READS } '
71+ f'| gzip > { f } ' )
72+ _ , se , rv = system_call (cmd )
73+ if rv != 0 or se :
74+ raise ValueError (f'Error during seqtk: { cmd } . { se } ' )
75+ self .assay_warnings .append (
76+ f'{ sn } ({ bn } ) had { row [read_col ]} sequences, '
77+ f'subsampling to { self .MAX_READS } ' )
2578
2679
2780class Illumina (Protocol ):
0 commit comments