1
- from itertools import chain
2
1
from collections import defaultdict
3
2
from json import dumps , load
4
3
from metapool import load_sample_sheet
5
4
from os import makedirs , walk , listdir
6
- from os .path import join , exists , split , basename , dirname
5
+ from os .path import join , exists , split , basename , dirname , abspath
7
6
from sequence_processing_pipeline .ConvertJob import ConvertJob
8
7
from sequence_processing_pipeline .FastQCJob import FastQCJob
9
8
from sequence_processing_pipeline .GenPrepFileJob import GenPrepFileJob
@@ -380,11 +379,6 @@ def _generate_prep_file(self, config, input_file_path, seqpro_path):
380
379
381
380
gpf_job .run (callback = self .update_callback )
382
381
383
- # concatenate the lists of paths across all study_ids into a single
384
- # list. Replace sample-names w/tube-ids in all relevant prep-files.
385
- preps = list (chain .from_iterable (gpf_job .prep_file_paths .values ()))
386
- self ._overwrite_prep_files (preps )
387
-
388
382
return gpf_job
389
383
390
384
def _helper_process_fastp_report_dirs (self ):
@@ -727,6 +721,7 @@ def _get_tube_ids_from_qiita(self, qclient):
727
721
728
722
# use empty dict {} as an indication that get_tube_ids_from_qiita was
729
723
# called but no tube-ids were found for any project.
724
+ # to clarify, self.tube_id_map maps sample-names to tube-ids.
730
725
self .tube_id_map = tids_by_qiita_id
731
726
# should samples_in_qiita be none if tube_id_map is not?
732
727
self .samples_in_qiita = sample_names_by_qiita_id
@@ -860,7 +855,10 @@ def _process_tube_ids(self, project_name, qiita_id, samples):
860
855
# return None otherwise
861
856
862
857
@classmethod
863
- def _replace_with_tube_ids (cls , prep_file_path , tube_id_map ):
858
+ def _replace_tube_ids_w_sample_names (cls , prep_file_path , tube_id_map ):
859
+ # reversed_map maps tube-ids to sample-names
860
+ reversed_map = {tube_id_map [k ]: k for k in tube_id_map }
861
+
864
862
# passing tube_id_map as a parameter allows for easier testing.
865
863
df = pd .read_csv (prep_file_path , sep = '\t ' , dtype = str , index_col = False )
866
864
# save copy of sample_name column as 'old_sample_name'
@@ -874,16 +872,13 @@ def _replace_with_tube_ids(cls, prep_file_path, tube_id_map):
874
872
# remove leading zeroes if they exist to match Qiita results.
875
873
sample_name = sample_name .lstrip ('0' )
876
874
877
- reversed_map = {tube_id_map [k ]: k for k in tube_id_map }
878
875
if sample_name in reversed_map :
879
876
df .at [i , "sample_name" ] = reversed_map [sample_name ]
880
877
881
878
df .to_csv (prep_file_path , index = False , sep = "\t " )
882
879
883
880
def _overwrite_prep_files (self , prep_file_paths ):
884
- # replaces sample-names in prep-files with tube-ids according to
885
- # a dict with project-names as keys and another dict as a value.
886
- # this dict uses sample-names as keys and tube-ids as values.
881
+ # replace tube-ids in prep-info files w/sample-names.
887
882
if self .tube_id_map is None :
888
883
raise ValueError ("get_tube_ids_from_qiita() was not called" )
889
884
@@ -905,12 +900,10 @@ def _overwrite_prep_files(self, prep_file_paths):
905
900
if len (matching_files ) == 0 :
906
901
continue
907
902
908
- if len (matching_files ) > 1 :
909
- raise ValueError ("More than one match found for project "
910
- f"'{ fqp_name } ': { str (matching_files )} " )
911
-
912
- Step ._replace_with_tube_ids (matching_files [0 ],
913
- self .tube_id_map [qiita_id ])
903
+ for matching_file in matching_files :
904
+ Step ._replace_tube_ids_w_sample_names (matching_file ,
905
+ self .tube_id_map [
906
+ qiita_id ])
914
907
915
908
def update_blanks_in_qiita (self , qclient ):
916
909
for sif_path in self .sifs :
@@ -1010,6 +1003,39 @@ def execute_pipeline(self, qclient, increment_status, update=True,
1010
1003
if "GenPrepFileJob" not in skip_steps :
1011
1004
self .generate_prep_file ()
1012
1005
1006
+ # moved final component of genprepfilejob outside of object.
1007
+ # obtain the paths to the prep-files generated by GenPrepFileJob
1008
+ # w/out having to recover full state.
1009
+ tmp = join (self .pipeline .output_path , 'GenPrepFileJob' , 'PrepFiles' )
1010
+
1011
+ self .has_replicates = False
1012
+
1013
+ prep_paths = []
1014
+ self .prep_file_paths = {}
1015
+
1016
+ for root , dirs , files in walk (tmp ):
1017
+ for _file in files :
1018
+ # breakup the prep-info-file into segments
1019
+ # (run-id, project_qid, other) and cleave
1020
+ # the qiita-id from the project_name.
1021
+ qid = _file .split ('.' )[1 ].split ('_' )[- 1 ]
1022
+
1023
+ if qid not in self .prep_file_paths :
1024
+ self .prep_file_paths [qid ] = []
1025
+
1026
+ _path = abspath (join (root , _file ))
1027
+ if _path .endswith ('.tsv' ):
1028
+ prep_paths .append (_path )
1029
+ self .prep_file_paths [qid ].append (_path )
1030
+
1031
+ for _dir in dirs :
1032
+ if _dir == '1' :
1033
+ # if PrepFiles contains the '1' directory, then it's a
1034
+ # given that this sample-sheet contains replicates.
1035
+ self .has_replicates = True
1036
+
1037
+ self ._overwrite_prep_files (prep_paths )
1038
+
1013
1039
# for now, simply re-run any line below as if it was a new job, even
1014
1040
# for a restart. functionality is idempotent, except for the
1015
1041
# registration of new preps in Qiita. These will simply be removed
0 commit comments