@@ -39,10 +39,50 @@ def parseCNVFile(fpath, sampid, genes):
39
39
newdat = newdat [['improve_sample_id' ,'entrez_id' ,'copy_number' ,'source' ,'study' ]]
40
40
newdat ['copy_call' ] = [get_copy_call (a ) for a in newdat ['copy_number' ]]
41
41
return newdat
42
-
43
-
42
+
43
+
44
+ mutmap = {'CODON_CHANGE_PLUS_CODON_DELETION' :'In_Frame_Del' , ##this isn't a great mapping
45
+ 'CODON_CHANGE_PLUS_CODON_INSERTION' :'In_Frame_Ins' , ##this isn't a great mapping
46
+ 'CODON_DELETION' :'In_Frame_Del' ,
47
+ 'CODON_INSERTION' :'In_Frame_Ins' ,
48
+ 'DOWNSTREAM' :"3'Flank" ,
49
+ 'FRAME_SHIFT' :'Frameshift_Variant' ,
50
+ 'FRAME_SHIFT+SPLICE_SITE_ACCEPTOR+SPLICE_SITE_REGION+INTRON' :'Frameshift_Variant' ,
51
+ 'FRAME_SHIFT+SPLICE_SITE_REGION' :'Frameshift_Variant' ,
52
+ 'INTERGENIC' :'IGR' ,
53
+ 'INTRON' :'Intron' ,
54
+ 'NON_SYNONYMOUS_CODING' :'Missense_Mutation' ,
55
+ 'NON_SYNONYMOUS_CODING+SPLICE_SITE_REGION' :'Missense_Mutation' ,
56
+ 'SPLICE_SITE_ACCEPTOR+INTRON' :'Splice_Site' ,
57
+ 'SPLICE_SITE_DONOR+INTRON' :'Splice_Site' ,
58
+ 'SPLICE_SITE_REGION+INTRON' :'Splice_Site' ,
59
+ 'SPLICE_SITE_REGION+NON_CODING_EXON_VARIANT' :'Splice_Site' ,
60
+ 'SPLICE_SITE_REGION+SYNONYMOUS_CODING' :'Silent' ,
61
+ 'START_GAINED+UTR_5_PRIME' :'Start_Codon_Ins' ,
62
+ 'STOP_GAINED' :'Stop_Codon_Ins' ,
63
+ 'STOP_GAINED+CODON_CHANGE_PLUS_CODON_INSERTION' :'Stop_Codon_Ins' ,
64
+ 'SYNONYMOUS_CODING' :'Silent' ,
65
+ 'UPSTREAM' :"5'Flank" ,
66
+ 'UTR_3_PRIME' :"3'UTR" ,
67
+ 'UTR_5_PRIME' :"5'UTR"
68
+ }
69
+
44
70
def parseMutFile (fpath , sampid ,genes ):
45
- mutfile = pd .read_csv (fpath ,sep = '\t ' )
71
+ '''
72
+ move mutations to following headers:
73
+ entrez_id, improve_sample_id, source, study, mutation, variant_classification
74
+ '''
75
+ mutfile = pd .read_csv (fpath ,sep = '\t ' )[['SNPEFF_GENE_NAME' ,'SNPEFF_EFFECT' ,'SNPEFF_CDS_CHANGE' ]]
76
+ mutfile = mutfile .dropna (subset = 'SNPEFF_CDS_CHANGE' )
77
+ mutfile .columns = ['gene_symbol' ,'SNPEFF_EFFECT' ,'mutation' ]
78
+ fullfile = pd .merge (mutfile ,pd .DataFrame ({'SNPEFF_EFFECT' :mutmap .keys (),'variant_classification' :mutmap .values ()}))
79
+ fullfile = pd .merge (fullfile ,genes )
80
+ fullfile ['improve_sample_id' ] = sampid
81
+ fullfile ['source' ]= 'TiriacEtAl'
82
+ fullfile ['study' ]= 'pancpdo'
83
+ fullfile = fullfile [['improve_sample_id' ,'entrez_id' ,'source' ,'study' ,'mutation' ,'variant_classification' ]]
84
+ fullfile = fullfile .dropna ().drop_duplicates ()
85
+ return fullfile
46
86
47
87
def main ():
48
88
parser = argparse .ArgumentParser (description = 'Script that collects WES and CNV data from Synapse for Coderdata' )
@@ -83,12 +123,24 @@ def main():
83
123
newcnv .to_csv ('/tmp/pancpdo_copy_number.csv.gz' ,compression = 'gzip' ,index = False )
84
124
85
125
if args .mutation :
86
- wes = sc .tableQuery (' select * from syn64608378 where parentId== syn64608263' ).asDataFrame ()
126
+ wes = sc .tableQuery (" select * from syn64608378 where parentId=' syn64608263'" ).asDataFrame ()
87
127
alldats = []
88
128
##go through and get every mutation file
89
129
for index ,row in wes .iterrows ():
130
+ sname = row ['name' ].split ('--' )[0 ]
90
131
sid = row .id
91
- sname = row ['name' ]
92
-
132
+ print (sid ,sname )
133
+ if sname in set (samps .other_id ):
134
+ sampid = samps .loc [samps .other_id == sname ]['improve_sample_id' ].values [0 ]
135
+ else :
136
+ print ('Missing sample id for ' + sname )
137
+ continue
138
+ path = sc .get (sid ).path
139
+ sampid = samps .loc [samps .other_id == sname ]['improve_sample_id' ].values [0 ]
140
+ res = parseMutFile (path ,sampid , genes )
141
+ alldats .append (res )
142
+ newmut = pd .concat (alldats )
143
+ newmut .to_csv ("/tmp/pancpdo_mutations.csv.gz" ,compression = 'gzip' ,index = False )
144
+
93
145
if __name__ == '__main__' :
94
146
main ()
0 commit comments