Skip to content

Commit 7b3cf19

Browse files
committed
finished mutation file
now have to figure out what to do with missing identifiers
1 parent 97d1371 commit 7b3cf19

File tree

1 file changed

+58
-6
lines changed

1 file changed

+58
-6
lines changed

build/pancpdo/02a-getPancPDODataFromSynapse.py

Lines changed: 58 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -39,10 +39,50 @@ def parseCNVFile(fpath, sampid, genes):
3939
newdat = newdat[['improve_sample_id','entrez_id','copy_number','source','study']]
4040
newdat['copy_call'] = [get_copy_call(a) for a in newdat['copy_number']]
4141
return newdat
42-
43-
42+
43+
44+
mutmap = {'CODON_CHANGE_PLUS_CODON_DELETION':'In_Frame_Del', ##this isn't a great mapping
45+
'CODON_CHANGE_PLUS_CODON_INSERTION':'In_Frame_Ins', ##this isn't a great mapping
46+
'CODON_DELETION':'In_Frame_Del',
47+
'CODON_INSERTION':'In_Frame_Ins',
48+
'DOWNSTREAM':"3'Flank",
49+
'FRAME_SHIFT':'Frameshift_Variant',
50+
'FRAME_SHIFT+SPLICE_SITE_ACCEPTOR+SPLICE_SITE_REGION+INTRON':'Frameshift_Variant',
51+
'FRAME_SHIFT+SPLICE_SITE_REGION':'Frameshift_Variant',
52+
'INTERGENIC':'IGR',
53+
'INTRON':'Intron',
54+
'NON_SYNONYMOUS_CODING':'Missense_Mutation',
55+
'NON_SYNONYMOUS_CODING+SPLICE_SITE_REGION':'Missense_Mutation',
56+
'SPLICE_SITE_ACCEPTOR+INTRON':'Splice_Site',
57+
'SPLICE_SITE_DONOR+INTRON':'Splice_Site',
58+
'SPLICE_SITE_REGION+INTRON':'Splice_Site',
59+
'SPLICE_SITE_REGION+NON_CODING_EXON_VARIANT':'Splice_Site',
60+
'SPLICE_SITE_REGION+SYNONYMOUS_CODING':'Silent',
61+
'START_GAINED+UTR_5_PRIME':'Start_Codon_Ins',
62+
'STOP_GAINED':'Stop_Codon_Ins',
63+
'STOP_GAINED+CODON_CHANGE_PLUS_CODON_INSERTION':'Stop_Codon_Ins',
64+
'SYNONYMOUS_CODING':'Silent',
65+
'UPSTREAM':"5'Flank",
66+
'UTR_3_PRIME':"3'UTR",
67+
'UTR_5_PRIME':"5'UTR"
68+
}
69+
4470
def parseMutFile(fpath, sampid,genes):
45-
mutfile = pd.read_csv(fpath,sep='\t')
71+
'''
72+
move mutations to following headers:
73+
entrez_id, improve_sample_id, source, study, mutation, variant_classification
74+
'''
75+
mutfile = pd.read_csv(fpath,sep='\t')[['SNPEFF_GENE_NAME','SNPEFF_EFFECT','SNPEFF_CDS_CHANGE']]
76+
mutfile = mutfile.dropna(subset='SNPEFF_CDS_CHANGE')
77+
mutfile.columns = ['gene_symbol','SNPEFF_EFFECT','mutation']
78+
fullfile = pd.merge(mutfile,pd.DataFrame({'SNPEFF_EFFECT':mutmap.keys(),'variant_classification':mutmap.values()}))
79+
fullfile = pd.merge(fullfile,genes)
80+
fullfile['improve_sample_id'] = sampid
81+
fullfile['source']='TiriacEtAl'
82+
fullfile['study']='pancpdo'
83+
fullfile = fullfile[['improve_sample_id','entrez_id','source','study','mutation','variant_classification']]
84+
fullfile = fullfile.dropna().drop_duplicates()
85+
return fullfile
4686

4787
def main():
4888
parser = argparse.ArgumentParser(description = 'Script that collects WES and CNV data from Synapse for Coderdata')
@@ -83,12 +123,24 @@ def main():
83123
newcnv.to_csv('/tmp/pancpdo_copy_number.csv.gz',compression='gzip',index=False)
84124

85125
if args.mutation:
86-
wes = sc.tableQuery('select * from syn64608378 where parentId==syn64608263').asDataFrame()
126+
wes = sc.tableQuery("select * from syn64608378 where parentId='syn64608263'").asDataFrame()
87127
alldats = []
88128
##go through and get every mutation file
89129
for index,row in wes.iterrows():
130+
sname = row['name'].split('--')[0]
90131
sid = row.id
91-
sname = row['name']
92-
132+
print(sid,sname)
133+
if sname in set(samps.other_id):
134+
sampid = samps.loc[samps.other_id==sname]['improve_sample_id'].values[0]
135+
else:
136+
print('Missing sample id for '+sname)
137+
continue
138+
path = sc.get(sid).path
139+
sampid = samps.loc[samps.other_id==sname]['improve_sample_id'].values[0]
140+
res = parseMutFile(path,sampid, genes)
141+
alldats.append(res)
142+
newmut = pd.concat(alldats)
143+
newmut.to_csv("/tmp/pancpdo_mutations.csv.gz",compression='gzip',index=False)
144+
93145
if __name__=='__main__':
94146
main()

0 commit comments

Comments
 (0)