Skip to content

Commit 97d1371

Browse files
committed
added basic copy number alteration data from synapse
1 parent dcb23fa commit 97d1371

File tree

2 files changed

+100
-0
lines changed

2 files changed

+100
-0
lines changed

build/pancpdo/02-getPancPDOData.py

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -683,6 +683,12 @@ def main():
683683
final_data = align_to_schema(combined_data,args.type,7500,args.samples)
684684
gc.collect()
685685

686+
##what if we shrink samples to only include the values that have transcriptional data
687+
#this fails
688+
#newsamps = pd.read_csv(args.samples)
689+
#newsamps = newsamps[newsamps.improve_sample_id.isin(final_data.improve_sample_id)]
690+
#newsamps.to_csv(args.samples)
691+
686692
combined_data = None
687693

688694
print(f"final data:\n{final_data}")
Lines changed: 94 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,94 @@
1+
import pandas as pd
2+
import synapseclient
3+
import argparse
4+
import math
5+
6+
7+
def get_copy_call(a):
8+
"""
9+
Helper Function - Determine copy call for a value.
10+
"""
11+
12+
if a is None:
13+
return float('nan')
14+
15+
if math.isnan(a):
16+
return float('nan')
17+
18+
a_val = a##math.log2(float(a)+0.000001) ###this should not be exponent, should be log!!! 2**float(a)
19+
if a_val < 0.0: #0.5210507:
20+
return 'deep del'
21+
elif a_val < 0.7311832:
22+
return 'het loss'
23+
elif a_val < 1.214125:
24+
return 'diploid'
25+
elif a_val < 1.731183:
26+
return 'gain'
27+
else:
28+
return 'amp'
29+
30+
return pl.Series([get_copy_call(a) for a in arr])
31+
32+
def parseCNVFile(fpath, sampid, genes):
33+
log2data = pd.read_csv(fpath, sep='\t', header=None)
34+
log2data.columns = ['gene_symbol','copy_number','Region','Type','Pos']
35+
log2data['improve_sample_id']=sampid
36+
newdat = pd.merge(log2data,genes)[['improve_sample_id','entrez_id','copy_number']].drop_duplicates()
37+
newdat['study']='pancpdo'
38+
newdat['source']='TiriacEtal'
39+
newdat = newdat[['improve_sample_id','entrez_id','copy_number','source','study']]
40+
newdat['copy_call'] = [get_copy_call(a) for a in newdat['copy_number']]
41+
return newdat
42+
43+
44+
def parseMutFile(fpath, sampid,genes):
45+
mutfile = pd.read_csv(fpath,sep='\t')
46+
47+
def main():
48+
parser = argparse.ArgumentParser(description = 'Script that collects WES and CNV data from Synapse for Coderdata')
49+
parser.add_argument('-s', '--samples', help='Path to sample file',default=None)
50+
parser.add_argument('-g', '--genes', help='Path to genes file', default = None)
51+
parser.add_argument('-c', '--copy', help='Flag to capture copy number data', action='store_true', default=False)
52+
parser.add_argument('-m', '--mutation', help='Flag to capture mutation data', action='store_true', default=False)
53+
parser.add_argument('-t', '--token', help='Synapse token')
54+
55+
args = parser.parse_args()
56+
if args.samples is None or args.genes is None:
57+
print('We need at least a genes and samples file to continue')
58+
exit()
59+
samps = pd.read_csv(args.samples)
60+
genes = pd.read_csv(args.genes)
61+
62+
sc = synapseclient.login(args.token)
63+
64+
if args.copy:
65+
##query synapse view for files
66+
cnvs = sc.tableQuery("select * from syn64608378 where parentId='syn64608163'").asDataFrame()
67+
alldats = []
68+
##go through table and get every file
69+
for index,row in cnvs.iterrows():
70+
sid = row.id
71+
sname = row['name'].split('--')[0]
72+
print(sid,sname)
73+
path = sc.get(sid).path
74+
if sname in set(samps.other_id):
75+
sampid = samps.loc[samps.other_id==sname]['improve_sample_id'].values[0]
76+
else:
77+
print('Missing sample id for '+sname)
78+
continue
79+
sampid = samps.loc[samps.other_id==sname]['improve_sample_id'].values[0]
80+
res = parseCNVFile(path,sampid, genes)
81+
alldats.append(res)
82+
newcnv = pd.concat(alldats)
83+
newcnv.to_csv('/tmp/pancpdo_copy_number.csv.gz',compression='gzip',index=False)
84+
85+
if args.mutation:
86+
wes = sc.tableQuery('select * from syn64608378 where parentId==syn64608263').asDataFrame()
87+
alldats = []
88+
##go through and get every mutation file
89+
for index,row in wes.iterrows():
90+
sid = row.id
91+
sname = row['name']
92+
93+
if __name__=='__main__':
94+
main()

0 commit comments

Comments
 (0)