Skip to content

Commit c39a21a

Browse files
authored
Merge pull request #302 from PNNL-CompBio/panc-pdo-check
updated panc pdo files for proper DMSO control and added in genomic data
2 parents 5675103 + cffa309 commit c39a21a

File tree

8 files changed

+232
-23
lines changed

8 files changed

+232
-23
lines changed

build/build_all.py

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -39,7 +39,7 @@ def main():
3939
parser.add_argument('--figshare', action='store_true', help="Upload all local data to Figshare. FIGSHARE_TOKEN must be set in local environment.")
4040
parser.add_argument('--all',dest='all',default=False,action='store_true', help="Run all data build commands. This includes docker, samples, omics, drugs, exp arguments. This does not run the validate or figshare commands")
4141
parser.add_argument('--high_mem',dest='high_mem',default=False,action='store_true',help = "If you have 32 or more CPUs, this option is recommended. It will run many code portions in parallel. If you don't have enough memory, this will cause a run failure.")
42-
parser.add_argument('--dataset',dest='datasets',default='broad_sanger,hcmi,beataml,cptac,mpnst,mpnstpdx',help='Datasets to process. Defaults to all available.')
42+
parser.add_argument('--dataset',dest='datasets',default='broad_sanger,hcmi,beataml,cptac,mpnst,mpnstpdx,pancpdo',help='Datasets to process. Defaults to all available.')
4343
parser.add_argument('--version', type=str, required=False, help='Version number for the Figshare upload title (e.g., "0.1.29"). This is required for Figshare upload. This must be a higher version than previously published versions.')
4444
parser.add_argument('--github-username', type=str, required=False, help='GitHub username for the repository.')
4545
parser.add_argument('--github-email', type=str, required=False, help='GitHub email for the repository.')
@@ -119,6 +119,7 @@ def process_docker(datasets):
119119
'beataml': ['beataml'],
120120
'mpnst': ['mpnst'],
121121
'mpnstpdx': ['mpnstpdx'],
122+
'pancpdo': ['pancpdo'],
122123
'cptac': ['cptac'],
123124
'genes': ['genes'],
124125
'upload': ['upload']

build/docker/Dockerfile.pancpdo

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -4,6 +4,7 @@ WORKDIR /usr/src/app
44

55
COPY build/pancpdo/01-createPancPDOSamplesFile.py .
66
COPY build/pancpdo/02-getPancPDOData.py .
7+
COPY build/pancpdo/02a-getPancPDODataFromSynapse.py .
78
COPY build/pancpdo/03-getPancPDODrugs.py .
89
COPY build/pancpdo/04-getPancPDOExperiments.py .
910
COPY build/pancpdo/05-addPrecalcAUC.py .
@@ -18,4 +19,5 @@ ENV MPLCONFIGDIR=/app/tmp/matplotlib
1819
RUN mkdir -p /app/tmp/matplotlib
1920

2021
RUN pip install --no-cache-dir -r requirements.txt
22+
2123
VOLUME ['/tmp']

build/genes/00-buildGeneFile.R

Lines changed: 24 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -10,6 +10,8 @@ library(dplyr)
1010
##get entrez ids to symbol
1111
entrez<-as.data.frame(org.Hs.egALIAS2EG)
1212

13+
sym <- as.data.frame(org.Hs.egSYMBOL)
14+
1315
##get entriz ids to ensembl
1416
ens<-as.data.frame(org.Hs.egENSEMBL2EG)
1517

@@ -22,25 +24,34 @@ ensembl <- useEnsembl(biomart = "genes", dataset = "hsapiens_gene_ensembl")
2224
tab <- getBM(attributes=c('ensembl_gene_id'),filters='biotype', values=c('protein_coding'),mart=ensembl)
2325

2426

25-
joined.df<-entrez%>%full_join(ens)%>%
26-
dplyr::rename(entrez_id='gene_id',gene_symbol='alias_symbol',other_id='ensembl_id')%>%
27-
mutate(other_id_source='ensembl_gene')|>
28-
mutate(is_protein=other_id%in%tab$ensembl_gene_id)|>
29-
subset(is_protein)|>
30-
dplyr::select(-is_protein)
27+
joined.df<-entrez|>
28+
left_join(sym)|>
29+
dplyr::rename(entrez_id='gene_id',gene_symbol='symbol',other_id='alias_symbol',gene_symbol='symbol')%>%
30+
mutate(other_id_source='entrez_alias')
31+
32+
##now get aliases from ensembl
33+
edf <- sym|>
34+
inner_join(ens)|>
35+
dplyr::rename(entrez_id='gene_id',gene_symbol='symbol',other_id='ensembl_id')%>%
36+
mutate(other_id_source='ensembl_gene')
37+
3138

32-
tdf<-entrez|>
33-
full_join(enst)|>
34-
dplyr::rename(entrez_id='gene_id',gene_symbol='alias_symbol',other_id='trans_id')|>
35-
subset(entrez_id%in%joined.df$entrez_id)|>
36-
subset(gene_symbol%in%joined.df$gene_symbol)|>
39+
tdf<-sym|>
40+
inner_join(enst)|>
41+
dplyr::rename(entrez_id='gene_id',gene_symbol='symbol',other_id='trans_id')|>
42+
subset(entrez_id%in%edf$entrez_id)|>
43+
# subset(gene_symbol%in%ed.df$gene_symbol)|>
3744
dplyr::mutate(other_id_source='ensembl_transcript')
3845

39-
joined.df<-rbind(joined.df,tdf)|>
46+
47+
prots<-subset(edf,other_id%in%tab$ensembl_gene_id)
48+
49+
full.df<-rbind(joined.df,edf,tdf)|>
50+
subset(entrez_id%in%prots$entrez_id)|>
4051
distinct()
4152

4253
#save to file and version
43-
write.table(joined.df,'/tmp/genes.csv',sep=',',row.names=F,quote=T)
54+
write.table(full.df,'/tmp/genes.csv',sep=',',row.names=F,quote=T)
4455

4556
##store this file somewhere!
4657

build/pancpdo/01-createPancPDOSamplesFile.py

Lines changed: 12 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -270,18 +270,26 @@ def filter_and_subset_data(df, maxval, mapfile):
270270
# Convert 'other_names' to string to ensure consistency
271271
longtab['other_names'] = longtab['other_names'].astype(str)
272272

273+
#print(longtab)
273274
# Reassign 'improve_sample_id's at the end
274275
unique_other_names = longtab['other_names'].unique()
275276
print("Number of unique 'other_names' after filtering:", len(unique_other_names))
276277

278+
##UPDATE: assign them to common_names instead!
279+
unique_common_names = longtab['common_name'].unique()
280+
print("Number of unique 'common_names' after filtering:", len(unique_common_names))
277281
# Create a new mapping
282+
#mapping = pd.DataFrame({
283+
# 'other_names': unique_other_names,
284+
# 'improve_sample_id': range(int(maxval) + 1, int(maxval) + len(unique_other_names) + 1)
285+
#})
278286
mapping = pd.DataFrame({
279-
'other_names': unique_other_names,
280-
'improve_sample_id': range(int(maxval) + 1, int(maxval) + len(unique_other_names) + 1)
281-
})
287+
'common_name':unique_common_names,
288+
'improve_sample_id': range(int(maxval) +1, int(maxval) + len(unique_common_names)+1)
289+
})
282290

283291
# Merge the mapping back into 'longtab'
284-
longtab = pd.merge(longtab, mapping, on='other_names', how='left')
292+
longtab = pd.merge(longtab, mapping, on='common_name', how='left')
285293

286294
# Debugging: Check longtab after reassigning IDs
287295
print("\nlongtab columns after reassigning 'improve_sample_id':", longtab.columns)

build/pancpdo/02-getPancPDOData.py

Lines changed: 7 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -183,7 +183,7 @@ def use_gdc_tool(manifest_data, data_type, download_data):
183183

184184
# Initialize retry variables
185185
retries = 0
186-
max_retries = 5
186+
max_retries = 1
187187

188188
# Function to get downloaded file IDs
189189
def get_downloaded_ids(manifest_loc):
@@ -683,6 +683,12 @@ def main():
683683
final_data = align_to_schema(combined_data,args.type,7500,args.samples)
684684
gc.collect()
685685

686+
##what if we shrink samples to only include the values that have transcriptional data
687+
#this fails
688+
#newsamps = pd.read_csv(args.samples)
689+
#newsamps = newsamps[newsamps.improve_sample_id.isin(final_data.improve_sample_id)]
690+
#newsamps.to_csv(args.samples)
691+
686692
combined_data = None
687693

688694
print(f"final data:\n{final_data}")
Lines changed: 171 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,171 @@
1+
import pandas as pd
2+
import synapseclient
3+
import argparse
4+
import math
5+
6+
7+
def get_copy_call(a):
8+
"""
9+
Helper Function - Determine copy call for a value.
10+
"""
11+
12+
if a is None:
13+
return float('nan')
14+
15+
if math.isnan(a):
16+
return float('nan')
17+
18+
a_val = a##math.log2(float(a)+0.000001) ###this should not be exponent, should be log!!! 2**float(a)
19+
if a_val < 0.0: #0.5210507:
20+
return 'deep del'
21+
elif a_val < 0.7311832:
22+
return 'het loss'
23+
elif a_val < 1.214125:
24+
return 'diploid'
25+
elif a_val < 1.731183:
26+
return 'gain'
27+
else:
28+
return 'amp'
29+
30+
return pl.Series([get_copy_call(a) for a in arr])
31+
32+
def parseCNVFile(fpath, sampid, genes):
33+
log2data = pd.read_csv(fpath, sep='\t', header=None)
34+
log2data.columns = ['gene_symbol','copy_number','Region','Type','Pos']
35+
log2data['improve_sample_id']=sampid
36+
newdat = pd.merge(log2data,genes)[['improve_sample_id','entrez_id','copy_number']].drop_duplicates()
37+
newdat['study']='pancpdo'
38+
newdat['source']='TiriacEtal'
39+
newdat = newdat[['improve_sample_id','entrez_id','copy_number','source','study']]
40+
newdat['copy_call'] = [get_copy_call(a) for a in newdat['copy_number']]
41+
return newdat
42+
43+
44+
mutmap = {'CODON_CHANGE_PLUS_CODON_DELETION':'In_Frame_Del', ##this isn't a great mapping
45+
'CODON_CHANGE_PLUS_CODON_INSERTION':'In_Frame_Ins', ##this isn't a great mapping
46+
'CODON_DELETION':'In_Frame_Del',
47+
'CODON_INSERTION':'In_Frame_Ins',
48+
'DOWNSTREAM':"3'Flank",
49+
'FRAME_SHIFT':'Frameshift_Variant',
50+
'FRAME_SHIFT+SPLICE_SITE_ACCEPTOR+SPLICE_SITE_REGION+INTRON':'Frameshift_Variant',
51+
'FRAME_SHIFT+SPLICE_SITE_REGION':'Frameshift_Variant',
52+
'INTERGENIC':'IGR',
53+
'INTRON':'Intron',
54+
'NON_SYNONYMOUS_CODING':'Missense_Mutation',
55+
'NON_SYNONYMOUS_CODING+SPLICE_SITE_REGION':'Missense_Mutation',
56+
'SPLICE_SITE_ACCEPTOR+INTRON':'Splice_Site',
57+
'SPLICE_SITE_DONOR+INTRON':'Splice_Site',
58+
'SPLICE_SITE_REGION+INTRON':'Splice_Site',
59+
'SPLICE_SITE_REGION+NON_CODING_EXON_VARIANT':'Splice_Site',
60+
'SPLICE_SITE_REGION+SYNONYMOUS_CODING':'Silent',
61+
'START_GAINED+UTR_5_PRIME':'Start_Codon_Ins',
62+
'STOP_GAINED':'Stop_Codon_Ins',
63+
'STOP_GAINED+CODON_CHANGE_PLUS_CODON_INSERTION':'Stop_Codon_Ins',
64+
'SYNONYMOUS_CODING':'Silent',
65+
'UPSTREAM':"5'Flank",
66+
'UTR_3_PRIME':"3'UTR",
67+
'UTR_5_PRIME':"5'UTR"
68+
}
69+
70+
def parseMutFile(fpath, sampid,genes):
71+
'''
72+
move mutations to following headers:
73+
entrez_id, improve_sample_id, source, study, mutation, variant_classification
74+
'''
75+
mutfile = pd.read_csv(fpath,sep='\t')[['SNPEFF_GENE_NAME','SNPEFF_EFFECT','SNPEFF_CDS_CHANGE']]
76+
mutfile = mutfile.dropna(subset='SNPEFF_CDS_CHANGE')
77+
mutfile.columns = ['gene_symbol','SNPEFF_EFFECT','mutation']
78+
fullfile = pd.merge(mutfile,pd.DataFrame({'SNPEFF_EFFECT':mutmap.keys(),'variant_classification':mutmap.values()}))
79+
fullfile = pd.merge(fullfile,genes)
80+
fullfile['improve_sample_id'] = sampid
81+
fullfile['source']='TiriacEtAl'
82+
fullfile['study']='pancpdo'
83+
fullfile = fullfile[['improve_sample_id','entrez_id','source','study','mutation','variant_classification']]
84+
fullfile = fullfile.dropna().drop_duplicates()
85+
return fullfile
86+
87+
def main():
88+
parser = argparse.ArgumentParser(description = 'Script that collects WES and CNV data from Synapse for Coderdata')
89+
parser.add_argument('-s', '--samples', help='Path to sample file',default=None)
90+
parser.add_argument('-g', '--genes', help='Path to genes file', default = None)
91+
parser.add_argument('-c', '--copy', help='Flag to capture copy number data', action='store_true', default=False)
92+
parser.add_argument('-m', '--mutation', help='Flag to capture mutation data', action='store_true', default=False)
93+
parser.add_argument('-t', '--token', help='Synapse token')
94+
95+
args = parser.parse_args()
96+
if args.samples is None or args.genes is None:
97+
print('We need at least a genes and samples file to continue')
98+
exit()
99+
samps = pd.read_csv(args.samples)
100+
genes = pd.read_csv(args.genes)
101+
102+
print("Logging into synapse")
103+
sc = synapseclient.Synapse()
104+
sc.login(authToken=args.token)
105+
106+
##to double check identifiers, we use transcriptomics data since that determines what samples were sequenced
107+
#update this step isn't needed anymore
108+
trans = pd.read_csv('/tmp/pancpdo_transcriptomics.csv.gz')
109+
tsamps = samps[samps.improve_sample_id.isin(trans.improve_sample_id)]
110+
print(samps.shape)
111+
print(tsamps.shape)
112+
113+
114+
missingsamples = []
115+
if args.copy:
116+
##query synapse view for files
117+
cnvs = sc.tableQuery("select * from syn64608378 where parentId='syn64608163'").asDataFrame()
118+
alldats = []
119+
##go through table and get every file
120+
for index,row in cnvs.iterrows():
121+
sid = row.id
122+
sname = row['name'].split('--')[0]
123+
print(sid,sname)
124+
path = sc.get(sid).path
125+
if sname in set(tsamps.other_id):
126+
print(sname+' in transcriptomics, using that id')
127+
sampid = tsamps.loc[tsamps.other_id==sname]['improve_sample_id'].values[0]
128+
missingsamples.append('copy,trans,'+sname)
129+
elif sname in set(samps.other_id):
130+
print(sname+' in samples but not transcriptomics, using other id')
131+
sampid = samps.loc[samps.other_id==sname]['improve_sample_id'].values[0]
132+
missingsamples.append("copy,notrans,"+sname)
133+
else:
134+
print('Missing sample id for '+sname,' skipping for now')
135+
missingsamples.append('copy,missed,'+sname)
136+
continue
137+
sampid = samps.loc[samps.other_id==sname]['improve_sample_id'].values[0]
138+
res = parseCNVFile(path,sampid, genes)
139+
alldats.append(res)
140+
newcnv = pd.concat(alldats)
141+
newcnv.to_csv('/tmp/pancpdo_copy_number.csv.gz',compression='gzip',index=False)
142+
143+
if args.mutation:
144+
wes = sc.tableQuery("select * from syn64608378 where parentId='syn64608263'").asDataFrame()
145+
alldats = []
146+
##go through and get every mutation file
147+
for index,row in wes.iterrows():
148+
sname = row['name'].split('--')[0]
149+
sid = row.id
150+
print(sid,sname)
151+
if sname in set(tsamps.other_id):
152+
print(sname+' in transcriptomics, using that id')
153+
sampid = tsamps.loc[tsamps.other_id==sname]['improve_sample_id'].values[0]
154+
missingsamples.append('mutation,trans,'+sname)
155+
elif sname in set(samps.other_id):
156+
print(sname+' in samples but not transcriptomics, using other id')
157+
sampid = samps.loc[samps.other_id==sname]['improve_sample_id'].values[0]
158+
missingsamples.append('mutation,notrans,'+sname)
159+
else:
160+
print('Missing sample id for '+sname)
161+
missingsamples.append('mutation,'+sname)
162+
continue
163+
path = sc.get(sid).path
164+
sampid = samps.loc[samps.other_id==sname]['improve_sample_id'].values[0]
165+
res = parseMutFile(path,sampid, genes)
166+
alldats.append(res)
167+
newmut = pd.concat(alldats)
168+
newmut.to_csv("/tmp/pancpdo_mutations.csv.gz",compression='gzip',index=False)
169+
#pd.DataFrame(missingsamples).to_csv('missing.csv',index=False,quoting=None,header=False)
170+
if __name__=='__main__':
171+
main()

build/pancpdo/04-getPancPDOExperiments.py

Lines changed: 11 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -95,15 +95,22 @@ def get_data(token):
9595

9696
##renormalize values to max
9797
##IMPORTANT: this is how we normalize without DMSO. We need to consider how we're doing this for EACH ORGANOID
98-
##currently we take the max value of each orgnaoid/replicate.
99-
rtab["MaxRep"] = rtab.groupby(['Drug','Organoid','Rep']).Response.transform('max')
100-
rtab['PercResponse'] = (rtab.Response/rtab.MaxRep)*100.00
98+
##currently we take the max value of each orgnaoid/replicate.
99+
##UPDATE: see belo
100+
# rtab["MaxRep"] = rtab.groupby(['Drug','Organoid','Rep']).Response.transform('max')
101+
# rtab['PercResponse'] = (rtab.Response/rtab.MaxRep)*100.00
101102

102103

103104
##dosenum isa dummy value to use for merging since we need to repeat the concentrations over and over
104105
dosenum = [a for a in range(15)]
105106
rtab['Dosenum']=dosenum*int(rtab.shape[0]/15)
106107

108+
##The last dose (dosenum ==14) is the control value per Herve. we now must normalize to that
109+
110+
dmso_vals = rtab[rtab.Dosenum==14][['Organoid','Drug','Rep','Response']].rename({'Response':'DMSO'},axis=1)
111+
full_res = rtab.merge(dmso_vals,on=['Organoid','Drug','Rep'])
112+
full_res['PercResponse'] = 100*(full_res.Response/full_res.DMSO)
113+
107114
#print(set(rtab.Drug))
108115
##merge the concentrations
109116
concs = concs.dropna().melt(value_vars=concs.columns,var_name='Drug',value_name='Dose')
@@ -114,7 +121,7 @@ def get_data(token):
114121
concs['Dosenum'] = dosenum*int(concs.shape[0]/15)##creating dosenum here to merge
115122
#print(set(concs.Drug))
116123

117-
return rtab.merge(concs)
124+
return full_res.merge(concs)
118125

119126
if __name__=='__main__':
120127
main()

build/pancpdo/build_omics.sh

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -6,6 +6,9 @@ trap 'echo "Error on or near line $LINENO while executing: $BASH_COMMAND"; exit
66
echo "Running 02-getPancPDOData.py for transcriptomics."
77
python 02-getPancPDOData.py -m full_manifest.txt -t transcriptomics -o /tmp/pancpdo_transcriptomics.csv.gz -g $1 -s $2
88

9+
echo 'Running 02a-getPancPDODataFromSynapse.py for copy number and mutations'
10+
python 02a-getPancPDODataFromSynapse.py -g $1 -s $2 -t $SYNAPSE_AUTH_TOKEN -c -m
11+
912
#echo "Running 02-getPancPDOData.py for copy_number."
1013
#python 02-getPancPDOData.py -m full_manifest.txt -t copy_number -o /tmp/pancpdo_copy_number.csv.gz -g $1 -s $2
1114

0 commit comments

Comments
 (0)