Skip to content

Commit 95d55f5

Browse files
authored
Merge pull request #173 from PNNL-CompBio/nci60-add
Nci60 and other fixes
2 parents 13f5748 + e1dcd45 commit 95d55f5

21 files changed

+481
-42283
lines changed

build/beatAML/requirements.txt

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -4,3 +4,6 @@ requests
44
synapseclient
55
argparse
66
numpy
7+
openpyxl
8+
matplotlib
9+
scikit-learn

build/broad_sanger/01-broadSangerSamples.R

Lines changed: 22 additions & 18 deletions
Original file line numberDiff line numberDiff line change
@@ -8,6 +8,7 @@
88
library(curl)
99
library(dplyr)
1010
library(readr)
11+
library(XML)
1112

1213
##the only thing that Priyanka has here is TRP identifiers, so collecting those
1314
# tab<-read.table('DepMap_Argonne_Mapping.csv',sep=',',header=T)%>%
@@ -31,34 +32,37 @@ print(paste("Downloaded",nrow(depmap_models),'dep map identifiers and',nrow(sang
3132

3233
##query for cellosaurus automagically to get loadest version
3334
url='https://ftp.expasy.org/databases/cellosaurus/cellosaurus.xml'
34-
if(!file.exists('cell.xml'))
35-
curl_download(url,'cell.xml',quiet=TRUE)#curl(url, "r", h)
36-
cello<-XML::xmlParse('cell.xml')
35+
if(!file.exists('/tmp/cell.xml'))
36+
curl_download(url,'/tmp/cell.xml',quiet=TRUE)#curl(url, "r", h)
37+
cello<-XML::xmlParse('/tmp/cell.xml')
3738
cdf<-XML::xmlToList(cello)
3839

39-
40+
print('Got all cellosaurus ids')
4041
### next we parse through cellosaurus to get as many samples as we deem relevant
4142
##ok, this command seems to have gotten file in appropriate state
4243
cell.lines<-lapply(cdf$`cell-line-list`, function(x) unlist(x))
4344

4445
##now we need toe xtract columns
4546
options(show.error.messages=TRUE)
4647
full.res<-do.call(rbind,lapply(cell.lines,function(x){
47-
##create a data frame for each cell lines
48-
x<-unlist(x)
49-
#should only be one acession
50-
acc<-x[grep('accession.text',names(x),fixed=T)]%>%unlist()
51-
52-
cn<-x[grep('name.text',names(x),fixed=T)]%>%unlist()
53-
#these will fail if no key found
54-
spec<-x[grep("species-list.cv-term.text",names(x),fixed=T)]%>%unlist()
55-
#dis<-x[grep("disease-list.cv-term.text",names(x),fixed=T)]%>%unlist()
56-
data.frame(accession=cn,
57-
RRID=rep(acc,length(cn)),
58-
species=rep(spec,length(cn)))
59-
# disease=rep(dis,length(cn)))
48+
##create a data frame for each cell lines
49+
x<-unlist(x)
50+
#should only be one acession
51+
acc<-x[grep('accession.text',names(x),fixed=T)]%>%unlist()
52+
53+
cn<-x[grep('name-list.name.text',names(x),fixed=T)]%>%unlist()
54+
#these will fail if no key found
55+
spec<-x[grep("species-list.xref.label",names(x),fixed=T)]%>%unlist()
56+
#print(acc)
57+
#print(cn)
58+
#print(spec)
59+
#dis<-x[grep("disease-list.cv-term.text",names(x),fixed=T)]%>%unlist()
60+
data.frame(accession=cn,
61+
RRID=rep(acc,length(cn)),
62+
species=rep(spec,length(cn)))
63+
# disease=rep(dis,length(cn)))
6064
}))%>%
61-
subset(species=='Homo sapiens (Human)')
65+
subset(species=='Homo sapiens (Human)')
6266

6367

6468
print(paste('Got',nrow(full.res),'human cellosaurus samples'))

build/broad_sanger/03-createDrugFile.R

Lines changed: 19 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -13,9 +13,13 @@ all.dsets<-PharmacoGx::availablePSets()
1313

1414

1515
#' getCellLineData - gets cell line dose response data
16-
getDepMapDrugData<-function(cell.lines=c('CTRPv2','FIMM','gCSI','PRISM','GDSC','NCI60','CCLE')){
17-
16+
getDepMapDrugData<-function(cell.lines=c('CTRPv2','FIMM','gCSI','PRISM','GDSC','CCLE'),efile=''){
1817

18+
if(efile!=''){
19+
existing_ids=readr::read_tsv(efile)
20+
}else{
21+
existing_ids=NULL
22+
}
1923
for(cel in cell.lines){
2024

2125
files<-subset(all.dsets,`Dataset Name`==cel)%>%
@@ -55,6 +59,12 @@ getDepMapDrugData<-function(cell.lines=c('CTRPv2','FIMM','gCSI','PRISM','GDSC','
5559
# dplyr::select(common_drug_name='chem_name',improve_drug_id)%>%
5660
# distinct()
5761
chem_list <- unique(mapping$treatmentid)
62+
print(paste('Found',length(chem_list),'chemicals for dataset',cel))
63+
# if(!is.null(existing_ids)){
64+
# chem_list=setdiff(chem_list,existing_ids$chem_name)
65+
# print(paste('Reducing to',length(chem_list),'after accounting for existing ids'))
66+
# }
67+
5868
output_file_path <- '/tmp/broad_sanger_drugs.tsv'
5969
ignore_file_path <- '/tmp/ignore_chems.txt'
6070
update_dataframe_and_write_tsv(unique_names=chem_list,output_filename=output_file_path,ignore_chems=ignore_file_path)
@@ -70,15 +80,17 @@ getDepMapDrugData<-function(cell.lines=c('CTRPv2','FIMM','gCSI','PRISM','GDSC','
7080

7181
main<-function(){
7282
args = commandArgs(trailingOnly=TRUE)
73-
if(length(args)!=1){
74-
print('Usage: Rscript 03-createDrugFile.R [datasets]')
83+
if(length(args)<2){
84+
print('Usage: Rscript 03-createDrugFile.R [datasets] [existing file]')
7585
# exit()
7686
}
7787
# sfile = args[1]
7888
dsets<-unlist(strsplit(args[1],split=','))
79-
80-
81-
dl1<-getDepMapDrugData(dsets)
89+
if(length(args)==2)
90+
efile=args[2]
91+
else
92+
efile=''
93+
dl1<-getDepMapDrugData(dsets,efile)
8294

8395

8496
}

build/broad_sanger/03a-nci60Drugs.py

Lines changed: 104 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,104 @@
1+
2+
3+
'''
4+
gets nci60 drug information
5+
'''
6+
7+
import polars as pl
8+
import os
9+
import argparse
10+
import pubchem_retrieval as pr
11+
import random as rand
12+
from urllib import request
13+
14+
##drug files
15+
smi_strings='https://wiki.nci.nih.gov/download/attachments/155844992/nsc_smiles.csv?version=1&modificationDate=1710381820000&api=v2&download=true'
16+
pc_ids='https://wiki.nci.nih.gov/download/attachments/155844992/nsc_sid_cid.csv?version=2&modificationDate=1712766341112&api=v2&download=true'
17+
chemnames='https://wiki.nci.nih.gov/download/attachments/155844992/nsc_chemcal_name.csv?version=1&modificationDate=1710382716000&api=v2&download=true'
18+
cas='https://wiki.nci.nih.gov/download/attachments/155844992/nsc_cas.csv?version=1&modificationDate=1710381783000&api=v2&download=true'
19+
conc_data = 'https://wiki.nci.nih.gov/download/attachments/147193864/DOSERESP.zip?version=11&modificationDate=1712351454136&api=v2'
20+
21+
22+
def main():
23+
parser = argparse.ArgumentParser()
24+
parser.add_argument('--test',action='store_true',default=False,help='Test script by sampling 100 chemicals')
25+
parser.add_argument('--output',default='/tmp/broad_sanger_drugs.tsv')
26+
opts = parser.parse_args()
27+
28+
###primary DF
29+
df = {'improve_drug_id':[],'chem_name':[],'canSMILES':[],'isoSMILES':[],\
30+
'InChIKey':[],'formula':[],'weight':[],'pubchem_id':[]}
31+
32+
print('Downloading NSC identifiers for nci60 data')
33+
names = pl.read_csv(chemnames,ignore_errors=True)
34+
#castab = pd.read_csv(cas)
35+
pubchems = pl.read_csv(pc_ids)
36+
smiles = pl.read_csv(smi_strings)
37+
38+
print('Getting experimental data to filter drugs')
39+
if not os.path.exists('DOSERESP.csv'):
40+
resp = request.urlretrieve(conc_data,'doseresp.zip')
41+
os.system('unzip doseresp.zip')
42+
dose_resp = pl.read_csv("DOSERESP.csv",quote_char='"',infer_schema_length=10000000)
43+
pubchems = pubchems.filter(pl.col('NSC').is_in(dose_resp['NSC']))
44+
##first retreive pubchem data
45+
if opts.test:
46+
arr = rand.sample(list(pubchems['CID']),100)
47+
else:
48+
arr = set(pubchems['CID'])
49+
50+
print("Querying pubchem from CIDs")
51+
pr.update_dataframe_and_write_tsv(arr,opts.output,'/tmp/ignore_chems.txt',batch_size=400,isname=False,time_limit=10*60*60)
52+
53+
##then make sure to paste `nsc` in front of all nsc idds
54+
res = pl.read_csv(opts.output,separator='\t')
55+
56+
57+
nsc = list(pubchems.filter(pl.col('CID').is_in(list(res['pubchem_id'])))['NSC'])
58+
59+
print('Checking NSCs to see what we missed')
60+
missing = [n for n in nsc if 'nsc'+str(n) not in res['chem_name'] and 'nsc-'+str(n) not in res['chem_name']]
61+
62+
##check ignore_chems.txt
63+
print('missing '+str(len(missing))+' nsc ids')
64+
65+
msmi = smiles.filter(pl.col('NSC').is_in(missing))
66+
print('Found SMILE strings for '+str(msmi.shape[1])+' NSCs')
67+
68+
##add in improve ids, nsc name and structure for all.
69+
mdf = msmi.join(names,on='NSC',how='left').join(pubchems,on='NSC',how='left')
70+
71+
max_imp = max(int(a.split('_')[1]) for a in res['improve_drug_id'])
72+
73+
smicount=len(set(mdf['SMILES'])) ## unique smiles in our missing data frame
74+
newdf = pl.DataFrame(
75+
{
76+
"improve_drug_id": ["SMI_"+str(a) for a in range(max_imp+1,max_imp+1+smicount,1)],
77+
'canSMILES': [a for a in set(mdf['SMILES'])],
78+
'isoSMILES': [a for a in set(mdf['SMILES'])],
79+
'InChIKey': [None for a in range(smicount)],
80+
'formula': [None for a in range(smicount)],
81+
'weight': [None for a in range(smicount)]
82+
}
83+
)
84+
85+
#create updated nsc ids and names
86+
namedf = pl.DataFrame(
87+
{
88+
"nscid": ['nsc-'+str(a) for a in mdf['NSC']],
89+
'lower_name': [a if a is None else str(a).lower() for a in mdf['NAME']],
90+
'canSMILES': list(mdf['SMILES']),
91+
'pubchem_id': list(mdf['CID'])
92+
}
93+
)
94+
#merge and melt
95+
merged = pl.concat([mdf,namedf],how='horizontal').select(['SMILES','pubchem_id','nscid','lower_name'])
96+
melted = merged.melt(id_vars=['SMILES','pubchem_id'],value_vars=['nscid','lower_name']).select(['SMILES','pubchem_id','value']).unique()
97+
melted.columns = ['canSMILES','pubchem_id','chem_name']
98+
if newdf.shape[0]>0:
99+
newdf = newdf.join(melted,on='canSMILES',how='inner').select(res.columns)
100+
res = pl.concat([res,newdf],how='vertical')
101+
res.write_csv(opts.output,separator='\t')
102+
103+
if __name__=='__main__':
104+
main()

build/broad_sanger/04-drug_dosage_and_curves.py

Lines changed: 21 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -12,6 +12,8 @@
1212

1313
import os
1414
import argparse
15+
import pandas as pd
16+
import subprocess
1517

1618
parser = argparse.ArgumentParser()
1719
parser.add_argument('--curSampleFile',dest='samplefile',default=None,help='DepMap sample file')
@@ -23,27 +25,37 @@
2325
samplefile = opts.samplefile
2426
drugfile = opts.dfile
2527

26-
####step 4a - get dose response data
27-
cmd = 'Rscript 04a-drugResponseData.R '+samplefile+' '+drugfile+' CTRPv2,FIMM,GDSC'
28+
cmd = ['/opt/venv/bin/python','04b-nci60-updated.py','--sampleFile='+samplefile,'--drugFile='+drugfile]
2829
print(cmd)
29-
os.system(cmd)
30+
subprocess.run(cmd)
3031

31-
cmd = 'Rscript 04a-drugResponseData.R '+samplefile+' '+drugfile+' gCSI,PRISM,CCLE'
32+
####step 4a - get dose response data
33+
cmd = ['Rscript','04a-drugResponseData.R',samplefile,drugfile,'CTRPv2,FIMM,GDSC']
3234
print(cmd)
33-
os.system(cmd)
35+
subprocess.run(cmd)
3436

35-
cmd = 'Rscript 04a-drugResponseData.R '+samplefile+' '+drugfile+' NCI60'
37+
cmd = ['Rscript','04a-drugResponseData.R',samplefile,drugfile,'gCSI,PRISM,CCLE']
3638
print(cmd)
37-
os.system(cmd)
39+
subprocess.run(cmd)
40+
41+
42+
#cmd = 'Rscript 04a-drugResponseData.R '+samplefile+' '+drugfile+' NCI60'
43+
#print(cmd)
44+
#os.system(cmd)
3845

3946
########Step 4b fit curves
4047
allfiles=[a for a in os.listdir('./') if 'DoseResponse' in a]
4148
print(allfiles)
4249
for a in allfiles:
43-
os.system('/opt/venv/bin/python fit_curve.py --input '+a+' --output '+a)
50+
subprocess.run(['/opt/venv/bin/python','fit_curve.py','--input='+a,'--output='+a])
4451

4552
###step 4c concatenate all files
53+
outfiles = [a for a in os.listdir("./") if ".0" in a]
54+
final_file = []
55+
for of in outfiles:
56+
final_file.append(pd.read_csv(of,sep='\t'))
4657

47-
os.system('cat *.0 > /tmp/broad_sanger_experiments.tsv')
58+
pd.concat(final_file).to_csv('/tmp/broad_sanger_experiments.tsv',index=False,sep='\t')
59+
#os.system('cat *.0 > /tmp/broad_sanger_experiments.tsv')
4860
#os.system('gzip -f /tmp/experiments.tsv')
4961

Lines changed: 112 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,112 @@
1+
'''
2+
gets nci60 data from 10/2023 release
3+
4+
'''
5+
6+
import polars as pl
7+
import argparse
8+
#from zipfile import ZipFile
9+
import os
10+
#from io import BytesIO
11+
import re
12+
from urllib import request
13+
14+
conc_data = 'https://wiki.nci.nih.gov/download/attachments/147193864/DOSERESP.zip?version=11&modificationDate=1712351454136&api=v2'
15+
cancelled = 'https://wiki.nci.nih.gov/download/attachments/147193864/DOSERESP_Cancelled.csv?version=1&modificationDate=1660871847000&api=v2&download=true'
16+
17+
def main():
18+
19+
parser = argparse.ArgumentParser()
20+
parser.add_argument('--sampleFile',dest='samplefile',default=None,help='DepMap sample file')
21+
parser.add_argument('--drugFile',dest='dfile',default=None,help='Drug database')
22+
23+
opts = parser.parse_args()
24+
25+
samplefile = opts.samplefile
26+
drugfile = opts.dfile
27+
if not os.path.exists('DOSERESP.csv'):
28+
resp = request.urlretrieve(conc_data,'doseresp.zip')
29+
os.system('unzip doseresp.zip')
30+
31+
samples = pl.read_csv(samplefile,quote_char='"')
32+
drugs = pl.read_csv(drugfile,separator='\t',quote_char='"')
33+
34+
dose_resp = pl.read_csv("DOSERESP.csv",quote_char='"',infer_schema_length=10000000)
35+
36+
##update drug mapping
37+
drugmapping = pl.DataFrame(
38+
{
39+
'chem_name' : ['nsc-'+str(nsc) for nsc in set(dose_resp['NSC'])],
40+
'NSC' : [a for a in set(dose_resp['NSC'])]
41+
}
42+
)
43+
44+
drugmapping = drugmapping.join(drugs,on='chem_name')[['NSC','improve_drug_id']]
45+
drugmapping = drugmapping.unique()
46+
47+
###update sample mapping
48+
on = samples[['other_names','improve_sample_id']]
49+
on.columns=['common_name','improve_sample_id']
50+
51+
sampmapping = pl.concat([on[['common_name','improve_sample_id']],samples[['common_name','improve_sample_id']]])
52+
53+
sampmapping = sampmapping.unique()
54+
sampmapping.columns = ['CELL_NAME','improve_sample_id']
55+
56+
###create a time mapping tabel
57+
timemapping = pl.DataFrame(
58+
{
59+
'EXPID':dose_resp['EXPID'],
60+
'time':[72 if int(a[0:2])>22 and int(a[0:2])<50 and int(a[2:4])>0 else 48 for a in dose_resp['EXPID']],
61+
'time_unit':['hours' for a in dose_resp['EXPID']]
62+
}
63+
).unique()
64+
65+
66+
##now we can merge all the data into the dose response data frame
67+
merged = dose_resp[['AVERAGE_PTC','CONCENTRATION','CELL_NAME','EXPID','NSC']].join(sampmapping,on='CELL_NAME',how='left')
68+
merged = merged.join(timemapping,on='EXPID',how='left')
69+
70+
##clean up mssing samples
71+
nonulls = merged.filter(pl.col('improve_sample_id').is_not_null())
72+
73+
nulls = merged.filter(pl.col('improve_sample_id').is_null())
74+
75+
newnames = pl.DataFrame(
76+
{
77+
'new_name':[re.split(' |\(|\/',a)[0] for a in nulls['CELL_NAME']],
78+
'CELL_NAME':nulls['CELL_NAME']
79+
}
80+
)
81+
newnames = newnames.unique()
82+
83+
fixed = nulls[['AVERAGE_PTC','CONCENTRATION','CELL_NAME','EXPID','NSC','time','time_unit']].join(newnames,on='CELL_NAME',how='left')
84+
fixed.columns = ['AVERAGE_PTC','CONCENTRATION','old_CELL_NAME','EXPID','NSC','time','time_unit','CELL_NAME']
85+
fixed = fixed.join(sampmapping,on='CELL_NAME',how='left')[['AVERAGE_PTC','CONCENTRATION','old_CELL_NAME','EXPID','NSC','improve_sample_id','time','time_unit']]
86+
fixed.columns = ['AVERAGE_PTC','CONCENTRATION','CELL_NAME','EXPID','NSC','improve_sample_id','time','time_unit']
87+
fixed = fixed.filter(pl.col('improve_sample_id').is_not_null())
88+
89+
merged = pl.concat([nonulls,fixed])
90+
91+
###we get a few more results added, but still missing a bunch
92+
merged = merged.join(drugmapping,on='NSC',how='left')
93+
nulldrugs = merged.filter(pl.col('improve_drug_id').is_null())
94+
nonulls = merged.filter(pl.col('improve_drug_id').is_not_null())
95+
finaldf = pl.DataFrame(
96+
{
97+
'source':['NCI60_24' for a in nonulls['improve_drug_id']], ##2024 build
98+
'improve_sample_id':nonulls['improve_sample_id'],
99+
'Drug':nonulls['improve_drug_id'],
100+
'study':['NCI60' for a in nonulls['improve_drug_id']],
101+
'time':nonulls['time'],
102+
'time_unit':nonulls['time_unit'],
103+
'DOSE': [10**a for a in nonulls['CONCENTRATION']],
104+
'GROWTH':nonulls['AVERAGE_PTC']
105+
}
106+
)
107+
##write to file
108+
finaldf.write_csv('nci60DoseResponse',separator='\t')
109+
110+
111+
if __name__=='__main__':
112+
main()

0 commit comments

Comments
 (0)