PNNL-CompBio
diff --git a/‎build/beatAML/requirements.txt
Lines changed: 3 additions & 0 deletions b/‎build/beatAML/requirements.txt
Lines changed: 3 additions & 0 deletions
diff --git a/‎build/broad_sanger/01-broadSangerSamples.R
Lines changed: 22 additions & 18 deletions b/‎build/broad_sanger/01-broadSangerSamples.R
Lines changed: 22 additions & 18 deletions
diff --git a/‎build/broad_sanger/03-createDrugFile.R
Lines changed: 19 additions & 7 deletions b/‎build/broad_sanger/03-createDrugFile.R
Lines changed: 19 additions & 7 deletions
diff --git a/‎build/broad_sanger/03a-nci60Drugs.py
Lines changed: 104 additions & 0 deletions b/‎build/broad_sanger/03a-nci60Drugs.py
Lines changed: 104 additions & 0 deletions
diff --git a/‎build/broad_sanger/04-drug_dosage_and_curves.py
Lines changed: 21 additions & 9 deletions b/‎build/broad_sanger/04-drug_dosage_and_curves.py
Lines changed: 21 additions & 9 deletions
diff --git a/‎build/broad_sanger/04b-nci60-updated.py
Lines changed: 112 additions & 0 deletions b/‎build/broad_sanger/04b-nci60-updated.py
Lines changed: 112 additions & 0 deletions
@@ -4,3 +4,6 @@ requests
 synapseclient
 argparse
 numpy
+openpyxl
+matplotlib
+scikit-learn
@@ -8,6 +8,7 @@
 library(curl)
 library(dplyr)
 library(readr)
+library(XML)
 
 ##the only thing that Priyanka has here is TRP identifiers, so collecting those
 # tab<-read.table('DepMap_Argonne_Mapping.csv',sep=',',header=T)%>%
@@ -31,34 +32,37 @@ print(paste("Downloaded",nrow(depmap_models),'dep map identifiers and',nrow(sang
 
 ##query for cellosaurus automagically to get loadest version
 url='https://ftp.expasy.org/databases/cellosaurus/cellosaurus.xml'
-if(!file.exists('cell.xml'))
-  curl_download(url,'cell.xml',quiet=TRUE)#curl(url, "r", h)
-cello<-XML::xmlParse('cell.xml')
+if(!file.exists('/tmp/cell.xml'))
+  curl_download(url,'/tmp/cell.xml',quiet=TRUE)#curl(url, "r", h)
+cello<-XML::xmlParse('/tmp/cell.xml')
 cdf<-XML::xmlToList(cello)
 
-
+print('Got all cellosaurus ids')
 ### next we parse through cellosaurus to get as many samples as we deem relevant
 ##ok, this command seems to have gotten file in appropriate state
 cell.lines<-lapply(cdf$`cell-line-list`, function(x) unlist(x))
 
 ##now we need toe xtract columns
 options(show.error.messages=TRUE)
 full.res<-do.call(rbind,lapply(cell.lines,function(x){
-  ##create a data frame for each cell lines
-  x<-unlist(x)
-  #should only be one acession
-  acc<-x[grep('accession.text',names(x),fixed=T)]%>%unlist()
-
-  cn<-x[grep('name.text',names(x),fixed=T)]%>%unlist()
-  #these will fail if no key found
-  spec<-x[grep("species-list.cv-term.text",names(x),fixed=T)]%>%unlist()
-  #dis<-x[grep("disease-list.cv-term.text",names(x),fixed=T)]%>%unlist()
-  data.frame(accession=cn,
-             RRID=rep(acc,length(cn)),
-             species=rep(spec,length(cn)))
-   #          disease=rep(dis,length(cn)))
+    ##create a data frame for each cell lines
+    x<-unlist(x)
+                                        #should only be one acession
+    acc<-x[grep('accession.text',names(x),fixed=T)]%>%unlist()
+
+    cn<-x[grep('name-list.name.text',names(x),fixed=T)]%>%unlist()
+                                        #these will fail if no key found
+    spec<-x[grep("species-list.xref.label",names(x),fixed=T)]%>%unlist()
+    #print(acc)
+    #print(cn)
+    #print(spec)
+                                        #dis<-x[grep("disease-list.cv-term.text",names(x),fixed=T)]%>%unlist()
+    data.frame(accession=cn,
+               RRID=rep(acc,length(cn)),
+               species=rep(spec,length(cn)))
+                                        #          disease=rep(dis,length(cn)))
 }))%>%
-  subset(species=='Homo sapiens (Human)')
+    subset(species=='Homo sapiens (Human)')
 
 
 print(paste('Got',nrow(full.res),'human cellosaurus samples'))
 
@@ -13,9 +13,13 @@ all.dsets<-PharmacoGx::availablePSets()
 
 
 #' getCellLineData - gets cell line dose response data
-getDepMapDrugData<-function(cell.lines=c('CTRPv2','FIMM','gCSI','PRISM','GDSC','NCI60','CCLE')){
-
+getDepMapDrugData<-function(cell.lines=c('CTRPv2','FIMM','gCSI','PRISM','GDSC','CCLE'),efile=''){
 
+    if(efile!=''){
+        existing_ids=readr::read_tsv(efile)
+    }else{
+        existing_ids=NULL
+        }
     for(cel in cell.lines){
 
         files<-subset(all.dsets,`Dataset Name`==cel)%>%
@@ -55,6 +59,12 @@ getDepMapDrugData<-function(cell.lines=c('CTRPv2','FIMM','gCSI','PRISM','GDSC','
 #                 dplyr::select(common_drug_name='chem_name',improve_drug_id)%>%
 #                 distinct()
             chem_list <- unique(mapping$treatmentid)
+            print(paste('Found',length(chem_list),'chemicals for dataset',cel))
+        #    if(!is.null(existing_ids)){
+        #        chem_list=setdiff(chem_list,existing_ids$chem_name)
+        #        print(paste('Reducing to',length(chem_list),'after accounting for existing ids'))
+        #     }
+
             output_file_path <- '/tmp/broad_sanger_drugs.tsv'
             ignore_file_path <- '/tmp/ignore_chems.txt'
             update_dataframe_and_write_tsv(unique_names=chem_list,output_filename=output_file_path,ignore_chems=ignore_file_path)
@@ -70,15 +80,17 @@ getDepMapDrugData<-function(cell.lines=c('CTRPv2','FIMM','gCSI','PRISM','GDSC','
 
 main<-function(){
 	args = commandArgs(trailingOnly=TRUE)
-	if(length(args)!=1){
-	  print('Usage: Rscript 03-createDrugFile.R [datasets]')
+	if(length(args)<2){
+	  print('Usage: Rscript 03-createDrugFile.R [datasets] [existing file]')
 # 	  exit()
 	  }
 #	sfile = args[1]
         dsets<-unlist(strsplit(args[1],split=','))
-
-
-       dl1<-getDepMapDrugData(dsets)
+        if(length(args)==2)
+            efile=args[2]
+        else
+            efile=''
+       dl1<-getDepMapDrugData(dsets,efile)
 
 
 }
 
@@ -0,0 +1,104 @@
+
+
+'''
+gets nci60 drug information
+'''
+
+import polars as pl
+import os
+import argparse
+import pubchem_retrieval as pr
+import random as rand
+from urllib import request
+
+##drug files
+smi_strings='https://wiki.nci.nih.gov/download/attachments/155844992/nsc_smiles.csv?version=1&modificationDate=1710381820000&api=v2&download=true'
+pc_ids='https://wiki.nci.nih.gov/download/attachments/155844992/nsc_sid_cid.csv?version=2&modificationDate=1712766341112&api=v2&download=true'
+chemnames='https://wiki.nci.nih.gov/download/attachments/155844992/nsc_chemcal_name.csv?version=1&modificationDate=1710382716000&api=v2&download=true'
+cas='https://wiki.nci.nih.gov/download/attachments/155844992/nsc_cas.csv?version=1&modificationDate=1710381783000&api=v2&download=true'
+conc_data = 'https://wiki.nci.nih.gov/download/attachments/147193864/DOSERESP.zip?version=11&modificationDate=1712351454136&api=v2'
+
+
+def main():    
+    parser = argparse.ArgumentParser()
+    parser.add_argument('--test',action='store_true',default=False,help='Test script by sampling 100 chemicals')
+    parser.add_argument('--output',default='/tmp/broad_sanger_drugs.tsv')
+    opts = parser.parse_args()
+
+    ###primary DF
+    df = {'improve_drug_id':[],'chem_name':[],'canSMILES':[],'isoSMILES':[],\
+          'InChIKey':[],'formula':[],'weight':[],'pubchem_id':[]}
+
+    print('Downloading NSC identifiers for nci60 data')
+    names = pl.read_csv(chemnames,ignore_errors=True)
+    #castab = pd.read_csv(cas)
+    pubchems = pl.read_csv(pc_ids)
+    smiles = pl.read_csv(smi_strings)
+
+    print('Getting experimental data to filter drugs')
+    if not os.path.exists('DOSERESP.csv'):
+        resp = request.urlretrieve(conc_data,'doseresp.zip')
+        os.system('unzip doseresp.zip')
+    dose_resp = pl.read_csv("DOSERESP.csv",quote_char='"',infer_schema_length=10000000)
+    pubchems = pubchems.filter(pl.col('NSC').is_in(dose_resp['NSC']))
+    ##first retreive pubchem data
+    if opts.test:
+        arr = rand.sample(list(pubchems['CID']),100)
+    else:
+        arr = set(pubchems['CID'])
+        
+    print("Querying pubchem from CIDs")
+    pr.update_dataframe_and_write_tsv(arr,opts.output,'/tmp/ignore_chems.txt',batch_size=400,isname=False,time_limit=10*60*60)
+    
+    ##then make sure to paste `nsc` in front of all nsc idds
+    res = pl.read_csv(opts.output,separator='\t')
+
+
+    nsc = list(pubchems.filter(pl.col('CID').is_in(list(res['pubchem_id'])))['NSC'])
+
+    print('Checking NSCs to see what we missed')
+    missing = [n for n in nsc if 'nsc'+str(n) not in res['chem_name'] and 'nsc-'+str(n) not in res['chem_name']]
+    
+    ##check ignore_chems.txt
+    print('missing '+str(len(missing))+' nsc ids')
+
+    msmi = smiles.filter(pl.col('NSC').is_in(missing))
+    print('Found SMILE strings for '+str(msmi.shape[1])+' NSCs')
+    
+    ##add in improve ids, nsc name and structure for all.
+    mdf = msmi.join(names,on='NSC',how='left').join(pubchems,on='NSC',how='left')
+
+    max_imp = max(int(a.split('_')[1]) for a in res['improve_drug_id'])
+
+    smicount=len(set(mdf['SMILES'])) ## unique smiles in our missing data frame
+    newdf = pl.DataFrame(
+        {
+            "improve_drug_id": ["SMI_"+str(a) for a in range(max_imp+1,max_imp+1+smicount,1)],
+            'canSMILES': [a for a in set(mdf['SMILES'])],
+            'isoSMILES': [a for a in set(mdf['SMILES'])],
+            'InChIKey': [None for a in range(smicount)],
+            'formula': [None for a in range(smicount)],
+            'weight': [None for a in range(smicount)]
+        }
+    )
+
+    #create updated nsc ids and names
+    namedf = pl.DataFrame(
+        {
+            "nscid": ['nsc-'+str(a) for a in mdf['NSC']],
+            'lower_name': [a if a is None else str(a).lower() for a in mdf['NAME']],
+            'canSMILES': list(mdf['SMILES']),
+            'pubchem_id': list(mdf['CID'])
+        }
+    )
+    #merge and melt
+    merged = pl.concat([mdf,namedf],how='horizontal').select(['SMILES','pubchem_id','nscid','lower_name'])
+    melted = merged.melt(id_vars=['SMILES','pubchem_id'],value_vars=['nscid','lower_name']).select(['SMILES','pubchem_id','value']).unique()
+    melted.columns = ['canSMILES','pubchem_id','chem_name']
+    if newdf.shape[0]>0:
+        newdf = newdf.join(melted,on='canSMILES',how='inner').select(res.columns)
+        res = pl.concat([res,newdf],how='vertical')
+    res.write_csv(opts.output,separator='\t')
+    
+if __name__=='__main__':
+    main()
@@ -12,6 +12,8 @@
 
 import os
 import argparse
+import pandas as pd
+import subprocess
 
 parser = argparse.ArgumentParser()
 parser.add_argument('--curSampleFile',dest='samplefile',default=None,help='DepMap sample file')
@@ -23,27 +25,37 @@
 samplefile = opts.samplefile
 drugfile = opts.dfile
 
-####step 4a - get dose response data
-cmd = 'Rscript 04a-drugResponseData.R '+samplefile+' '+drugfile+' CTRPv2,FIMM,GDSC'
+cmd = ['/opt/venv/bin/python','04b-nci60-updated.py','--sampleFile='+samplefile,'--drugFile='+drugfile]
 print(cmd)
-os.system(cmd)
+subprocess.run(cmd)
 
-cmd = 'Rscript 04a-drugResponseData.R '+samplefile+' '+drugfile+' gCSI,PRISM,CCLE'
+####step 4a - get dose response data
+cmd = ['Rscript','04a-drugResponseData.R',samplefile,drugfile,'CTRPv2,FIMM,GDSC']
 print(cmd)
-os.system(cmd)
+subprocess.run(cmd)
 
-cmd = 'Rscript 04a-drugResponseData.R '+samplefile+' '+drugfile+' NCI60'
+cmd = ['Rscript','04a-drugResponseData.R',samplefile,drugfile,'gCSI,PRISM,CCLE']
 print(cmd)
-os.system(cmd)
+subprocess.run(cmd)
+
+
+#cmd = 'Rscript 04a-drugResponseData.R '+samplefile+' '+drugfile+' NCI60'
+#print(cmd)
+#os.system(cmd)
 
 ########Step 4b fit curves
 allfiles=[a for a in os.listdir('./') if 'DoseResponse' in a]
 print(allfiles)
 for a in allfiles:
-    os.system('/opt/venv/bin/python fit_curve.py --input '+a+' --output '+a)
+    subprocess.run(['/opt/venv/bin/python','fit_curve.py','--input='+a,'--output='+a])
 
 ###step 4c concatenate all files
+outfiles = [a for a in os.listdir("./") if ".0" in a]
+final_file = []
+for of in outfiles:
+    final_file.append(pd.read_csv(of,sep='\t'))
 
-os.system('cat *.0 > /tmp/broad_sanger_experiments.tsv')
+pd.concat(final_file).to_csv('/tmp/broad_sanger_experiments.tsv',index=False,sep='\t')
+#os.system('cat *.0 > /tmp/broad_sanger_experiments.tsv')
 #os.system('gzip -f /tmp/experiments.tsv')
 
@@ -0,0 +1,112 @@
+'''
+gets nci60 data from 10/2023 release
+
+'''
+
+import polars as pl
+import argparse
+#from zipfile import ZipFile
+import os
+#from io import BytesIO
+import re
+from urllib import request
+
+conc_data = 'https://wiki.nci.nih.gov/download/attachments/147193864/DOSERESP.zip?version=11&modificationDate=1712351454136&api=v2'
+cancelled = 'https://wiki.nci.nih.gov/download/attachments/147193864/DOSERESP_Cancelled.csv?version=1&modificationDate=1660871847000&api=v2&download=true'
+
+def main():
+    
+    parser = argparse.ArgumentParser()
+    parser.add_argument('--sampleFile',dest='samplefile',default=None,help='DepMap sample file') 
+    parser.add_argument('--drugFile',dest='dfile',default=None,help='Drug database')
+    
+    opts = parser.parse_args()
+    
+    samplefile = opts.samplefile
+    drugfile = opts.dfile
+    if not os.path.exists('DOSERESP.csv'):
+        resp = request.urlretrieve(conc_data,'doseresp.zip')
+        os.system('unzip doseresp.zip')
+    
+    samples = pl.read_csv(samplefile,quote_char='"')
+    drugs = pl.read_csv(drugfile,separator='\t',quote_char='"')
+    
+    dose_resp = pl.read_csv("DOSERESP.csv",quote_char='"',infer_schema_length=10000000)
+
+    ##update drug mapping
+    drugmapping = pl.DataFrame(
+        {
+            'chem_name' : ['nsc-'+str(nsc) for nsc in set(dose_resp['NSC'])],
+            'NSC' : [a for a in set(dose_resp['NSC'])]
+        }
+    )
+
+    drugmapping = drugmapping.join(drugs,on='chem_name')[['NSC','improve_drug_id']]
+    drugmapping = drugmapping.unique()
+
+    ###update sample mapping
+    on = samples[['other_names','improve_sample_id']]
+    on.columns=['common_name','improve_sample_id']
+    
+    sampmapping = pl.concat([on[['common_name','improve_sample_id']],samples[['common_name','improve_sample_id']]])
+                            
+    sampmapping = sampmapping.unique()
+    sampmapping.columns = ['CELL_NAME','improve_sample_id']
+
+    ###create a time mapping tabel
+    timemapping = pl.DataFrame(
+        {
+            'EXPID':dose_resp['EXPID'],
+            'time':[72 if int(a[0:2])>22 and int(a[0:2])<50 and int(a[2:4])>0 else 48 for a in dose_resp['EXPID']],
+            'time_unit':['hours' for a in dose_resp['EXPID']]
+        }
+        ).unique()
+
+
+    ##now we can merge all the data into the dose response data frame
+    merged = dose_resp[['AVERAGE_PTC','CONCENTRATION','CELL_NAME','EXPID','NSC']].join(sampmapping,on='CELL_NAME',how='left')
+    merged = merged.join(timemapping,on='EXPID',how='left')
+    
+    ##clean up mssing samples
+    nonulls = merged.filter(pl.col('improve_sample_id').is_not_null())
+
+    nulls = merged.filter(pl.col('improve_sample_id').is_null())
+    
+    newnames = pl.DataFrame(
+        {
+            'new_name':[re.split(' |\(|\/',a)[0] for a in nulls['CELL_NAME']],
+            'CELL_NAME':nulls['CELL_NAME']
+        }
+    )
+    newnames = newnames.unique()
+    
+    fixed = nulls[['AVERAGE_PTC','CONCENTRATION','CELL_NAME','EXPID','NSC','time','time_unit']].join(newnames,on='CELL_NAME',how='left')
+    fixed.columns = ['AVERAGE_PTC','CONCENTRATION','old_CELL_NAME','EXPID','NSC','time','time_unit','CELL_NAME']
+    fixed = fixed.join(sampmapping,on='CELL_NAME',how='left')[['AVERAGE_PTC','CONCENTRATION','old_CELL_NAME','EXPID','NSC','improve_sample_id','time','time_unit']]
+    fixed.columns = ['AVERAGE_PTC','CONCENTRATION','CELL_NAME','EXPID','NSC','improve_sample_id','time','time_unit']
+    fixed = fixed.filter(pl.col('improve_sample_id').is_not_null())
+
+    merged = pl.concat([nonulls,fixed])
+    
+    ###we get a few more results added, but still missing a bunch    
+    merged = merged.join(drugmapping,on='NSC',how='left')
+    nulldrugs = merged.filter(pl.col('improve_drug_id').is_null())
+    nonulls =  merged.filter(pl.col('improve_drug_id').is_not_null())
+    finaldf = pl.DataFrame(
+        {
+            'source':['NCI60_24' for a in nonulls['improve_drug_id']], ##2024 build
+            'improve_sample_id':nonulls['improve_sample_id'],
+            'Drug':nonulls['improve_drug_id'],
+            'study':['NCI60' for a in nonulls['improve_drug_id']],
+            'time':nonulls['time'],
+            'time_unit':nonulls['time_unit'],
+            'DOSE': [10**a for a in nonulls['CONCENTRATION']],
+            'GROWTH':nonulls['AVERAGE_PTC']
+        }
+    )
+    ##write to file
+    finaldf.write_csv('nci60DoseResponse',separator='\t')
+    
+
+if __name__=='__main__':
+    main()