updated with more testing updates

sgosline · sgosline · commit 8abb5010f9f2 · 2024-05-07T06:00:34.000-07:00
how did other things break?
diff --git a/build/beatAML/requirements.txt b/build/beatAML/requirements.txt
@@ -4,3 +4,4 @@ requests
 synapseclient
 argparse
 numpy
+openpyxl
diff --git a/build/broad_sanger/01-broadSangerSamples.R b/build/broad_sanger/01-broadSangerSamples.R
@@ -8,6 +8,7 @@
 library(curl)
 library(dplyr)
 library(readr)
+library(XML)
 
 ##the only thing that Priyanka has here is TRP identifiers, so collecting those
 # tab<-read.table('DepMap_Argonne_Mapping.csv',sep=',',header=T)%>%
@@ -31,34 +32,37 @@ print(paste("Downloaded",nrow(depmap_models),'dep map identifiers and',nrow(sang
 
 ##query for cellosaurus automagically to get loadest version
 url='https://ftp.expasy.org/databases/cellosaurus/cellosaurus.xml'
-if(!file.exists('cell.xml'))
-  curl_download(url,'cell.xml',quiet=TRUE)#curl(url, "r", h)
-cello<-XML::xmlParse('cell.xml')
+if(!file.exists('/tmp/cell.xml'))
+  curl_download(url,'/tmp/cell.xml',quiet=TRUE)#curl(url, "r", h)
+cello<-XML::xmlParse('/tmp/cell.xml')
 cdf<-XML::xmlToList(cello)
 
-
+print('Got all cellosaurus ids')
 ### next we parse through cellosaurus to get as many samples as we deem relevant
 ##ok, this command seems to have gotten file in appropriate state
 cell.lines<-lapply(cdf$`cell-line-list`, function(x) unlist(x))
 
 ##now we need toe xtract columns
 options(show.error.messages=TRUE)
 full.res<-do.call(rbind,lapply(cell.lines,function(x){
-  ##create a data frame for each cell lines
-  x<-unlist(x)
-  #should only be one acession
-  acc<-x[grep('accession.text',names(x),fixed=T)]%>%unlist()
-
-  cn<-x[grep('name.text',names(x),fixed=T)]%>%unlist()
-  #these will fail if no key found
-  spec<-x[grep("species-list.cv-term.text",names(x),fixed=T)]%>%unlist()
-  #dis<-x[grep("disease-list.cv-term.text",names(x),fixed=T)]%>%unlist()
-  data.frame(accession=cn,
-             RRID=rep(acc,length(cn)),
-             species=rep(spec,length(cn)))
-   #          disease=rep(dis,length(cn)))
+    ##create a data frame for each cell lines
+    x<-unlist(x)
+                                        #should only be one acession
+    acc<-x[grep('accession.text',names(x),fixed=T)]%>%unlist()
+
+    cn<-x[grep('name-list.name.text',names(x),fixed=T)]%>%unlist()
+                                        #these will fail if no key found
+    spec<-x[grep("species-list.xref.label",names(x),fixed=T)]%>%unlist()
+    #print(acc)
+    #print(cn)
+    #print(spec)
+                                        #dis<-x[grep("disease-list.cv-term.text",names(x),fixed=T)]%>%unlist()
+    data.frame(accession=cn,
+               RRID=rep(acc,length(cn)),
+               species=rep(spec,length(cn)))
+                                        #          disease=rep(dis,length(cn)))
 }))%>%
-  subset(species=='Homo sapiens (Human)')
+    subset(species=='Homo sapiens (Human)')
 
 
 print(paste('Got',nrow(full.res),'human cellosaurus samples'))
diff --git a/build/broad_sanger/03a-nci60Drugs.py b/build/broad_sanger/03a-nci60Drugs.py
@@ -51,10 +51,10 @@ def main():
         arr = set(pubchems['CID'])
         
     print("Querying pubchem from CIDs")
-    pr.update_dataframe_and_write_tsv(arr,batch_size=400,isname=False)
+    pr.update_dataframe_and_write_tsv(arr,opts.output,'/tmp/ignore_chems.txt',batch_size=400,isname=False)
     
     ##then make sure to paste `nsc` in front of all nsc idds
-    res = pl.read_csv('drugs.tsv',separator='\t')
+    res = pl.read_csv(opts.output,separator='\t')
 
 
     nsc = list(pubchems.filter(pl.col('CID').is_in(list(res['pubchem_id'])))['NSC'])
diff --git a/build/broad_sanger/build_drugs.sh b/build/broad_sanger/build_drugs.sh
@@ -1,2 +1,2 @@
-/opt/env/bin/python 03a-nci60Drugs.py 
+/opt/venv/bin/python 03a-nci60Drugs.py 
 Rscript 03-createDrugFile.R CTRPv2,GDSC,gCSI,PRISM,CCLE,FIMM
diff --git a/build/broad_sanger/requirements.txt b/build/broad_sanger/requirements.txt
@@ -7,3 +7,4 @@ scikit-learn
 scipy
 requests
 openpyxl
+polars
diff --git a/build/build_all.py b/build/build_all.py
@@ -36,7 +36,7 @@ def main():
     parser.add_argument('--drugs',dest='drugs',default=False,action='store_true')
     parser.add_argument('--exp',dest='exp',default=False,action='store_true')
     parser.add_argument('--all',dest='all',default=False,action='store_true')
-    parser.add_argument('--dataset',dest='datasets',default='broad_sanger,cptac,hcmi,beataml,mpnst',help='Datasets to process. Defaults to all available, but if there are synapse issues, please remove beataml and mpnst')
+    parser.add_argument('--dataset',dest='datasets',default='broad_sanger,hcmi,beataml,mpnst,cptac',help='Datasets to process. Defaults to all available, but if there are synapse issues, please remove beataml and mpnst')
 
     args = parser.parse_args()
                     
@@ -72,7 +72,8 @@ def main():
     ## can be run independently but first before omics/experiemnts
     if args.samples or args.all:
         ### build gene file
-        run_cmd(['genes','sh','build_genes.sh'],'gene file')
+        if not os.path.exists('/tmp/genes.csv'):
+            run_cmd(['genes','sh','build_genes.sh'],'gene file')
         
         ###build sample files
         sf=''
@@ -101,7 +102,7 @@ def main():
                 
             if not os.path.exists('local/'+da+'_drugs.tsv'):
                 run_cmd([di,'sh','build_drugs.sh',df],da+' drugs')
-            df = '/tmp/'+di+'_drugs.tsv'
+            df = '/tmp/'+da+'_drugs.tsv'
 
     #### Any new omics files are created here.
     ## depends on samples!
diff --git a/build/docker/Dockerfile.broad_sanger_exp b/build/docker/Dockerfile.broad_sanger_exp
@@ -13,7 +13,7 @@ WORKDIR /app
 
 ADD build/broad_sanger/03-createDrugFile.R ./
 ADD build/broad_sanger/04a-drugResponseData.R ./
-ADD build/broad_sanger/04-drug_dosage_and_curves.py ./
+ADD build/broad_sanger/*py ./
 ADD build/broad_sanger/build_drugs.sh ./
 ADD build/broad_sanger/build_exp.sh ./
 ADD build/utils/* ./
diff --git a/build/utils/pubchem_retrieval.py b/build/utils/pubchem_retrieval.py
@@ -150,7 +150,7 @@ def timeout_handler(signum, frame):
     should_continue = False
 
 # Call this function from other scripts. 
-def update_dataframe_and_write_tsv(unique_names, output_filename="drugs.tsv",ignore_chems="ignore_chems.txt", batch_size=1,isname=False):
+def update_dataframe_and_write_tsv(unique_names, output_filename="drugs.tsv",ignore_chems="ignore_chems.txt", batch_size=1,isname=True):
     global should_continue, existing_synonyms, existing_pubchemids
     time_limit=5*60*60 # 5 hours
     signal.signal(signal.SIGALRM, timeout_handler)

Original file line number	Diff line number	Diff line change
`@@ -1,2 +1,2 @@`
`1`		`-/opt/env/bin/python 03a-nci60Drugs.py`
	`1`	`+/opt/venv/bin/python 03a-nci60Drugs.py`
`2`	`2`	`Rscript 03-createDrugFile.R CTRPv2,GDSC,gCSI,PRISM,CCLE,FIMM`