Merge branch 'main' into curve_fit_multithread

jjacobson95 · jjacobson95 · commit 87ec92e77e0e · 2024-04-26T16:39:28.000-07:00
diff --git a/.dockerignore b/.dockerignore
@@ -4,4 +4,5 @@ coderdata/
 dataSummary/
 docs/
 candle_bmd/
-schema/
+schema/
+build/local/
diff --git a/build/beatAML/GetBeatAML.py b/build/beatAML/GetBeatAML.py
@@ -7,7 +7,7 @@
 import numpy as np
 import subprocess
 import argparse
-
+import time
 
 def download_from_github(raw_url, save_path):
     """
@@ -159,11 +159,14 @@ def retrieve_drug_info(compound_name):
     """
     if pd.isna(compound_name):
         return np.nan, np.nan, np.nan, np.nan, np.nan, np.nan
+
+    ##limit is 1 call per 5 seconds. add in wait call.
     
     url = f"https://pubchem.ncbi.nlm.nih.gov/rest/pug/compound/name/{compound_name}/property/CanonicalSMILES,IsomericSMILES,InChIKey,MolecularFormula,MolecularWeight/JSON"
     response = requests.get(url)
 
     if response.status_code != 200:
+        print(response.text)
         return np.nan, np.nan, np.nan, np.nan, np.nan, np.nan
     
     data = response.json()
@@ -206,16 +209,20 @@ def update_dataframe_with_pubchem(d_df):
     for name in chem_names:
         print("Attempting to call pubchem API for chem_name: ", name)
         chem_data_dict[name] = retrieve_drug_info(name)
+        time.sleep(0.2)
     failed_chem_names = {k for k, v in chem_data_dict.items() if all(pd.isna(val) for val in v)}
     other_names = d_df[d_df['chem_name'].isin(failed_chem_names)]['other_name'].dropna().unique()
     other_data_dict = {}
     for name in other_names:
         print("Attempting to call pubchem API for other_name: ", name)
         other_data_dict[name] = retrieve_drug_info(name)
+        time.sleep(0.2)
 
     # Combine both dictionaries for easy lookup
     data_dict = {**chem_data_dict, **other_data_dict}
 
+    #print(data_dict)
+#    print(data_dict['isoSMILES'])
     # Update the DataFrame using the data dictionary
     for idx, row in d_df.iterrows():
         if row['chem_name'] in data_dict and not all(pd.isna(val) for val in data_dict[row['chem_name']]):
@@ -248,6 +255,9 @@ def merge_drug_info(d_df,drug_map):
     pd.DataFrame
         The merged dataframe containing combined drug information.
     """
+    #print(drug_map)
+    #print(d_df.columns)
+    #print(d_df)
     result_df = d_df.merge(drug_map[['isoSMILES', 'improve_drug_id']], on='isoSMILES', how='left')
     return result_df
 
@@ -292,7 +302,7 @@ def format_drug_df(drug_path):
     """
     d_df = pd.read_csv(drug_path, index_col=None,sep="\t")
     d_df[['chem_name', 'other_name']] = d_df['inhibitor'].str.extract(r'^(.*?)\s*(?:\((.+)\))?$')
-    d_df["chem_name"] = d_df["chem_name"].str.replace('\s-\s', ':')
+    d_df["chem_name"] = d_df["chem_name"].str.replace('\s-\s', ':',regex=True)
     d_df['chem_name'] = [a.lower() for a in d_df['chem_name']]
     return d_df
 
diff --git a/build/beatAML/requirements.txt b/build/beatAML/requirements.txt
@@ -0,0 +1,6 @@
+pandas
+wget==3.2
+requests
+synapseclient
+argparse
+numpy
diff --git a/build/broad_sanger/02-broadSangerOmics.R b/build/broad_sanger/02-broadSangerOmics.R
@@ -31,7 +31,7 @@ variant_schema =list(`3'UTR`=c("3'UTR",'THREE_PRIME_UTR','3prime_UTR_variant','3
                      IGR=c('IGR','nc_variant'),
                      In_Frame_Del=c('IN_FRAME_DEL','In_Frame_Del','inframe'),
                      In_Frame_Ins=c('IN_FRAME_INS','In_Frame_Ins'),
-                     Intron=c('INTRON','Intron','intronic'),
+                     Intron=c('INTRON','Intron','intronic','intron'),
                      Missense_Mutation=c('Missense_Mutation','MISSENSE','missense'),
                      Nonsense_Mutation=c('Nonsense_Mutation','NONSENSE','nonsense'),
                      Nonstop_Mutation=c('Nonstop_Mutation','NONSTOP'),
@@ -160,8 +160,17 @@ sanger_files<-function(fi,value){
         left_join(smap)|>
           mutate(study='Sanger')|>
           dplyr::select(-c(other_id,gene_symbol))|>
-          left_join(as.data.frame(sanger_vtab))|>
-          dplyr::select(-effect)|>
+          left_join(as.data.frame(sanger_vtab))
+
+      ##now many variants are missing???
+      missing<-res|>
+          select(effect,variant_classification)|>
+          distinct()|>
+          subset(is.na(variant_classification))
+      print(missing)
+
+###TODO double check to see if any variants are missing
+      res<-res|>dplyr::select(-effect)|>
           subset(!is.na(improve_sample_id))|>
           distinct()
 
@@ -387,7 +396,16 @@ depmap_files<-function(fi,value){
 
         res<-exp_file|>
           mutate(entrez_id=as.numeric(EntrezGeneID))|>
-            left_join(as.data.frame(depmap_vtab))|>
+            left_join(as.data.frame(depmap_vtab))
+
+              ##now many variants are missing???
+        missing<-res|>
+            select(VariantInfo,variant_classification)|>
+            distinct()|>
+            subset(is.na(variant_classification))
+        print(missing)
+
+        res<-res|>
             dplyr::select(-c(EntrezGeneID,VariantInfo))|>
             distinct()|>
           subset(!is.na(entrez_id)) ##removes thos with unknonw entrez
@@ -538,13 +556,12 @@ main<-function(){
 
     lapply(alltypes,function(dt){
         print(dt)
-        temps<-sanger_files(sanger_filenames[[dt]],dt)
-        tempd<-depmap_files(depmap_filenames[[dt]],dt)
+        temps<-sanger_files(sanger_filenames[[dt]],dt)|>tidyr::drop_na()
+        tempd<-depmap_files(depmap_filenames[[dt]],dt)|>tidyr::drop_na()
         readr::write_csv(rbind(tempd,temps),file=paste0('/tmp/broad_sanger_',dt,'.csv.gz'))
         rm(tempd)
         rm(temps)
     })
-    system(paste0('/opt/venv/bin/python 02a-broad_sanger_proteomics.py --gene ',gfile,' --sample ',sfile))
 
 }
 
diff --git a/build/broad_sanger/build_omics.sh b/build/broad_sanger/build_omics.sh
@@ -1,2 +1,3 @@
+/opt/venv/bin/python 02a-broad_sanger_proteomics.py --gene $1 --sample $2
 Rscript 02-broadSangerOmics.R $1 $2
 #python 02a-broad/sanger_proteomics.py $1 $2
diff --git a/build/build_all.py b/build/build_all.py
@@ -115,7 +115,7 @@ def main():
                 df='broad_sanger'
             else:
                 df = di
-            if not os.path.exists('/local/'+di+'_experiments.tsv'):
+            if not os.path.exists('local/'+di+'_experiments.tsv'):
                 run_cmd([di,'sh','build_exp.sh','/tmp/'+df+'_samples.csv','/tmp/'+df+'_drugs.tsv'],di+' experiments')
     
 
diff --git a/build/docker/Dockerfile.beataml b/build/docker/Dockerfile.beataml
@@ -6,8 +6,8 @@ WORKDIR /usr/src/app
 COPY build/beatAML/GetBeatAML.py . 
 COPY build/utils/fit_curve.py .
 COPY build/beatAML/*sh ./
+COPY build/beatAML/requirements.txt .
 
-COPY requirements.txt .
 RUN pip install --no-cache-dir -r requirements.txt
 VOLUME ['/tmp']
 # CMD python GetBeatAML.py --token ${SYNAPSE_TOKEN}

Original file line number	Diff line number	Diff line change
`@@ -1,2 +1,3 @@`
	`1`	`+/opt/venv/bin/python 02a-broad_sanger_proteomics.py --gene $1 --sample $2`
`1`	`2`	`Rscript 02-broadSangerOmics.R $1 $2`
`2`	`3`	`#python 02a-broad/sanger_proteomics.py $1 $2`