Skip to content

Commit 87ec92e

Browse files
committed
Merge branch 'main' into curve_fit_multithread
2 parents eb9c314 + 06bf381 commit 87ec92e

File tree

7 files changed

+47
-12
lines changed

7 files changed

+47
-12
lines changed

.dockerignore

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -4,4 +4,5 @@ coderdata/
44
dataSummary/
55
docs/
66
candle_bmd/
7-
schema/
7+
schema/
8+
build/local/

build/beatAML/GetBeatAML.py

Lines changed: 12 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -7,7 +7,7 @@
77
import numpy as np
88
import subprocess
99
import argparse
10-
10+
import time
1111

1212
def download_from_github(raw_url, save_path):
1313
"""
@@ -159,11 +159,14 @@ def retrieve_drug_info(compound_name):
159159
"""
160160
if pd.isna(compound_name):
161161
return np.nan, np.nan, np.nan, np.nan, np.nan, np.nan
162+
163+
##limit is 1 call per 5 seconds. add in wait call.
162164

163165
url = f"https://pubchem.ncbi.nlm.nih.gov/rest/pug/compound/name/{compound_name}/property/CanonicalSMILES,IsomericSMILES,InChIKey,MolecularFormula,MolecularWeight/JSON"
164166
response = requests.get(url)
165167

166168
if response.status_code != 200:
169+
print(response.text)
167170
return np.nan, np.nan, np.nan, np.nan, np.nan, np.nan
168171

169172
data = response.json()
@@ -206,16 +209,20 @@ def update_dataframe_with_pubchem(d_df):
206209
for name in chem_names:
207210
print("Attempting to call pubchem API for chem_name: ", name)
208211
chem_data_dict[name] = retrieve_drug_info(name)
212+
time.sleep(0.2)
209213
failed_chem_names = {k for k, v in chem_data_dict.items() if all(pd.isna(val) for val in v)}
210214
other_names = d_df[d_df['chem_name'].isin(failed_chem_names)]['other_name'].dropna().unique()
211215
other_data_dict = {}
212216
for name in other_names:
213217
print("Attempting to call pubchem API for other_name: ", name)
214218
other_data_dict[name] = retrieve_drug_info(name)
219+
time.sleep(0.2)
215220

216221
# Combine both dictionaries for easy lookup
217222
data_dict = {**chem_data_dict, **other_data_dict}
218223

224+
#print(data_dict)
225+
# print(data_dict['isoSMILES'])
219226
# Update the DataFrame using the data dictionary
220227
for idx, row in d_df.iterrows():
221228
if row['chem_name'] in data_dict and not all(pd.isna(val) for val in data_dict[row['chem_name']]):
@@ -248,6 +255,9 @@ def merge_drug_info(d_df,drug_map):
248255
pd.DataFrame
249256
The merged dataframe containing combined drug information.
250257
"""
258+
#print(drug_map)
259+
#print(d_df.columns)
260+
#print(d_df)
251261
result_df = d_df.merge(drug_map[['isoSMILES', 'improve_drug_id']], on='isoSMILES', how='left')
252262
return result_df
253263

@@ -292,7 +302,7 @@ def format_drug_df(drug_path):
292302
"""
293303
d_df = pd.read_csv(drug_path, index_col=None,sep="\t")
294304
d_df[['chem_name', 'other_name']] = d_df['inhibitor'].str.extract(r'^(.*?)\s*(?:\((.+)\))?$')
295-
d_df["chem_name"] = d_df["chem_name"].str.replace('\s-\s', ':')
305+
d_df["chem_name"] = d_df["chem_name"].str.replace('\s-\s', ':',regex=True)
296306
d_df['chem_name'] = [a.lower() for a in d_df['chem_name']]
297307
return d_df
298308

build/beatAML/requirements.txt

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,6 @@
1+
pandas
2+
wget==3.2
3+
requests
4+
synapseclient
5+
argparse
6+
numpy

build/broad_sanger/02-broadSangerOmics.R

Lines changed: 24 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -31,7 +31,7 @@ variant_schema =list(`3'UTR`=c("3'UTR",'THREE_PRIME_UTR','3prime_UTR_variant','3
3131
IGR=c('IGR','nc_variant'),
3232
In_Frame_Del=c('IN_FRAME_DEL','In_Frame_Del','inframe'),
3333
In_Frame_Ins=c('IN_FRAME_INS','In_Frame_Ins'),
34-
Intron=c('INTRON','Intron','intronic'),
34+
Intron=c('INTRON','Intron','intronic','intron'),
3535
Missense_Mutation=c('Missense_Mutation','MISSENSE','missense'),
3636
Nonsense_Mutation=c('Nonsense_Mutation','NONSENSE','nonsense'),
3737
Nonstop_Mutation=c('Nonstop_Mutation','NONSTOP'),
@@ -160,8 +160,17 @@ sanger_files<-function(fi,value){
160160
left_join(smap)|>
161161
mutate(study='Sanger')|>
162162
dplyr::select(-c(other_id,gene_symbol))|>
163-
left_join(as.data.frame(sanger_vtab))|>
164-
dplyr::select(-effect)|>
163+
left_join(as.data.frame(sanger_vtab))
164+
165+
##now many variants are missing???
166+
missing<-res|>
167+
select(effect,variant_classification)|>
168+
distinct()|>
169+
subset(is.na(variant_classification))
170+
print(missing)
171+
172+
###TODO double check to see if any variants are missing
173+
res<-res|>dplyr::select(-effect)|>
165174
subset(!is.na(improve_sample_id))|>
166175
distinct()
167176

@@ -387,7 +396,16 @@ depmap_files<-function(fi,value){
387396

388397
res<-exp_file|>
389398
mutate(entrez_id=as.numeric(EntrezGeneID))|>
390-
left_join(as.data.frame(depmap_vtab))|>
399+
left_join(as.data.frame(depmap_vtab))
400+
401+
##now many variants are missing???
402+
missing<-res|>
403+
select(VariantInfo,variant_classification)|>
404+
distinct()|>
405+
subset(is.na(variant_classification))
406+
print(missing)
407+
408+
res<-res|>
391409
dplyr::select(-c(EntrezGeneID,VariantInfo))|>
392410
distinct()|>
393411
subset(!is.na(entrez_id)) ##removes thos with unknonw entrez
@@ -538,13 +556,12 @@ main<-function(){
538556

539557
lapply(alltypes,function(dt){
540558
print(dt)
541-
temps<-sanger_files(sanger_filenames[[dt]],dt)
542-
tempd<-depmap_files(depmap_filenames[[dt]],dt)
559+
temps<-sanger_files(sanger_filenames[[dt]],dt)|>tidyr::drop_na()
560+
tempd<-depmap_files(depmap_filenames[[dt]],dt)|>tidyr::drop_na()
543561
readr::write_csv(rbind(tempd,temps),file=paste0('/tmp/broad_sanger_',dt,'.csv.gz'))
544562
rm(tempd)
545563
rm(temps)
546564
})
547-
system(paste0('/opt/venv/bin/python 02a-broad_sanger_proteomics.py --gene ',gfile,' --sample ',sfile))
548565

549566
}
550567

build/broad_sanger/build_omics.sh

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,2 +1,3 @@
1+
/opt/venv/bin/python 02a-broad_sanger_proteomics.py --gene $1 --sample $2
12
Rscript 02-broadSangerOmics.R $1 $2
23
#python 02a-broad/sanger_proteomics.py $1 $2

build/build_all.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -115,7 +115,7 @@ def main():
115115
df='broad_sanger'
116116
else:
117117
df = di
118-
if not os.path.exists('/local/'+di+'_experiments.tsv'):
118+
if not os.path.exists('local/'+di+'_experiments.tsv'):
119119
run_cmd([di,'sh','build_exp.sh','/tmp/'+df+'_samples.csv','/tmp/'+df+'_drugs.tsv'],di+' experiments')
120120

121121

build/docker/Dockerfile.beataml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -6,8 +6,8 @@ WORKDIR /usr/src/app
66
COPY build/beatAML/GetBeatAML.py .
77
COPY build/utils/fit_curve.py .
88
COPY build/beatAML/*sh ./
9+
COPY build/beatAML/requirements.txt .
910

10-
COPY requirements.txt .
1111
RUN pip install --no-cache-dir -r requirements.txt
1212
VOLUME ['/tmp']
1313
# CMD python GetBeatAML.py --token ${SYNAPSE_TOKEN}

0 commit comments

Comments
 (0)