Skip to content

Commit c13d2d0

Browse files
committed
fixed dropNA issues in broad_sanger
Closes #135 closes #136 Closes #137
1 parent 1379139 commit c13d2d0

File tree

2 files changed

+27
-9
lines changed

2 files changed

+27
-9
lines changed

build/beatAML/GetBeatAML.py

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -221,7 +221,7 @@ def update_dataframe_with_pubchem(d_df):
221221
# Combine both dictionaries for easy lookup
222222
data_dict = {**chem_data_dict, **other_data_dict}
223223

224-
print(data_dict)
224+
#print(data_dict)
225225
# print(data_dict['isoSMILES'])
226226
# Update the DataFrame using the data dictionary
227227
for idx, row in d_df.iterrows():
@@ -256,8 +256,8 @@ def merge_drug_info(d_df,drug_map):
256256
The merged dataframe containing combined drug information.
257257
"""
258258
#print(drug_map)
259-
print(d_df.columns)
260-
print(d_df)
259+
#print(d_df.columns)
260+
#print(d_df)
261261
result_df = d_df.merge(drug_map[['isoSMILES', 'improve_drug_id']], on='isoSMILES', how='left')
262262
return result_df
263263

build/broad_sanger/02-broadSangerOmics.R

Lines changed: 24 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -31,7 +31,7 @@ variant_schema =list(`3'UTR`=c("3'UTR",'THREE_PRIME_UTR','3prime_UTR_variant','3
3131
IGR=c('IGR','nc_variant'),
3232
In_Frame_Del=c('IN_FRAME_DEL','In_Frame_Del','inframe'),
3333
In_Frame_Ins=c('IN_FRAME_INS','In_Frame_Ins'),
34-
Intron=c('INTRON','Intron','intronic'),
34+
Intron=c('INTRON','Intron','intronic','intron'),
3535
Missense_Mutation=c('Missense_Mutation','MISSENSE','missense'),
3636
Nonsense_Mutation=c('Nonsense_Mutation','NONSENSE','nonsense'),
3737
Nonstop_Mutation=c('Nonstop_Mutation','NONSTOP'),
@@ -160,8 +160,17 @@ sanger_files<-function(fi,value){
160160
left_join(smap)|>
161161
mutate(study='Sanger')|>
162162
dplyr::select(-c(other_id,gene_symbol))|>
163-
left_join(as.data.frame(sanger_vtab))|>
164-
dplyr::select(-effect)|>
163+
left_join(as.data.frame(sanger_vtab))
164+
165+
##now many variants are missing???
166+
missing<-res|>
167+
select(effect,variant_classification)|>
168+
distinct()|>
169+
subset(is.na(variant_classification))
170+
print(missing)
171+
172+
###TODO double check to see if any variants are missing
173+
res<-res|>dplyr::select(-effect)|>
165174
subset(!is.na(improve_sample_id))|>
166175
distinct()
167176

@@ -387,7 +396,16 @@ depmap_files<-function(fi,value){
387396

388397
res<-exp_file|>
389398
mutate(entrez_id=as.numeric(EntrezGeneID))|>
390-
left_join(as.data.frame(depmap_vtab))|>
399+
left_join(as.data.frame(depmap_vtab))
400+
401+
##now many variants are missing???
402+
missing<-res|>
403+
select(VariantInfo,variant_classification)|>
404+
distinct()|>
405+
subset(is.na(variant_classification))
406+
print(missing)
407+
408+
res<-res|>
391409
dplyr::select(-c(EntrezGeneID,VariantInfo))|>
392410
distinct()|>
393411
subset(!is.na(entrez_id)) ##removes thos with unknonw entrez
@@ -538,8 +556,8 @@ main<-function(){
538556

539557
lapply(alltypes,function(dt){
540558
print(dt)
541-
temps<-sanger_files(sanger_filenames[[dt]],dt)
542-
tempd<-depmap_files(depmap_filenames[[dt]],dt)
559+
temps<-sanger_files(sanger_filenames[[dt]],dt)|>tidyr::drop_na()
560+
tempd<-depmap_files(depmap_filenames[[dt]],dt)|>tidyr::drop_na()
543561
readr::write_csv(rbind(tempd,temps),file=paste0('/tmp/broad_sanger_',dt,'.csv.gz'))
544562
rm(tempd)
545563
rm(temps)

0 commit comments

Comments
 (0)