fixed dropNA issues in broad_sanger

sgosline · sgosline · commit c13d2d060b7a · 2024-04-26T09:43:39.000-07:00
Closes #135 closes #136 Closes #137
diff --git a/build/beatAML/GetBeatAML.py b/build/beatAML/GetBeatAML.py
@@ -221,7 +221,7 @@ def update_dataframe_with_pubchem(d_df):
     # Combine both dictionaries for easy lookup
     data_dict = {**chem_data_dict, **other_data_dict}
 
-    print(data_dict)
+    #print(data_dict)
 #    print(data_dict['isoSMILES'])
     # Update the DataFrame using the data dictionary
     for idx, row in d_df.iterrows():
@@ -256,8 +256,8 @@ def merge_drug_info(d_df,drug_map):
         The merged dataframe containing combined drug information.
     """
     #print(drug_map)
-    print(d_df.columns)
-    print(d_df)
+    #print(d_df.columns)
+    #print(d_df)
     result_df = d_df.merge(drug_map[['isoSMILES', 'improve_drug_id']], on='isoSMILES', how='left')
     return result_df
 
diff --git a/build/broad_sanger/02-broadSangerOmics.R b/build/broad_sanger/02-broadSangerOmics.R
@@ -31,7 +31,7 @@ variant_schema =list(`3'UTR`=c("3'UTR",'THREE_PRIME_UTR','3prime_UTR_variant','3
                      IGR=c('IGR','nc_variant'),
                      In_Frame_Del=c('IN_FRAME_DEL','In_Frame_Del','inframe'),
                      In_Frame_Ins=c('IN_FRAME_INS','In_Frame_Ins'),
-                     Intron=c('INTRON','Intron','intronic'),
+                     Intron=c('INTRON','Intron','intronic','intron'),
                      Missense_Mutation=c('Missense_Mutation','MISSENSE','missense'),
                      Nonsense_Mutation=c('Nonsense_Mutation','NONSENSE','nonsense'),
                      Nonstop_Mutation=c('Nonstop_Mutation','NONSTOP'),
@@ -160,8 +160,17 @@ sanger_files<-function(fi,value){
         left_join(smap)|>
           mutate(study='Sanger')|>
           dplyr::select(-c(other_id,gene_symbol))|>
-          left_join(as.data.frame(sanger_vtab))|>
-          dplyr::select(-effect)|>
+          left_join(as.data.frame(sanger_vtab))
+
+      ##now many variants are missing???
+      missing<-res|>
+          select(effect,variant_classification)|>
+          distinct()|>
+          subset(is.na(variant_classification))
+      print(missing)
+
+###TODO double check to see if any variants are missing
+      res<-res|>dplyr::select(-effect)|>
           subset(!is.na(improve_sample_id))|>
           distinct()
 
@@ -387,7 +396,16 @@ depmap_files<-function(fi,value){
 
         res<-exp_file|>
           mutate(entrez_id=as.numeric(EntrezGeneID))|>
-            left_join(as.data.frame(depmap_vtab))|>
+            left_join(as.data.frame(depmap_vtab))
+
+              ##now many variants are missing???
+        missing<-res|>
+            select(VariantInfo,variant_classification)|>
+            distinct()|>
+            subset(is.na(variant_classification))
+        print(missing)
+
+        res<-res|>
             dplyr::select(-c(EntrezGeneID,VariantInfo))|>
             distinct()|>
           subset(!is.na(entrez_id)) ##removes thos with unknonw entrez
@@ -538,8 +556,8 @@ main<-function(){
 
     lapply(alltypes,function(dt){
         print(dt)
-        temps<-sanger_files(sanger_filenames[[dt]],dt)
-        tempd<-depmap_files(depmap_filenames[[dt]],dt)
+        temps<-sanger_files(sanger_filenames[[dt]],dt)|>tidyr::drop_na()
+        tempd<-depmap_files(depmap_filenames[[dt]],dt)|>tidyr::drop_na()
         readr::write_csv(rbind(tempd,temps),file=paste0('/tmp/broad_sanger_',dt,'.csv.gz'))
         rm(tempd)
         rm(temps)