reduced multiprocessing load, and updated MPNST

sgosline · sgosline · commit be3ccc758cb2 · 2024-05-08T11:02:31.000-07:00
diff --git a/build/broad_sanger/04-drug_dosage_and_curves.py b/build/broad_sanger/04-drug_dosage_and_curves.py
@@ -24,7 +24,7 @@
 samplefile = opts.samplefile
 drugfile = opts.dfile
 
-cmd = 'python 04b-nci60-updated.py --sampleFile '+samplefile+' --drugFile '+drugfile
+cmd = '/opt/venv/bin/python 04b-nci60-updated.py --sampleFile '+samplefile+' --drugFile '+drugfile
 print(cmd)
 os.system(cmd)
 
diff --git a/build/mpnst/01_mpnst_get_omics.R b/build/mpnst/01_mpnst_get_omics.R
@@ -149,26 +149,44 @@ cnv<-do.call(rbind,lapply(setdiff(combined$CopyNumber,NA),function(x){
     sample<-subset(combined,CopyNumber==x)
     print(sample$improve_sample_id)
     res<-fread(synGet(x2)$path)
-    long_df <- res[, strsplit(as.character(gene), ","), by = .(chromosome, start, end, depth, log2)]
-    filtered_df <- long_df |>
-        subset(is.finite(log2))|>
-        filter(V1 %in% genes_df$gene) # get only protein coding genes and remove empty gene symbols
-    filtered_df <- filtered_df[, .(gene_symbol = V1,
-                           improve_sample_id = sample$improve_sample_id[1],
-                           copy_number = 2^log2,
-                           source = "NF Data Portal",
-                           study = "MPNST PDX MT")]
-    res<-filtered_df|> ##deep del < 0.5210507 < het loss < 0.7311832 < diploid < 1.214125 < gain < 1.422233 < amp
-        dplyr::mutate(copy_call=ifelse(copy_number<0.5210507,'deep del',
-                                       ifelse(copy_number<0.7311832,'het loss',
-                                              ifelse(copy_number<1.214125,'diploid',
-                                              ifelse(copy_number<1.422233,'gain','amp')))))|>
-        left_join(genes_df)|>
-        dplyr::select(entrez_id,improve_sample_id,copy_number,copy_call,study,source)|>
-        subset(!is.na(entrez_id))|>
-        distinct()
-    res|>group_by(copy_call)|>summarize(n_distinct(entrez_id))
-    return(distinct(res))
+
+    long_df<- res|>
+      tidyr::separate_rows(gene,sep=',')|>
+      dplyr::rename(gene_symbol='gene')|>
+      dplyr::left_join(genes_df)|>
+      subset(!is.na(entrez_id))|>
+      dplyr::select(entrez_id,log2)|>
+      dplyr::distinct()|>
+      dplyr::mutate(copy_number=2^log2)
+
+  res<-long_df|> ##deep del < 0.5210507 < het loss < 0.7311832 < diploid < 1.214125 < gain < 1.422233 < amp
+      dplyr::mutate(copy_call=ifelse(copy_number<0.5210507,'deep del',
+                                     ifelse(copy_number<0.7311832,'het loss',
+                                            ifelse(copy_number<1.214125,'diploid',
+                                                   ifelse(copy_number<1.422233,'gain','amp')))))|>
+    mutate(study='MPNST PDX MT',source='NF Data Portal',improve_sample_id=sample$improve_sample_id[1])|>
+    dplyr::distinct()
+
+    # long_df <- res[, strsplit(as.character(gene), ","), by = .(chromosome, start, end, depth, log2)]
+    # filtered_df <- long_df |>
+    #     subset(is.finite(log2))|>
+    #     filter(V1 %in% genes_df$gene) # get only protein coding genes and remove empty gene symbols
+    # filtered_df <- filtered_df[, .(gene_symbol = V1,
+    #                        improve_sample_id = sample$improve_sample_id[1],
+    #                        copy_number = 2^log2,
+    #                        source = "NF Data Portal",
+    #                        study = "MPNST PDX MT")]
+    # res<-filtered_df|> ##deep del < 0.5210507 < het loss < 0.7311832 < diploid < 1.214125 < gain < 1.422233 < amp
+    #     dplyr::mutate(copy_call=ifelse(copy_number<0.5210507,'deep del',
+    #                                    ifelse(copy_number<0.7311832,'het loss',
+    #                                           ifelse(copy_number<1.214125,'diploid',
+    #                                           ifelse(copy_number<1.422233,'gain','amp')))))|>
+    #     left_join(genes_df)|>
+    #     dplyr::select(entrez_id,improve_sample_id,copy_number,copy_call,study,source)|>
+    #     subset(!is.na(entrez_id))|>
+    #     distinct()
+    # res|>group_by(copy_call)|>summarize(n_distinct(entrez_id))
+    return(res)
                                         # }
 }))
 
diff --git a/build/utils/fit_curve.py b/build/utils/fit_curve.py
@@ -163,7 +163,7 @@ def process_df_part(df, fname, beataml=False, sep='\t', start=0, count=None):
     count = count or (4484081 - start)
     groups = islice(groups, start, start+count)
     
-    with multiprocessing.Pool(processes=multiprocessing.cpu_count()) as pool:
+    with multiprocessing.Pool(processes=multiprocessing.cpu_count()/4) as pool:
         results = pool.map(process_single_drug, groups)
 
     with open(f'{fname}.{start}', 'w') as f: