PNNL-CompBio
diff --git a/‎.github/workflows/main.yml
Lines changed: 1377 additions & 1374 deletions b/‎.github/workflows/main.yml
Lines changed: 1377 additions & 1374 deletions
diff --git a/‎coderdata/download/downloader.py
Lines changed: 0 additions & 4 deletions b/‎coderdata/download/downloader.py
Lines changed: 0 additions & 4 deletions
diff --git a/‎dataSummary/visualization01.R
Lines changed: 59 additions & 42 deletions b/‎dataSummary/visualization01.R
Lines changed: 59 additions & 42 deletions
diff --git a/‎dataSummary/visualization02.R
Lines changed: 25 additions & 16 deletions b/‎dataSummary/visualization02.R
Lines changed: 25 additions & 16 deletions
diff --git a/‎docs/_data/beataml_table.csv
Lines changed: 2 additions & 2 deletions b/‎docs/_data/beataml_table.csv
Lines changed: 2 additions & 2 deletions
diff --git a/‎docs/_data/broad_sanger_table.csv
Lines changed: 5 additions & 0 deletions b/‎docs/_data/broad_sanger_table.csv
Lines changed: 5 additions & 0 deletions
diff --git a/‎docs/_data/cptac_table.csv
Lines changed: 3 additions & 3 deletions b/‎docs/_data/cptac_table.csv
Lines changed: 3 additions & 3 deletions
diff --git a/‎docs/_data/depmap_table.csv
Lines changed: 0 additions & 5 deletions b/‎docs/_data/depmap_table.csv
Lines changed: 0 additions & 5 deletions
diff --git a/‎docs/_data/figshare_latest.yml
Lines changed: 1 addition & 1 deletion b/‎docs/_data/figshare_latest.yml
Lines changed: 1 addition & 1 deletion
diff --git a/‎docs/_data/hcmi_table.csv
Lines changed: 3 additions & 3 deletions b/‎docs/_data/hcmi_table.csv
Lines changed: 3 additions & 3 deletions
@@ -1,10 +1,6 @@
 # coderdata/download/downloader.py
 
 import requests
-import os
-import json
-import yaml
-
 
 def download_data_by_prefix(dataset_prefix=None):
     """
 
@@ -26,32 +26,32 @@ mergeSamples<-function(){
     distinct()
 
   ###########################
-  ## CCLE Cell Line data
+  ## Broad Sanger data
   ## We have many more cancer types here, so we try to map what we have to CPTAC names and then adjust the rest
   ##
   ###########################
-  depmap<-readr::read_csv('depmap_samples.csv')|>
+  broad_sanger<-readr::read_csv('broad_sanger_samples.csv')|>
     dplyr::mutate(`Cell line cancer type`=cancer_type)|>
     mutate(sampleSource='CCLE')
 
-  allec<-grep('Endometrial',depmap$`Cell line cancer type`)
-  depmap$`Cell line cancer type`[allec]<-'Uterine corpus endometrial carcinoma'
+  allec<-grep('Endometrial',broad_sanger$`Cell line cancer type`)
+  broad_sanger$`Cell line cancer type`[allec]<-'Uterine corpus endometrial carcinoma'
 
-  depmap<-depmap|>
+  broad_sanger<-broad_sanger|>
     left_join(cmaps)
 
   ##first we collect the names of the cancers that are NOT in CPTAC
-  other_cans<-which(is.na(depmap$`CPTAC Cancer type`))
-  depmap$`CPTAC Cancer type`[other_cans]<-depmap$`Cell line cancer type`[other_cans]
-  depmap<-depmap|>
+  other_cans<-which(is.na(broad_sanger$`CPTAC Cancer type`))
+  broad_sanger$`CPTAC Cancer type`[other_cans]<-broad_sanger$`Cell line cancer type`[other_cans]
+  broad_sanger<-broad_sanger|>
     dplyr::select(improve_sample_id,`CPTAC Cancer type`,model_type,species,sampleSource)|>
     distinct()
 
   #then we rename the NA values to 'Other' if we want
-  #other_cans<-which(is.na(depmap$`CPTAC Cancer type`))
-  #depmap$`CPTAC Cancer type`[other_cans]<-'Other'
+  #other_cans<-which(is.na(broad_sanger$`CPTAC Cancer type`))
+  #broad_sanger$`CPTAC Cancer type`[other_cans]<-'Other'
   # or just remove them
-  depmap<-depmap|>
+  broad_sanger<-broad_sanger|>
     subset(!is.na(`CPTAC Cancer type`))
 
   ###########################
@@ -61,10 +61,8 @@ mergeSamples<-function(){
   hcmi<-readr::read_csv('hcmi_samples.csv')|>
     dplyr::rename(id_source='other_id_source')|>
     mutate(species='human')|>
-    subset(model_type%in%c('3D Organoid','Solid Tissue','Adherent Cell Line'))|>
+    subset(model_type%in%c('organoid','tumor','cell line','Patient derived xenograft'))|>
     dplyr::mutate(`HCMI Cancer type`=cancer_type,`HCMI Common name`=common_name)|>
-    dplyr::mutate(model_type=stringr::str_replace_all(model_type,'Solid Tissue','Tumor'))|>
-    dplyr::mutate(model_type=stringr::str_replace_all(model_type,'Adherent Cell Line','cell line'))|>
     left_join(cmaps)|>
     mutate(sampleSource='HCMI')|>
     dplyr::select(improve_sample_id,`CPTAC Cancer type`,model_type,species,sampleSource)|>
@@ -80,18 +78,23 @@ mergeSamples<-function(){
   baml<-readr::read_csv("beataml_samples.csv")|>
     mutate(cancer_type='Acute myeloid leukemia')|>
     mutate(species='Human')|>
-    mutate(model_type='Tumor')|>
+    mutate(model_type='tumor')|>
     mutate(sampleSource='BeatAML')|>
     dplyr::select(improve_sample_id,species,cancer_type,sampleSource,model_type)|>
     distinct()
   ###########################
-  ## TCGA SAMPLE DATA
+  ## MPNST SAMPLE DATA
   ## TBD
   ###########################
-  
+  mpnst<-readr::read_csv("mpnst_samples.csv")|>
+    mutate(cancer_type='Neurofibromatosis')|>
+    mutate(species='Human')|>
+    mutate(sampleSource='MPNST')|>
+    dplyr::select(improve_sample_id,species,cancer_type,sampleSource,model_type)|>
+    distinct()
 
   ##now we join thomdelsem into a single table, with cancer type
-  fulldat<<-rbind(cptac,depmap,hcmi)|>
+  fulldat<<-rbind(cptac,broad_sanger,hcmi)|>
     dplyr::rename(cancer_type=`CPTAC Cancer type`)|>
     subset()
 
@@ -100,6 +103,11 @@ mergeSamples<-function(){
     distinct()|>
     rbind(baml)
 
+  fulldat<-fulldat|>
+    dplyr::select(improve_sample_id,species,cancer_type,sampleSource,model_type)|>
+    distinct()|>
+    rbind(mpnst)
+  
   models<-fulldat|>
     group_by(cancer_type)|>
     summarize(num_models=n_distinct(model_type))|>
@@ -126,67 +134,76 @@ stats<-fulldat|>
   subset(model_type!='Not Reported')|>
   subset(numSamps>1)
 
-color_palette <- brewer.pal(n = 3, name = "Set2")
+color_palette <- brewer.pal(n = 4, name = "Set2")
 
 # Assign colors to the model types
-names(color_palette) <- c("Tumor", "cell line", "3D Organoid")
-background_color <- "#E0F2F1"
+names(color_palette) <- c("tumor", "cell line", "organoid",'Patient derived xenograft')
+
 fig0<-ggplot(stats,aes(x=cancer_type,y=numSamps,fill=model_type))+
   geom_bar(stat='identity',position='dodge')+
   theme(axis.text.x = element_text(angle = 45, hjust = 1),
-    plot.background = element_rect(fill = background_color, color = background_color),
-    legend.background = element_rect(fill = background_color, color = background_color))+
+  plot.background = element_rect(fill = background_color, color = background_color),
+  legend.background = element_rect(fill = background_color, color = background_color))+
   scale_y_log10()+scale_fill_manual(values=color_palette)+
   ggtitle('Samples by tumor type')
 
 print(fig0)
-ggsave('Fig0_Overview.png',fig0,height=9,width=12,bg = background_color)
+ggsave('Fig0_Overview.png',fig0,height=8,width=10)
 
 # Subset data for each type
 data_type1 <- subset(stats, sampleSource == 'HCMI')
 data_type2 <- subset(stats, sampleSource == 'BeatAML')
 data_type3 <- subset(stats, sampleSource == 'CPTAC')
 data_type4 <- subset(stats, sampleSource == 'CCLE')
+data_type5 <- subset(stats, sampleSource == 'MPNST')
 
-# Create separate plots for each type
+# Create separate plots for each type, with colorblind-friendly colors
 background_color <- "#E0F2F1"
-
 fig1 <- ggplot(data_type1, aes(x=cancer_type, y=numSamps, fill=model_type)) +
   geom_bar(stat='identity', position='dodge') +
   scale_fill_manual(values=color_palette) +
   theme(axis.text.x = element_text(angle = 45, hjust = 1),
-        plot.background = element_rect(fill = background_color, color = background_color),
-        legend.background = element_rect(fill = background_color, color = background_color)) +
+  plot.background = element_rect(fill = background_color, color = background_color),
+  legend.background = element_rect(fill = background_color, color = background_color)) +
   ggtitle('Cancer and Tissue Types - HCMI')
 
-fig2 <- ggplot(data_type2, aes(x=cancer_type, y=numSamps, fill=model_type),
-legend.background = element_rect(fill = background_color, color = background_color)) +
+fig2 <- ggplot(data_type2, aes(x=cancer_type, y=numSamps, fill=model_type)) +
   geom_bar(stat='identity', position='dodge') +
   scale_fill_manual(values=color_palette) +
   theme(axis.text.x = element_text(angle = 45, hjust = 1),
-        plot.background = element_rect(fill = background_color, color = background_color),
-        legend.background = element_rect(fill = background_color, color = background_color)) +
+  plot.background = element_rect(fill = background_color, color = background_color),
+  legend.background = element_rect(fill = background_color, color = background_color)) +
   ggtitle('Cancer and Tissue Types - BeatAML')
 
 fig3 <- ggplot(data_type3, aes(x=cancer_type, y=numSamps, fill=model_type)) +
   geom_bar(stat='identity', position='dodge') +
   scale_fill_manual(values=color_palette) +
   theme(axis.text.x = element_text(angle = 45, hjust = 1),
-        plot.background = element_rect(fill = background_color, color = background_color),
-        legend.background = element_rect(fill = background_color, color = background_color)) +
+  plot.background = element_rect(fill = background_color, color = background_color),
+  legend.background = element_rect(fill = background_color, color = background_color))+
   ggtitle('Cancer and Tissue Types - CPTAC')
 
 fig4 <- ggplot(data_type4, aes(x=cancer_type, y=numSamps, fill=model_type)) +
   geom_bar(stat='identity', position='dodge') +
   scale_fill_manual(values=color_palette) +
   theme(axis.text.x = element_text(angle = 45, hjust = 1),
-        plot.background = element_rect(fill = background_color, color = background_color),
-        legend.background = element_rect(fill = background_color, color = background_color)) +
-  ggtitle('Cancer and Tissue Types - CCLE')
-
-ggsave('Fig1_HCMI.png', fig1, height=9, width=12, bg = background_color)
-ggsave('Fig2_BeatAML.png', fig2, height=9, width=12, bg = background_color)
-ggsave('Fig3_CPTAC.png', fig3, height=9, width=12, bg = background_color)
-ggsave('Fig4_CCLE.png', fig4, height=9, width=12, bg = background_color)
+  plot.background = element_rect(fill = background_color, color = background_color),
+  legend.background = element_rect(fill = background_color, color = background_color)) +
+  scale_y_log10() +
+  ggtitle('Cancer and Tissue Types - Broad Sanger')
+
+fig5 <- ggplot(data_type5, aes(x=cancer_type, y=numSamps, fill=model_type)) +
+  geom_bar(stat='identity', position='dodge') +
+  scale_fill_manual(values=color_palette) +
+  theme(axis.text.x = element_text(angle = 45, hjust = 1),
+  plot.background = element_rect(fill = background_color, color = background_color),
+  legend.background = element_rect(fill = background_color, color = background_color)) +
+  ggtitle('Cancer and Tissue Types - MPNST')
+
+ggsave('Fig1_HCMI.png', fig1, height=8, width=10)
+ggsave('Fig2_BeatAML.png', fig2, height=8, width=10)
+ggsave('Fig3_CPTAC.png', fig3, height=8, width=10)
+ggsave('Fig4_Broad_Sanger.png', fig4, height=8, width=10)
+ggsave('Fig5_MPNST.png', fig5, height=8, width=10)
 
 
@@ -1,6 +1,5 @@
 # Required libraries
 library(circlize)
-# library("tidyverse")
 library(dplyr)
 library(tools)
 library(ggplot2)
@@ -136,7 +135,7 @@ generate_circos_plot <- function(processed_data,prefix) {
         }
       }
     })
-
+    
     # Dots for the outer track
     circos.track(track.index = 2, ylim = c(1, num_datasets+1), bg.border = NA, panel.fun = function(x, y) {
       xlim = get.cell.meta.data("xlim")
@@ -283,10 +282,11 @@ generate_group_summary_plot <- function(all_file_names) {
     labs(title = "Number of Samples by Data Type and Source",
          x = "Data Type",
          y = "Number of Samples") +
-    scale_fill_manual(values = c("beataml" = "#fc8d62", "hcmi" = "#8da0cb", "depmap" = "#66c2a5", "cptac" = "#8511c1")) +
+    scale_fill_manual(values = c("beataml" = "#fc8d62", "hcmi" = "#8da0cb", "broad_sanger" = "#66c2a5", "cptac" = "#8511c1", "mpnst" = "#FFD700")) +
+    scale_y_log10(breaks = c(1, 10, 100, 1000), labels = c("1", "10", "100", "1,000")) +
     theme(plot.background = element_rect(fill = background_color, color = background_color),
-    legend.background = element_rect(fill = background_color, color = background_color))
-  ggsave('Fig5_Sample_Summary.png', p, height = 9, width = 12, bg = background_color)
+          legend.background = element_rect(fill = background_color, color = background_color))
+  ggsave('Fig6_Sample_Summary.png', p, height = 9, width = 12)
 }
 
 # Data file names for each group
@@ -300,27 +300,35 @@ hcmi_names <- list(
   mutations = "hcmi_mutations.csv.gz",
   copy_number = "hcmi_copy_number.csv.gz"
 )
-depmap_names <- list(
-  transcriptomics = "depmap_transcriptomics.csv.gz",
-  proteomics = "depmap_proteomics.csv.gz",
-  copy_number = "depmap_copy_number.csv.gz",
-  mutations = "depmap_mutations.csv.gz"
+broad_sanger_names <- list(
+  transcriptomics = "broad_sanger_transcriptomics.csv.gz",
+  proteomics = "broad_sanger_proteomics.csv.gz",
+  copy_number = "broad_sanger_copy_number.csv.gz",
+  mutations = "broad_sanger_mutations.csv.gz"
 )
 cptac_names <- list(
   transcriptomics = "cptac_transcriptomics.csv.gz",
   proteomics = "cptac_proteomics.csv.gz",
   copy_number = "cptac_copy_number.csv.gz",
   mutations = "cptac_mutations.csv.gz"
 )
+mpnst_names <- list(
+  transcriptomics = "mpnst_transcriptomics.csv.gz",
+  copy_number = "mpnst_copy_number.csv.gz",
+  mutations = "mpnst_mutations.csv.gz"
+)
+
+
 
 # Combine all file names into one list
 all_file_names <- list(
   beataml = beataml_names,
   hcmi = hcmi_names,
-  depmap = depmap_names,
-  cptac = cptac_names
+  broad_sanger = broad_sanger_names,
+  cptac = cptac_names,
+  mpnst = mpnst_names
 )
-
+# 
 save_summary <- function(file_group, individ_summary) {
   file_name <- paste0(file_group,"_table", ".csv")
   write.csv(individ_summary, file = file_name, row.names = FALSE)
@@ -342,12 +350,13 @@ for (file_group_name in names(all_file_names)) {
 samples_names <- list(
   HCMI = "hcmi_samples.csv",
   BEATAML = "beataml_samples.csv",
-  DepMap = "depmap_samples.csv",
-  CPTAC = "cptac_samples.csv"
+  Broad_Sanger = "broad_sanger_samples.csv",
+  CPTAC = "cptac_samples.csv",
+  MPNST = "mpnst_samples.csv"
 )
 
 # # Generate and print the summary
-# group_summary <- generate_group_summary_stats(samples_names)
+group_summary <- generate_group_summary_stats(samples_names)
 
 # Generate group summary plot
 generate_group_summary_plot(all_file_names)
@@ -1,4 +1,4 @@
 "Dataset","Unique_Entrez_IDs","Unique_Sample_IDs"
 "Transcriptomics",18306,707
-"Proteomics",7357,210
-"Mutations",3455,871
+"Proteomics",7356,210
+"Mutations",3454,871
@@ -0,0 +1,5 @@
+"Dataset","Unique_Entrez_IDs","Unique_Sample_IDs"
+"Transcriptomics",37290,1697
+"Proteomics",12936,1008
+"Copy_number",24766,1790
+"Mutations",21658,1729
@@ -1,5 +1,5 @@
 "Dataset","Unique_Entrez_IDs","Unique_Sample_IDs"
-"Transcriptomics",38407,1113
-"Proteomics",15278,1026
-"Copy_number",38395,1024
+"Transcriptomics",38406,1113
+"Proteomics",15273,1086
+"Copy_number",38394,1024
 "Mutations",18866,833
@@ -1,4 +1,4 @@
-article_link: https://figshare.com/articles/dataset/CODERData0_1_21/25043531
+article_link: https://figshare.com/articles/dataset/CODERData0_0_22/25537288
 file_download:
   beataml_drugs.tsv.gz: https://ndownloader.figshare.com/files/44184881
   beataml_experiments.csv.gz: https://ndownloader.figshare.com/files/44184929
 
@@ -1,4 +1,4 @@
 "Dataset","Unique_Entrez_IDs","Unique_Sample_IDs"
-"Transcriptomics",19491,436
-"Mutations",17551,337
-"Copy_number",39095,385
+"Transcriptomics",19490,396
+"Mutations",16915,289
+"Copy_number",38879,282
Original file line number	Diff line number	Diff line change
`@@ -1,4 +1,4 @@`
`1`		`-article_link: https://figshare.com/articles/dataset/CODERData0_1_21/25043531`
	`1`	`+article_link: https://figshare.com/articles/dataset/CODERData0_0_22/25537288`
`2`	`2`	`file_download:`
`3`	`3`	`beataml_drugs.tsv.gz: https://ndownloader.figshare.com/files/44184881`
`4`	`4`	`beataml_experiments.csv.gz: https://ndownloader.figshare.com/files/44184929`