Skip to content

Commit 8c436c7

Browse files
authored
Merge branch 'main' into docker-cleanup
2 parents a8fbd56 + 2007152 commit 8c436c7

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

48 files changed

+75573
-1515
lines changed

.github/workflows/main.yml

Lines changed: 1377 additions & 1374 deletions
Large diffs are not rendered by default.

coderdata/download/downloader.py

Lines changed: 0 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -1,10 +1,6 @@
11
# coderdata/download/downloader.py
22

33
import requests
4-
import os
5-
import json
6-
import yaml
7-
84

95
def download_data_by_prefix(dataset_prefix=None):
106
"""

dataSummary/visualization01.R

Lines changed: 59 additions & 42 deletions
Original file line numberDiff line numberDiff line change
@@ -26,32 +26,32 @@ mergeSamples<-function(){
2626
distinct()
2727

2828
###########################
29-
## CCLE Cell Line data
29+
## Broad Sanger data
3030
## We have many more cancer types here, so we try to map what we have to CPTAC names and then adjust the rest
3131
##
3232
###########################
33-
depmap<-readr::read_csv('depmap_samples.csv')|>
33+
broad_sanger<-readr::read_csv('broad_sanger_samples.csv')|>
3434
dplyr::mutate(`Cell line cancer type`=cancer_type)|>
3535
mutate(sampleSource='CCLE')
3636

37-
allec<-grep('Endometrial',depmap$`Cell line cancer type`)
38-
depmap$`Cell line cancer type`[allec]<-'Uterine corpus endometrial carcinoma'
37+
allec<-grep('Endometrial',broad_sanger$`Cell line cancer type`)
38+
broad_sanger$`Cell line cancer type`[allec]<-'Uterine corpus endometrial carcinoma'
3939

40-
depmap<-depmap|>
40+
broad_sanger<-broad_sanger|>
4141
left_join(cmaps)
4242

4343
##first we collect the names of the cancers that are NOT in CPTAC
44-
other_cans<-which(is.na(depmap$`CPTAC Cancer type`))
45-
depmap$`CPTAC Cancer type`[other_cans]<-depmap$`Cell line cancer type`[other_cans]
46-
depmap<-depmap|>
44+
other_cans<-which(is.na(broad_sanger$`CPTAC Cancer type`))
45+
broad_sanger$`CPTAC Cancer type`[other_cans]<-broad_sanger$`Cell line cancer type`[other_cans]
46+
broad_sanger<-broad_sanger|>
4747
dplyr::select(improve_sample_id,`CPTAC Cancer type`,model_type,species,sampleSource)|>
4848
distinct()
4949

5050
#then we rename the NA values to 'Other' if we want
51-
#other_cans<-which(is.na(depmap$`CPTAC Cancer type`))
52-
#depmap$`CPTAC Cancer type`[other_cans]<-'Other'
51+
#other_cans<-which(is.na(broad_sanger$`CPTAC Cancer type`))
52+
#broad_sanger$`CPTAC Cancer type`[other_cans]<-'Other'
5353
# or just remove them
54-
depmap<-depmap|>
54+
broad_sanger<-broad_sanger|>
5555
subset(!is.na(`CPTAC Cancer type`))
5656

5757
###########################
@@ -61,10 +61,8 @@ mergeSamples<-function(){
6161
hcmi<-readr::read_csv('hcmi_samples.csv')|>
6262
dplyr::rename(id_source='other_id_source')|>
6363
mutate(species='human')|>
64-
subset(model_type%in%c('3D Organoid','Solid Tissue','Adherent Cell Line'))|>
64+
subset(model_type%in%c('organoid','tumor','cell line','Patient derived xenograft'))|>
6565
dplyr::mutate(`HCMI Cancer type`=cancer_type,`HCMI Common name`=common_name)|>
66-
dplyr::mutate(model_type=stringr::str_replace_all(model_type,'Solid Tissue','Tumor'))|>
67-
dplyr::mutate(model_type=stringr::str_replace_all(model_type,'Adherent Cell Line','cell line'))|>
6866
left_join(cmaps)|>
6967
mutate(sampleSource='HCMI')|>
7068
dplyr::select(improve_sample_id,`CPTAC Cancer type`,model_type,species,sampleSource)|>
@@ -80,18 +78,23 @@ mergeSamples<-function(){
8078
baml<-readr::read_csv("beataml_samples.csv")|>
8179
mutate(cancer_type='Acute myeloid leukemia')|>
8280
mutate(species='Human')|>
83-
mutate(model_type='Tumor')|>
81+
mutate(model_type='tumor')|>
8482
mutate(sampleSource='BeatAML')|>
8583
dplyr::select(improve_sample_id,species,cancer_type,sampleSource,model_type)|>
8684
distinct()
8785
###########################
88-
## TCGA SAMPLE DATA
86+
## MPNST SAMPLE DATA
8987
## TBD
9088
###########################
91-
89+
mpnst<-readr::read_csv("mpnst_samples.csv")|>
90+
mutate(cancer_type='Neurofibromatosis')|>
91+
mutate(species='Human')|>
92+
mutate(sampleSource='MPNST')|>
93+
dplyr::select(improve_sample_id,species,cancer_type,sampleSource,model_type)|>
94+
distinct()
9295

9396
##now we join thomdelsem into a single table, with cancer type
94-
fulldat<<-rbind(cptac,depmap,hcmi)|>
97+
fulldat<<-rbind(cptac,broad_sanger,hcmi)|>
9598
dplyr::rename(cancer_type=`CPTAC Cancer type`)|>
9699
subset()
97100

@@ -100,6 +103,11 @@ mergeSamples<-function(){
100103
distinct()|>
101104
rbind(baml)
102105

106+
fulldat<-fulldat|>
107+
dplyr::select(improve_sample_id,species,cancer_type,sampleSource,model_type)|>
108+
distinct()|>
109+
rbind(mpnst)
110+
103111
models<-fulldat|>
104112
group_by(cancer_type)|>
105113
summarize(num_models=n_distinct(model_type))|>
@@ -126,67 +134,76 @@ stats<-fulldat|>
126134
subset(model_type!='Not Reported')|>
127135
subset(numSamps>1)
128136

129-
color_palette <- brewer.pal(n = 3, name = "Set2")
137+
color_palette <- brewer.pal(n = 4, name = "Set2")
130138

131139
# Assign colors to the model types
132-
names(color_palette) <- c("Tumor", "cell line", "3D Organoid")
133-
background_color <- "#E0F2F1"
140+
names(color_palette) <- c("tumor", "cell line", "organoid",'Patient derived xenograft')
141+
134142
fig0<-ggplot(stats,aes(x=cancer_type,y=numSamps,fill=model_type))+
135143
geom_bar(stat='identity',position='dodge')+
136144
theme(axis.text.x = element_text(angle = 45, hjust = 1),
137-
plot.background = element_rect(fill = background_color, color = background_color),
138-
legend.background = element_rect(fill = background_color, color = background_color))+
145+
plot.background = element_rect(fill = background_color, color = background_color),
146+
legend.background = element_rect(fill = background_color, color = background_color))+
139147
scale_y_log10()+scale_fill_manual(values=color_palette)+
140148
ggtitle('Samples by tumor type')
141149

142150
print(fig0)
143-
ggsave('Fig0_Overview.png',fig0,height=9,width=12,bg = background_color)
151+
ggsave('Fig0_Overview.png',fig0,height=8,width=10)
144152

145153
# Subset data for each type
146154
data_type1 <- subset(stats, sampleSource == 'HCMI')
147155
data_type2 <- subset(stats, sampleSource == 'BeatAML')
148156
data_type3 <- subset(stats, sampleSource == 'CPTAC')
149157
data_type4 <- subset(stats, sampleSource == 'CCLE')
158+
data_type5 <- subset(stats, sampleSource == 'MPNST')
150159

151-
# Create separate plots for each type
160+
# Create separate plots for each type, with colorblind-friendly colors
152161
background_color <- "#E0F2F1"
153-
154162
fig1 <- ggplot(data_type1, aes(x=cancer_type, y=numSamps, fill=model_type)) +
155163
geom_bar(stat='identity', position='dodge') +
156164
scale_fill_manual(values=color_palette) +
157165
theme(axis.text.x = element_text(angle = 45, hjust = 1),
158-
plot.background = element_rect(fill = background_color, color = background_color),
159-
legend.background = element_rect(fill = background_color, color = background_color)) +
166+
plot.background = element_rect(fill = background_color, color = background_color),
167+
legend.background = element_rect(fill = background_color, color = background_color)) +
160168
ggtitle('Cancer and Tissue Types - HCMI')
161169

162-
fig2 <- ggplot(data_type2, aes(x=cancer_type, y=numSamps, fill=model_type),
163-
legend.background = element_rect(fill = background_color, color = background_color)) +
170+
fig2 <- ggplot(data_type2, aes(x=cancer_type, y=numSamps, fill=model_type)) +
164171
geom_bar(stat='identity', position='dodge') +
165172
scale_fill_manual(values=color_palette) +
166173
theme(axis.text.x = element_text(angle = 45, hjust = 1),
167-
plot.background = element_rect(fill = background_color, color = background_color),
168-
legend.background = element_rect(fill = background_color, color = background_color)) +
174+
plot.background = element_rect(fill = background_color, color = background_color),
175+
legend.background = element_rect(fill = background_color, color = background_color)) +
169176
ggtitle('Cancer and Tissue Types - BeatAML')
170177

171178
fig3 <- ggplot(data_type3, aes(x=cancer_type, y=numSamps, fill=model_type)) +
172179
geom_bar(stat='identity', position='dodge') +
173180
scale_fill_manual(values=color_palette) +
174181
theme(axis.text.x = element_text(angle = 45, hjust = 1),
175-
plot.background = element_rect(fill = background_color, color = background_color),
176-
legend.background = element_rect(fill = background_color, color = background_color)) +
182+
plot.background = element_rect(fill = background_color, color = background_color),
183+
legend.background = element_rect(fill = background_color, color = background_color))+
177184
ggtitle('Cancer and Tissue Types - CPTAC')
178185

179186
fig4 <- ggplot(data_type4, aes(x=cancer_type, y=numSamps, fill=model_type)) +
180187
geom_bar(stat='identity', position='dodge') +
181188
scale_fill_manual(values=color_palette) +
182189
theme(axis.text.x = element_text(angle = 45, hjust = 1),
183-
plot.background = element_rect(fill = background_color, color = background_color),
184-
legend.background = element_rect(fill = background_color, color = background_color)) +
185-
ggtitle('Cancer and Tissue Types - CCLE')
186-
187-
ggsave('Fig1_HCMI.png', fig1, height=9, width=12, bg = background_color)
188-
ggsave('Fig2_BeatAML.png', fig2, height=9, width=12, bg = background_color)
189-
ggsave('Fig3_CPTAC.png', fig3, height=9, width=12, bg = background_color)
190-
ggsave('Fig4_CCLE.png', fig4, height=9, width=12, bg = background_color)
190+
plot.background = element_rect(fill = background_color, color = background_color),
191+
legend.background = element_rect(fill = background_color, color = background_color)) +
192+
scale_y_log10() +
193+
ggtitle('Cancer and Tissue Types - Broad Sanger')
194+
195+
fig5 <- ggplot(data_type5, aes(x=cancer_type, y=numSamps, fill=model_type)) +
196+
geom_bar(stat='identity', position='dodge') +
197+
scale_fill_manual(values=color_palette) +
198+
theme(axis.text.x = element_text(angle = 45, hjust = 1),
199+
plot.background = element_rect(fill = background_color, color = background_color),
200+
legend.background = element_rect(fill = background_color, color = background_color)) +
201+
ggtitle('Cancer and Tissue Types - MPNST')
202+
203+
ggsave('Fig1_HCMI.png', fig1, height=8, width=10)
204+
ggsave('Fig2_BeatAML.png', fig2, height=8, width=10)
205+
ggsave('Fig3_CPTAC.png', fig3, height=8, width=10)
206+
ggsave('Fig4_Broad_Sanger.png', fig4, height=8, width=10)
207+
ggsave('Fig5_MPNST.png', fig5, height=8, width=10)
191208

192209

dataSummary/visualization02.R

Lines changed: 25 additions & 16 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,5 @@
11
# Required libraries
22
library(circlize)
3-
# library("tidyverse")
43
library(dplyr)
54
library(tools)
65
library(ggplot2)
@@ -136,7 +135,7 @@ generate_circos_plot <- function(processed_data,prefix) {
136135
}
137136
}
138137
})
139-
138+
140139
# Dots for the outer track
141140
circos.track(track.index = 2, ylim = c(1, num_datasets+1), bg.border = NA, panel.fun = function(x, y) {
142141
xlim = get.cell.meta.data("xlim")
@@ -283,10 +282,11 @@ generate_group_summary_plot <- function(all_file_names) {
283282
labs(title = "Number of Samples by Data Type and Source",
284283
x = "Data Type",
285284
y = "Number of Samples") +
286-
scale_fill_manual(values = c("beataml" = "#fc8d62", "hcmi" = "#8da0cb", "depmap" = "#66c2a5", "cptac" = "#8511c1")) +
285+
scale_fill_manual(values = c("beataml" = "#fc8d62", "hcmi" = "#8da0cb", "broad_sanger" = "#66c2a5", "cptac" = "#8511c1", "mpnst" = "#FFD700")) +
286+
scale_y_log10(breaks = c(1, 10, 100, 1000), labels = c("1", "10", "100", "1,000")) +
287287
theme(plot.background = element_rect(fill = background_color, color = background_color),
288-
legend.background = element_rect(fill = background_color, color = background_color))
289-
ggsave('Fig5_Sample_Summary.png', p, height = 9, width = 12, bg = background_color)
288+
legend.background = element_rect(fill = background_color, color = background_color))
289+
ggsave('Fig6_Sample_Summary.png', p, height = 9, width = 12)
290290
}
291291

292292
# Data file names for each group
@@ -300,27 +300,35 @@ hcmi_names <- list(
300300
mutations = "hcmi_mutations.csv.gz",
301301
copy_number = "hcmi_copy_number.csv.gz"
302302
)
303-
depmap_names <- list(
304-
transcriptomics = "depmap_transcriptomics.csv.gz",
305-
proteomics = "depmap_proteomics.csv.gz",
306-
copy_number = "depmap_copy_number.csv.gz",
307-
mutations = "depmap_mutations.csv.gz"
303+
broad_sanger_names <- list(
304+
transcriptomics = "broad_sanger_transcriptomics.csv.gz",
305+
proteomics = "broad_sanger_proteomics.csv.gz",
306+
copy_number = "broad_sanger_copy_number.csv.gz",
307+
mutations = "broad_sanger_mutations.csv.gz"
308308
)
309309
cptac_names <- list(
310310
transcriptomics = "cptac_transcriptomics.csv.gz",
311311
proteomics = "cptac_proteomics.csv.gz",
312312
copy_number = "cptac_copy_number.csv.gz",
313313
mutations = "cptac_mutations.csv.gz"
314314
)
315+
mpnst_names <- list(
316+
transcriptomics = "mpnst_transcriptomics.csv.gz",
317+
copy_number = "mpnst_copy_number.csv.gz",
318+
mutations = "mpnst_mutations.csv.gz"
319+
)
320+
321+
315322

316323
# Combine all file names into one list
317324
all_file_names <- list(
318325
beataml = beataml_names,
319326
hcmi = hcmi_names,
320-
depmap = depmap_names,
321-
cptac = cptac_names
327+
broad_sanger = broad_sanger_names,
328+
cptac = cptac_names,
329+
mpnst = mpnst_names
322330
)
323-
331+
#
324332
save_summary <- function(file_group, individ_summary) {
325333
file_name <- paste0(file_group,"_table", ".csv")
326334
write.csv(individ_summary, file = file_name, row.names = FALSE)
@@ -342,12 +350,13 @@ for (file_group_name in names(all_file_names)) {
342350
samples_names <- list(
343351
HCMI = "hcmi_samples.csv",
344352
BEATAML = "beataml_samples.csv",
345-
DepMap = "depmap_samples.csv",
346-
CPTAC = "cptac_samples.csv"
353+
Broad_Sanger = "broad_sanger_samples.csv",
354+
CPTAC = "cptac_samples.csv",
355+
MPNST = "mpnst_samples.csv"
347356
)
348357

349358
# # Generate and print the summary
350-
# group_summary <- generate_group_summary_stats(samples_names)
359+
group_summary <- generate_group_summary_stats(samples_names)
351360

352361
# Generate group summary plot
353362
generate_group_summary_plot(all_file_names)

docs/_data/beataml_table.csv

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,4 @@
11
"Dataset","Unique_Entrez_IDs","Unique_Sample_IDs"
22
"Transcriptomics",18306,707
3-
"Proteomics",7357,210
4-
"Mutations",3455,871
3+
"Proteomics",7356,210
4+
"Mutations",3454,871

docs/_data/broad_sanger_table.csv

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,5 @@
1+
"Dataset","Unique_Entrez_IDs","Unique_Sample_IDs"
2+
"Transcriptomics",37290,1697
3+
"Proteomics",12936,1008
4+
"Copy_number",24766,1790
5+
"Mutations",21658,1729

docs/_data/cptac_table.csv

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,5 @@
11
"Dataset","Unique_Entrez_IDs","Unique_Sample_IDs"
2-
"Transcriptomics",38407,1113
3-
"Proteomics",15278,1026
4-
"Copy_number",38395,1024
2+
"Transcriptomics",38406,1113
3+
"Proteomics",15273,1086
4+
"Copy_number",38394,1024
55
"Mutations",18866,833

docs/_data/depmap_table.csv

Lines changed: 0 additions & 5 deletions
This file was deleted.

docs/_data/figshare_latest.yml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,4 @@
1-
article_link: https://figshare.com/articles/dataset/CODERData0_1_21/25043531
1+
article_link: https://figshare.com/articles/dataset/CODERData0_0_22/25537288
22
file_download:
33
beataml_drugs.tsv.gz: https://ndownloader.figshare.com/files/44184881
44
beataml_experiments.csv.gz: https://ndownloader.figshare.com/files/44184929

docs/_data/hcmi_table.csv

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,4 @@
11
"Dataset","Unique_Entrez_IDs","Unique_Sample_IDs"
2-
"Transcriptomics",19491,436
3-
"Mutations",17551,337
4-
"Copy_number",39095,385
2+
"Transcriptomics",19490,396
3+
"Mutations",16915,289
4+
"Copy_number",38879,282

0 commit comments

Comments
 (0)