-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathTCGA_matched_pair_combined_analysis.R
369 lines (346 loc) · 17.8 KB
/
TCGA_matched_pair_combined_analysis.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
library(here)
library(RColorBrewer)
qual_col_pals = brewer.pal.info[brewer.pal.info$category == "qual" &
brewer.pal.info$colorblind == T,]
col_vector = unlist(mapply(brewer.pal, qual_col_pals$maxcolors, rownames(qual_col_pals)))
col_vector <- col_vector[c(1:3,9,11,19)]
rm(qual_col_pals)
## Import data ####
folder <- "Results/TCGA paired data results May 2020"
# Fisher's test results for detection of cancer-related genes using threshold
for (i in c("fisher.tests.weak_known.genes",
"fisher.tests.strong_known.genes",
"fisher.tests.weak_pathway.genes",
"fisher.tests.strong_pathway.genes",
"fisher.tests_pathway.genes.minus.known.genes")) {
assign(
i, read.csv(
here(folder, paste0(i, ".csv")), row.names=1
)
)
}
# Wilcoxon rank-sum test results for ranking of cancer-related genes
for (i in c("ranksum.tests.weak_known.genes",
"ranksum.tests.strong_known.genes",
"ranksum.tests.weak_pathway.genes",
"ranksum.tests.strong_pathway.genes",
"ranksum.tests_pathway.genes.minus.known.genes")) {
assign(
i, read.csv(
here(folder, paste0(i, ".csv")), row.names=1
)
)
}
# Overlap and correlation results
for (i in c("brca", "kirc", "thca", "luad", "lihc", "lusc", "prad", "coad")) {
for (j in c("HM", "HM_voom", "HMM_voom", "MDSeq")) {
for (k in c("known.genes", "pathway.genes", "pathway.genes.minus.known.genes")) {
assign(
paste0(i, "_", j, "_mean_v_disp_", k),
read.csv(
here(folder, paste0(i, "_", j, "_mean_v_disp_", k, ".csv")),
)
)
}
}
}
rm(folder,i,j,k)
## Ranking test results ####
# Report lnHM log mean and disp, HMM, HM/HM, voom/HM, voom, maybe edgeR QL, MDSeq ZI disp
# rownames(ranksum.tests.weak_known.genes)
ranksum.tests.weak_known.genes[
c("p.edgeR.ql", "p.voom", "p.mean.lnHM.log",
"p.disp.MDSeq.zi", "p.disp.lnHM.log", "prob.lnHMM",
"voom_lnHM.log", "lnHM.log_lnHM.log"),
c("brca", "kirc", "thca", "luad", "lihc", "lusc", "prad", "coad")
]
# Known cancer genes ranked significantly higher than other genes for all cancers for DE
# (with edgeR and voom, but not for lusc with HM). For DD, cancer genes ranked significantly
# higher than other genes for brca, thca, luad, lihc, lusc with MDSeq, and for thca, luad
# and lusc with HM. For DEDD, cancer genes ranked significantly higher than other genes for
# all cancers with voom/HM, for brca, thca, luac, lihc, lusc and prad with HM/HM, and for
# luad, lihc and lusc with HMM.
ranksum.tests_pathway.genes.minus.known.genes[
c("p.edgeR.ql", "p.voom", "p.mean.lnHM.log",
"p.disp.MDSeq.zi", "p.disp.lnHM.log", "prob.lnHMM",
"voom_lnHM.log", "lnHM.log_lnHM.log"),
c("brca", "kirc", "thca", "luad", "lihc", "lusc", "prad", "coad")
]
# Pathway genes with known genes excluded ranked significantly higher than other genes for
# all but luad and prad with edgeR, all but prad with voom, and for kirc, thca and lihc with
# HM. For DD, pathway genes ranked significantly higher than other genes for thca, lihc and
# lusc with MDSeq, and for thca, luad and lusc with HM. For DEDD, pathway genes ranked
# significantly higher than other genes for all but prad with voom/HM, for thca, luad, lihc
# and lusc with HM/HM, and for thca and lusc with HMM.
## DE v DD correlation results ####
# Look at all possible plots first, with brca as an example
par(mfrow=c(2,2), mar=c(2,2,2,0.5), mgp=c(3,0.7,0))
plot(brca_HM_mean_v_disp_known.genes$pvals,
brca_HM_mean_v_disp_known.genes$cor,
type="l",
# xlim=c(0,0.01),
ylim=c(-1,1),
col="red")
lines(brca_MDSeq_mean_v_disp_known.genes$pvals,
brca_MDSeq_mean_v_disp_known.genes$cor,
col="blue")
plot(brca_HM_mean_v_disp_known.genes$pvals,
brca_HM_mean_v_disp_known.genes$p.cor,
type="l",
# xlim=c(0,0.01),
ylim=c(0,1),
col="red")
lines(brca_MDSeq_mean_v_disp_known.genes$pvals,
brca_MDSeq_mean_v_disp_known.genes$p.cor,
col="blue")
plot(brca_HM_mean_v_disp_known.genes$pvals,
brca_HM_mean_v_disp_known.genes$overlap / brca_HM_mean_v_disp_known.genes$union,
type="l",
# xlim=c(0,0.01),
ylim=c(0,1),
col="red")
lines(brca_MDSeq_mean_v_disp_known.genes$pvals,
brca_MDSeq_mean_v_disp_known.genes$overlap / brca_MDSeq_mean_v_disp_known.genes$union,
col="blue")
plot(brca_HM_mean_v_disp_known.genes$pvals,
brca_HM_mean_v_disp_known.genes$p.hyper,
type="l",
# xlim=c(0,0.01),
ylim=c(0,1),
col="red")
lines(brca_MDSeq_mean_v_disp_known.genes$pvals,
brca_MDSeq_mean_v_disp_known.genes$p.hyper,
col="blue")
# Overlap or overlap/union is least convincing. Hypergeometric tests clearly show difference
# between MDSeq and HM, but not really convincing in showing that DD identifies different
# sets of genes from DE, because p-value is never very low, and is close to 1 for threshold
# p-values up to nearly 0.1.
# See what each plot looks like for each cancer
par(mfrow=c(4,8), mar=c(2,2,2,0.5), mgp=c(3,0.7,0))
# par(mfrow=c(2,4), mar=c(2,2,2,0.5), mgp=c(3,0.7,0))
for (i in c("brca", "kirc", "thca", "luad", "lihc", "lusc", "prad", "coad")) {
dat_HM <- get(paste0(i, "_HM_mean_v_disp_known.genes"))
dat_MD <- get(paste0(i, "_MDSeq_mean_v_disp_known.genes"))
plot(dat_HM$pvals,
dat_HM$cor,
type="l",
# xlim=c(0,0.01),
ylim=c(-1,1),
col="red", main=i)
lines(dat_MD$pvals,
dat_MD$cor,
col="blue")
}
# Correlation plots look pretty similar for all.
# par(mfrow=c(2,4), mar=c(2,2,2,0.5), mgp=c(3,0.7,0))
for (i in c("brca", "kirc", "thca", "luad", "lihc", "lusc", "prad", "coad")) {
dat_HM <- get(paste0(i, "_HM_mean_v_disp_known.genes"))
dat_MD <- get(paste0(i, "_MDSeq_mean_v_disp_known.genes"))
plot(dat_HM$pvals,
dat_HM$p.cor,
type="l",
# xlim=c(0,0.01),
ylim=c(0,1),
col="red", main=i)
lines(dat_MD$pvals,
dat_MD$p.cor,
col="blue")
}
# Same pattern for correlation tests for all, but point at which HM p-val increases changes,
# and for kirc and coad it never does.
# par(mfrow=c(2,4), mar=c(2,2,2,0.5), mgp=c(3,0.7,0))
for (i in c("brca", "kirc", "thca", "luad", "lihc", "lusc", "prad", "coad")) {
dat_HM <- get(paste0(i, "_HM_mean_v_disp_known.genes"))
dat_MD <- get(paste0(i, "_MDSeq_mean_v_disp_known.genes"))
plot(dat_HM$pvals,
dat_HM$overlap / dat_HM$union,
type="l",
# xlim=c(0,0.01),
ylim=c(0,1),
col="red", main=i)
lines(dat_MD$pvals,
dat_MD$overlap / dat_MD$union,
col="blue")
}
# Basically same pattern for overlap/union for all. Clearer difference for prad, and very
# little difference for luad, lihc, lusc, coad.
# par(mfrow=c(2,4), mar=c(2,2,2,0.5), mgp=c(3,0.7,0))
for (i in c("brca", "kirc", "thca", "luad", "lihc", "lusc", "prad", "coad")) {
dat_HM <- get(paste0(i, "_HM_mean_v_disp_known.genes"))
dat_MD <- get(paste0(i, "_MDSeq_mean_v_disp_known.genes"))
plot(dat_HM$pvals,
dat_HM$p.hyper,
type="l",
# xlim=c(0,0.01),
ylim=c(0,1),
col="red", main=i)
lines(dat_MD$pvals,
dat_MD$p.hyper,
col="blue")
}
# Some big differences in hypergeometric tests between cancers, but p-values always lower for
# HM than MD. Both stay near(ish) 1 the whole time for thca, luad, lusc and prad. HM very
# close to 0 up to about 0.3 for kirc and coad, and MD gets close to 0 below about 0.05 for
# MD.
# Difficult to make general inferences about differences between cancers in different
# analyses. Probably best to stick to making a general case but using one cancer, and have
# others in supplementary material. In that case, most general argument is using correlation
# and correlation tests, and brca is as good as any to show plots for since it's chosen
# without bias as the biggest sample size and the first that I looked at.
par(mfrow=c(2,1), mar=c(2,2,2,0.5), mgp=c(3,0.7,0))
plot(brca_HM_mean_v_disp_known.genes$pvals, brca_HM_mean_v_disp_known.genes$cor,
type="l", lwd=2, xlim=c(0.02,0.98), ylim=c(-1,0.8),
col=col_vector[2], main=paste0("Spearman correlation between differential expression and ",
"differential dispersion gene lists"))
lines(brca_MDSeq_mean_v_disp_known.genes$pvals,
brca_MDSeq_mean_v_disp_known.genes$cor,
lwd=2, col=col_vector[1])
legend("bottomright", fill=col_vector[1:2], bty='n', cex=1.5, ncol=1,
legend=c("MDSeq", "HM"))
plot(brca_HM_mean_v_disp_known.genes$pvals, brca_HM_mean_v_disp_known.genes$p.cor,
type="l", lwd=2, xlim=c(0.02,0.98), ylim=c(0,1),
col=col_vector[2], main="p-value for hypothesis test for negative correlation")
lines(brca_MDSeq_mean_v_disp_known.genes$pvals,
brca_MDSeq_mean_v_disp_known.genes$p.cor,
lwd=2, col=col_vector[1])
## Cancer-related genes identified by DE, DD, DEDD ####
# Plot for all cancers, for known genes and pathway genes excluding known genes, for lnHM log DD
# v voom, lnHM log DD v DE, lnHMM v voom, and MDSeq DD v DE
par(mfrow=c(8,8), mar=c(0.2,1,0.2,0), mgp=c(2,0,0))
for (i in c("brca", "kirc", "thca", "luad", "lihc", "lusc", "prad", "coad")) {
for (j in c("HM", "HM_voom", "HMM_voom", "MDSeq")) {
for (k in c("known.genes", "pathway.genes.minus.known.genes")) {
dat <- get(paste0(i, "_", j, "_mean_v_disp_", k))
plot(dat$pvals, dat$TPA, type="l", col="red", xaxt="n", ann=F, tcl=0, xlim=c(0,0.01))
lines(dat$pvals, dat$TPB, col='blue')
lines(dat$pvals, dat$TPboth, col='grey')
lines(dat$pvals, dat$TPAonly, col='red', lty=2)
lines(dat$pvals, dat$TPBonly, col='blue', lty=2)
legend('top', legend=paste0(i, " ", j), bty="n")
}
}
}
# Don't think there's any benefit in showing DE v DD for HM since it's clear that voom is better
# at identifying DE. Also not really any need to show MDSeq, since results are similar and I've
# already argued that MDSeq DD doesn't identify different sets of genes from DE as well as HM.
# Would like to include HMM to keep up the discussion about differential distribution -
# hopefully to show that differential distribution identifies more cancer-related genes than
# either DE or DD alone, except that that isn't the case generally - in most cases it identifies
# nearly exactly the same number of genes as DE, but for thca and prad HMM identifies far fewer
# than DE, and in all cases there are still genes that are identified by DE but not by HMM (as
# well as genes that are identified by HMM but not DE). For now just include lnHM log DD v voom,
# and think about adding HMM later.
# Include luad and prad in main paper, rest in supplementary material.
# Want to show that DD identifies known and potential cancer-related genes, but maybe only
# include known genes in main paper since there's not much difference between known and pathway
# genes for luad and prad, and only include pathway genes in supplementary material.
par(mfcol=c(2,2), mar=c(2,2,2,0.5), mgp=c(3,0.7,0))
plot(luad_HM_voom_mean_v_disp_known.genes$pvals, luad_HM_voom_mean_v_disp_known.genes$TPA,
type="l", lwd=2, col="red", xlim=c(0.03,0.97))
lines(luad_HM_voom_mean_v_disp_known.genes$pvals, luad_HM_voom_mean_v_disp_known.genes$TPB,
col='blue', lwd=2)
lines(luad_HM_voom_mean_v_disp_known.genes$pvals, luad_HM_voom_mean_v_disp_known.genes$TPboth,
col='grey', lwd=2)
lines(luad_HM_voom_mean_v_disp_known.genes$pvals, luad_HM_voom_mean_v_disp_known.genes$TPAonly,
col='red', lty=2, lwd=2)
lines(luad_HM_voom_mean_v_disp_known.genes$pvals, luad_HM_voom_mean_v_disp_known.genes$TPBonly,
col='blue', lty=2, lwd=2)
plot(prad_HM_voom_mean_v_disp_known.genes$pvals, prad_HM_voom_mean_v_disp_known.genes$TPA,
type="l", lwd=2, col="red", xlim=c(0.03,0.97))
lines(prad_HM_voom_mean_v_disp_known.genes$pvals, prad_HM_voom_mean_v_disp_known.genes$TPB,
col='blue', lwd=2)
lines(prad_HM_voom_mean_v_disp_known.genes$pvals, prad_HM_voom_mean_v_disp_known.genes$TPboth,
col='grey', lwd=2)
lines(prad_HM_voom_mean_v_disp_known.genes$pvals, prad_HM_voom_mean_v_disp_known.genes$TPAonly,
col='red', lty=2, lwd=2)
lines(prad_HM_voom_mean_v_disp_known.genes$pvals, prad_HM_voom_mean_v_disp_known.genes$TPBonly,
col='blue', lty=2, lwd=2)
plot(luad_HM_voom_mean_v_disp_known.genes$pvals, luad_HM_voom_mean_v_disp_known.genes$TPA,
type="l", lwd=2, col="red", xlim=c(0.0002,0.01), ylim=c(0,230))
lines(luad_HM_voom_mean_v_disp_known.genes$pvals, luad_HM_voom_mean_v_disp_known.genes$TPB,
col='blue', lwd=2)
lines(luad_HM_voom_mean_v_disp_known.genes$pvals, luad_HM_voom_mean_v_disp_known.genes$TPboth,
col='grey', lwd=2)
lines(luad_HM_voom_mean_v_disp_known.genes$pvals, luad_HM_voom_mean_v_disp_known.genes$TPAonly,
col='red', lty=2, lwd=2)
lines(luad_HM_voom_mean_v_disp_known.genes$pvals, luad_HM_voom_mean_v_disp_known.genes$TPBonly,
col='blue', lty=2, lwd=2)
plot(prad_HM_voom_mean_v_disp_known.genes$pvals, prad_HM_voom_mean_v_disp_known.genes$TPA,
type="l", lwd=2, col="red", xlim=c(0.0002,0.01), ylim=c(0,550))
lines(prad_HM_voom_mean_v_disp_known.genes$pvals, prad_HM_voom_mean_v_disp_known.genes$TPB,
col='blue', lwd=2)
lines(prad_HM_voom_mean_v_disp_known.genes$pvals, prad_HM_voom_mean_v_disp_known.genes$TPboth,
col='grey', lwd=2)
lines(prad_HM_voom_mean_v_disp_known.genes$pvals, prad_HM_voom_mean_v_disp_known.genes$TPAonly,
col='red', lty=2, lwd=2)
lines(prad_HM_voom_mean_v_disp_known.genes$pvals, prad_HM_voom_mean_v_disp_known.genes$TPBonly,
col='blue', lty=2, lwd=2)
# Not sure whether there's any benefit to zooming in on lower p-values. Trend is clearly
# discernible from looking at whole range. Think I should still show p-values up to 1, rather
# than only zoomed in, to give full view of number of cancer-related genes present. If zoom in
# as far as p-values of 0 to 0.01, clearly shows that voom identifies more cancer-related genes
# than HM DD at start, but that is just because of minimum p-value (and actually would not be
# the case if I made a different arbitrary decision and called the minimal value p=0 instead of
# 0.0001 as I have done), so probably not really a disadvantage to show that level of detail.
# However, still don't really get any more information from zoomed-in version than showing full
# range, so think I'll stick with just showing full range.
par(mfcol=c(2,1), mar=c(2,2,2,0.5), mgp=c(3,0.7,0))
plot(luad_HM_voom_mean_v_disp_known.genes$pvals, luad_HM_voom_mean_v_disp_known.genes$TPA,
type="l", lwd=2, col="red", xlim=c(0.03,0.97),
main="Lung adenocarcinoma")
lines(luad_HM_voom_mean_v_disp_known.genes$pvals, luad_HM_voom_mean_v_disp_known.genes$TPB,
col='blue', lwd=2)
lines(luad_HM_voom_mean_v_disp_known.genes$pvals, luad_HM_voom_mean_v_disp_known.genes$TPboth,
col='grey', lwd=2)
lines(luad_HM_voom_mean_v_disp_known.genes$pvals, luad_HM_voom_mean_v_disp_known.genes$TPAonly,
col='red', lty=2, lwd=2)
lines(luad_HM_voom_mean_v_disp_known.genes$pvals, luad_HM_voom_mean_v_disp_known.genes$TPBonly,
col='blue', lty=2, lwd=2)
legend("right",
col=c("red", "blue", "grey", "red", "blue"), lty=c(1,1,1,2,2),
lwd=2, bty='n', cex=1.2, ncol=1,
legend=c("Differential expression",
"Differential dispersion",
"Both",
"Differential expression only",
"Differential dispersion only"))
plot(prad_HM_voom_mean_v_disp_known.genes$pvals, prad_HM_voom_mean_v_disp_known.genes$TPA,
type="l", lwd=2, col="red", xlim=c(0.03,0.97),
main="Prostate adenocarcinoma")
lines(prad_HM_voom_mean_v_disp_known.genes$pvals, prad_HM_voom_mean_v_disp_known.genes$TPB,
col='blue', lwd=2)
lines(prad_HM_voom_mean_v_disp_known.genes$pvals, prad_HM_voom_mean_v_disp_known.genes$TPboth,
col='grey', lwd=2)
lines(prad_HM_voom_mean_v_disp_known.genes$pvals, prad_HM_voom_mean_v_disp_known.genes$TPAonly,
col='red', lty=2, lwd=2)
lines(prad_HM_voom_mean_v_disp_known.genes$pvals, prad_HM_voom_mean_v_disp_known.genes$TPBonly,
col='blue', lty=2, lwd=2)
# Would be best to also (or instead) plot by number of discoveries, and could then plot FDR
# or specificity (or something like them since we don't know true positives, just some sort of
# proxy) rather than just number of cancer-related genes identified.
par(mfrow=c(4,8), mar=c(0.2,1,0.2,0), mgp=c(2,0,0))
for (i in c("brca", "kirc", "thca", "luad", "lihc", "lusc", "prad", "coad")) {
for (j in c("HM_voom", "HMM_voom")) {
for (k in c("known.genes", "pathway.genes.minus.known.genes")) {
if (k == "known.genes") {
l <- "known"
} else {
l <- "pathway"
}
dat <- get(paste0(i, "_", j, "_mean_v_disp_", k))
plot(dat$listA, dat$TPA, type="l", col="red", xaxt="n", ann=F, tcl=0)
lines(dat$listB, dat$TPB, col='blue')
# lines(dat$pvals, dat$TPboth, col='grey')
# find a way of combining results from two methods for given number of disc
# lines(dat$pvals, dat$TPAonly, col='red', lty=2)
# lines(dat$pvals, dat$TPBonly, col='blue', lty=2)
legend('top', legend=paste0(i, " ", j, " ", l), bty="n")
}
}
}
rm(i,j,k,l)
# This isn't really any use until/unless I can get HM(M) to return finer p-values (or use
# smaller samples). But it does show that over the range where HMs go from 0 to some large
# number of positives, voom generally isn't doing much better than picking randomly from among
# the genes with minimal p-values from HM. This might end up being a good thing to include once
# I have finer p-values from HM(M).