-
Notifications
You must be signed in to change notification settings - Fork 9
/
Copy pathstatistical_testing_ratios.R
332 lines (282 loc) · 13.3 KB
/
statistical_testing_ratios.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
# Analysis of IOVS mouse lens data (Supplemental Table S01):
# Khan, Shahid Y., et al. "Proteome Profiling of Developing Murine Lens Through Mass Spectrometry."
# Investigative Ophthalmology & Visual Science 59.1 (2018): 100-107.
# load libraries
library(tidyverse) # modern R packages for big data analysis
library(limma) # edgeR will load this if we do not
library(edgeR)
# read the Supplemental 01 file (saved as a CSV export from XLSX file)
data_start <- read_csv("iovs-58-13-55_s01.csv")
# filter out proteins not seen in all three runs
data_no_na <- na.omit(data_start)
# fix the column headers
col_headers <- colnames(data_no_na)
col_headers <- str_replace(col_headers, " {2,3}", " ")
col_headers <- str_replace(col_headers, "Reporter ion intensities ", "")
colnames(data_no_na) <- col_headers
# save the annotation columns (gene symbol and protein accession) for later and remove from data frame
annotate_df <- data_no_na[1:2]
data_raw <- data_no_na[3:20]
row.names(data_raw) <- annotate_df$`Protein Accession No.`
# separate the TMT data by experiment
exp1_raw <- data_raw[c(1:6)]
exp2_raw <- data_raw[c(7:12)]
exp3_raw <- data_raw[c(13:18)]
# figure out the global target scaling value
target <- mean(c(colSums(exp1_raw), colSums(exp2_raw), colSums(exp3_raw)))
# do the sample loading normalizations (scale to the target value)
norm_facs <- target / colSums(exp1_raw)
exp1_sl <- sweep(exp1_raw, 2, norm_facs, FUN = "*")
norm_facs <- target / colSums(exp2_raw)
exp2_sl <- sweep(exp2_raw, 2, norm_facs, FUN = "*")
norm_facs <- target / colSums(exp3_raw)
exp3_sl <- sweep(exp3_raw, 2, norm_facs, FUN = "*")
data_sl <- cbind(exp1_sl, exp2_sl, exp3_sl)
# make data frame with row means from each TMT experiment
refs <- tibble(rowMeans(exp1_sl), rowMeans(exp2_sl), rowMeans(exp3_sl))
colnames(refs) <- c("ave1", "ave2", "ave3")
# make new data frame with ratio values (time points divided by the "reference")
data_ratio <- exp1_sl / refs$ave1
data_ratio <- cbind(data_ratio, exp2_sl / refs$ave2)
data_ratio <- cbind(data_ratio, exp3_sl / refs$ave3)
row.names(data_ratio) <- annotate_df$`Protein Accession No.`
# let's see what box plots look like
boxplot(data_ratio, col = rep(c("red", "green", "blue"), each = 6), main = "Ratio to 'average' standard")
boxplot(log2(data_ratio), col = rep(c("red", "green", "blue"), each = 6),
main = "Log2 ratio to 'average' standard")
plotDensities(log2(data_ratio), col = rep(c("red", "green", "blue"), 6), main = "Ratios")
# function computes CVs per time point
make_CVs <- function(df) {
# separate by time points
E15 <- df[c(1, 7, 13)]
E18 <- df[c(2, 8, 14)]
P0 <- df[c(3, 9, 15)]
P3 <- df[c(4, 10, 16)]
P6 <- df[c(5, 11, 17)]
P9 <- df[c(6, 12, 18)]
E15$ave <- rowMeans(E15)
E15$sd <- apply(E15[1:3], 1, sd)
E15$cv <- 100 * E15$sd / E15$ave
E18$ave <- rowMeans(E18)
E18$sd <- apply(E18[1:3], 1, sd)
E18$cv <- 100 * E18$sd / E18$ave
P0$ave <- rowMeans(P0)
P0$sd <- apply(P0[1:3], 1, sd)
P0$cv <- 100 * P0$sd / P0$ave
P3$ave <- rowMeans(P3)
P3$sd <- apply(P3[1:3], 1, sd)
P3$cv <- 100 * P3$sd / P3$ave
P6$ave <- rowMeans(P6)
P6$sd <- apply(P6[1:3], 1, sd)
P6$cv <- 100 * P6$sd / P6$ave
P9$ave <- rowMeans(P9)
P9$sd <- apply(P9[1:3], 1, sd)
P9$cv <- 100 * P9$sd / P9$ave
ave_df <- data.frame(E15$ave, E18$ave, P0$ave, P3$ave, P6$ave, P9$ave)
sd_df <- data.frame(E15$sd, E18$sd, P0$sd, P3$sd, P6$sd, P9$sd)
cv_df <- data.frame(E15$cv, E18$cv, P0$cv, P3$cv, P6$cv, P9$cv)
return(list(ave_df, sd_df, cv_df))
}
list_ratio <- make_CVs(data_ratio)
# compare CV distributions
boxplot(list_ratio[[3]], notch = TRUE, main = "Ratios CVs (14.3% ave)", ylim = c(0, 150))
round(mean(apply(list_ratio[[3]], 2, median)), 2)
plotMDS(log2(data_ratio), col = rep(c("red", "green", "blue"), each = 6),
main = "log2 of ratios to 'standard'")
plotMDS(log2(data_ratio), col = c("red", "orange", "black", "green", "blue", "violet"),
main = "log2 of ratios to 'standard'")
# collect the 6 early versus the 6 late columns
early <- data_ratio[c(1, 2, 7, 8, 13, 14)]
late <- data_ratio[c(5, 6, 11, 12, 17, 18)]
# get the data into a separate data frame so we can apply the t-test to each row
# to be safe, take the log2 of the ratios (maybe this helps for ratios less than one?)
pair <- data.frame(log2(early), log2(late))
# add average ratio columns (non-logged ratios), fold-change column, and row names
pair$ave_early <- rowMeans(early)
pair$ave_late <- rowMeans(late)
pair$logFC <- log2(pair$ave_late / pair$ave_early)
row.names(pair) <- annotate_df$`Protein Accession No.`
# apply the t-test (from:https://stackoverflow.com/questions/25496693/
# how-to-do-t-test-two-samples-to-rows-of-a-dataframe-using-apply-family-functio)
# data is set up to do the two-sample t.test (a built-in) on each row
t.result <- apply(pair, 1, function(x) t.test(x[1:6], x[7:12]))
# extract the p-value column from the t-test thingy
pair$p_value <- unlist(lapply(t.result, function(x) x$p.value))
# do a Benjamini-Hochberg multiple testing correction
pair$fdr <- p.adjust(pair$p_value, method = "BH")
head(pair)
# add a DE candidate status column
pair$candidate <- "no"
pair[which(pair$fdr <= 0.10 & pair$fdr > 0.05), dim(pair)[2]] <- "low"
pair[which(pair$fdr <= 0.05 & pair$fdr > 0.01), dim(pair)[2]] <- "med"
pair[which(pair$fdr <= 0.01), dim(pair)[2]] <- "high"
pair$candidate <- as.factor(pair$candidate)
# count up, down and the rest (FDR less than 0.05)
all <- dim(pair)[1]
up <- dim(pair[(pair$fdr <= 0.05) & (pair$logFC > 0.0), ])[1]
down <- dim(pair[(pair$fdr <= 0.05) & (pair$logFC <= 0.0), ])[1]
up
all - up - down
down
# what does the test p-value distribution look like?
ggplot(pair, aes(p_value)) +
geom_histogram(bins = 100, fill = "white", color = "black") +
geom_hline(yintercept = mean(hist(pair$p_value, breaks = 100, plot = FALSE)$counts[26:100])) +
ggtitle("log2 ratios t-test: p-value distribution")
# make the combined candidate corelation plot
de_ratio <- data.frame(pair$ave_early, pair$ave_late, pair$candidate)
colnames(de_ratio) <- c("early", "late", "candidate")
ggplot(de_ratio, aes(x = early, y = late)) +
geom_point(aes(color = candidate, shape = candidate)) +
ggtitle("t-test ratios: early vs late")
# make separate corelation plots
ggplot(de_ratio, aes(x = early, y = late)) +
geom_point(aes(color = candidate, shape = candidate)) +
facet_wrap(~ candidate) +
ggtitle("t-test ratios: separated by candidate")
# make a volcano plot
vc <- data.frame(-1*pair$logFC, -1*log10(pair$fdr), pair$candidate)
colnames(vc) <- c("logFC", "fdr", "candidate")
ggplot(vc, aes(x = logFC, y = fdr)) +
geom_point(aes(color = candidate, shape = candidate)) +
xlab("Fold-Change (Log2)") +
ylab("-Log10 FDR") +
ggtitle("t-test ratios: Volcano Plot")
# create a basic design matrix
group = as.factor(rep(c("early", "early", "middle", "middle", "late", "late"), 3))
group <- factor(group, levels(group)[c(1, 3, 2)]) # set the factor order
design <- model.matrix(~ 0 + group)
colnames(design) <- c("early", "middle", "late")
design
# the contrast is where the early versus late sub-comparison is selected
contrast <- makeContrasts(late-early, levels = design)
contrast
# do the linear model fitting
data_limma <- log2(data_ratio)
fit <- lmFit(data_limma, design)
# get the fit for the contrast of interest
fit2 <- contrasts.fit(fit, contrast)
# do the empirical Bayes moderation of the test statistic
fit2 <- eBayes(fit2, trend = TRUE)
# grab the information in topTable so we can get the data to plot candidates
# the coef parameter has to do with the contrast of interest
# specify no sorting of results and a number that is longer than the data table
tt_limma <- topTable(fit2, coef = 1, sort.by = "none", number = 10000)
# let's see what columns we have
head(tt_limma)
summary(decideTests(fit2))
# add average ratio columns (not logged), fold-change, and row names
pair <- data_limma
pair$ave_early <- rowMeans(early)
pair$ave_late <- rowMeans(late)
pair$logFC <- log2(pair$ave_late / pair$ave_early)
row.names(pair) <- annotate_df$`Protein Accession No.`
# get the p-values and FDRs from the top tests
pair$p_value <- tt_limma$P.Value
pair$fdr <- tt_limma$adj.P.Val
# see if we have what we want
# head(pair)
# add a DE candidate status column
pair$candidate <- "no"
pair[which(pair$fdr <= 0.10 & pair$fdr > 0.05), dim(pair)[2]] <- "low"
pair[which(pair$fdr <= 0.05 & pair$fdr > 0.01), dim(pair)[2]] <- "med"
pair[which(pair$fdr <= 0.01), dim(pair)[2]] <- "high"
pair$candidate <- as.factor(pair$candidate)
# count up, down and the rest (FDR less than 0.05)
all <- dim(pair)[1]
up <- dim(pair[(pair$fdr <= 0.05) & (pair$logFC > 0.0), ])[1]
down <- dim(pair[(pair$fdr <= 0.05) & (pair$logFC <= 0.0), ])[1]
up
all - up - down
down
# what does the test p-value distribution look like?
ggplot(pair, aes(p_value)) +
geom_histogram(bins = 100, fill = "white", color = "black") +
geom_hline(yintercept = mean(hist(pair$p_value, breaks = 100, plot = FALSE)$counts[26:100])) +
ggtitle("log2 ratios t-test: p-value distribution")
# make the combined candidate corelation plot
de_ratio <- data.frame(pair$ave_early, pair$ave_late, pair$candidate)
colnames(de_ratio) <- c("early", "late", "candidate")
ggplot(de_ratio, aes(x = early, y = late)) +
geom_point(aes(color = candidate, shape = candidate)) +
ggtitle("limma log ratios: early vs late")
# make separate corelation plots
ggplot(de_ratio, aes(x = early, y = late)) +
geom_point(aes(color = candidate, shape = candidate)) +
facet_wrap(~ candidate) +
ggtitle("limma log ratios: separated by candidate")
# make a volcano plot
vc <- data.frame(-1*pair$logFC, -1*log10(pair$fdr), pair$candidate)
colnames(vc) <- c("logFC", "fdr", "candidate")
ggplot(vc, aes(x = logFC, y = fdr)) +
geom_point(aes(color = candidate, shape = candidate)) +
xlab("Fold-Change (Log2)") +
ylab("-Log10 FDR") +
ggtitle("limma log ratios: Volcano Plot")
# collect up some results to write out
final <- cbind(annotate_df, data_raw, data_sl, refs, data_ratio, pair)
write.csv(final, file = "final_part3.csv")
library(psych)
# lets compare the combination of SL and TMM normalizations to SL/IRS/TMM
# again using the idea that replicates of the same time point should similar
pairs.panels(log2(data_ratio[c(1, 7, 13)]), lm = TRUE, main = "SL/Ratio E15")
pairs.panels(log2(data_ratio[c(2, 8, 14)]), lm = TRUE, main = "SL/Ratio E18")
pairs.panels(log2(data_ratio[c(3, 9, 15)]), lm = TRUE, main = "SL/Ratio P0")
pairs.panels(log2(data_ratio[c(4, 10, 16)]), lm = TRUE, main = "SL/Ratio P3")
pairs.panels(log2(data_ratio[c(5, 11, 17)]), lm = TRUE, main = "SL/Ratio P6")
pairs.panels(log2(data_ratio[c(6, 12, 18)]), lm = TRUE, main = "SL/Ratio P9")
# create some functions
subset_proteins <- function(df, proteins, labels, value_col) {
df_subset <- subset(df, df$protein %in% proteins)
df_subset <- df_subset[match(proteins, df_subset$protein), ]
df_subset$proteins <- as.factor(labels)
keycol <- "time"
valuecol <- value_col
gathercols <- colnames(df)[1:6]
return(gather_(df_subset, keycol, valuecol, gathercols))
}
plot_proteins <- function(df_list, annotate_df) {
ave_df <- df_list[[1]]
sd_df <- df_list[[2]]
# lists of lens proteins to extract and plot
alphas <- c("P24622", "P23927")
alpha_labels <- c("alphaA", "alphaB")
betas <- c("P02525", "Q9JJV1", "Q9JJV0","Q9WVJ5", "P62696", "Q9JJU9", "O35486")
beta_labels <- c("betaA1", "betaA2", "betaA4", "betaB1", "betaB2", "betaB3", "betaS")
gammas <- c("P04345", "P04344", "Q61597", "P04342", "Q03740", "Q9CXV3", "Q8VHL5")
gamma_labels <- c("gammaA", "gammaB", "gammaC", "gammaD", "gammaE", "gammaF", "gammaN")
ave_df$protein <- annotate_df$`Protein Accession No.`
sd_df$protein <- annotate_df$`Protein Accession No.`
# plot alphas
ave_df_wide <- subset_proteins(ave_df, alphas, alpha_labels, "intensity")
sd_df_wide <- subset_proteins(sd_df, alphas, alpha_labels, "sd")
ave_df_wide$sd <- sd_df_wide$sd
alpha_plot <- ggplot(ave_df_wide, aes(time, intensity, fill=proteins)) +
geom_bar(stat="identity", position=position_dodge(), colour="black") +
geom_errorbar(aes(ymin = intensity-sd, ymax = intensity+sd), position = position_dodge(0.9), width = 0.4) +
xlab("Developmental Time") + ylab("Protein Intensity Ratio") +
ggtitle("Alpha crystallin expression")
print(alpha_plot)
# plot betas
ave_df_wide <- subset_proteins(ave_df, betas, beta_labels, "intensity")
sd_df_wide <- subset_proteins(sd_df, betas, beta_labels, "sd")
ave_df_wide$sd <- sd_df_wide$sd
beta_plot <- ggplot(ave_df_wide, aes(time, intensity, fill=proteins)) +
geom_bar(stat="identity", position=position_dodge(), colour="black") +
geom_errorbar(aes(ymin = intensity-sd, ymax = intensity+sd), position = position_dodge(0.9), width = 0.4) +
xlab("Developmental Time") + ylab("Protein Intensity Ratio") +
ggtitle("Beta crystallin expression")
print(beta_plot)
# plot gammas
ave_df_wide <- subset_proteins(ave_df, gammas, gamma_labels, "intensity")
sd_df_wide <- subset_proteins(sd_df, gammas, gamma_labels, "sd")
ave_df_wide$sd <- sd_df_wide$sd
gamma_plot <- ggplot(ave_df_wide, aes(time, intensity, fill=proteins)) +
geom_bar(stat="identity", position=position_dodge(), colour="black") +
geom_errorbar(aes(ymin = intensity-sd, ymax = intensity+sd), position = position_dodge(0.9), width = 0.4) +
xlab("Developmental Time") + ylab("Protein Intensity Ratio") +
ggtitle("Gamma crystallin expression")
print(gamma_plot)
return(NULL)
}
plot_proteins(list_ratio, annotate_df)