-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathchaps_similerity_index.R
226 lines (184 loc) · 7.84 KB
/
chaps_similerity_index.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
#---------------- chaps_similerity_index.R --------------
# use similarity indexes to find the differences between chaperons
# by comparing between the cancers for each chaperone
#------------------------------
#-------- includes --------
library(readxl)
library(vegan)
library(ggplot2)
library(reshape2)
library(tidyr)
library(readr)
source("functions.r")
#-------- load the networks from an excel file --------
networks <- load_cancer_mats()
sheet_names <- names(networks)
#-------- build cancerXproteins network per chap --------
prots_meta <- read.table("HPC/Mito_genes.tab", sep="\t", header=TRUE,
stringsAsFactors=FALSE, quote="", fill=FALSE)
chaps_meta <- read.table("HPC/Mito_ch_genes.tab", sep="\t", header=TRUE,
stringsAsFactors=FALSE, quote="", fill=FALSE)
prots_meta <- prots_meta[!prots_meta$ENSID %in% chaps_meta$ENSID, ]
# build empty df
empty_net <- data.frame(matrix(nrow=length(sheet_names),
ncol = length(prots_meta$ENSID)))
colnames(empty_net) <- prots_meta$ENSID
row.names(empty_net) <- sheet_names
chap_nets <- list()
for (chap in chaps_meta$Symbol) {
chap_net <- empty_net
for (cancr in sheet_names) {
cncr_df <- networks[[cancr]]
chap_net[cancr,] <- cncr_df[chap,]
}
# add to network list
chap_nets[[chap]] <- chap_net
}
#-------- use vegan::vegdist() --------
# Jaccard index is computed as 2B/(1+B), where B is Bray-Curtis *dissimilarity*.
# so use the 1-P value to use the similarity.
all_simlrs <- matrix(0, nrow = 66, ncol = 0)
for (chapp in names(chap_nets)) {
net <- chap_nets[[chapp]]
res <- vegdist(net, method="jaccard")
simlr <- 1-res
all_simlrs <- cbind(all_simlrs, simlr)
}
colnames(all_simlrs) <- names(chap_nets)
write.csv(all_simlrs, file = "output/jaccard_values_per_chap.csv", row.names = FALSE)
#-------- visualize the results --------
# melt the data to a long format
mlt_sim <- as.data.frame(melt(all_simlrs))
g1 <- ggplot(mlt_sim, aes(x=Var2,y=value))+
geom_boxplot(outlier.colour="black", outlier.shape=16,
outlier.size=2, notch=FALSE)+
theme(axis.text.x=element_text(angle=45, hjust=1))
g1
ggsave("output/paper_figures/chap_jaccard_boxplot.pdf", g1)
#-------- load realized niche to visualize together --------
fold_per <- t(read.csv("output/chap_realized_niche.csv", row.names = 1))
all_simlrs <- read_csv("output/jaccard_values_per_chap.csv")
mlt_sim <- as.data.frame(melt(all_simlrs))
mlt_per <- as.data.frame(melt(fold_per))
g2 <- ggplot(mlt_per, aes(x=Var2,y=value))+
geom_boxplot(outlier.colour="black", outlier.shape=16,
outlier.size=2, notch=FALSE)
g2
# prpepare data to combine
mlt_per$Var1 <- "Realized Niche"
mlt_sim$Var1 <- "Similarity Index"
all <- rbind(mlt_per, mlt_sim)
# build the plot to work togather
g3 <- ggplot(all, aes(x=Var2, y=value, fill=Var1)) +
geom_boxplot(outlier.colour="black", outlier.shape=16,
outlier.size=2, notch=FALSE) +
scale_y_continuous(name = "Realized Niche (%)",# Add a second axis:
sec.axis = sec_axis(trans=~.,
name="Similerity (b diversity")) +
labs(fill="Indexes:", x=NULL) + theme(legend.position="top",
axis.text.x=element_text(angle=45, hjust=1))
g3
ggsave("output/similarity_and_percent_boxplot.pdf", g3)
#------- test correlation between median similarity and folding ----------
f <- apply(fold_per,2,median)
s <- apply(all_simlrs,2,median)
f <- f[order(names(f))]
s <- s[order(names(s))]
st <- cor.test(s, f, method="spearman", exact=FALSE)
obs_pval <- st$p.value
obs_rval <- st$estimate
st
# for paper:
min(s)
max(s)
#-------- build a DF for dependency visualization --------
potentials <- read.csv("output/folding_potential.csv", row.names = 1)
expression <- read.csv("output/chap_median_expressions.csv", row.names = 1) # TODO
# make a new df with:
# name | potential | median_similarity | 25q | 75q | med_realized | 25q | 75q
f_s <- t(do.call(cbind, lapply(as.data.frame(fold_per), summary)))
s_s <- t(do.call(cbind, lapply(as.data.frame(all_simlrs), summary)))
e_s <- t(do.call(cbind, lapply(as.data.frame(log10(t(expression))), summary)))
f_s <- as.data.frame(f_s) %>% dplyr::select("1st Qu.", "Median" ,"3rd Qu.") %>%
dplyr::rename(fold_q1 = `1st Qu.`, fold_med = Median, fold_q3 = `3rd Qu.`)
s_s <- as.data.frame(s_s) %>% dplyr::select("1st Qu.", "Median" ,"3rd Qu.") %>%
dplyr::rename(sim_q1 = `1st Qu.`, sim_med = Median, sim_q3 = `3rd Qu.`)
e_s <- as.data.frame(e_s) %>% dplyr::select("1st Qu.", "Median" ,"3rd Qu.") %>%
dplyr::rename(exp_q1 = `1st Qu.`, exp_med = Median, exp_q3 = `3rd Qu.`)
all_stats <- merge(potentials, f_s, by="row.names", all=TRUE)
all_stats <- merge(all_stats, s_s, by.x="Row.names" ,by.y="row.names", all=TRUE)
all_stats <- merge(all_stats, e_s, by.x="Row.names" ,by.y="row.names", all=TRUE)
rownames(all_stats) <- all_stats$Row.names
all_stats <- all_stats[2:length(all_stats)]
all_stats
write.csv(all_stats, file = "output/data/rn_similarity_expr_stats.csv", row.names = TRUE)
# -------- visualize the two (niche ~ similarity) as a dependency --------
sml <- all_stats$sim_med
m_q1 <- all_stats$sim_q1
m_q3 <- all_stats$sim_q3
fol <- all_stats$fold_med
f_q1 <- all_stats$fold_q1
f_q3 <- all_stats$fold_q3
png(filename = "output/paper_figures/similarity_realized_scatter.png")
plot(y = fol, x = sml,
xlim=range(c(sml-m_q1, sml+m_q3)),
ylim=range(c(fol-f_q1, fol+f_q3)),
pch=19, ylab="realized folding", xlab="similarity"
)
# we draw arrows with very special "arrowheads" as whiskers
arrows(sml-m_q1, fol, sml+m_q3, fol, length=0.05, angle=90, code=3) # siml whiskers
arrows(sml, fol-f_q1, sml, fol+f_q3, length=0.05, angle=90, code=3) # fold whiskers
dev.off()
# -------- visualize the similarity vs folding potential --------
pot <- all_stats$potential
med <- all_stats$sim_med
m_q1 <- all_stats$sim_q1
m_q3 <- all_stats$sim_q3
png(filename = "output/paper_figures/similarity_potential_scatter.png")
plot(y = pot, x = med,
xlim=range(c(med-m_q1, med+m_q3)),
pch=19, ylab="Potential", xlab="similarity",
main="jaccard vs folding potential dapancdency"
)
# hack: we draw arrows but with very special "arrowheads"
arrows(med-m_q1, pot, med+m_q3, pot, length=0.05, angle=90, code=3)
text(med, pot, all_stats$Row.names, pos=1)
dev.off()
#----------- visualize (niche ~ potential) as a dependency ----------
# this doesn't really belong here logically but all that is needed for it is here.
pot <- all_stats$potential
med <- all_stats$fold_med
m_q1 <- all_stats$fold_q1
m_q3 <- all_stats$fold_q3
png(filename = "output/potential_vs_realized_scatter.png")
plot(y = pot, x = med,
xlim=range(c(med-m_q1, med+m_q3)),
pch=19, ylab="Potential", xlab="realized folding",
main="realized folding vs folding potential dapancdency"
)
arrows(med-m_q1, pot, med+m_q3, pot, length=0.05, angle=90, code=3)
text(med, pot, all_stats$Row.names, pos=1)
dev.off()
#----------- visualize (similarity ~ expression) as a dependency ----------
sml <- all_stats$sim_med
m_q1 <- all_stats$sim_q1
m_q3 <- all_stats$sim_q3
exp <- all_stats$exp_med
e_q1 <- all_stats$exp_q1
e_q3 <- all_stats$exp_q3
png(filename = "output/similarity_expression_scatter.png")
plot(y = exp, x = sml,
xlim=range(c(sml-m_q1, sml+m_q3)),
ylim=range(c(exp-e_q1, exp+e_q3)),
pch=19, ylab="Log10 expression", xlab="similarity",
main="jaccard vs log10 expression levels dapancdency"
)
# we draw arrows with very special "arrowheads" as whiskers
arrows(sml-m_q1, exp, sml+m_q3, exp, length=0.05, angle=90, code=3) # siml whiskers
arrows(sml, exp-e_q1, sml, exp+e_q3, length=0.05, angle=90, code=3) # expression whiskers
dev.off()
#------- test correlation between median similarity and expression ----------
st <- cor.test(sml, exp, method="spearman", exact=FALSE)
obs_pval <- st$p.value
obs_rval <- st$estimate
st