-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathidentify_hvg.Rmd
313 lines (227 loc) · 9.89 KB
/
identify_hvg.Rmd
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
---
title: "Finding HVG"
author: "Yifan Duan"
date: "2024-09-17"
output: html_document
---
```{r setup, include=FALSE}
knitr::opts_chunk$set(echo = TRUE)
```
```{r}
library(Seurat)
library(BPCells)
library(dplyr)
library(ggplot2)
library(ggrepel)
library(patchwork)
library(Matrix)
library(cowplot)
library(irlba)
library(tidyr)
library(tibble)
library(readr)
```
```{r}
neuron_obj <- readRDS("data/neuron.Rds")
# all of the ORs with "real" effect
or_list <- read.table("data/Olfr_GO_unique_Rel94.txt", header = T, sep = "\t", quote = "")
# subsetting the neuron to mature neuron only
mature_neuron_obj <- subset(neuron_obj, subset = RNA_snn_res.0.25 %in% c("1", "2", "4", "7", "8"), features = or_list$Gene.name)
# getting the raw counts
counts_matrix <- mature_neuron_obj[["RNA"]]$counts
# more than 3 UMIs counts as OR expressed
counts_matrix_dpi <- as.data.frame(t(apply(counts_matrix, 2, function(x) ifelse(x >= 3, 1, 0))))
# filtering out the cell that expresses more than 1 OR
counts_matrix_dpi <- counts_matrix_dpi |> filter(rowSums(counts_matrix_dpi) == 1)
counts_matrix_dpi$dpi <- as.factor(sub("_.*", "", rownames(counts_matrix_dpi)))
# converting into dataframe per dpi for each OR (to select them later)
dpi_or_composition <- counts_matrix_dpi %>%
pivot_longer(cols = -dpi, names_to = "ORs", values_to = "count") %>%
group_by(ORs, dpi) %>%
summarize(count = sum(count), .groups = 'drop') %>%
pivot_wider(names_from = dpi, values_from = count, values_fill = list(count = 0))
# only if it's 4 or more cells in each dpi for each OR
dpi_or_composition_filtered <- dpi_or_composition |>
filter(if_all(where(is.numeric), ~ . >= 4))
ORs_interested <- dpi_or_composition_filtered$ORs
```
```{r}
# using OR_filtered list containing 308 ORs to get cell expressing them only
cell_interested <- counts_matrix_dpi[, colnames(counts_matrix_dpi) %in% ORs_interested]
all_zero_rows <- apply(cell_interested, 1, function(row) all(row == 0))
# Display the rows that are cell with 1 ORs
cell_interested <- cell_interested |> filter(all_zero_rows == F)
```
```{r}
sample_ORs_by_dpi <- function(data, n = 4) {
cell_to_subsample <- c()
for (OR_names in colnames(data)){
OR_data <- data |> select(OR_names) |> filter(get(OR_names) != 0)
# adding cell name & dpi as column
OR_data <- OR_data |> mutate(cell_name = rownames(OR_data),
dpi = as.factor(sub("_.*", "", rownames(OR_data))),
OR = OR_names)
OR_samples <- OR_data |>
group_by(dpi) |> sample_n(min(n(), n), replace = FALSE)
cell_to_subsample <- unique(c(cell_to_subsample, OR_samples$cell_name))
}
return(cell_to_subsample)
}
# 4 cells * 5 dpi * 308 ORs = 6160 cells (if done correctly)
test <- sample_ORs_by_dpi(cell_interested, 4)
length(test)
length(unique(test)) # 5658 unique, not equal to 6160??
```
```{r}
all_HVGs <- list()
set.seed(25)
for (i in 1:10){
# selecting the cell that express each OR of interest (3 UMIs + at least 4 cells per dpi)
# selecting 4 cells per OR per dpi would mediate effect of abundant OR
barcode_interested <- sample_ORs_by_dpi(cell_interested)
# subset the seurat object to the criteria listed above
filtered_mature_neuron_obj <- subset(neuron_obj,
features = setdiff(rownames(neuron_obj), or_list$Gene.name),
cells = barcode_interested)
# find HVGs
filtered_mature_neuron_obj <- FindVariableFeatures(filtered_mature_neuron_obj,
selection.method = "vst",
nfeatures = 2000,
layer = "data")
HVGs <- VariableFeatures(filtered_mature_neuron_obj)
all_HVGs[[i]] <- HVGs
}
gene_freq_df <- as.data.frame(table(unlist(all_HVGs)))
colnames(gene_freq_df) <- c("gene", "freq")
write.table(gene_freq_df, file = "data/gene_freq_10iterations.csv", sep = ",", row.names = FALSE, col.names = TRUE)
#HVFInfo(filtered_mature_neuron_obj)
```
```{r}
gene_freq_df <- read_csv("data/gene_freq_10iterations.csv")
gene_freq_df |> ggplot(aes(x = as.factor(freq))) + geom_bar(stat = "count") +
theme_cowplot() + ggtitle("2000 HVGs over 10 iterations")
hvg_filtered <- gene_freq_df |> filter(freq > 1)
rheostat_hvg <- read_csv("data/rheostat_table_s2.csv")
library(ggvenn)
# Define the two sets
hvg_overlap <- list(rheostat_hvg = rheostat_hvg$Gene,
influenza_hvg = as.character(hvg_filtered$gene))
ggvenn(hvg_overlap)
```
```{r}
# cosine similarity
library(lsa)
filtered_mature_neuron_obj <- ScaleData(filtered_mature_neuron_obj)
filtered_mature_neuron_mat <- filtered_mature_neuron_obj[["RNA"]]$scale.data
filtered_mature_neuron_mat_hvg <- filtered_mature_neuron_mat[VariableFeatures(filtered_mature_neuron_obj),]
# saves time according to BPCells
filtered_mature_neuron_mat_hvg <- filtered_mature_neuron_mat_hvg %>% write_matrix_dir(tempfile("mat"))
# PCA
svd <- BPCells::svds(filtered_mature_neuron_mat_hvg, k = 50)
#svd <- irlba(neuron_mat_norm, nv=50)
pca <- multiply_cols(svd$v, svd$d)
colnames(pca) <- paste0("PC_", 1:ncol(pca))
rownames(pca) <- colnames(filtered_mature_neuron_mat_hvg)
plot(svd$d / sqrt(nrow(filtered_mature_neuron_mat_hvg)))
pca_top_30 <- pca[, 1:30]
# compute the cosine distance
cos_dist <- 1 - cosine(t(pca_top_30))
cos_dist[1:30, 1:3]
#write.csv(cos_dist, file = "data/cos_dist.csv")
```
```{r}
# returns the dpi and OR label for each cell
get_cell_metadata <- function(feature_table, OR_list){
cell_metadata <- data.frame()
for (OR_names in OR_list) {
OR_data <- feature_table |>
select(OR_names) |>
filter(get(OR_names) != 0)
# Adding cell name & dpi as columns
OR_data <- OR_data |>
mutate(cell_name = rownames(OR_data),
dpi = as.factor(sub("_.*", "", rownames(OR_data))),
OR = OR_names) |>
select(-all_of(OR_names)) # Use all_of() to correctly reference the column
# Combine results
cell_metadata <- bind_rows(cell_metadata, OR_data)
}
return(cell_metadata)
}
cell_metadata <- get_cell_metadata(cell_interested, ORs_interested)
write_csv(cell_metadata, file = "data/cell_metadata.csv")
# View the result
dim(cos_dist)
subset_cell_metadata <- cell_metadata[cell_metadata$cell_name %in% barcode_interested, ]
```
```{r within OR distance}
within_dist <- function(dist_matrix, metadata){
# Extract the OR groups from the metadata
unique_ors <- unique(metadata$OR)
# Initialize a vector to store the median within-OR distances
within_or_medians <- data.frame(OR = character(), median_dist = numeric())
for (or in unique_ors) {
# Get cell names for the current OR
cells_for_or <- metadata$cell_name[metadata$OR == or]
# Filter the cosine distance matrix for these cells
matrix_for_or <- dist_matrix[cells_for_or, cells_for_or, drop = F]
# Extract the upper triangle of the matrix as a vector of distances
upper_triangle_distances <- matrix_for_or[upper.tri(matrix_for_or)]
# Calculate the median of the pairwise distances
median_distance <- median(upper_triangle_distances, na.rm = TRUE)
within_or_medians <- rbind(within_or_medians, data.frame(OR = or, median_dist = median_distance))
}
return(within_or_medians)
}
median_distance <- within_dist(cos_dist, subset_cell_metadata)
median_distance |> ggplot(aes(x = median_dist)) + geom_density() + theme_cowplot() +
xlab("Cosine distance")
```
```{r}
OR_names <- unique(subset_cell_metadata$OR) # Use unique OR names to avoid redundant calculations
cell_name_by_OR <- split(subset_cell_metadata$cell_name, subset_cell_metadata$OR) # Pre-split by OR
between_dist <- list()
# Loop through each OR to use as the comparison base
for (comparison_base in OR_names) {
base_cells <- cell_name_by_OR[[comparison_base]] # Get base cells for current OR
median_distances <- list() # Store median distances for this base OR
# Loop through each other OR group
for (OR in OR_names) {
if (OR != comparison_base) {
comparison_cells <- cell_name_by_OR[[OR]] # Get comparison cells for current OR
# Subset the cos_dist matrix directly, avoiding `outer`
dist_matrix <- cos_dist[base_cells, comparison_cells]
# Calculate the median distance and store
median_distances[[OR]] <- median(dist_matrix)
}
}
# Store the median of medians for the current comparison base OR
between_dist[[comparison_base]] <- unlist(median_distances)
}
# Convert the results into a named vector if necessary
between_dist <- unlist(between_dist)
```
```{r}
ggplot() +
geom_histogram(aes(x = median_distance$median_dist, y = ..density.., fill = "Within OR"), alpha = 0.5) +
geom_histogram(aes(x = between_dist, y = ..density.., fill = "Between ORs"), alpha = 0.5) +
scale_fill_manual(name = "Group", values = c("Within OR" = "blue", "Between ORs" = "red")) +
theme_cowplot() + xlab("Cosine distance")
```
```{r}
# create a null distribution (shuffling OR labels)
subset_cell_metadata_shuffled <- subset_cell_metadata
within_dist_shuffled <- list()
for (i in 1:1000){
subset_cell_metadata_shuffled$OR <- sample(subset_cell_metadata_shuffled$OR, length(subset_cell_metadata_shuffled$OR))
within_dist_shuffled[[i]] <- within_dist(cos_dist, subset_cell_metadata_shuffled)
}
within_dist_shuffled <- bind_rows(within_dist_shuffled)
hist(within_dist_shuffled$median_dist)
ggplot() +
geom_density(aes(x = median_distance$median_dist, y = ..density.., fill = "Within OR"), alpha = 0.6) +
geom_density(aes(x = between_dist, y = ..density.., fill = "Between ORs"), alpha = 0.6) +
geom_density(aes(x = within_dist_shuffled$median_dist, y = ..density.., fill = "Shuffled 1000 times (within OR)"), alpha = 0.6) +
scale_fill_manual(name = "Group", values = c("Within OR" = "blue", "Between ORs" = "red", "Shuffled 1000 times (within OR)" = "green")) +
theme_cowplot() + xlab("Cosine distance")
```