Skip to content

Commit 5049f3f

Browse files
committed
transcriptomic distance
1 parent d5bf9c6 commit 5049f3f

8 files changed

+504
-134
lines changed

.DS_Store

0 Bytes
Binary file not shown.

.gitignore

+1
Original file line numberDiff line numberDiff line change
@@ -50,3 +50,4 @@ rsconnect/
5050

5151
# data files
5252
data/
53+
paper/

figure/OSN_OR_counts.pdf

6.59 KB
Binary file not shown.

figure/cell_threshold.pdf

5.29 KB
Binary file not shown.

generate_h5ad.ipynb

+13-46
Original file line numberDiff line numberDiff line change
@@ -609,7 +609,19 @@
609609
"cell_type": "code",
610610
"execution_count": 17,
611611
"metadata": {},
612-
"outputs": [],
612+
"outputs": [
613+
{
614+
"ename": "",
615+
"evalue": "",
616+
"output_type": "error",
617+
"traceback": [
618+
"\u001b[1;31mThe Kernel crashed while executing code in the current cell or a previous cell. \n",
619+
"\u001b[1;31mPlease review the code in the cell(s) to identify a possible cause of the failure. \n",
620+
"\u001b[1;31mClick <a href='https://aka.ms/vscodeJupyterKernelCrash'>here</a> for more info. \n",
621+
"\u001b[1;31mView Jupyter <a href='command:jupyter.viewOutput'>log</a> for further details."
622+
]
623+
}
624+
],
613625
"source": [
614626
"# need to be float for BPCells, otherwise you gonna spend 2hr debugging it\n",
615627
"adata_corrected.X = adata_corrected.X.astype(np.float64)\n",
@@ -625,51 +637,6 @@
625637
"source": [
626638
"adata_corrected.X.dtype\n"
627639
]
628-
},
629-
{
630-
"cell_type": "code",
631-
"execution_count": 11,
632-
"metadata": {},
633-
"outputs": [
634-
{
635-
"name": "stderr",
636-
"output_type": "stream",
637-
"text": [
638-
"/opt/anaconda3/lib/python3.12/site-packages/anndata/__init__.py:55: FutureWarning: `anndata.read` is deprecated, use `anndata.read_h5ad` instead. `ad.read` will be removed in mid 2024.\n",
639-
" warnings.warn(\n"
640-
]
641-
}
642-
],
643-
"source": [
644-
"flu_data = ad.read('data/flu_processed_backup.h5ad')"
645-
]
646-
},
647-
{
648-
"cell_type": "code",
649-
"execution_count": 13,
650-
"metadata": {},
651-
"outputs": [
652-
{
653-
"data": {
654-
"text/plain": [
655-
"dtype('float64')"
656-
]
657-
},
658-
"execution_count": 13,
659-
"metadata": {},
660-
"output_type": "execute_result"
661-
}
662-
],
663-
"source": [
664-
"flu_data.X.dtype"
665-
]
666-
},
667-
{
668-
"cell_type": "code",
669-
"execution_count": null,
670-
"metadata": {},
671-
"outputs": [],
672-
"source": []
673640
}
674641
],
675642
"metadata": {

identify_hvg.Rmd

+269
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,269 @@
1+
---
2+
title: "Finding HVG"
3+
author: "Yifan Duan"
4+
date: "2024-09-17"
5+
output: html_document
6+
---
7+
8+
```{r setup, include=FALSE}
9+
knitr::opts_chunk$set(echo = TRUE)
10+
```
11+
12+
13+
```{r}
14+
library(Seurat)
15+
library(BPCells)
16+
library(dplyr)
17+
library(ggplot2)
18+
library(ggrepel)
19+
library(patchwork)
20+
library(Matrix)
21+
library(cowplot)
22+
library(irlba)
23+
library(tidyr)
24+
library(tibble)
25+
```
26+
27+
28+
```{r}
29+
neuron_obj <- readRDS("data/neuron.Rds")
30+
# all of the ORs with "real" effect
31+
or_list <- read.table("data/Olfr_GO_unique_Rel94.txt", header = T, sep = "\t", quote = "")
32+
33+
# subsetting the neuron to mature neuron only
34+
mature_neuron_obj <- subset(neuron_obj, subset = RNA_snn_res.0.25 %in% c("1", "2", "4", "7", "8"), features = or_list$Gene.name)
35+
36+
# getting the raw counts
37+
counts_matrix <- mature_neuron_obj[["RNA"]]$counts
38+
# more than 3 UMIs counts as OR expressed
39+
counts_matrix_dpi <- as.data.frame(t(apply(counts_matrix, 2, function(x) ifelse(x >= 3, 1, 0))))
40+
# filtering out the cell that expresses more than 1 OR
41+
counts_matrix_dpi <- counts_matrix_dpi |> filter(rowSums(counts_matrix_dpi) == 1)
42+
43+
counts_matrix_dpi$dpi <- as.factor(sub("_.*", "", rownames(counts_matrix_dpi)))
44+
45+
# converting into dataframe per dpi for each OR (to select them later)
46+
dpi_or_composition <- counts_matrix_dpi %>%
47+
pivot_longer(cols = -dpi, names_to = "ORs", values_to = "count") %>%
48+
group_by(ORs, dpi) %>%
49+
summarize(count = sum(count), .groups = 'drop') %>%
50+
pivot_wider(names_from = dpi, values_from = count, values_fill = list(count = 0))
51+
52+
# only if it's 4 or more cells in each dpi for each OR
53+
dpi_or_composition_filtered <- dpi_or_composition |>
54+
filter(if_all(where(is.numeric), ~ . >= 4))
55+
56+
ORs_interested <- dpi_or_composition_filtered$ORs
57+
```
58+
59+
60+
```{r}
61+
# using OR_filtered list containing 308 ORs to get cell expressing them only
62+
cell_interested <- counts_matrix_dpi[, colnames(counts_matrix_dpi) %in% ORs_interested]
63+
all_zero_rows <- apply(cell_interested, 1, function(row) all(row == 0))
64+
65+
# Display the rows that are all zero
66+
cell_interested <- cell_interested |> filter(all_zero_rows == F)
67+
68+
```
69+
70+
71+
```{r}
72+
sample_ORs_by_dpi <- function(data, n = 4) {
73+
cell_to_subsample <- c()
74+
75+
for (OR_names in colnames(data)){
76+
OR_data <- data |> select(OR_names) |> filter(get(OR_names) != 0)
77+
78+
# adding cell name & dpi as column
79+
OR_data <- OR_data |> mutate(cell_name = rownames(OR_data),
80+
dpi = as.factor(sub("_.*", "", rownames(OR_data))),
81+
OR = OR_names)
82+
OR_samples <- OR_data |>
83+
group_by(dpi) |> sample_n(min(n(), n), replace = FALSE)
84+
85+
cell_to_subsample <- unique(c(cell_to_subsample, OR_samples$cell_name))
86+
}
87+
return(cell_to_subsample)
88+
}
89+
90+
91+
# 4 cells * 5 dpi * 308 ORs = 6160 cells (if done correctly)
92+
test <- sample_ORs_by_dpi(cell_interested, 4)
93+
length(test)
94+
length(unique(test)) # 5658 unique, not equal to 6160??
95+
```
96+
97+
98+
```{r eval=FALSE}
99+
all_HVGs <- list()
100+
set.seed(25)
101+
102+
for (i in 1:10){
103+
# selecting the cell that express each OR of interest (3 UMIs + at least 4 cells per dpi)
104+
# selecting 4 cells per OR per dpi would mediate effect of abundant OR
105+
barcode_interested <- sample_ORs_by_dpi(cell_interested)
106+
107+
# subset the seurat object to the criteria listed above
108+
filtered_mature_neuron_obj <- subset(neuron_obj,
109+
features = setdiff(rownames(neuron_obj), or_list$Gene.name),
110+
cells = barcode_interested)
111+
112+
# find HVGs
113+
filtered_mature_neuron_obj <- FindVariableFeatures(filtered_mature_neuron_obj,
114+
selection.method = "vst",
115+
nfeatures = 2000,
116+
layer = "data")
117+
118+
HVGs <- VariableFeatures(filtered_mature_neuron_obj)
119+
all_HVGs[[i]] <- HVGs
120+
}
121+
122+
gene_freq_df <- as.data.frame(table(unlist(all_HVGs)))
123+
colnames(gene_freq_df) <- c("gene", "freq")
124+
125+
write.table(gene_freq_df, file = "data/gene_freq_10iterations.csv", sep = ",", row.names = FALSE, col.names = TRUE)
126+
127+
#HVFInfo(filtered_mature_neuron_obj)
128+
```
129+
130+
131+
```{r}
132+
gene_freq_df <- read_csv("data/gene_freq_10iterations.csv")
133+
gene_freq_df |> ggplot(aes(x = as.factor(freq))) + geom_bar(stat = "count") +
134+
theme_cowplot() + ggtitle("2000 HVGs over 10 iterations")
135+
136+
hvg_filtered <- gene_freq_df |> filter(freq > 1)
137+
rheostat_hvg <- read_csv("data/rheostat_table_s2.csv")
138+
139+
library(ggvenn)
140+
141+
# Define the two sets
142+
hvg_overlap <- list(rheostat_hvg = rheostat_hvg$Gene,
143+
influenza_hvg = as.character(hvg_filtered$gene))
144+
ggvenn(hvg_overlap)
145+
```
146+
147+
148+
```{r}
149+
# cosine similarity
150+
library(lsa)
151+
152+
filtered_mature_neuron_mat <- filtered_mature_neuron_obj[["RNA"]]$data
153+
filtered_mature_neuron_mat_hvg <- filtered_mature_neuron_mat[VariableFeatures(filtered_mature_neuron_obj),]
154+
155+
# saves time according to BPCells
156+
filtered_mature_neuron_mat_hvg <- filtered_mature_neuron_mat_hvg %>% write_matrix_dir(tempfile("mat"))
157+
158+
# PCA
159+
svd <- BPCells::svds(filtered_mature_neuron_mat_hvg, k = 50)
160+
#svd <- irlba(neuron_mat_norm, nv=50)
161+
pca <- multiply_cols(svd$v, svd$d)
162+
163+
colnames(pca) <- paste0("PC_", 1:ncol(pca))
164+
rownames(pca) <- colnames(filtered_mature_neuron_mat_hvg)
165+
166+
plot(svd$d / sqrt(nrow(filtered_mature_neuron_mat_hvg)))
167+
pca_top_20 <- pca[, 1:20]
168+
169+
# compute the cosine distance
170+
cos_dist <- 1 - cosine(t(pca_top_20))
171+
172+
cos_dist[1:3, 1:3]
173+
# need to associate barcode with OR
174+
175+
#write.csv(cos_dist, file = "data/cos_dist.csv")
176+
```
177+
178+
179+
```{r}
180+
get_cell_metadata <- function(feature_table, OR_list){
181+
cell_metadata <- data.frame()
182+
183+
for (OR_names in OR_list) {
184+
OR_data <- feature_table |>
185+
select(OR_names) |>
186+
filter(get(OR_names) != 0)
187+
188+
# Adding cell name & dpi as columns
189+
OR_data <- OR_data |>
190+
mutate(cell_name = rownames(OR_data),
191+
dpi = as.factor(sub("_.*", "", rownames(OR_data))),
192+
OR = OR_names) |>
193+
select(-all_of(OR_names)) # Use all_of() to correctly reference the column
194+
195+
# Combine results
196+
cell_metadata <- bind_rows(cell_metadata, OR_data)
197+
}
198+
return(cell_metadata)
199+
}
200+
201+
get_cell_metadata(cell_interested, ORs_interested)
202+
203+
write_csv(cell_metadata, file = "data/cell_metadata.csv")
204+
# View the result
205+
dim(cos_dist)
206+
subset_cell_metadata <- cell_metadata[cell_metadata$cell_name %in% barcode_interested, ]
207+
208+
```
209+
210+
211+
```{r within OR distance}
212+
within_dist <- function(dist_matrix, metadata){
213+
# Extract the OR groups from the metadata
214+
unique_ors <- unique(metadata$OR)
215+
216+
# Initialize a vector to store the median within-OR distances
217+
within_or_medians <- data.frame(OR = character(), median_dist = numeric())
218+
219+
for (or in unique_ors) {
220+
# Get cell names for the current OR
221+
cells_for_or <- metadata$cell_name[metadata$OR == or]
222+
223+
# Filter the cosine distance matrix for these cells
224+
matrix_for_or <- dist_matrix[cells_for_or, cells_for_or, drop = F]
225+
226+
# Extract the upper triangle of the matrix as a vector of distances
227+
upper_triangle_distances <- matrix_for_or[upper.tri(matrix_for_or)]
228+
229+
# Calculate the median of the pairwise distances
230+
median_distance <- median(upper_triangle_distances, na.rm = TRUE)
231+
232+
within_or_medians <- rbind(within_or_medians, data.frame(OR = or, median_dist = median_distance))
233+
}
234+
235+
return(within_or_medians)
236+
}
237+
238+
median_distance <- within_dist(cos_dist, subset_cell_metadata)
239+
hist(median_distance$median_dist)
240+
```
241+
242+
243+
```{r}
244+
between_dist <- c()
245+
for (or in median_distance$OR){
246+
between_dist[[or]] <- median_distance |> filter(OR != or)
247+
}
248+
249+
# Initialize an empty data frame to store the results
250+
hist(between_dist$Olfr1019$median_dist)
251+
median_between_distances <- data.frame(OR = character(),
252+
between_dist_median = numeric(),
253+
stringsAsFactors = FALSE)
254+
255+
# Loop through each OR in the between_dist list
256+
for (or in names(between_dist)) {
257+
# Compute the median of 'median_dist' for the current OR
258+
median_value <- median(between_dist[[or]]$median_dist, na.rm = TRUE)
259+
260+
# Add the result to the data frame
261+
median_between_distances <- rbind(median_between_distances,
262+
data.frame(OR = or,
263+
between_dist_median = median_value))
264+
}
265+
266+
# View the result
267+
print(median_between_distances)
268+
```
269+

0 commit comments

Comments
 (0)