Skip to content

Commit d5bf9c6

Browse files
committed
added raw count
1 parent aa33530 commit d5bf9c6

8 files changed

+160
-67
lines changed

figure/DE_genes_grp6.pdf

-550 KB
Binary file not shown.

figure/DE_genes_grp7.pdf

550 KB
Binary file not shown.

figure/marker_mature_OSN.pdf

11.4 MB
Binary file not shown.

figure/marker_mature_OSN_subtype.pdf

11.1 MB
Binary file not shown.

generate_h5ad.ipynb

+111-36
Original file line numberDiff line numberDiff line change
@@ -29,52 +29,77 @@
2929
"metadata": {},
3030
"outputs": [],
3131
"source": [
32-
"counts = scipy.io.mmread('data/primary_sparse_processed.mtx').tocsr().transpose()"
32+
"counts = scipy.io.mmread('data/primary_sparse.mtx').tocsr().transpose()"
3333
]
3434
},
3535
{
3636
"cell_type": "code",
37-
"execution_count": 17,
37+
"execution_count": 3,
3838
"metadata": {},
3939
"outputs": [
4040
{
4141
"name": "stdout",
4242
"output_type": "stream",
4343
"text": [
44-
"156572 25129 (156572, 25129)\n"
44+
"raw cell number 156572, raw gene number 25147, the dim of the raw matrix(156572, 25147), the processed gene number 25129\n"
4545
]
4646
}
4747
],
4848
"source": [
4949
"# Load barcodes and genes\n",
50-
"barcodes = pd.read_csv('data/primary_colnames_processed.txt', header=None).squeeze().tolist()\n",
51-
"genes = pd.read_csv('data/primary_features_processed.txt', header=None).squeeze().tolist()\n",
52-
"print(len(barcodes), len(genes), counts.shape)"
50+
"raw_barcodes = pd.read_csv('data/primary_colnames_raw.txt', header=None).squeeze().tolist()\n",
51+
"raw_genes = pd.read_csv('data/primary_features_raw.txt', header=None).squeeze().tolist()\n",
52+
"processed_genes = pd.read_csv('data/primary_features_processed.txt', header=None).squeeze().tolist()\n",
53+
"\n",
54+
"print(f\"raw cell number {len(raw_barcodes)}, raw gene number {len(raw_genes)}, the dim of the raw matrix{counts.shape}, the processed gene number {len(processed_genes)}\")"
5355
]
5456
},
5557
{
5658
"cell_type": "code",
57-
"execution_count": 18,
59+
"execution_count": 4,
5860
"metadata": {},
5961
"outputs": [],
6062
"source": [
6163
"adata = sc.AnnData(X=counts)\n",
62-
"adata.var_names = genes\n",
63-
"adata.obs_names = barcodes\n"
64+
"adata.var_names = raw_genes\n",
65+
"adata.obs_names = raw_barcodes"
66+
]
67+
},
68+
{
69+
"cell_type": "code",
70+
"execution_count": 5,
71+
"metadata": {},
72+
"outputs": [
73+
{
74+
"data": {
75+
"text/plain": [
76+
"25129"
77+
]
78+
},
79+
"execution_count": 5,
80+
"metadata": {},
81+
"output_type": "execute_result"
82+
}
83+
],
84+
"source": [
85+
"adata.var_names.intersection(processed_genes)\n",
86+
"# this contains the gene only found in the processed data\n",
87+
"adata_corrected = adata[:, processed_genes].copy()\n",
88+
"len(adata_corrected.var)"
6489
]
6590
},
6691
{
6792
"cell_type": "code",
68-
"execution_count": 19,
93+
"execution_count": 6,
6994
"metadata": {},
7095
"outputs": [
7196
{
7297
"name": "stderr",
7398
"output_type": "stream",
7499
"text": [
75-
"/var/folders/pj/g7ctw93j7477th9q941xfyyc0000gn/T/ipykernel_8486/700565825.py:1: DtypeWarning: Columns (16,17) have mixed types. Specify dtype option on import or set low_memory=False.\n",
100+
"/var/folders/pj/g7ctw93j7477th9q941xfyyc0000gn/T/ipykernel_30998/4086199203.py:1: DtypeWarning: Columns (16,17) have mixed types. Specify dtype option on import or set low_memory=False.\n",
76101
" metadata = pd.read_table('data/scp_primary_metadata.txt', index_col=0)\n",
77-
"/var/folders/pj/g7ctw93j7477th9q941xfyyc0000gn/T/ipykernel_8486/700565825.py:5: DtypeWarning: Columns (4,5,6) have mixed types. Specify dtype option on import or set low_memory=False.\n",
102+
"/var/folders/pj/g7ctw93j7477th9q941xfyyc0000gn/T/ipykernel_30998/4086199203.py:5: DtypeWarning: Columns (4,5,6) have mixed types. Specify dtype option on import or set low_memory=False.\n",
78103
" cluster_data = pd.read_table('data/primary_clusterdata.txt', index_col=0)\n"
79104
]
80105
}
@@ -89,12 +114,18 @@
89114
"cluster_data = cluster_data.iloc[1:]\n",
90115
"\n",
91116
"combined_metadata = pd.merge(cluster_data, metadata, left_index=True, right_index=True)\n",
92-
"adata.obs = combined_metadata\n"
117+
"adata.obs = combined_metadata\n",
118+
"\n",
119+
"adata.obs = adata.obs.astype('category')\n",
120+
"\n",
121+
"columns_to_convert = ['X', 'Y', 'number_of_reads', 'number_of_features', 'Cell.Type']\n",
122+
"for column in columns_to_convert:\n",
123+
" adata.obs[column] = pd.to_numeric(adata.obs[column])\n"
93124
]
94125
},
95126
{
96127
"cell_type": "code",
97-
"execution_count": 28,
128+
"execution_count": 7,
98129
"metadata": {},
99130
"outputs": [
100131
{
@@ -450,19 +481,19 @@
450481
"Naive_LNG.AGCATCAGTGCCCAGT LNG HSC Naive \n",
451482
"Naive_LNG.TAACGACAGACGTCCC LNG HSC Naive \n",
452483
"\n",
453-
" X Y Cell.Type biosample_id \\\n",
454-
"NAME \n",
455-
"D14_OE.AAACCCAGTATTCCTT -11.398092 -1.642075 5 D14_OM \n",
456-
"D14_OE.AAACGAACAAAGCTCT -13.586423 -4.545745 5 D14_OM \n",
457-
"D14_OE.AAACGAAGTGTTAAAG -11.207908 -1.848815 5 D14_OM \n",
458-
"D14_OE.AAACGCTAGAATACAC -11.215386 -1.674733 5 D14_OM \n",
459-
"D14_OE.AAACGCTAGCATCCTA -12.391555 -4.034981 5 D14_OM \n",
460-
"... ... ... ... ... \n",
461-
"Naive_RE.CTCATGCAGGCTTAGG 2.256732 -12.009367 10 Naive_RM \n",
462-
"Naive_RE.GTATTGGGTGCCGGTT -5.709469 -3.818193 10 Naive_RM \n",
463-
"Naive_RE.TGGGTTATCGCAATGT 2.359610 -11.714253 10 Naive_RM \n",
464-
"Naive_LNG.AGCATCAGTGCCCAGT -6.198734 -3.709947 10 Naive_LNG \n",
465-
"Naive_LNG.TAACGACAGACGTCCC 2.420867 -12.134587 10 Naive_LNG \n",
484+
" X Y Cell.Type biosample_id \\\n",
485+
"NAME \n",
486+
"D14_OE.AAACCCAGTATTCCTT -11.398092 -1.642075 5 D14_OM \n",
487+
"D14_OE.AAACGAACAAAGCTCT -13.586423 -4.545745 5 D14_OM \n",
488+
"D14_OE.AAACGAAGTGTTAAAG -11.207908 -1.848815 5 D14_OM \n",
489+
"D14_OE.AAACGCTAGAATACAC -11.215386 -1.674733 5 D14_OM \n",
490+
"D14_OE.AAACGCTAGCATCCTA -12.391555 -4.034981 5 D14_OM \n",
491+
"... ... ... ... ... \n",
492+
"Naive_RE.CTCATGCAGGCTTAGG 2.256732 -12.009367 10 Naive_RM \n",
493+
"Naive_RE.GTATTGGGTGCCGGTT -5.709469 -3.818193 10 Naive_RM \n",
494+
"Naive_RE.TGGGTTATCGCAATGT 2.359610 -11.714253 10 Naive_RM \n",
495+
"Naive_LNG.AGCATCAGTGCCCAGT -6.198734 -3.709947 10 Naive_LNG \n",
496+
"Naive_LNG.TAACGACAGACGTCCC 2.420867 -12.134587 10 Naive_LNG \n",
466497
"\n",
467498
" donor_id species \\\n",
468499
"NAME \n",
@@ -565,7 +596,7 @@
565596
"[156572 rows x 23 columns]"
566597
]
567598
},
568-
"execution_count": 28,
599+
"execution_count": 7,
569600
"metadata": {},
570601
"output_type": "execute_result"
571602
}
@@ -576,25 +607,69 @@
576607
},
577608
{
578609
"cell_type": "code",
579-
"execution_count": 29,
610+
"execution_count": 17,
580611
"metadata": {},
581612
"outputs": [],
582613
"source": [
583-
"adata.obs = adata.obs.astype('category')\n",
614+
"# need to be float for BPCells, otherwise you gonna spend 2hr debugging it\n",
615+
"adata_corrected.X = adata_corrected.X.astype(np.float64)\n",
584616
"\n",
585-
"columns_to_convert = ['X', 'Y', 'number_of_reads', 'number_of_features', 'Cell.Type']\n",
586-
"for column in columns_to_convert:\n",
587-
" adata.obs[column] = pd.to_numeric(adata.obs[column])"
617+
"adata_corrected.write('data/flu_raw.h5ad', compression=\"gzip\")\n"
588618
]
589619
},
590620
{
591621
"cell_type": "code",
592-
"execution_count": 30,
622+
"execution_count": 16,
593623
"metadata": {},
594624
"outputs": [],
595625
"source": [
596-
"adata.write('data/flu_processed.h5ad', compression=\"gzip\")\n"
626+
"adata_corrected.X.dtype\n"
597627
]
628+
},
629+
{
630+
"cell_type": "code",
631+
"execution_count": 11,
632+
"metadata": {},
633+
"outputs": [
634+
{
635+
"name": "stderr",
636+
"output_type": "stream",
637+
"text": [
638+
"/opt/anaconda3/lib/python3.12/site-packages/anndata/__init__.py:55: FutureWarning: `anndata.read` is deprecated, use `anndata.read_h5ad` instead. `ad.read` will be removed in mid 2024.\n",
639+
" warnings.warn(\n"
640+
]
641+
}
642+
],
643+
"source": [
644+
"flu_data = ad.read('data/flu_processed_backup.h5ad')"
645+
]
646+
},
647+
{
648+
"cell_type": "code",
649+
"execution_count": 13,
650+
"metadata": {},
651+
"outputs": [
652+
{
653+
"data": {
654+
"text/plain": [
655+
"dtype('float64')"
656+
]
657+
},
658+
"execution_count": 13,
659+
"metadata": {},
660+
"output_type": "execute_result"
661+
}
662+
],
663+
"source": [
664+
"flu_data.X.dtype"
665+
]
666+
},
667+
{
668+
"cell_type": "code",
669+
"execution_count": null,
670+
"metadata": {},
671+
"outputs": [],
672+
"source": []
598673
}
599674
],
600675
"metadata": {
@@ -613,7 +688,7 @@
613688
"name": "python",
614689
"nbconvert_exporter": "python",
615690
"pygments_lexer": "ipython3",
616-
"version": "3.12.4"
691+
"version": "3.12.2"
617692
}
618693
},
619694
"nbformat": 4,

mature_neuron.Rmd

+13
Original file line numberDiff line numberDiff line change
@@ -70,6 +70,19 @@ comp_change |> group_by(Timepoint, Celltype) |> summarise(avg_count = Counts) |>
7070

7171

7272
```{r}
73+
neuron_obj <- readRDS("data/neuron.Rds")
74+
DimPlot(neuron_obj, reduction = "umap", group.by = "RNA_snn_res.0.25", label = T) +
75+
theme(legend.position = "none")
76+
DimPlot(neuron_obj, reduction = "umap", group.by = "cell_type_custom", label = T)
77+
78+
mature_neuron_obj <- subset(neuron_obj, subset = RNA_snn_res.0.25 %in% c("1", "2", "4", "7", "8"))
79+
80+
```
81+
82+
83+
```{r}
84+
mature_neuron_obj
85+
DimPlot(mature_neuron_obj, reduction = "umap", group.by = "cell_type_custom", label = T)
7386
7487
```
7588

neuron_analysis.Rmd

+21-31
Original file line numberDiff line numberDiff line change
@@ -29,15 +29,16 @@ neuron_obj <- readRDS("data/neuron.Rds")
2929

3030
```{r}
3131
# find variable genes with Seurat
32-
neuron_obj <- FindVariableFeatures(neuron_obj, selection.method = "vst", nfeatures = 2000)
33-
top10 <- head(VariableFeatures(neuron_obj), 10)
32+
neuron_obj <- FindVariableFeatures(neuron_obj, selection.method = "vst",
33+
nfeatures = 2000, layer = "data")
34+
top20 <- head(VariableFeatures(neuron_obj), 20)
3435
3536
plot1 <- VariableFeaturePlot(neuron_obj)
36-
plot2 <- LabelPoints(plot = plot1, points = top10, repel = TRUE)
37+
plot2 <- LabelPoints(plot = plot1, points = top20, repel = TRUE)
3738
plot2
3839
3940
# selecting top 2000 variable genes
40-
neuron_mat <- neuron_obj@assays$RNA$counts
41+
neuron_mat <- neuron_obj@assays$RNA$data
4142
neuron_mat_norm <- neuron_mat[VariableFeatures(neuron_obj),]
4243
4344
# saves time according to BPCells
@@ -72,15 +73,15 @@ rownames(umap) <- colnames(neuron_mat_norm)
7273
neuron_obj[["umap"]] <- CreateDimReducObject(embeddings = umap, key = "UMAP_",
7374
assay = DefaultAssay(neuron_obj))
7475
75-
DimPlot(neuron_obj, reduction = "umap", group.by = "Cluster.Label")
76+
DimPlot(neuron_obj, reduction = "umap", group.by = "cell_type__ontology_label")
7677
```
7778

7879

7980
```{r}
8081
# clustering
8182
neuron_obj <- FindNeighbors(neuron_obj, dims = 1:50)
8283
# Define the range of resolutions
83-
resolutions <- c(0.2)
84+
resolutions <- c(0.25)
8485
8586
# Loop through the resolutions and store clustering results in Seurat object's metadata
8687
for (res in resolutions) {
@@ -92,50 +93,42 @@ for (res in resolutions) {
9293
#cluster_column <- paste0("RNA_snn_res.", res)
9394
}
9495
95-
DimPlot(neuron_obj, reduction = "umap", group.by = "cell_type__ontology_label")
96-
DimPlot(neuron_obj, reduction = "umap", group.by = "donor_id")
96+
DimPlot(neuron_obj, reduction = "umap", group.by = "RNA_snn_res.0.25", label = T) +
97+
theme(legend.position = "none")
98+
DimPlot(neuron_obj, reduction = "umap", group.by = "cell_type_custom", label = T) +
99+
theme(legend.position = "none")
97100
98101
```
99102

100103

101104
```{r}
102-
# add the count to data layer would fix it
103-
neuron_obj[["RNA"]]$data <- neuron_obj[["RNA"]]$counts
104-
105-
Idents(neuron_obj) <- "RNA_snn_res.0.2"
106-
#FindMarkers(neuron_obj, ident.1 = "6", assay = "RNA")
105+
Idents(neuron_obj) <- "RNA_snn_res.0.25"
106+
#FindMarkers(neuron_obj, ident.1 = "7", assay = "RNA")
107107
108108
dim(neuron_obj[["RNA"]]$counts)
109109
110+
pdf("figure/marker_mature_OSN.pdf")
110111
FeaturePlot(neuron_obj, features = c("Omp", "Stoml3", "Cnga2", "Adcy3"))
112+
dev.off()
111113
```
112114

113115

114116
```{r investigating doublet}
115117
# 1. higher counts than the rest
116118
# 2. mixture of markers
117-
grp6_marker <- FindMarkers(neuron_obj, ident.1 = "6", assay = "RNA")
118-
doublet_obj <- subset(neuron_obj, subset = RNA_snn_res.0.2 == "6")
119+
grp7_marker <- FindMarkers(neuron_obj, ident.1 = "7", assay = "RNA")
119120
120121
library(EnhancedVolcano)
121-
pdf("figure/DE_genes_grp6.pdf")
122-
EnhancedVolcano(grp6_marker,
123-
rownames(grp6_marker),
122+
pdf("figure/DE_genes_grp7.pdf")
123+
EnhancedVolcano(grp7_marker,
124+
rownames(grp7_marker),
124125
x ="avg_log2FC",
125126
y ="p_val_adj")
126127
dev.off()
127128
128-
DimPlot(doublet_obj, reduction = "umap", group.by = "Cluster.Label")
129-
cluster_table <- table(doublet_obj$Cluster.Label)
130-
cluster_table <- cluster_table[cluster_table != 0]
131-
132-
VlnPlot(neuron_obj, features = "nCount_RNA", group.by = "seurat_clusters")
133-
VlnPlot(neuron_obj, features = "nFeature_RNA", group.by = "seurat_clusters")
134-
135-
library(DoubletFinder)
136-
neuron_obj <- doubletFinder_v3(neuron_obj, PCs = 1:10, pN = 0.25, pK = best_pK, nExp = 500)
137-
138129
130+
VlnPlot(neuron_obj, features = "number_of_reads", group.by = "seurat_clusters")
131+
VlnPlot(neuron_obj, features = "number_of_features", group.by = "seurat_clusters")
139132
```
140133

141134

@@ -145,8 +138,5 @@ saveRDS(neuron_obj, file = "data/neuron.Rds")
145138
FeaturePlot(neuron_obj, features = c("Nqo1","Acsm4", "Nfix", "Ncam2"))
146139
# dorsal acsm4, nqo1
147140
# ventral nfix, ncam2
148-
149-
mature_neuron_obj <- subset(neuron_obj, subset = RNA_snn_res.0.2 == c("1", "2", "4", "6"))
150-
saveRDS(mature_neuron_obj, "data/mature_neuron.Rds")
151141
```
152142

0 commit comments

Comments
 (0)