yifand64
diff --git a/‎figure/DE_genes_grp6.pdf
-550 KB b/‎figure/DE_genes_grp6.pdf
-550 KB
diff --git a/‎figure/DE_genes_grp7.pdf
550 KB b/‎figure/DE_genes_grp7.pdf
550 KB
diff --git a/‎figure/marker_mature_OSN.pdf
11.4 MB b/‎figure/marker_mature_OSN.pdf
11.4 MB
diff --git a/‎figure/marker_mature_OSN_subtype.pdf
11.1 MB b/‎figure/marker_mature_OSN_subtype.pdf
11.1 MB
diff --git a/‎generate_h5ad.ipynb
+111-36 b/‎generate_h5ad.ipynb
+111-36
diff --git a/‎mature_neuron.Rmd
+13 b/‎mature_neuron.Rmd
+13
diff --git a/‎neuron_analysis.Rmd
+21-31 b/‎neuron_analysis.Rmd
+21-31
@@ -29,52 +29,77 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "counts = scipy.io.mmread('data/primary_sparse_processed.mtx').tocsr().transpose()"
+    "counts = scipy.io.mmread('data/primary_sparse.mtx').tocsr().transpose()"
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": 17,
+   "execution_count": 3,
    "metadata": {},
    "outputs": [
     {
      "name": "stdout",
      "output_type": "stream",
      "text": [
-      "156572 25129 (156572, 25129)\n"
+      "raw cell number 156572, raw gene number 25147, the dim of the raw matrix(156572, 25147), the processed gene number 25129\n"
      ]
     }
    ],
    "source": [
     "# Load barcodes and genes\n",
-    "barcodes = pd.read_csv('data/primary_colnames_processed.txt', header=None).squeeze().tolist()\n",
-    "genes = pd.read_csv('data/primary_features_processed.txt', header=None).squeeze().tolist()\n",
-    "print(len(barcodes), len(genes), counts.shape)"
+    "raw_barcodes = pd.read_csv('data/primary_colnames_raw.txt', header=None).squeeze().tolist()\n",
+    "raw_genes = pd.read_csv('data/primary_features_raw.txt', header=None).squeeze().tolist()\n",
+    "processed_genes = pd.read_csv('data/primary_features_processed.txt', header=None).squeeze().tolist()\n",
+    "\n",
+    "print(f\"raw cell number {len(raw_barcodes)}, raw gene number {len(raw_genes)}, the dim of the raw matrix{counts.shape}, the processed gene number {len(processed_genes)}\")"
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": 18,
+   "execution_count": 4,
    "metadata": {},
    "outputs": [],
    "source": [
     "adata = sc.AnnData(X=counts)\n",
-    "adata.var_names = genes\n",
-    "adata.obs_names = barcodes\n"
+    "adata.var_names = raw_genes\n",
+    "adata.obs_names = raw_barcodes"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 5,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "25129"
+      ]
+     },
+     "execution_count": 5,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "adata.var_names.intersection(processed_genes)\n",
+    "# this contains the gene only found in the processed data\n",
+    "adata_corrected = adata[:, processed_genes].copy()\n",
+    "len(adata_corrected.var)"
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": 19,
+   "execution_count": 6,
    "metadata": {},
    "outputs": [
     {
      "name": "stderr",
      "output_type": "stream",
      "text": [
-      "/var/folders/pj/g7ctw93j7477th9q941xfyyc0000gn/T/ipykernel_8486/700565825.py:1: DtypeWarning: Columns (16,17) have mixed types. Specify dtype option on import or set low_memory=False.\n",
+      "/var/folders/pj/g7ctw93j7477th9q941xfyyc0000gn/T/ipykernel_30998/4086199203.py:1: DtypeWarning: Columns (16,17) have mixed types. Specify dtype option on import or set low_memory=False.\n",
       "  metadata = pd.read_table('data/scp_primary_metadata.txt', index_col=0)\n",
-      "/var/folders/pj/g7ctw93j7477th9q941xfyyc0000gn/T/ipykernel_8486/700565825.py:5: DtypeWarning: Columns (4,5,6) have mixed types. Specify dtype option on import or set low_memory=False.\n",
+      "/var/folders/pj/g7ctw93j7477th9q941xfyyc0000gn/T/ipykernel_30998/4086199203.py:5: DtypeWarning: Columns (4,5,6) have mixed types. Specify dtype option on import or set low_memory=False.\n",
       "  cluster_data = pd.read_table('data/primary_clusterdata.txt', index_col=0)\n"
      ]
     }
@@ -89,12 +114,18 @@
     "cluster_data = cluster_data.iloc[1:]\n",
     "\n",
     "combined_metadata = pd.merge(cluster_data, metadata, left_index=True, right_index=True)\n",
-    "adata.obs = combined_metadata\n"
+    "adata.obs = combined_metadata\n",
+    "\n",
+    "adata.obs = adata.obs.astype('category')\n",
+    "\n",
+    "columns_to_convert = ['X', 'Y', 'number_of_reads', 'number_of_features', 'Cell.Type']\n",
+    "for column in columns_to_convert:\n",
+    "    adata.obs[column] = pd.to_numeric(adata.obs[column])\n"
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": 28,
+   "execution_count": 7,
    "metadata": {},
    "outputs": [
     {
@@ -450,19 +481,19 @@
        "Naive_LNG.AGCATCAGTGCCCAGT           LNG             HSC      Naive   \n",
        "Naive_LNG.TAACGACAGACGTCCC           LNG             HSC      Naive   \n",
        "\n",
-       "                                    X          Y Cell.Type biosample_id  \\\n",
-       "NAME                                                                      \n",
-       "D14_OE.AAACCCAGTATTCCTT    -11.398092  -1.642075         5       D14_OM   \n",
-       "D14_OE.AAACGAACAAAGCTCT    -13.586423  -4.545745         5       D14_OM   \n",
-       "D14_OE.AAACGAAGTGTTAAAG    -11.207908  -1.848815         5       D14_OM   \n",
-       "D14_OE.AAACGCTAGAATACAC    -11.215386  -1.674733         5       D14_OM   \n",
-       "D14_OE.AAACGCTAGCATCCTA    -12.391555  -4.034981         5       D14_OM   \n",
-       "...                               ...        ...       ...          ...   \n",
-       "Naive_RE.CTCATGCAGGCTTAGG    2.256732 -12.009367        10     Naive_RM   \n",
-       "Naive_RE.GTATTGGGTGCCGGTT   -5.709469  -3.818193        10     Naive_RM   \n",
-       "Naive_RE.TGGGTTATCGCAATGT    2.359610 -11.714253        10     Naive_RM   \n",
-       "Naive_LNG.AGCATCAGTGCCCAGT  -6.198734  -3.709947        10    Naive_LNG   \n",
-       "Naive_LNG.TAACGACAGACGTCCC   2.420867 -12.134587        10    Naive_LNG   \n",
+       "                                    X          Y  Cell.Type biosample_id  \\\n",
+       "NAME                                                                       \n",
+       "D14_OE.AAACCCAGTATTCCTT    -11.398092  -1.642075          5       D14_OM   \n",
+       "D14_OE.AAACGAACAAAGCTCT    -13.586423  -4.545745          5       D14_OM   \n",
+       "D14_OE.AAACGAAGTGTTAAAG    -11.207908  -1.848815          5       D14_OM   \n",
+       "D14_OE.AAACGCTAGAATACAC    -11.215386  -1.674733          5       D14_OM   \n",
+       "D14_OE.AAACGCTAGCATCCTA    -12.391555  -4.034981          5       D14_OM   \n",
+       "...                               ...        ...        ...          ...   \n",
+       "Naive_RE.CTCATGCAGGCTTAGG    2.256732 -12.009367         10     Naive_RM   \n",
+       "Naive_RE.GTATTGGGTGCCGGTT   -5.709469  -3.818193         10     Naive_RM   \n",
+       "Naive_RE.TGGGTTATCGCAATGT    2.359610 -11.714253         10     Naive_RM   \n",
+       "Naive_LNG.AGCATCAGTGCCCAGT  -6.198734  -3.709947         10    Naive_LNG   \n",
+       "Naive_LNG.TAACGACAGACGTCCC   2.420867 -12.134587         10    Naive_LNG   \n",
        "\n",
        "                               donor_id          species  \\\n",
        "NAME                                                       \n",
@@ -565,7 +596,7 @@
        "[156572 rows x 23 columns]"
       ]
      },
-     "execution_count": 28,
+     "execution_count": 7,
      "metadata": {},
      "output_type": "execute_result"
     }
@@ -576,25 +607,69 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 29,
+   "execution_count": 17,
    "metadata": {},
    "outputs": [],
    "source": [
-    "adata.obs = adata.obs.astype('category')\n",
+    "# need to be float for BPCells, otherwise you gonna spend 2hr debugging it\n",
+    "adata_corrected.X = adata_corrected.X.astype(np.float64)\n",
     "\n",
-    "columns_to_convert = ['X', 'Y', 'number_of_reads', 'number_of_features', 'Cell.Type']\n",
-    "for column in columns_to_convert:\n",
-    "    adata.obs[column] = pd.to_numeric(adata.obs[column])"
+    "adata_corrected.write('data/flu_raw.h5ad', compression=\"gzip\")\n"
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": 30,
+   "execution_count": 16,
    "metadata": {},
    "outputs": [],
    "source": [
-    "adata.write('data/flu_processed.h5ad', compression=\"gzip\")\n"
+    "adata_corrected.X.dtype\n"
    ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 11,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "/opt/anaconda3/lib/python3.12/site-packages/anndata/__init__.py:55: FutureWarning: `anndata.read` is deprecated, use `anndata.read_h5ad` instead. `ad.read` will be removed in mid 2024.\n",
+      "  warnings.warn(\n"
+     ]
+    }
+   ],
+   "source": [
+    "flu_data = ad.read('data/flu_processed_backup.h5ad')"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 13,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "dtype('float64')"
+      ]
+     },
+     "execution_count": 13,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "flu_data.X.dtype"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": []
   }
  ],
  "metadata": {
@@ -613,7 +688,7 @@
    "name": "python",
    "nbconvert_exporter": "python",
    "pygments_lexer": "ipython3",
-   "version": "3.12.4"
+   "version": "3.12.2"
   }
  },
  "nbformat": 4,
 
@@ -70,6 +70,19 @@ comp_change |> group_by(Timepoint, Celltype) |> summarise(avg_count = Counts) |>
 
 
 ```{r}
+neuron_obj <- readRDS("data/neuron.Rds")
+DimPlot(neuron_obj, reduction = "umap", group.by = "RNA_snn_res.0.25", label = T) + 
+  theme(legend.position = "none")
+DimPlot(neuron_obj, reduction = "umap", group.by = "cell_type_custom", label = T)
+
+mature_neuron_obj <- subset(neuron_obj, subset = RNA_snn_res.0.25 %in% c("1", "2", "4", "7", "8"))
+
+```
+
+
+```{r}
+mature_neuron_obj
+DimPlot(mature_neuron_obj, reduction = "umap", group.by = "cell_type_custom", label = T)
 
 ```
 
@@ -29,15 +29,16 @@ neuron_obj <- readRDS("data/neuron.Rds")
 
 ```{r}
 # find variable genes with Seurat
-neuron_obj <- FindVariableFeatures(neuron_obj, selection.method = "vst", nfeatures = 2000)
-top10 <- head(VariableFeatures(neuron_obj), 10)
+neuron_obj <- FindVariableFeatures(neuron_obj, selection.method = "vst", 
+                                   nfeatures = 2000, layer = "data")
+top20 <- head(VariableFeatures(neuron_obj), 20)
 
 plot1 <- VariableFeaturePlot(neuron_obj)
-plot2 <- LabelPoints(plot = plot1, points = top10, repel = TRUE)
+plot2 <- LabelPoints(plot = plot1, points = top20, repel = TRUE)
 plot2
 
 # selecting top 2000 variable genes
-neuron_mat <- neuron_obj@assays$RNA$counts
+neuron_mat <- neuron_obj@assays$RNA$data
 neuron_mat_norm <- neuron_mat[VariableFeatures(neuron_obj),]
 
 # saves time according to BPCells
@@ -72,15 +73,15 @@ rownames(umap) <- colnames(neuron_mat_norm)
 neuron_obj[["umap"]] <- CreateDimReducObject(embeddings = umap, key = "UMAP_", 
                                          assay = DefaultAssay(neuron_obj))
 
-DimPlot(neuron_obj, reduction = "umap", group.by = "Cluster.Label")
+DimPlot(neuron_obj, reduction = "umap", group.by = "cell_type__ontology_label")
 ```
 
 
 ```{r}
 # clustering
 neuron_obj <- FindNeighbors(neuron_obj, dims = 1:50)
 # Define the range of resolutions
-resolutions <- c(0.2)
+resolutions <- c(0.25)
 
 # Loop through the resolutions and store clustering results in Seurat object's metadata
 for (res in resolutions) {
@@ -92,50 +93,42 @@ for (res in resolutions) {
   #cluster_column <- paste0("RNA_snn_res.", res)
 }
 
-DimPlot(neuron_obj, reduction = "umap", group.by = "cell_type__ontology_label")
-DimPlot(neuron_obj, reduction = "umap", group.by = "donor_id")
+DimPlot(neuron_obj, reduction = "umap", group.by = "RNA_snn_res.0.25", label = T) + 
+  theme(legend.position = "none")
+DimPlot(neuron_obj, reduction = "umap", group.by = "cell_type_custom", label = T) + 
+  theme(legend.position = "none")
 
 ```
 
 
 ```{r}
-# add the count to data layer would fix it
-neuron_obj[["RNA"]]$data <- neuron_obj[["RNA"]]$counts
-
-Idents(neuron_obj) <- "RNA_snn_res.0.2"
-#FindMarkers(neuron_obj, ident.1 = "6", assay = "RNA")
+Idents(neuron_obj) <- "RNA_snn_res.0.25"
+#FindMarkers(neuron_obj, ident.1 = "7", assay = "RNA")
 
 dim(neuron_obj[["RNA"]]$counts)
 
+pdf("figure/marker_mature_OSN.pdf")
 FeaturePlot(neuron_obj, features = c("Omp", "Stoml3", "Cnga2", "Adcy3"))
+dev.off()
 ```
 
 
 ```{r investigating doublet}
 # 1. higher counts than the rest
 # 2. mixture of markers 
-grp6_marker <- FindMarkers(neuron_obj, ident.1 = "6", assay = "RNA")
-doublet_obj <- subset(neuron_obj, subset = RNA_snn_res.0.2 == "6")
+grp7_marker <- FindMarkers(neuron_obj, ident.1 = "7", assay = "RNA")
 
 library(EnhancedVolcano)
-pdf("figure/DE_genes_grp6.pdf")
-EnhancedVolcano(grp6_marker, 
-                rownames(grp6_marker),
+pdf("figure/DE_genes_grp7.pdf")
+EnhancedVolcano(grp7_marker, 
+                rownames(grp7_marker),
                 x ="avg_log2FC", 
                 y ="p_val_adj")
 dev.off()
 
-DimPlot(doublet_obj, reduction = "umap", group.by = "Cluster.Label")
-cluster_table <- table(doublet_obj$Cluster.Label)
-cluster_table <- cluster_table[cluster_table != 0]
-
-VlnPlot(neuron_obj, features = "nCount_RNA", group.by = "seurat_clusters")
-VlnPlot(neuron_obj, features = "nFeature_RNA", group.by = "seurat_clusters")
-
-library(DoubletFinder)
-neuron_obj <- doubletFinder_v3(neuron_obj, PCs = 1:10, pN = 0.25, pK = best_pK, nExp = 500)
-
 
+VlnPlot(neuron_obj, features = "number_of_reads", group.by = "seurat_clusters")
+VlnPlot(neuron_obj, features = "number_of_features", group.by = "seurat_clusters")
 ```
 
 
@@ -145,8 +138,5 @@ saveRDS(neuron_obj, file = "data/neuron.Rds")
 FeaturePlot(neuron_obj, features = c("Nqo1","Acsm4", "Nfix", "Ncam2"))
 # dorsal acsm4, nqo1
 # ventral nfix, ncam2
-
-mature_neuron_obj <- subset(neuron_obj, subset = RNA_snn_res.0.2 == c("1", "2", "4", "6"))
-saveRDS(mature_neuron_obj, "data/mature_neuron.Rds")
 ```