|
29 | 29 | "metadata": {},
|
30 | 30 | "outputs": [],
|
31 | 31 | "source": [
|
32 |
| - "counts = scipy.io.mmread('data/primary_sparse_processed.mtx').tocsr().transpose()" |
| 32 | + "counts = scipy.io.mmread('data/primary_sparse.mtx').tocsr().transpose()" |
33 | 33 | ]
|
34 | 34 | },
|
35 | 35 | {
|
36 | 36 | "cell_type": "code",
|
37 |
| - "execution_count": 17, |
| 37 | + "execution_count": 3, |
38 | 38 | "metadata": {},
|
39 | 39 | "outputs": [
|
40 | 40 | {
|
41 | 41 | "name": "stdout",
|
42 | 42 | "output_type": "stream",
|
43 | 43 | "text": [
|
44 |
| - "156572 25129 (156572, 25129)\n" |
| 44 | + "raw cell number 156572, raw gene number 25147, the dim of the raw matrix(156572, 25147), the processed gene number 25129\n" |
45 | 45 | ]
|
46 | 46 | }
|
47 | 47 | ],
|
48 | 48 | "source": [
|
49 | 49 | "# Load barcodes and genes\n",
|
50 |
| - "barcodes = pd.read_csv('data/primary_colnames_processed.txt', header=None).squeeze().tolist()\n", |
51 |
| - "genes = pd.read_csv('data/primary_features_processed.txt', header=None).squeeze().tolist()\n", |
52 |
| - "print(len(barcodes), len(genes), counts.shape)" |
| 50 | + "raw_barcodes = pd.read_csv('data/primary_colnames_raw.txt', header=None).squeeze().tolist()\n", |
| 51 | + "raw_genes = pd.read_csv('data/primary_features_raw.txt', header=None).squeeze().tolist()\n", |
| 52 | + "processed_genes = pd.read_csv('data/primary_features_processed.txt', header=None).squeeze().tolist()\n", |
| 53 | + "\n", |
| 54 | + "print(f\"raw cell number {len(raw_barcodes)}, raw gene number {len(raw_genes)}, the dim of the raw matrix{counts.shape}, the processed gene number {len(processed_genes)}\")" |
53 | 55 | ]
|
54 | 56 | },
|
55 | 57 | {
|
56 | 58 | "cell_type": "code",
|
57 |
| - "execution_count": 18, |
| 59 | + "execution_count": 4, |
58 | 60 | "metadata": {},
|
59 | 61 | "outputs": [],
|
60 | 62 | "source": [
|
61 | 63 | "adata = sc.AnnData(X=counts)\n",
|
62 |
| - "adata.var_names = genes\n", |
63 |
| - "adata.obs_names = barcodes\n" |
| 64 | + "adata.var_names = raw_genes\n", |
| 65 | + "adata.obs_names = raw_barcodes" |
| 66 | + ] |
| 67 | + }, |
| 68 | + { |
| 69 | + "cell_type": "code", |
| 70 | + "execution_count": 5, |
| 71 | + "metadata": {}, |
| 72 | + "outputs": [ |
| 73 | + { |
| 74 | + "data": { |
| 75 | + "text/plain": [ |
| 76 | + "25129" |
| 77 | + ] |
| 78 | + }, |
| 79 | + "execution_count": 5, |
| 80 | + "metadata": {}, |
| 81 | + "output_type": "execute_result" |
| 82 | + } |
| 83 | + ], |
| 84 | + "source": [ |
| 85 | + "adata.var_names.intersection(processed_genes)\n", |
| 86 | + "# this contains the gene only found in the processed data\n", |
| 87 | + "adata_corrected = adata[:, processed_genes].copy()\n", |
| 88 | + "len(adata_corrected.var)" |
64 | 89 | ]
|
65 | 90 | },
|
66 | 91 | {
|
67 | 92 | "cell_type": "code",
|
68 |
| - "execution_count": 19, |
| 93 | + "execution_count": 6, |
69 | 94 | "metadata": {},
|
70 | 95 | "outputs": [
|
71 | 96 | {
|
72 | 97 | "name": "stderr",
|
73 | 98 | "output_type": "stream",
|
74 | 99 | "text": [
|
75 |
| - "/var/folders/pj/g7ctw93j7477th9q941xfyyc0000gn/T/ipykernel_8486/700565825.py:1: DtypeWarning: Columns (16,17) have mixed types. Specify dtype option on import or set low_memory=False.\n", |
| 100 | + "/var/folders/pj/g7ctw93j7477th9q941xfyyc0000gn/T/ipykernel_30998/4086199203.py:1: DtypeWarning: Columns (16,17) have mixed types. Specify dtype option on import or set low_memory=False.\n", |
76 | 101 | " metadata = pd.read_table('data/scp_primary_metadata.txt', index_col=0)\n",
|
77 |
| - "/var/folders/pj/g7ctw93j7477th9q941xfyyc0000gn/T/ipykernel_8486/700565825.py:5: DtypeWarning: Columns (4,5,6) have mixed types. Specify dtype option on import or set low_memory=False.\n", |
| 102 | + "/var/folders/pj/g7ctw93j7477th9q941xfyyc0000gn/T/ipykernel_30998/4086199203.py:5: DtypeWarning: Columns (4,5,6) have mixed types. Specify dtype option on import or set low_memory=False.\n", |
78 | 103 | " cluster_data = pd.read_table('data/primary_clusterdata.txt', index_col=0)\n"
|
79 | 104 | ]
|
80 | 105 | }
|
|
89 | 114 | "cluster_data = cluster_data.iloc[1:]\n",
|
90 | 115 | "\n",
|
91 | 116 | "combined_metadata = pd.merge(cluster_data, metadata, left_index=True, right_index=True)\n",
|
92 |
| - "adata.obs = combined_metadata\n" |
| 117 | + "adata.obs = combined_metadata\n", |
| 118 | + "\n", |
| 119 | + "adata.obs = adata.obs.astype('category')\n", |
| 120 | + "\n", |
| 121 | + "columns_to_convert = ['X', 'Y', 'number_of_reads', 'number_of_features', 'Cell.Type']\n", |
| 122 | + "for column in columns_to_convert:\n", |
| 123 | + " adata.obs[column] = pd.to_numeric(adata.obs[column])\n" |
93 | 124 | ]
|
94 | 125 | },
|
95 | 126 | {
|
96 | 127 | "cell_type": "code",
|
97 |
| - "execution_count": 28, |
| 128 | + "execution_count": 7, |
98 | 129 | "metadata": {},
|
99 | 130 | "outputs": [
|
100 | 131 | {
|
|
450 | 481 | "Naive_LNG.AGCATCAGTGCCCAGT LNG HSC Naive \n",
|
451 | 482 | "Naive_LNG.TAACGACAGACGTCCC LNG HSC Naive \n",
|
452 | 483 | "\n",
|
453 |
| - " X Y Cell.Type biosample_id \\\n", |
454 |
| - "NAME \n", |
455 |
| - "D14_OE.AAACCCAGTATTCCTT -11.398092 -1.642075 5 D14_OM \n", |
456 |
| - "D14_OE.AAACGAACAAAGCTCT -13.586423 -4.545745 5 D14_OM \n", |
457 |
| - "D14_OE.AAACGAAGTGTTAAAG -11.207908 -1.848815 5 D14_OM \n", |
458 |
| - "D14_OE.AAACGCTAGAATACAC -11.215386 -1.674733 5 D14_OM \n", |
459 |
| - "D14_OE.AAACGCTAGCATCCTA -12.391555 -4.034981 5 D14_OM \n", |
460 |
| - "... ... ... ... ... \n", |
461 |
| - "Naive_RE.CTCATGCAGGCTTAGG 2.256732 -12.009367 10 Naive_RM \n", |
462 |
| - "Naive_RE.GTATTGGGTGCCGGTT -5.709469 -3.818193 10 Naive_RM \n", |
463 |
| - "Naive_RE.TGGGTTATCGCAATGT 2.359610 -11.714253 10 Naive_RM \n", |
464 |
| - "Naive_LNG.AGCATCAGTGCCCAGT -6.198734 -3.709947 10 Naive_LNG \n", |
465 |
| - "Naive_LNG.TAACGACAGACGTCCC 2.420867 -12.134587 10 Naive_LNG \n", |
| 484 | + " X Y Cell.Type biosample_id \\\n", |
| 485 | + "NAME \n", |
| 486 | + "D14_OE.AAACCCAGTATTCCTT -11.398092 -1.642075 5 D14_OM \n", |
| 487 | + "D14_OE.AAACGAACAAAGCTCT -13.586423 -4.545745 5 D14_OM \n", |
| 488 | + "D14_OE.AAACGAAGTGTTAAAG -11.207908 -1.848815 5 D14_OM \n", |
| 489 | + "D14_OE.AAACGCTAGAATACAC -11.215386 -1.674733 5 D14_OM \n", |
| 490 | + "D14_OE.AAACGCTAGCATCCTA -12.391555 -4.034981 5 D14_OM \n", |
| 491 | + "... ... ... ... ... \n", |
| 492 | + "Naive_RE.CTCATGCAGGCTTAGG 2.256732 -12.009367 10 Naive_RM \n", |
| 493 | + "Naive_RE.GTATTGGGTGCCGGTT -5.709469 -3.818193 10 Naive_RM \n", |
| 494 | + "Naive_RE.TGGGTTATCGCAATGT 2.359610 -11.714253 10 Naive_RM \n", |
| 495 | + "Naive_LNG.AGCATCAGTGCCCAGT -6.198734 -3.709947 10 Naive_LNG \n", |
| 496 | + "Naive_LNG.TAACGACAGACGTCCC 2.420867 -12.134587 10 Naive_LNG \n", |
466 | 497 | "\n",
|
467 | 498 | " donor_id species \\\n",
|
468 | 499 | "NAME \n",
|
|
565 | 596 | "[156572 rows x 23 columns]"
|
566 | 597 | ]
|
567 | 598 | },
|
568 |
| - "execution_count": 28, |
| 599 | + "execution_count": 7, |
569 | 600 | "metadata": {},
|
570 | 601 | "output_type": "execute_result"
|
571 | 602 | }
|
|
576 | 607 | },
|
577 | 608 | {
|
578 | 609 | "cell_type": "code",
|
579 |
| - "execution_count": 29, |
| 610 | + "execution_count": 17, |
580 | 611 | "metadata": {},
|
581 | 612 | "outputs": [],
|
582 | 613 | "source": [
|
583 |
| - "adata.obs = adata.obs.astype('category')\n", |
| 614 | + "# need to be float for BPCells, otherwise you gonna spend 2hr debugging it\n", |
| 615 | + "adata_corrected.X = adata_corrected.X.astype(np.float64)\n", |
584 | 616 | "\n",
|
585 |
| - "columns_to_convert = ['X', 'Y', 'number_of_reads', 'number_of_features', 'Cell.Type']\n", |
586 |
| - "for column in columns_to_convert:\n", |
587 |
| - " adata.obs[column] = pd.to_numeric(adata.obs[column])" |
| 617 | + "adata_corrected.write('data/flu_raw.h5ad', compression=\"gzip\")\n" |
588 | 618 | ]
|
589 | 619 | },
|
590 | 620 | {
|
591 | 621 | "cell_type": "code",
|
592 |
| - "execution_count": 30, |
| 622 | + "execution_count": 16, |
593 | 623 | "metadata": {},
|
594 | 624 | "outputs": [],
|
595 | 625 | "source": [
|
596 |
| - "adata.write('data/flu_processed.h5ad', compression=\"gzip\")\n" |
| 626 | + "adata_corrected.X.dtype\n" |
597 | 627 | ]
|
| 628 | + }, |
| 629 | + { |
| 630 | + "cell_type": "code", |
| 631 | + "execution_count": 11, |
| 632 | + "metadata": {}, |
| 633 | + "outputs": [ |
| 634 | + { |
| 635 | + "name": "stderr", |
| 636 | + "output_type": "stream", |
| 637 | + "text": [ |
| 638 | + "/opt/anaconda3/lib/python3.12/site-packages/anndata/__init__.py:55: FutureWarning: `anndata.read` is deprecated, use `anndata.read_h5ad` instead. `ad.read` will be removed in mid 2024.\n", |
| 639 | + " warnings.warn(\n" |
| 640 | + ] |
| 641 | + } |
| 642 | + ], |
| 643 | + "source": [ |
| 644 | + "flu_data = ad.read('data/flu_processed_backup.h5ad')" |
| 645 | + ] |
| 646 | + }, |
| 647 | + { |
| 648 | + "cell_type": "code", |
| 649 | + "execution_count": 13, |
| 650 | + "metadata": {}, |
| 651 | + "outputs": [ |
| 652 | + { |
| 653 | + "data": { |
| 654 | + "text/plain": [ |
| 655 | + "dtype('float64')" |
| 656 | + ] |
| 657 | + }, |
| 658 | + "execution_count": 13, |
| 659 | + "metadata": {}, |
| 660 | + "output_type": "execute_result" |
| 661 | + } |
| 662 | + ], |
| 663 | + "source": [ |
| 664 | + "flu_data.X.dtype" |
| 665 | + ] |
| 666 | + }, |
| 667 | + { |
| 668 | + "cell_type": "code", |
| 669 | + "execution_count": null, |
| 670 | + "metadata": {}, |
| 671 | + "outputs": [], |
| 672 | + "source": [] |
598 | 673 | }
|
599 | 674 | ],
|
600 | 675 | "metadata": {
|
|
613 | 688 | "name": "python",
|
614 | 689 | "nbconvert_exporter": "python",
|
615 | 690 | "pygments_lexer": "ipython3",
|
616 |
| - "version": "3.12.4" |
| 691 | + "version": "3.12.2" |
617 | 692 | }
|
618 | 693 | },
|
619 | 694 | "nbformat": 4,
|
|
0 commit comments