Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add notebook for downloading McFarland 2020 Figure 1 data #2

Open
wants to merge 1 commit into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
211 changes: 211 additions & 0 deletions datasets/McFarland_2020_Fig1.ipynb
Original file line number Diff line number Diff line change
@@ -0,0 +1,211 @@
{
"cells": [
{
"cell_type": "code",
"execution_count": 1,
"id": "62385964-6d60-47de-bd3c-4a40d5c9954c",
"metadata": {},
"outputs": [],
"source": [
"from utils import download_binary_file\n",
"import os\n",
"import shutil\n",
"import pandas as pd\n",
"import numpy as np\n",
"from scipy.io import mmread\n",
"import anndata\n",
"from anndata import AnnData\n",
"import scanpy as sc\n",
"from typing import List, Tuple"
]
},
{
"cell_type": "code",
"execution_count": 2,
"id": "00d0a7c1-b9ae-414d-8b33-eb424a5c9e81",
"metadata": {},
"outputs": [],
"source": [
"# Functions for downloading/reading files\n",
"\n",
"def download_mcfarland_2020(output_path: str) -> None:\n",
" \"\"\"\n",
" Download Mcfarland et al. 2020 data from the hosting URLs.\n",
"\n",
" Args:\n",
" ----\n",
" output_path: Output path to store the downloaded and unzipped\n",
" directories.\n",
"\n",
" Returns\n",
" -------\n",
" None. File directories are downloaded and unzipped in output_path.\n",
" \"\"\"\n",
" idasanutlin_url = \"https://figshare.com/ndownloader/files/18716351\"\n",
" idasanutlin_output_filename = os.path.join(output_path, \"idasanutlin.zip\")\n",
"\n",
" download_binary_file(idasanutlin_url, idasanutlin_output_filename)\n",
" idasanutlin_output_dir = idasanutlin_output_filename.replace(\".zip\", \"\")\n",
" shutil.unpack_archive(idasanutlin_output_filename, idasanutlin_output_dir)\n",
"\n",
" dmso_url = \"https://figshare.com/ndownloader/files/18716354\"\n",
" dmso_output_filename = os.path.join(output_path, \"dmso.zip\")\n",
"\n",
" download_binary_file(dmso_url, dmso_output_filename)\n",
" dmso_output_dir = dmso_output_filename.replace(\".zip\", \"\")\n",
" shutil.unpack_archive(dmso_output_filename, dmso_output_dir)\n",
"\n",
"\n",
"def _read_mixseq_df(directory: str) -> pd.DataFrame:\n",
" data = mmread(os.path.join(directory, \"matrix.mtx\"))\n",
" barcodes = pd.read_table(os.path.join(directory, \"barcodes.tsv\"), header=None)\n",
" classifications = pd.read_csv(os.path.join(directory, \"classifications.csv\"))\n",
" classifications[\"cell_line\"] = np.array(\n",
" [x.split(\"_\")[0] for x in classifications.singlet_ID.values]\n",
" )\n",
" gene_names = pd.read_table(os.path.join(directory, \"genes.tsv\"), header=None)\n",
"\n",
" df = pd.DataFrame(\n",
" data.toarray(),\n",
" columns=barcodes.iloc[:, 0].values,\n",
" index=gene_names.iloc[:, 0].values,\n",
" )\n",
" return df\n",
"\n",
"\n",
"def _get_cell_line_labels(directory: str) -> np.array:\n",
" classifications = pd.read_csv(os.path.join(directory, \"classifications.csv\"))\n",
" return classifications.singlet_ID.values\n",
"\n",
"\n",
"def _get_tp53_mutation_status(cell_line_labels: List[str]) -> np.array:\n",
" # Taken from https://cancerdatascience.org/blog/posts/mix-seq/\n",
" TP53_WT = [\n",
" \"LNCAPCLONEFGC_PROSTATE\",\n",
" \"DKMG_CENTRAL_NERVOUS_SYSTEM\",\n",
" \"NCIH226_LUNG\",\n",
" \"RCC10RGB_KIDNEY\",\n",
" \"SNU1079_BILIARY_TRACT\",\n",
" \"CCFSTTG1_CENTRAL_NERVOUS_SYSTEM\",\n",
" \"COV434_OVARY\",\n",
" ]\n",
"\n",
" TP53_mutation_status = [\n",
" \"Wild Type\" if x in TP53_WT else \"Mutation\" for x in cell_line_labels\n",
" ]\n",
" return np.array(TP53_mutation_status)\n",
"\n",
"\n",
"def read_mcfarland_2020(file_directory: str) -> Tuple[pd.DataFrame, pd.DataFrame]:\n",
" \"\"\"\n",
" Read the expression data for Mcfarland et al. 2020 in the given directory.\n",
"\n",
" Args:\n",
" ----\n",
" file_directory: Directory containing Mcfarland et al. 2020 data.\n",
"\n",
" Returns\n",
" -------\n",
" Two data frames of raw count expression data. The first contains\n",
" single-cell gene expression count data from cancer cell lines exposed to\n",
" idasanutlin with cell identification barcodes as column names and gene IDs as\n",
" indices. The second contains count data with the same format from samples\n",
" exposed to a control solution (DMSO).\n",
" \"\"\"\n",
" idasanutlin_dir = os.path.join(\n",
" file_directory, \"idasanutlin\", \"Idasanutlin_24hr_expt1\"\n",
" )\n",
" idasanutlin_df = _read_mixseq_df(idasanutlin_dir)\n",
"\n",
" dmso_dir = os.path.join(file_directory, \"dmso\", \"DMSO_24hr_expt1\")\n",
" dmso_df = _read_mixseq_df(dmso_dir)\n",
"\n",
" return idasanutlin_df, dmso_df"
]
},
{
"cell_type": "code",
"execution_count": 3,
"id": "701585d6-7d2f-409e-b16b-7ca55c0b614d",
"metadata": {},
"outputs": [
{
"name": "stderr",
"output_type": "stream",
"text": [
"/tmp/ipykernel_31619/762642982.py:6: FutureWarning: X.dtype being converted to np.float32 from int64. In the next version of anndata (0.9) conversion will not be automatic. Pass dtype explicitly to avoid this warning. Pass `AnnData(X, dtype=X.dtype, ...)` to get the future behavour.\n",
" idasanutlin_adata = AnnData(idasanutlin_df)\n",
"/tmp/ipykernel_31619/762642982.py:18: FutureWarning: X.dtype being converted to np.float32 from int64. In the next version of anndata (0.9) conversion will not be automatic. Pass dtype explicitly to avoid this warning. Pass `AnnData(X, dtype=X.dtype, ...)` to get the future behavour.\n",
" dmso_adata = AnnData(dmso_df)\n",
"/homes/gws/ewein/miniconda3/envs/contrastive-vi-env/lib/python3.10/site-packages/anndata/_core/anndata.py:1828: UserWarning: Observation names are not unique. To make them unique, call `.obs_names_make_unique`.\n",
" utils.warn_names_duplicates(\"obs\")\n"
]
}
],
"source": [
"download_path = \"./\"\n",
"\n",
"idasanutlin_df, dmso_df = read_mcfarland_2020(download_path)\n",
"idasanutlin_df, dmso_df = idasanutlin_df.transpose(), dmso_df.transpose()\n",
"\n",
"idasanutlin_adata = AnnData(idasanutlin_df)\n",
"idasanutlin_dir = os.path.join(\n",
" download_path, \"idasanutlin\", \"Idasanutlin_24hr_expt1\"\n",
")\n",
"idasanutlin_adata.obs[\"cell_line\"] = _get_cell_line_labels(idasanutlin_dir)\n",
"idasanutlin_adata.obs[\"TP53_mutation_status\"] = _get_tp53_mutation_status(\n",
" idasanutlin_adata.obs[\"cell_line\"]\n",
")\n",
"idasanutlin_adata.obs[\"condition\"] = np.repeat(\n",
" \"Idasanutlin\", idasanutlin_adata.shape[0]\n",
")\n",
"\n",
"dmso_adata = AnnData(dmso_df)\n",
"dmso_dir = os.path.join(download_path, \"dmso\", \"DMSO_24hr_expt1\")\n",
"dmso_adata.obs[\"cell_line\"] = _get_cell_line_labels(dmso_dir)\n",
"dmso_adata.obs[\"TP53_mutation_status\"] = _get_tp53_mutation_status(\n",
" dmso_adata.obs[\"cell_line\"]\n",
")\n",
"dmso_adata.obs[\"condition\"] = np.repeat(\"DMSO\", dmso_adata.shape[0])\n",
"\n",
"adata = anndata.concat([idasanutlin_adata, dmso_adata])"
]
},
{
"cell_type": "code",
"execution_count": 4,
"id": "f1cef90a-4cc9-4dbb-8801-231c9601b10c",
"metadata": {},
"outputs": [],
"source": [
"# Filling in the standard metadata values\n",
"\n",
"adata.obs['perturbation_name'] = adata.obs['condition']\n",
"adata.obs['perturbation_type'] = 'small molecule'\n",
"adata.obs['perturbation_value'] = '24'\n",
"adata.obs['perturbation_unit'] = 'hrs'"
]
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.10.0"
}
},
"nbformat": 4,
"nbformat_minor": 5
}
30 changes: 30 additions & 0 deletions datasets/utils.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,30 @@
import requests
import os

def download_binary_file(
file_url: str, output_path: str, overwrite: bool = False
) -> None:
"""
Download binary data file from a URL.

Args:
----
file_url: URL where the file is hosted.
output_path: Output path for the downloaded file.
overwrite: Whether to overwrite existing downloaded file.

Returns
-------
None.
"""
file_exists = os.path.exists(output_path)
if (not file_exists) or (file_exists and overwrite):
request = requests.get(file_url)
with open(output_path, "wb") as f:
f.write(request.content)
print(f"Downloaded data from {file_url} at {output_path}")
else:
print(
f"File {output_path} already exists. "
"No files downloaded to overwrite the existing file."
)