Skip to content

Commit

Permalink
🚧 use fetch_annotations from pkg, add ssgsea
Browse files Browse the repository at this point in the history
  • Loading branch information
enryH committed Dec 9, 2024
1 parent dba2000 commit ddac13c
Show file tree
Hide file tree
Showing 3 changed files with 97 additions and 87 deletions.
90 changes: 52 additions & 38 deletions acore/enrichment_analysis.py
Original file line number Diff line number Diff line change
Expand Up @@ -483,8 +483,8 @@ def run_enrichment(
def run_ssgsea(
data: pd.DataFrame,
annotation: str,
set_index: list[str],
annotation_col: str = "an notation",
set_index: list[str] = None,
annotation_col: str = "annotation",
identifier_col: str = "identifier",
outdir: str = "tmp",
min_size: int = 15,
Expand All @@ -501,11 +501,11 @@ def run_ssgsea(
:param data: pandas.DataFrame with the quantified features (i.e. subject x proteins)
:param annotation: pandas.DataFrame with the annotation to be used in the enrichment
(i.e. CKG pathway annotation file)
:param str annotation_col: name of the column containing annotation terms.
:param str identifier_col: name of column containing dependent variables identifiers.
:param list set_index: column/s to be used as index. Enrichment will be calculated
for these values (i.e ["subject"] will return subjects x pathways matrix of
enrichment scores)
:param str annotation_col: name of the column containing annotation terms.
:param str identifier_col: name of column containing dependent variables identifiers.
:param str out_dir: directory path where results will be stored
(default None, tmp folder is used)
:param int min_size: minimum number of features (i.e. proteins) in enriched terms
Expand Down Expand Up @@ -548,56 +548,70 @@ def run_ssgsea(
permutations=0
)
"""
result = {}
df = data.copy()
if not os.path.exists(outdir):
os.makedirs(outdir)

# Comine columns to create a unique name for each set (?)
name = []
index = data[set_index]
for i, row in data[set_index].iterrows():
if set_index is None:
index = data.index.to_frame()
set_index = index.columns.tolist()
else:
index = data[set_index]
df = df.drop(set_index, axis=1)

for i, row in index.iterrows():
name.append(
"_".join(row[set_index].tolist())
) # this assumes strings as identifiers

df["Name"] = name
index.index = name
df = df.drop(set_index, axis=1).set_index("Name").transpose()
df = df.set_index("Name").transpose()

if annotation_col in annotation and identifier_col in annotation:
grouped_annotations = (
annotation.groupby(annotation_col)[identifier_col].apply(list).reset_index()
if not annotation_col in annotation:
raise ValueError(
f"Missing Annotation Column: {annotation_col} as specified by `annotation_col`"
)
fid = uuid.uuid4()
file_path = os.path.join(outdir, str(fid) + ".gmt")
with open(file_path, "w", encoding="utf8") as out:
for i, row in grouped_annotations.iterrows():
out.write(
row[annotation_col]
+ "\t"
+ "\t".join(list(filter(None, row[identifier_col])))
+ "\n"
)
enrichment = gp.ssgsea(
data=df,
gene_sets=str(file_path),
outdir=outdir,
min_size=min_size,
max_size=max_size,
scale=scale,
permutation_num=permutations,
no_plot=True,
processes=1,
seed=10,
format="png",

if not identifier_col in annotation:
raise ValueError(
f"Missing Identifier Column: {identifier_col} as specified by `identifier_col`"
)

enrichment_es = pd.DataFrame(enrichment.resultsOnSamples).transpose()
enrichment_es = enrichment_es.join(index)
enrichment_nes = enrichment.res2d.transpose()
enrichment_nes = enrichment_nes.join(index)
grouped_annotations = (
annotation.groupby(annotation_col)[identifier_col].apply(list).reset_index()
)
fid = uuid.uuid4()
file_path = os.path.join(outdir, str(fid) + ".gmt")
with open(file_path, "w", encoding="utf8") as out:
for i, row in grouped_annotations.iterrows():
out.write(
row[annotation_col]
+ "\t"
+ "\t".join(list(filter(None, row[identifier_col])))
+ "\n"
)
enrichment = gp.ssgsea(
data=df,
gene_sets=str(file_path),
outdir=outdir,
min_size=min_size,
max_size=max_size,
scale=scale,
permutation_num=permutations,
no_plot=True,
processes=1,
seed=10,
format="png",
)

enrichment_es = pd.DataFrame(enrichment.results).transpose() # resultsOnSamples
enrichment_es = enrichment_es.join(index)
enrichment_nes = enrichment.res2d.transpose()
# enrichment_nes = enrichment_nes.join(index) # ! To Fix

result = {"es": enrichment_es, "nes": enrichment_nes}
result = {"es": enrichment_es, "nes": enrichment_nes}

return result
77 changes: 31 additions & 46 deletions docs/api_examples/enrichment_analysis.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -49,6 +49,7 @@
"import acore\n",
"import acore.differential_regulation\n",
"import acore.enrichment_analysis\n",
"from acore.io.uniprot import fetch_annotations\n",
"\n",
"dsp_pandas.format.set_pandas_options(max_colwidth=15)"
]
Expand Down Expand Up @@ -79,7 +80,7 @@
"omics: str = f\"{base_path}/omics.csv\"\n",
"meta_pgs: str = f\"{base_path}/meta_pgs.csv\"\n",
"meta: str = f\"{base_path}/meta_patients.csv\"\n",
"N_to_sample: int = 1_000"
"features_to_sample: int = 100"
]
},
{
Expand Down Expand Up @@ -149,7 +150,7 @@
" .drop(idx_always_included, axis=1)\n",
" .dropna(thresh=18, axis=1)\n",
" .sample(\n",
" N_to_sample - len(idx_always_included),\n",
" features_to_sample - len(idx_always_included),\n",
" axis=1,\n",
" random_state=42,\n",
" )\n",
Expand Down Expand Up @@ -223,47 +224,7 @@
"metadata": {},
"outputs": [],
"source": [
"from acore.io.uniprot import (\n",
" check_id_mapping_results_ready,\n",
" get_id_mapping_results_link,\n",
" get_id_mapping_results_search,\n",
" submit_id_mapping,\n",
")\n",
"\n",
"\n",
"def fetch_annotations(ids: pd.Index | list) -> pd.DataFrame:\n",
" \"\"\"Fetch annotations for UniProt IDs. Combines several calls to the API of UniProt's\n",
" knowledgebase (KB).\n",
"\n",
" Parameters\n",
" ----------\n",
" ids : pd.Index | list\n",
" Iterable of UniProt IDs. Fetches annotations as speecified by the specified fields.\n",
" fields : str, optional\n",
" Fields to fetch, by default \"accession,go_p,go_c. See for availble fields:\n",
" https://www.uniprot.org/help/return_fields\n",
"\n",
" Returns\n",
" -------\n",
" pd.DataFrame\n",
" DataFrame with annotations of the UniProt IDs.\n",
" \"\"\"\n",
" job_id = submit_id_mapping(from_db=\"UniProtKB_AC-ID\", to_db=\"UniProtKB\", ids=ids)\n",
"\n",
" if check_id_mapping_results_ready(job_id):\n",
" link = get_id_mapping_results_link(job_id)\n",
" # add fields to the link to get more information\n",
" # From and Entry (accession) are the same for UniProt IDs.\n",
" results = get_id_mapping_results_search(\n",
" link + \"?fields=accession,go_p,go_c,go_f&format=tsv\"\n",
" )\n",
" header = results.pop(0).split(\"\\t\")\n",
" results = [line.split(\"\\t\") for line in results]\n",
" df = pd.DataFrame(results, columns=header)\n",
" return df\n",
"\n",
"\n",
"fname_annotations = \"downloaded/annotations.csv\"\n",
"fname_annotations = f\"downloaded/annotations_{features_to_sample}.csv\"\n",
"fname = Path(fname_annotations)\n",
"try:\n",
" annotations = pd.read_csv(fname, index_col=0)\n",
Expand Down Expand Up @@ -325,9 +286,7 @@
"cell_type": "code",
"execution_count": null,
"id": "f300c5b5",
"metadata": {
"lines_to_next_cell": 2
},
"metadata": {},
"outputs": [],
"source": [
"ret = acore.enrichment_analysis.run_regulation_enrichment(\n",
Expand All @@ -338,6 +297,32 @@
"ret"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "88b05e0c",
"metadata": {},
"outputs": [],
"source": [
"res_dict = acore.enrichment_analysis.run_ssgsea(\n",
" data=df_omics,\n",
" annotation=annotations,\n",
" min_size=1,\n",
")\n",
"print(res_dict.keys())\n",
"res_dict[\"es\"]"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "b9e2fddf",
"metadata": {},
"outputs": [],
"source": [
"res_dict[\"nes\"]"
]
},
{
"cell_type": "code",
"execution_count": null,
Expand Down
17 changes: 14 additions & 3 deletions docs/api_examples/enrichment_analysis.py
Original file line number Diff line number Diff line change
Expand Up @@ -39,7 +39,7 @@
omics: str = f"{base_path}/omics.csv"
meta_pgs: str = f"{base_path}/meta_pgs.csv"
meta: str = f"{base_path}/meta_patients.csv"
N_to_sample: int = 1_000
features_to_sample: int = 100

# %% [markdown]
# # Load processed data
Expand Down Expand Up @@ -69,7 +69,7 @@
.drop(idx_always_included, axis=1)
.dropna(thresh=18, axis=1)
.sample(
N_to_sample - len(idx_always_included),
features_to_sample - len(idx_always_included),
axis=1,
random_state=42,
)
Expand Down Expand Up @@ -106,7 +106,7 @@
#

# %%
fname_annotations = "downloaded/annotations.csv"
fname_annotations = f"downloaded/annotations_{features_to_sample}.csv"
fname = Path(fname_annotations)
try:
annotations = pd.read_csv(fname, index_col=0)
Expand Down Expand Up @@ -155,5 +155,16 @@
)
ret

# %%
res_dict = acore.enrichment_analysis.run_ssgsea(
data=df_omics,
annotation=annotations,
min_size=1,
)
print(res_dict.keys())
res_dict["es"]

# %%
res_dict["nes"]

# %%

0 comments on commit ddac13c

Please sign in to comment.