🚧 use fetch_annotations from pkg, add ssgsea

Multiomics-Analytics-Group · Dec 9, 2024 · ddac13c · ddac13c
1 parent dba2000
commit ddac13c
Show file tree

Hide file tree

Showing 3 changed files with 97 additions and 87 deletions.
diff --git a/acore/enrichment_analysis.py b/acore/enrichment_analysis.py
@@ -483,8 +483,8 @@ def run_enrichment(
 def run_ssgsea(
     data: pd.DataFrame,
     annotation: str,
-    set_index: list[str],
-    annotation_col: str = "an notation",
+    set_index: list[str] = None,
+    annotation_col: str = "annotation",
     identifier_col: str = "identifier",
     outdir: str = "tmp",
     min_size: int = 15,
@@ -501,11 +501,11 @@ def run_ssgsea(
     :param data: pandas.DataFrame with the quantified features (i.e. subject x proteins)
     :param annotation: pandas.DataFrame with the annotation to be used in the enrichment
         (i.e. CKG pathway annotation file)
-    :param str annotation_col: name of the column containing annotation terms.
-    :param str identifier_col: name of column containing dependent variables identifiers.
     :param list set_index: column/s to be used as index. Enrichment will be calculated
         for these values (i.e ["subject"] will return subjects x pathways matrix of
         enrichment scores)
+    :param str annotation_col: name of the column containing annotation terms.
+    :param str identifier_col: name of column containing dependent variables identifiers.
     :param str out_dir: directory path where results will be stored
         (default None, tmp folder is used)
     :param int min_size: minimum number of features (i.e. proteins) in enriched terms
@@ -548,56 +548,70 @@ def run_ssgsea(
             permutations=0
         )
     """
-    result = {}
     df = data.copy()
     if not os.path.exists(outdir):
         os.makedirs(outdir)
 
     # Comine columns to create a unique name for each set (?)
     name = []
-    index = data[set_index]
-    for i, row in data[set_index].iterrows():
+    if set_index is None:
+        index = data.index.to_frame()
+        set_index = index.columns.tolist()
+    else:
+        index = data[set_index]
+        df = df.drop(set_index, axis=1)
+
+    for i, row in index.iterrows():
         name.append(
             "_".join(row[set_index].tolist())
         )  # this assumes strings as identifiers
 
     df["Name"] = name
     index.index = name
-    df = df.drop(set_index, axis=1).set_index("Name").transpose()
+    df = df.set_index("Name").transpose()
 
-    if annotation_col in annotation and identifier_col in annotation:
-        grouped_annotations = (
-            annotation.groupby(annotation_col)[identifier_col].apply(list).reset_index()
+    if not annotation_col in annotation:
+        raise ValueError(
+            f"Missing Annotation Column: {annotation_col} as specified by `annotation_col`"
         )
-        fid = uuid.uuid4()
-        file_path = os.path.join(outdir, str(fid) + ".gmt")
-        with open(file_path, "w", encoding="utf8") as out:
-            for i, row in grouped_annotations.iterrows():
-                out.write(
-                    row[annotation_col]
-                    + "\t"
-                    + "\t".join(list(filter(None, row[identifier_col])))
-                    + "\n"
-                )
-        enrichment = gp.ssgsea(
-            data=df,
-            gene_sets=str(file_path),
-            outdir=outdir,
-            min_size=min_size,
-            max_size=max_size,
-            scale=scale,
-            permutation_num=permutations,
-            no_plot=True,
-            processes=1,
-            seed=10,
-            format="png",
+
+    if not identifier_col in annotation:
+        raise ValueError(
+            f"Missing Identifier Column: {identifier_col} as specified by `identifier_col`"
         )
 
-        enrichment_es = pd.DataFrame(enrichment.resultsOnSamples).transpose()
-        enrichment_es = enrichment_es.join(index)
-        enrichment_nes = enrichment.res2d.transpose()
-        enrichment_nes = enrichment_nes.join(index)
+    grouped_annotations = (
+        annotation.groupby(annotation_col)[identifier_col].apply(list).reset_index()
+    )
+    fid = uuid.uuid4()
+    file_path = os.path.join(outdir, str(fid) + ".gmt")
+    with open(file_path, "w", encoding="utf8") as out:
+        for i, row in grouped_annotations.iterrows():
+            out.write(
+                row[annotation_col]
+                + "\t"
+                + "\t".join(list(filter(None, row[identifier_col])))
+                + "\n"
+            )
+    enrichment = gp.ssgsea(
+        data=df,
+        gene_sets=str(file_path),
+        outdir=outdir,
+        min_size=min_size,
+        max_size=max_size,
+        scale=scale,
+        permutation_num=permutations,
+        no_plot=True,
+        processes=1,
+        seed=10,
+        format="png",
+    )
+
+    enrichment_es = pd.DataFrame(enrichment.results).transpose()  # resultsOnSamples
+    enrichment_es = enrichment_es.join(index)
+    enrichment_nes = enrichment.res2d.transpose()
+    # enrichment_nes = enrichment_nes.join(index) # ! To Fix
 
-        result = {"es": enrichment_es, "nes": enrichment_nes}
+    result = {"es": enrichment_es, "nes": enrichment_nes}
 
     return result
diff --git a/docs/api_examples/enrichment_analysis.ipynb b/docs/api_examples/enrichment_analysis.ipynb
@@ -49,6 +49,7 @@
     "import acore\n",
     "import acore.differential_regulation\n",
     "import acore.enrichment_analysis\n",
+    "from acore.io.uniprot import fetch_annotations\n",
     "\n",
     "dsp_pandas.format.set_pandas_options(max_colwidth=15)"
    ]
@@ -79,7 +80,7 @@
     "omics: str = f\"{base_path}/omics.csv\"\n",
     "meta_pgs: str = f\"{base_path}/meta_pgs.csv\"\n",
     "meta: str = f\"{base_path}/meta_patients.csv\"\n",
-    "N_to_sample: int = 1_000"
+    "features_to_sample: int = 100"
    ]
   },
   {
@@ -149,7 +150,7 @@
     "    .drop(idx_always_included, axis=1)\n",
     "    .dropna(thresh=18, axis=1)\n",
     "    .sample(\n",
-    "        N_to_sample - len(idx_always_included),\n",
+    "        features_to_sample - len(idx_always_included),\n",
     "        axis=1,\n",
     "        random_state=42,\n",
     "    )\n",
@@ -223,47 +224,7 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "from acore.io.uniprot import (\n",
-    "    check_id_mapping_results_ready,\n",
-    "    get_id_mapping_results_link,\n",
-    "    get_id_mapping_results_search,\n",
-    "    submit_id_mapping,\n",
-    ")\n",
-    "\n",
-    "\n",
-    "def fetch_annotations(ids: pd.Index | list) -> pd.DataFrame:\n",
-    "    \"\"\"Fetch annotations for UniProt IDs. Combines several calls to the API of UniProt's\n",
-    "    knowledgebase (KB).\n",
-    "\n",
-    "    Parameters\n",
-    "    ----------\n",
-    "    ids : pd.Index | list\n",
-    "        Iterable of UniProt IDs. Fetches annotations as speecified by the specified fields.\n",
-    "    fields : str, optional\n",
-    "        Fields to fetch, by default \"accession,go_p,go_c. See for availble fields:\n",
-    "        https://www.uniprot.org/help/return_fields\n",
-    "\n",
-    "    Returns\n",
-    "    -------\n",
-    "    pd.DataFrame\n",
-    "        DataFrame with annotations of the UniProt IDs.\n",
-    "    \"\"\"\n",
-    "    job_id = submit_id_mapping(from_db=\"UniProtKB_AC-ID\", to_db=\"UniProtKB\", ids=ids)\n",
-    "\n",
-    "    if check_id_mapping_results_ready(job_id):\n",
-    "        link = get_id_mapping_results_link(job_id)\n",
-    "        # add fields to the link to get more information\n",
-    "        # From and Entry (accession) are the same for UniProt IDs.\n",
-    "        results = get_id_mapping_results_search(\n",
-    "            link + \"?fields=accession,go_p,go_c,go_f&format=tsv\"\n",
-    "        )\n",
-    "    header = results.pop(0).split(\"\\t\")\n",
-    "    results = [line.split(\"\\t\") for line in results]\n",
-    "    df = pd.DataFrame(results, columns=header)\n",
-    "    return df\n",
-    "\n",
-    "\n",
-    "fname_annotations = \"downloaded/annotations.csv\"\n",
+    "fname_annotations = f\"downloaded/annotations_{features_to_sample}.csv\"\n",
     "fname = Path(fname_annotations)\n",
     "try:\n",
     "    annotations = pd.read_csv(fname, index_col=0)\n",
@@ -325,9 +286,7 @@
    "cell_type": "code",
    "execution_count": null,
    "id": "f300c5b5",
-   "metadata": {
-    "lines_to_next_cell": 2
-   },
+   "metadata": {},
    "outputs": [],
    "source": [
     "ret = acore.enrichment_analysis.run_regulation_enrichment(\n",
@@ -338,6 +297,32 @@
     "ret"
    ]
   },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "88b05e0c",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "res_dict = acore.enrichment_analysis.run_ssgsea(\n",
+    "    data=df_omics,\n",
+    "    annotation=annotations,\n",
+    "    min_size=1,\n",
+    ")\n",
+    "print(res_dict.keys())\n",
+    "res_dict[\"es\"]"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "b9e2fddf",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "res_dict[\"nes\"]"
+   ]
+  },
   {
    "cell_type": "code",
    "execution_count": null,

diff --git a/docs/api_examples/enrichment_analysis.py b/docs/api_examples/enrichment_analysis.py
@@ -39,7 +39,7 @@
 omics: str = f"{base_path}/omics.csv"
 meta_pgs: str = f"{base_path}/meta_pgs.csv"
 meta: str = f"{base_path}/meta_patients.csv"
-N_to_sample: int = 1_000
+features_to_sample: int = 100
 
 # %% [markdown]
 # # Load processed data
@@ -69,7 +69,7 @@
     .drop(idx_always_included, axis=1)
     .dropna(thresh=18, axis=1)
     .sample(
-        N_to_sample - len(idx_always_included),
+        features_to_sample - len(idx_always_included),
         axis=1,
         random_state=42,
     )
@@ -106,7 +106,7 @@
 #
 
 # %%
-fname_annotations = "downloaded/annotations.csv"
+fname_annotations = f"downloaded/annotations_{features_to_sample}.csv"
 fname = Path(fname_annotations)
 try:
     annotations = pd.read_csv(fname, index_col=0)
@@ -155,5 +155,16 @@
 )
 ret
 
+# %%
+res_dict = acore.enrichment_analysis.run_ssgsea(
+    data=df_omics,
+    annotation=annotations,
+    min_size=1,
+)
+print(res_dict.keys())
+res_dict["es"]
+
+# %%
+res_dict["nes"]
 
 # %%