🚧 create processing function for uniprot results

- needs to be further tested and evaluated
Multiomics-Analytics-Group · Feb 19, 2025 · f17a479 · f17a479
1 parent 67cc8bf
commit f17a479
Show file tree

Hide file tree

Showing 3 changed files with 56 additions and 54 deletions.
diff --git a/docs/api_examples/enrichment_analysis.ipynb b/docs/api_examples/enrichment_analysis.ipynb
@@ -50,7 +50,6 @@
     "import acore\n",
     "import acore.differential_regulation\n",
     "import acore.enrichment_analysis\n",
-    "from acore.io.uniprot import fetch_annotations\n",
     "\n",
     "dsp_pandas.format.set_pandas_options(max_colwidth=60)"
    ]
@@ -243,39 +242,19 @@
    "metadata": {},
    "outputs": [],
    "source": [
+    "from acore.io.uniprot import fetch_annotations, process_annotations\n",
+    "\n",
     "fname_annotations = f\"downloaded/annotations_{features_to_sample}.csv\"\n",
     "fname = Path(fname_annotations)\n",
     "try:\n",
     "    annotations = pd.read_csv(fname, index_col=0)\n",
     "    print(f\"Loaded annotations from {fname}\")\n",
     "except FileNotFoundError:\n",
     "    print(f\"Fetching annotations for {df_omics.columns.size} UniProt IDs.\")\n",
-    "    fields = \"go_p,go_c,go_f\"\n",
-    "    annotations = fetch_annotations(df_omics.columns, fields=fields)\n",
-    "    # First column (`From`) is additional to specified fields\n",
-    "    d_fields_to_col = {k: v for k, v in zip(fields.split(\",\"), annotations.columns[1:])}\n",
-    "\n",
-    "    # expand go terms\n",
-    "    to_expand = list()\n",
-    "    for field in d_fields_to_col:\n",
-    "        if \"go_\" in field:\n",
-    "            col = d_fields_to_col[field]\n",
-    "            annotations[col] = annotations[col].str.split(\";\")\n",
-    "            to_expand.append(col)\n",
-    "    for col in to_expand:\n",
-    "        # this is a bit wastefull. Processing to stack format should be done here.\n",
-    "        annotations = annotations.explode(col, ignore_index=True)\n",
-    "    # process other than go term columns\n",
-    "    annotations = (\n",
-    "        annotations.set_index(\"From\")\n",
-    "        .rename_axis(\"identifier\")\n",
-    "        # .drop(\"Entry\", axis=1)\n",
-    "        .rename_axis(\"source\", axis=1)\n",
-    "        .stack()\n",
-    "        .to_frame(\"annotation\")\n",
-    "        .reset_index()\n",
-    "        .drop_duplicates(ignore_index=True)\n",
-    "    )\n",
+    "    FIELDS = \"go_p,go_c,go_f\"\n",
+    "    annotations = fetch_annotations(df_omics.columns, fields=FIELDS)\n",
+    "    annotations = process_annotations(annotations, fields=FIELDS)\n",
+    "    # cache the annotations\n",
     "    fname.parent.mkdir(exist_ok=True, parents=True)\n",
     "    annotations.to_csv(fname, index=True)\n",
     "\n",

diff --git a/docs/api_examples/enrichment_analysis.py b/docs/api_examples/enrichment_analysis.py
@@ -25,7 +25,6 @@
 import acore
 import acore.differential_regulation
 import acore.enrichment_analysis
-from acore.io.uniprot import fetch_annotations
 
 dsp_pandas.format.set_pandas_options(max_colwidth=60)
 
@@ -116,39 +115,19 @@
 # in our selection of the dataset.
 
 # %%
+from acore.io.uniprot import fetch_annotations, process_annotations
+
 fname_annotations = f"downloaded/annotations_{features_to_sample}.csv"
 fname = Path(fname_annotations)
 try:
     annotations = pd.read_csv(fname, index_col=0)
     print(f"Loaded annotations from {fname}")
 except FileNotFoundError:
     print(f"Fetching annotations for {df_omics.columns.size} UniProt IDs.")
-    fields = "go_p,go_c,go_f"
-    annotations = fetch_annotations(df_omics.columns, fields=fields)
-    # First column (`From`) is additional to specified fields
-    d_fields_to_col = {k: v for k, v in zip(fields.split(","), annotations.columns[1:])}
-
-    # expand go terms
-    to_expand = list()
-    for field in d_fields_to_col:
-        if "go_" in field:
-            col = d_fields_to_col[field]
-            annotations[col] = annotations[col].str.split(";")
-            to_expand.append(col)
-    for col in to_expand:
-        # this is a bit wastefull. Processing to stack format should be done here.
-        annotations = annotations.explode(col, ignore_index=True)
-    # process other than go term columns
-    annotations = (
-        annotations.set_index("From")
-        .rename_axis("identifier")
-        # .drop("Entry", axis=1)
-        .rename_axis("source", axis=1)
-        .stack()
-        .to_frame("annotation")
-        .reset_index()
-        .drop_duplicates(ignore_index=True)
-    )
+    FIELDS = "go_p,go_c,go_f"
+    annotations = fetch_annotations(df_omics.columns, fields=FIELDS)
+    annotations = process_annotations(annotations, fields=FIELDS)
+    # cache the annotations
     fname.parent.mkdir(exist_ok=True, parents=True)
     annotations.to_csv(fname, index=True)
 

diff --git a/src/acore/io/uniprot/__init__.py b/src/acore/io/uniprot/__init__.py
@@ -46,3 +46,47 @@ def fetch_annotations(
     results = [line.split("\t") for line in results]
     df = pd.DataFrame(results, columns=header)
     return df
+
+
+def process_annotations(annotations: pd.DataFrame, fields: str) -> pd.DataFrame:
+    """Process annotations fetched from UniProt API.
+
+    Parameters
+    ----------
+    annotations : pd.DataFrame
+        DataFrame with annotations fetched from UniProt API.
+    fields : str
+        Fields that were fetched from the API. Comma-separated string. Fields
+        needs to match number of columns in annotations.
+
+    Returns
+    -------
+    pd.DataFrame
+        Processed DataFrame with annotations in long-format.
+    """
+    d_fields_to_col = {
+        k: v for k, v in zip(fields.split(","), annotations.columns[1:], strict=True)
+    }
+
+    # expand go terms
+    to_expand = list()
+    for field in d_fields_to_col:
+        if "go_" in field:
+            col = d_fields_to_col[field]
+            annotations[col] = annotations[col].str.split(";")
+            to_expand.append(col)
+    for col in to_expand:
+        # this is a bit wastefull. Processing to stack format should be done here.
+        annotations = annotations.explode(col, ignore_index=True)
+    # process other than go term columns
+    annotations = (
+        annotations.set_index("From")
+        .rename_axis("identifier")
+        # .drop("Entry", axis=1)
+        .rename_axis("source", axis=1)
+        .stack()
+        .to_frame("annotation")
+        .reset_index()
+        .drop_duplicates(ignore_index=True)
+    )
+    return annotations