Skip to content

Commit

Permalink
🚧 create processing function for uniprot results
Browse files Browse the repository at this point in the history
- needs to be further tested and evaluated
  • Loading branch information
enryH committed Feb 19, 2025
1 parent 67cc8bf commit f17a479
Show file tree
Hide file tree
Showing 3 changed files with 56 additions and 54 deletions.
33 changes: 6 additions & 27 deletions docs/api_examples/enrichment_analysis.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -50,7 +50,6 @@
"import acore\n",
"import acore.differential_regulation\n",
"import acore.enrichment_analysis\n",
"from acore.io.uniprot import fetch_annotations\n",
"\n",
"dsp_pandas.format.set_pandas_options(max_colwidth=60)"
]
Expand Down Expand Up @@ -243,39 +242,19 @@
"metadata": {},
"outputs": [],
"source": [
"from acore.io.uniprot import fetch_annotations, process_annotations\n",
"\n",
"fname_annotations = f\"downloaded/annotations_{features_to_sample}.csv\"\n",
"fname = Path(fname_annotations)\n",
"try:\n",
" annotations = pd.read_csv(fname, index_col=0)\n",
" print(f\"Loaded annotations from {fname}\")\n",
"except FileNotFoundError:\n",
" print(f\"Fetching annotations for {df_omics.columns.size} UniProt IDs.\")\n",
" fields = \"go_p,go_c,go_f\"\n",
" annotations = fetch_annotations(df_omics.columns, fields=fields)\n",
" # First column (`From`) is additional to specified fields\n",
" d_fields_to_col = {k: v for k, v in zip(fields.split(\",\"), annotations.columns[1:])}\n",
"\n",
" # expand go terms\n",
" to_expand = list()\n",
" for field in d_fields_to_col:\n",
" if \"go_\" in field:\n",
" col = d_fields_to_col[field]\n",
" annotations[col] = annotations[col].str.split(\";\")\n",
" to_expand.append(col)\n",
" for col in to_expand:\n",
" # this is a bit wastefull. Processing to stack format should be done here.\n",
" annotations = annotations.explode(col, ignore_index=True)\n",
" # process other than go term columns\n",
" annotations = (\n",
" annotations.set_index(\"From\")\n",
" .rename_axis(\"identifier\")\n",
" # .drop(\"Entry\", axis=1)\n",
" .rename_axis(\"source\", axis=1)\n",
" .stack()\n",
" .to_frame(\"annotation\")\n",
" .reset_index()\n",
" .drop_duplicates(ignore_index=True)\n",
" )\n",
" FIELDS = \"go_p,go_c,go_f\"\n",
" annotations = fetch_annotations(df_omics.columns, fields=FIELDS)\n",
" annotations = process_annotations(annotations, fields=FIELDS)\n",
" # cache the annotations\n",
" fname.parent.mkdir(exist_ok=True, parents=True)\n",
" annotations.to_csv(fname, index=True)\n",
"\n",
Expand Down
33 changes: 6 additions & 27 deletions docs/api_examples/enrichment_analysis.py
Original file line number Diff line number Diff line change
Expand Up @@ -25,7 +25,6 @@
import acore
import acore.differential_regulation
import acore.enrichment_analysis
from acore.io.uniprot import fetch_annotations

dsp_pandas.format.set_pandas_options(max_colwidth=60)

Expand Down Expand Up @@ -116,39 +115,19 @@
# in our selection of the dataset.

# %%
from acore.io.uniprot import fetch_annotations, process_annotations

fname_annotations = f"downloaded/annotations_{features_to_sample}.csv"
fname = Path(fname_annotations)
try:
annotations = pd.read_csv(fname, index_col=0)
print(f"Loaded annotations from {fname}")
except FileNotFoundError:
print(f"Fetching annotations for {df_omics.columns.size} UniProt IDs.")
fields = "go_p,go_c,go_f"
annotations = fetch_annotations(df_omics.columns, fields=fields)
# First column (`From`) is additional to specified fields
d_fields_to_col = {k: v for k, v in zip(fields.split(","), annotations.columns[1:])}

# expand go terms
to_expand = list()
for field in d_fields_to_col:
if "go_" in field:
col = d_fields_to_col[field]
annotations[col] = annotations[col].str.split(";")
to_expand.append(col)
for col in to_expand:
# this is a bit wastefull. Processing to stack format should be done here.
annotations = annotations.explode(col, ignore_index=True)
# process other than go term columns
annotations = (
annotations.set_index("From")
.rename_axis("identifier")
# .drop("Entry", axis=1)
.rename_axis("source", axis=1)
.stack()
.to_frame("annotation")
.reset_index()
.drop_duplicates(ignore_index=True)
)
FIELDS = "go_p,go_c,go_f"
annotations = fetch_annotations(df_omics.columns, fields=FIELDS)
annotations = process_annotations(annotations, fields=FIELDS)
# cache the annotations
fname.parent.mkdir(exist_ok=True, parents=True)
annotations.to_csv(fname, index=True)

Expand Down
44 changes: 44 additions & 0 deletions src/acore/io/uniprot/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -46,3 +46,47 @@ def fetch_annotations(
results = [line.split("\t") for line in results]
df = pd.DataFrame(results, columns=header)
return df


def process_annotations(annotations: pd.DataFrame, fields: str) -> pd.DataFrame:
"""Process annotations fetched from UniProt API.
Parameters
----------
annotations : pd.DataFrame
DataFrame with annotations fetched from UniProt API.
fields : str
Fields that were fetched from the API. Comma-separated string. Fields
needs to match number of columns in annotations.
Returns
-------
pd.DataFrame
Processed DataFrame with annotations in long-format.
"""
d_fields_to_col = {
k: v for k, v in zip(fields.split(","), annotations.columns[1:], strict=True)
}

# expand go terms
to_expand = list()
for field in d_fields_to_col:
if "go_" in field:
col = d_fields_to_col[field]
annotations[col] = annotations[col].str.split(";")
to_expand.append(col)
for col in to_expand:
# this is a bit wastefull. Processing to stack format should be done here.
annotations = annotations.explode(col, ignore_index=True)
# process other than go term columns
annotations = (
annotations.set_index("From")
.rename_axis("identifier")
# .drop("Entry", axis=1)
.rename_axis("source", axis=1)
.stack()
.to_frame("annotation")
.reset_index()
.drop_duplicates(ignore_index=True)
)
return annotations

0 comments on commit f17a479

Please sign in to comment.