diff --git a/docs/api_examples/Download_PRIDE_data.py b/docs/api_examples/Download_PRIDE_data.py index f029355..9ca219d 100644 --- a/docs/api_examples/Download_PRIDE_data.py +++ b/docs/api_examples/Download_PRIDE_data.py @@ -17,7 +17,7 @@ # This notebook shows how `acore` can be used to download data from # the Proteomics Identifications Database - PRIDE - # ([ebi.ac.uk/pride/](https://www.ebi.ac.uk/pride/)) -# and parse the data to be used in the analytics core. +# and parse the data to be used in the analytics core # and quickly formated to start analyzing them with the functionality in the analytics core. # # > based on CKG recipe: [Download PRIDE Data](https://ckg.readthedocs.io/en/latest/notebooks/recipes/Download_PRIDE_data.html) @@ -36,7 +36,7 @@ # %% [markdown] # ## Parameters # Specify the PRIDE identifier and file to be downloaded -# - and where to store intermediate files. +# and where to store intermediate files. # %% tags=["parameters"] pxd_id: str = "PXD008541" # PRIDE identifier diff --git a/docs/api_examples/enrichment_analysis.ipynb b/docs/api_examples/enrichment_analysis.ipynb index 17e96d8..12b30fa 100644 --- a/docs/api_examples/enrichment_analysis.ipynb +++ b/docs/api_examples/enrichment_analysis.ipynb @@ -8,15 +8,17 @@ }, "source": [ "# Enrichment analysis\n", + "requires\n", + "- some cluster of proteins/genes (e.g. up- and downregulated proteins/genes)\n", + "- functional annotations, i.e. a category summarizing a set of proteins/genes.\n", "\n", - "- we need some groups of genes to compute clusters\n", - "- we need functional annotations, i.e. a category summarizing a set of genes.\n", - "-\n", "You can start with watching Lars Juhl Jensen's brief introduction to enrichment analysis\n", "on [youtube](https://www.youtube.com/watch?v=2NC1QOXmc5o).\n", "\n", - "Use example data for ovarian cancer\n", - "([PXD010372](https://github.com/Multiomics-Analytics-Group/acore/tree/main/example_data/PXD010372))" + "Here we use as example data from an ovarian cancer dataset:\n", + "[PXD010372](https://github.com/Multiomics-Analytics-Group/acore/tree/main/example_data/PXD010372)\n", + "\n", + "First make sure you have the required packages installed:" ] }, { @@ -24,14 +26,13 @@ "execution_count": null, "id": "956ed7b7", "metadata": { - "lines_to_next_cell": 2, "tags": [ "hide-output" ] }, "outputs": [], "source": [ - "%pip install acore vuecore" + "%pip install acore vuecore 'plotly<6'" ] }, { @@ -49,7 +50,6 @@ "import acore\n", "import acore.differential_regulation\n", "import acore.enrichment_analysis\n", - "from acore.io.uniprot import fetch_annotations\n", "\n", "dsp_pandas.format.set_pandas_options(max_colwidth=60)" ] @@ -88,7 +88,9 @@ "id": "10ed1830", "metadata": {}, "source": [ - "# Load processed data" + "# Load processed data\n", + "from our repository. See details on obtaining the data under the example data section on\n", + "[this page](nb_ref_ovarian_data)" ] }, { @@ -111,7 +113,12 @@ "metadata": {}, "outputs": [], "source": [ - "ax = df_omics.notna().sum().sort_values(ascending=True).plot()" + "ax = (\n", + " df_omics.notna()\n", + " .sum()\n", + " .sort_values(ascending=True)\n", + " .plot(xlabel=\"Protein groups\", ylabel=\"Number of non-NaN values (samples)\")\n", + ")" ] }, { @@ -120,7 +127,7 @@ "metadata": {}, "source": [ "Keep only features with a certain amount of non-NaN values and select 100 of these\n", - "for illustration. Add the ones which were differently regulated in the ANOVA using all\n", + "for illustration. Add always four which were differently regulated in the ANOVA using all\n", "the protein groups." ] }, @@ -131,8 +138,7 @@ "metadata": {}, "outputs": [], "source": [ - "idx_always_included = [\"Q5HYN5\", \"P39059\", \"O43432\", \"O43175\"]\n", - "df_omics[idx_always_included]" + "idx_always_included = [\"Q5HYN5\", \"P39059\", \"O43432\", \"O43175\"]" ] }, { @@ -140,7 +146,6 @@ "execution_count": null, "id": "1145a2cd", "metadata": { - "lines_to_next_cell": 2, "tags": [ "hide-input" ] @@ -162,6 +167,15 @@ "df_omics" ] }, + { + "cell_type": "markdown", + "id": "ff72465c", + "metadata": {}, + "source": [ + "And we have the following patient metadata, from which we will use the `Status` column as\n", + "our dependent variable and the `PlatinumValue` as a covariate." + ] + }, { "cell_type": "code", "execution_count": null, @@ -179,7 +193,7 @@ "id": "4bbf5dc4", "metadata": {}, "source": [ - "## Compute up and downregulated genes\n", + "# ANOVA: Compute up and downregulated genes\n", "These will be used to find enrichments in the set of both up and downregulated genes." ] }, @@ -191,7 +205,6 @@ "outputs": [], "source": [ "group = \"Status\"\n", - "covariates = [\"PlatinumValue\"]\n", "diff_reg = acore.differential_regulation.run_anova(\n", " df_omics.join(df_meta[[group]]),\n", " drop_cols=[],\n", @@ -217,7 +230,8 @@ "id": "d6c0a225", "metadata": {}, "source": [ - "## Find functional annotations, here pathways\n" + "# Download functional annotations, here pathways, for the protein groups\n", + "in our selection of the dataset." ] }, { @@ -227,6 +241,8 @@ "metadata": {}, "outputs": [], "source": [ + "from acore.io.uniprot import fetch_annotations, process_annotations\n", + "\n", "fname_annotations = f\"downloaded/annotations_{features_to_sample}.csv\"\n", "fname = Path(fname_annotations)\n", "try:\n", @@ -234,32 +250,10 @@ " print(f\"Loaded annotations from {fname}\")\n", "except FileNotFoundError:\n", " print(f\"Fetching annotations for {df_omics.columns.size} UniProt IDs.\")\n", - " fields = \"go_p,go_c,go_f\"\n", - " annotations = fetch_annotations(df_omics.columns, fields=fields)\n", - " # First column (`From`) is additional to specified fields\n", - " d_fields_to_col = {k: v for k, v in zip(fields.split(\",\"), annotations.columns[1:])}\n", - "\n", - " # expand go terms\n", - " to_expand = list()\n", - " for field in d_fields_to_col:\n", - " if \"go_\" in field:\n", - " col = d_fields_to_col[field]\n", - " annotations[col] = annotations[col].str.split(\";\")\n", - " to_expand.append(col)\n", - " for col in to_expand:\n", - " # this is a bit wastefull. Processing to stack format should be done here.\n", - " annotations = annotations.explode(col, ignore_index=True)\n", - " # process other than go term columns\n", - " annotations = (\n", - " annotations.set_index(\"From\")\n", - " .rename_axis(\"identifier\")\n", - " # .drop(\"Entry\", axis=1)\n", - " .rename_axis(\"source\", axis=1)\n", - " .stack()\n", - " .to_frame(\"annotation\")\n", - " .reset_index()\n", - " .drop_duplicates(ignore_index=True)\n", - " )\n", + " FIELDS = \"go_p,go_c,go_f\"\n", + " annotations = fetch_annotations(df_omics.columns, fields=FIELDS)\n", + " annotations = process_annotations(annotations, fields=FIELDS)\n", + " # cache the annotations\n", " fname.parent.mkdir(exist_ok=True, parents=True)\n", " annotations.to_csv(fname, index=True)\n", "\n", @@ -271,7 +265,8 @@ "id": "d4734452", "metadata": {}, "source": [ - "See how many protein groups are associated with each annotation." + "See how many protein groups are associated with each annotation. We observe that most\n", + "functional annotations are associated only to a single protein group in our dataset." ] }, { @@ -285,13 +280,27 @@ }, "outputs": [], "source": [ - "_ = (\n", - " annotations.groupby(\"annotation\")\n", - " .size()\n", - " .value_counts()\n", - " .sort_index()\n", - " .plot(kind=\"bar\")\n", - ")" + "s_count_pg_per_annotation = (\n", + " annotations.groupby(\"annotation\").size().value_counts().sort_index()\n", + ")\n", + "_ = s_count_pg_per_annotation.plot(\n", + " kind=\"bar\",\n", + " xlabel=\"Number of protein groups associated with annotation\",\n", + " ylabel=\"Number of annotations\",\n", + ")\n", + "s_count_pg_per_annotation.to_frame(\"number of annotations\").rename_axis(\n", + " \"N protein groups\"\n", + ").T" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "e04c98d8", + "metadata": {}, + "outputs": [], + "source": [ + "annotations.groupby(\"annotation\").size().value_counts(ascending=False)" ] }, { @@ -299,7 +308,7 @@ "id": "4165bc94", "metadata": {}, "source": [ - "## Enrichment analysis\n", + "# Enrichment analysis\n", "Is done separately for up- and downregulated genes as it's assumed that biological\n", "processes are regulated in one direction." ] @@ -334,8 +343,10 @@ "id": "7380a528", "metadata": {}, "source": [ - "- this additionally sets a fold change cutoff\n", - "- and the fore and backgroud populations are changed due to the separation" + "Running the enrichment analysis for the up- and down regulated protein groups\n", + "separately with the default settings of the function, i.e. a log2 fold change cutoff\n", + "of 1 and at least 2 protein groups detected in the set of proteins\n", + "defining the functional annotation." ] }, { @@ -348,8 +359,8 @@ "ret = acore.enrichment_analysis.run_up_down_regulation_enrichment(\n", " regulation_data=diff_reg,\n", " annotation=annotations,\n", - " min_detected_in_set=2, # ! default is 2, so more conservative\n", - " lfc_cutoff=0.5, # ! the default is 1\n", + " min_detected_in_set=2,\n", + " lfc_cutoff=1,\n", ")\n", "ret" ] @@ -359,46 +370,46 @@ "id": "5cb036be", "metadata": {}, "source": [ - "here we see differences for the same set of differently regulated protein groups,\n", - "which can be reset using lfc_cutoff=0." + "we can decrease the cutoff for the log2 fold change to 0.5 and see that we retain\n", + "more annotations." ] }, { "cell_type": "code", "execution_count": null, - "id": "5e5e2b61", + "id": "3ea367ca", "metadata": {}, "outputs": [], "source": [ "ret = acore.enrichment_analysis.run_up_down_regulation_enrichment(\n", " regulation_data=diff_reg,\n", " annotation=annotations,\n", - " min_detected_in_set=1, # ! default is 2, so more conservative\n", - " lfc_cutoff=0.1, # ! the default is 1\n", + " min_detected_in_set=2,\n", + " lfc_cutoff=0.5, # ! the default is 1\n", ")\n", "ret" ] }, { "cell_type": "markdown", - "id": "e3530547", + "id": "e51bd7e3", "metadata": {}, "source": [ - "Or restricting the analysis to functional annotation for which we at least found 2\n", - "protein groups to be upregulated." + "And even more if we do not restrict the analysis of finding at least two proteins\n", + "of a functional set in our data set (i.e. we only need to find one match from the set)." ] }, { "cell_type": "code", "execution_count": null, - "id": "3ea367ca", + "id": "7ede296e", "metadata": {}, "outputs": [], "source": [ "ret = acore.enrichment_analysis.run_up_down_regulation_enrichment(\n", " regulation_data=diff_reg,\n", " annotation=annotations,\n", - " min_detected_in_set=2,\n", + " min_detected_in_set=1,\n", " lfc_cutoff=0.5, # ! the default is 1\n", ")\n", "ret" @@ -409,7 +420,7 @@ "id": "ecf75e7c", "metadata": {}, "source": [ - "### Site specific enrichment analysis" + "## Site specific enrichment analysis" ] }, { @@ -458,7 +469,7 @@ "id": "dbc34b7b", "metadata": {}, "source": [ - "## Single sample GSEA (ssGSEA)\n", + "# Single sample GSEA (ssGSEA)\n", "Run a gene set enrichment analysis (GSEA) for each sample,\n", "see [article](https://www.nature.com/articles/nature08460#Sec3) and\n", "the package [`gseapy`](https://gseapy.readthedocs.io/en/latest/run.html#gseapy.ssgsea)\n", @@ -497,7 +508,7 @@ "metadata": {}, "outputs": [], "source": [ - "enrichtments[\"NES\"].plot.hist()" + "ax = enrichtments[\"NES\"].plot.hist()" ] }, { @@ -577,13 +588,14 @@ "metadata": {}, "outputs": [], "source": [ - "from plotly.offline import iplot\n", + "import plotly.graph_objects as go\n", "from vuecore import viz\n", "\n", - "args = {\"factor\": 1, \"loadings\": 10}\n", + "args = {\"factor\": 2, \"loadings\": 1} # increase number of loadings or scaling factor\n", "#! pca_results has three items, but docstring requests only two -> double check\n", "figure = viz.get_pca_plot(data=pca_result, identifier=\"PCA enrichment\", args=args)\n", - "iplot(figure)" + "figure = go.Figure(data=figure[\"data\"], layout=figure[\"layout\"])\n", + "figure.show()" ] }, { @@ -591,7 +603,8 @@ "id": "4be5a8c8", "metadata": {}, "source": [ - "## Compare two distributions - KS test\n", + "# Compare two distributions - KS test\n", + "\n", "The Kolmogorov-Smirnov test is a non-parametric test that compares two distributions.\n", "- we compare the distributions of the two differently upregulated protein groups\n", "This is not the best example for comparing distributions, but it shows how to use the\n", @@ -643,6 +656,23 @@ "cell_metadata_filter": "tags,-all", "main_language": "python", "notebook_metadata_filter": "-all" + }, + "kernelspec": { + "display_name": "acore", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.11.10" } }, "nbformat": 4, diff --git a/docs/api_examples/enrichment_analysis.py b/docs/api_examples/enrichment_analysis.py index 5f09cd0..1049722 100644 --- a/docs/api_examples/enrichment_analysis.py +++ b/docs/api_examples/enrichment_analysis.py @@ -1,19 +1,20 @@ # %% [markdown] # # Enrichment analysis +# requires +# - some cluster of proteins/genes (e.g. up- and downregulated proteins/genes) +# - functional annotations, i.e. a category summarizing a set of proteins/genes. # -# - we need some groups of genes to compute clusters -# - we need functional annotations, i.e. a category summarizing a set of genes. -# - # You can start with watching Lars Juhl Jensen's brief introduction to enrichment analysis # on [youtube](https://www.youtube.com/watch?v=2NC1QOXmc5o). # -# Use example data for ovarian cancer -# ([PXD010372](https://github.com/Multiomics-Analytics-Group/acore/tree/main/example_data/PXD010372)) +# Here we use as example data from an ovarian cancer dataset: +# [PXD010372](https://github.com/Multiomics-Analytics-Group/acore/tree/main/example_data/PXD010372) +# +# First make sure you have the required packages installed: # %% tags=["hide-output"] -# %pip install acore vuecore - +# %pip install acore vuecore 'plotly<6' # %% from pathlib import Path @@ -24,7 +25,6 @@ import acore import acore.differential_regulation import acore.enrichment_analysis -from acore.io.uniprot import fetch_annotations dsp_pandas.format.set_pandas_options(max_colwidth=60) @@ -43,6 +43,8 @@ # %% [markdown] # # Load processed data +# from our repository. See details on obtaining the data under the example data section on +# [this page](nb_ref_ovarian_data) # %% df_omics = pd.read_csv(omics, index_col=0) @@ -51,16 +53,20 @@ df_omics # %% -ax = df_omics.notna().sum().sort_values(ascending=True).plot() +ax = ( + df_omics.notna() + .sum() + .sort_values(ascending=True) + .plot(xlabel="Protein groups", ylabel="Number of non-NaN values (samples)") +) # %% [markdown] # Keep only features with a certain amount of non-NaN values and select 100 of these -# for illustration. Add the ones which were differently regulated in the ANOVA using all +# for illustration. Add always four which were differently regulated in the ANOVA using all # the protein groups. # %% idx_always_included = ["Q5HYN5", "P39059", "O43432", "O43175"] -df_omics[idx_always_included] # %% tags=["hide-input"] df_omics = ( @@ -77,18 +83,20 @@ ) df_omics +# %% [markdown] +# And we have the following patient metadata, from which we will use the `Status` column as +# our dependent variable and the `PlatinumValue` as a covariate. # %% df_meta # %% [markdown] -# ## Compute up and downregulated genes +# # ANOVA: Compute up and downregulated genes # These will be used to find enrichments in the set of both up and downregulated genes. # %% group = "Status" -covariates = ["PlatinumValue"] diff_reg = acore.differential_regulation.run_anova( df_omics.join(df_meta[[group]]), drop_cols=[], @@ -102,10 +110,12 @@ diff_reg.query("rejected") # %% [markdown] -# ## Find functional annotations, here pathways -# +# # Download functional annotations, here pathways, for the protein groups +# in our selection of the dataset. # %% +from acore.io.uniprot import fetch_annotations, process_annotations + fname_annotations = f"downloaded/annotations_{features_to_sample}.csv" fname = Path(fname_annotations) try: @@ -113,51 +123,37 @@ print(f"Loaded annotations from {fname}") except FileNotFoundError: print(f"Fetching annotations for {df_omics.columns.size} UniProt IDs.") - fields = "go_p,go_c,go_f" - annotations = fetch_annotations(df_omics.columns, fields=fields) - # First column (`From`) is additional to specified fields - d_fields_to_col = {k: v for k, v in zip(fields.split(","), annotations.columns[1:])} - - # expand go terms - to_expand = list() - for field in d_fields_to_col: - if "go_" in field: - col = d_fields_to_col[field] - annotations[col] = annotations[col].str.split(";") - to_expand.append(col) - for col in to_expand: - # this is a bit wastefull. Processing to stack format should be done here. - annotations = annotations.explode(col, ignore_index=True) - # process other than go term columns - annotations = ( - annotations.set_index("From") - .rename_axis("identifier") - # .drop("Entry", axis=1) - .rename_axis("source", axis=1) - .stack() - .to_frame("annotation") - .reset_index() - .drop_duplicates(ignore_index=True) - ) + FIELDS = "go_p,go_c,go_f" + annotations = fetch_annotations(df_omics.columns, fields=FIELDS) + annotations = process_annotations(annotations, fields=FIELDS) + # cache the annotations fname.parent.mkdir(exist_ok=True, parents=True) annotations.to_csv(fname, index=True) annotations # %% [markdown] -# See how many protein groups are associated with each annotation. +# See how many protein groups are associated with each annotation. We observe that most +# functional annotations are associated only to a single protein group in our dataset. # %% tags=["hide-input"] -_ = ( - annotations.groupby("annotation") - .size() - .value_counts() - .sort_index() - .plot(kind="bar") +s_count_pg_per_annotation = ( + annotations.groupby("annotation").size().value_counts().sort_index() ) +_ = s_count_pg_per_annotation.plot( + kind="bar", + xlabel="Number of protein groups associated with annotation", + ylabel="Number of annotations", +) +s_count_pg_per_annotation.to_frame("number of annotations").rename_axis( + "N protein groups" +).T + +# %% +annotations.groupby("annotation").size().value_counts(ascending=False) # %% [markdown] -# ## Enrichment analysis +# # Enrichment analysis # Is done separately for up- and downregulated genes as it's assumed that biological # processes are regulated in one direction. @@ -176,46 +172,48 @@ ].sort_values("log2FC") # %% [markdown] -# - this additionally sets a fold change cutoff -# - and the fore and backgroud populations are changed due to the separation +# Running the enrichment analysis for the up- and down regulated protein groups +# separately with the default settings of the function, i.e. a log2 fold change cutoff +# of 1 and at least 2 protein groups detected in the set of proteins +# defining the functional annotation. # %% ret = acore.enrichment_analysis.run_up_down_regulation_enrichment( regulation_data=diff_reg, annotation=annotations, - min_detected_in_set=2, # ! default is 2, so more conservative - lfc_cutoff=0.5, # ! the default is 1 + min_detected_in_set=2, + lfc_cutoff=1, ) ret # %% [markdown] -# here we see differences for the same set of differently regulated protein groups, -# which can be reset using lfc_cutoff=0. +# we can decrease the cutoff for the log2 fold change to 0.5 and see that we retain +# more annotations. # %% ret = acore.enrichment_analysis.run_up_down_regulation_enrichment( regulation_data=diff_reg, annotation=annotations, - min_detected_in_set=1, # ! default is 2, so more conservative - lfc_cutoff=0.1, # ! the default is 1 + min_detected_in_set=2, + lfc_cutoff=0.5, # ! the default is 1 ) ret # %% [markdown] -# Or restricting the analysis to functional annotation for which we at least found 2 -# protein groups to be upregulated. +# And even more if we do not restrict the analysis of finding at least two proteins +# of a functional set in our data set (i.e. we only need to find one match from the set). # %% ret = acore.enrichment_analysis.run_up_down_regulation_enrichment( regulation_data=diff_reg, annotation=annotations, - min_detected_in_set=2, + min_detected_in_set=1, lfc_cutoff=0.5, # ! the default is 1 ) ret # %% [markdown] -# ### Site specific enrichment analysis +# ## Site specific enrichment analysis # %% [markdown] # The basic example uses a modified peptide sequence to @@ -240,7 +238,7 @@ # acore.enrichment_analysis.run_up_down_regulation_enrichment # %% [markdown] -# ## Single sample GSEA (ssGSEA) +# # Single sample GSEA (ssGSEA) # Run a gene set enrichment analysis (GSEA) for each sample, # see [article](https://www.nature.com/articles/nature08460#Sec3) and # the package [`gseapy`](https://gseapy.readthedocs.io/en/latest/run.html#gseapy.ssgsea) @@ -258,7 +256,7 @@ enrichtments.iloc[0].to_dict() # %% -enrichtments["NES"].plot.hist() +ax = enrichtments["NES"].plot.hist() # %% [markdown] # The normalised enrichment score (NES) can be used in a PCA plot to see if the samples @@ -295,16 +293,18 @@ # for this, which is also developed by the Multiomics Analytics Group. # %% -from plotly.offline import iplot +import plotly.graph_objects as go from vuecore import viz -args = {"factor": 1, "loadings": 10} +args = {"factor": 2, "loadings": 1} # increase number of loadings or scaling factor # #! pca_results has three items, but docstring requests only two -> double check figure = viz.get_pca_plot(data=pca_result, identifier="PCA enrichment", args=args) -iplot(figure) +figure = go.Figure(data=figure["data"], layout=figure["layout"]) +figure.show() # %% [markdown] -# ## Compare two distributions - KS test +# # Compare two distributions - KS test +# # The Kolmogorov-Smirnov test is a non-parametric test that compares two distributions. # - we compare the distributions of the two differently upregulated protein groups # This is not the best example for comparing distributions, but it shows how to use the diff --git a/docs/api_examples/ovarian_cancer.ipynb b/docs/api_examples/ovarian_cancer.ipynb index 8f35017..ab2a0f7 100644 --- a/docs/api_examples/ovarian_cancer.ipynb +++ b/docs/api_examples/ovarian_cancer.ipynb @@ -5,6 +5,7 @@ "id": "d61c8f55", "metadata": {}, "source": [ + "(nb_ref_ovarian_data)=\n", "# Download from journal (ovarian cancer proteome)\n", "Download the ovarian cancer proteome data from the journal's website. It was\n", "provided as supplementary data. See the article here:\n", diff --git a/docs/api_examples/ovarian_cancer.py b/docs/api_examples/ovarian_cancer.py index 49f6aeb..cf7483e 100644 --- a/docs/api_examples/ovarian_cancer.py +++ b/docs/api_examples/ovarian_cancer.py @@ -1,4 +1,5 @@ # %% [markdown] +# (nb_ref_ovarian_data)= # # Download from journal (ovarian cancer proteome) # Download the ovarian cancer proteome data from the journal's website. It was # provided as supplementary data. See the article here: diff --git a/docs/conf.py b/docs/conf.py index 8cb7971..e935d46 100755 --- a/docs/conf.py +++ b/docs/conf.py @@ -104,6 +104,8 @@ nb_execution_raise_on_error = True # Rendering nb_merge_streams = True +# maximum execution time per cell in seconds +nb_execution_timeout = 120 # https://myst-nb.readthedocs.io/en/latest/authoring/custom-formats.html#write-custom-formats # nb_custom_formats = {".py": ["jupytext.reads", {"fmt": "py:percent"}]} diff --git a/pyproject.toml b/pyproject.toml index 67d1043..e22a071 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -27,7 +27,7 @@ dependencies = [ "networkx", "biopython", "combat", - "gseapy", + "gseapy!=1.1.5", "kmapper", "lifelines", "pingouin", diff --git a/src/acore/io/uniprot/__init__.py b/src/acore/io/uniprot/__init__.py index 96e6002..d8c8ef6 100644 --- a/src/acore/io/uniprot/__init__.py +++ b/src/acore/io/uniprot/__init__.py @@ -46,3 +46,47 @@ def fetch_annotations( results = [line.split("\t") for line in results] df = pd.DataFrame(results, columns=header) return df + + +def process_annotations(annotations: pd.DataFrame, fields: str) -> pd.DataFrame: + """Process annotations fetched from UniProt API. + + Parameters + ---------- + annotations : pd.DataFrame + DataFrame with annotations fetched from UniProt API. + fields : str + Fields that were fetched from the API. Comma-separated string. Fields + needs to match number of columns in annotations. + + Returns + ------- + pd.DataFrame + Processed DataFrame with annotations in long-format. + """ + d_fields_to_col = { + k: v for k, v in zip(fields.split(","), annotations.columns[1:], strict=True) + } + + # expand go terms + to_expand = list() + for field in d_fields_to_col: + if "go_" in field: + col = d_fields_to_col[field] + annotations[col] = annotations[col].str.split(";") + to_expand.append(col) + for col in to_expand: + # this is a bit wastefull. Processing to stack format should be done here. + annotations = annotations.explode(col, ignore_index=True) + # process other than go term columns + annotations = ( + annotations.set_index("From") + .rename_axis("identifier") + # .drop("Entry", axis=1) + .rename_axis("source", axis=1) + .stack() + .to_frame("annotation") + .reset_index() + .drop_duplicates(ignore_index=True) + ) + return annotations