Merge branch 'main' into ancova_updated

Multiomics-Analytics-Group · Nov 28, 2024 · 48a37df · 48a37df
2 parents 8b9adb6 + b9b035f
commit 48a37df
Show file tree

Hide file tree

Showing 13 changed files with 299 additions and 35 deletions.
diff --git a/.gitignore b/.gitignore
@@ -104,3 +104,6 @@ ENV/
 # IDE settings
 .vscode/
 .idea/
+
+# Mac
+.DS_Store
diff --git a/acore/__init__.py b/acore/__init__.py
@@ -1,3 +1,6 @@
 from importlib.metadata import version
 
+import dsp_pandas  # sets up pandas formatting options
+
+__all__ = ["dsp_pandas"]
 __version__ = version("acore")
diff --git a/acore/correlation_analysis.py b/acore/correlation_analysis.py
@@ -10,6 +10,16 @@
 from acore.multiple_testing import apply_pvalue_correction
 
 
+def corr_lower_triangle(df: pd.DataFrame, **kwargs) -> pd.DataFrame:
+    """Compute the correlation matrix, returning only unique values (lower triangle).
+    Passes kwargs to pandas.DataFrame.corr method.
+    """
+    corr_df = df.corr(**kwargs)
+    lower_triangle = pd.DataFrame(np.tril(np.ones(corr_df.shape), -1)).astype(bool)
+    lower_triangle.index, lower_triangle.columns = corr_df.index, corr_df.columns
+    return corr_df.where(lower_triangle)
+
+
 def calculate_correlations(x, y, method="pearson"):
     """
     Calculates a Spearman (nonparametric) or a Pearson (parametric) correlation coefficient and p-value to test for non-correlation.

diff --git a/acore/sklearn/pca.py → acore/decomposition/pca.py b/acore/sklearn/pca.py → acore/decomposition/pca.py
@@ -1,10 +1,8 @@
-from typing import Optional
-
-import matplotlib
 import pandas as pd
 import sklearn.decomposition
 
 
+# ! also a version in exploratory analysis
 def run_pca(
     df_wide: pd.DataFrame, n_components: int = 2
 ) -> tuple[pd.DataFrame, sklearn.decomposition.PCA]:
@@ -35,16 +33,3 @@ def run_pca(
     ]
     PCs = pd.DataFrame(PCs, index=df_wide.index, columns=cols)
     return PCs, pca
-
-
-# ! move to aviz
-def plot_explained_variance(
-    pca: sklearn.decomposition.PCA, ax: Optional[matplotlib.axes.Axes] = None
-) -> matplotlib.axes.Axes:
-    """Plot explained variance of PCA from scikit-learn."""
-    exp_var = pd.Series(pca.explained_variance_ratio_).to_frame("explained variance")
-    exp_var.index += 1  # start at 1
-    exp_var["explained variance (cummulated)"] = exp_var["explained variance"].cumsum()
-    exp_var.index.name = "PC"
-    ax = exp_var.plot(ax=ax)
-    return ax
diff --git a/acore/differential_regulation.py b/acore/differential_regulation.py
@@ -37,16 +37,16 @@ def calc_means_between_groups(
 
 
 def calc_ttest(
-    df: pd.DataFrame, boolean_array: pd.Series, vars: list[str]
+    df: pd.DataFrame, boolean_array: pd.Series, variables: list[str]
 ) -> pd.DataFrame:
-    """Calculate t-test for each variable in `vars` between two groups defined
+    """Calculate t-test for each variable in `variables` between two groups defined
     by boolean array."""
     ret = []
-    for var in vars:
+    for var in variables:
         _ = pg.ttest(df.loc[boolean_array, var], df.loc[~boolean_array, var])
         ret.append(_)
     ret = pd.concat(ret)
-    ret = ret.set_index(vars)
+    ret = ret.set_index(variables)
     ret.columns.name = "ttest"
     ret.columns = pd.MultiIndex.from_product(
         [["ttest"], ret.columns], names=("test", "var")
@@ -65,7 +65,7 @@ def run_diff_analysis(
     ret = calc_means_between_groups(
         df, boolean_array=boolean_array, event_names=event_names
     )
-    ttests = calc_ttest(df, boolean_array=boolean_array, vars=ret.index)
+    ttests = calc_ttest(df, boolean_array=boolean_array, variables=ret.index)
     ret = ret.join(ttests.loc[:, pd.IndexSlice[:, ttest_vars]])
     return ret
 

diff --git a/acore/plotting/__init__.py b/acore/plotting/__init__.py
@@ -0,0 +1,206 @@
+"""
+This module contains functions to plot data. It will be moved to a separate
+visualization package.
+"""
+
+import logging
+import pathlib
+from typing import Iterable
+
+import matplotlib
+import matplotlib.pyplot as plt
+import numpy as np
+import pandas as pd
+
+plt.rcParams["figure.figsize"] = [4.0, 3.0]
+plt.rcParams["pdf.fonttype"] = 42
+plt.rcParams["ps.fonttype"] = 42
+
+plt.rcParams["figure.dpi"] = 147
+
+figsize_a4 = (8.3, 11.7)
+
+logger = logging.getLogger(__name__)
+
+
+def savefig(
+    fig: matplotlib.figure.Figure,
+    name: str,
+    folder: pathlib.Path = ".",
+    pdf=True,
+    tight_layout=True,
+    dpi=300,
+):
+    """Save matplotlib Figure (having method `savefig`) as pdf and png."""
+    folder = pathlib.Path(folder)
+    fname = folder / name
+    folder = fname.parent  # in case name specifies folders
+    folder.mkdir(exist_ok=True, parents=True)
+    if not fig.get_constrained_layout() and tight_layout:
+        fig.tight_layout()
+    fig.savefig(fname.with_suffix(".png"), bbox_inches="tight", dpi=dpi)
+    if pdf:
+        fig.savefig(fname.with_suffix(".pdf"), bbox_inches="tight", dpi=dpi)
+    logger.info(f"Saved Figures to {fname}")
+
+
+def select_xticks(ax: matplotlib.axes.Axes, max_ticks: int = 50) -> list:
+    """Limit the number of xticks displayed.
+
+    Parameters
+    ----------
+    ax : matplotlib.axes.Axes
+        Axes object to manipulate
+    max_ticks : int, optional
+        maximum number of set ticks on x-axis, by default 50
+
+    Returns
+    -------
+    list
+        list of current ticks for x-axis. Either new
+        or old (depending if something was changed).
+    """
+    x_ticks = ax.get_xticks()
+    offset = len(x_ticks) // max_ticks
+    if offset > 1:  # if larger than 1
+        return ax.set_xticks(x_ticks[::offset])
+    return x_ticks
+
+
+def select_dates(date_series: pd.Series, max_ticks=30) -> np.array:
+    """Get unique dates (single days) for selection in pd.plot.line
+    with xticks argument.
+
+    Parameters
+    ----------
+    date_series : pd.Series
+        datetime series to use (values, not index)
+    max_ticks : int, optional
+        maximum number of unique ticks to select, by default 30
+
+    Returns
+    -------
+    np.array
+        array of selected dates
+    """
+    xticks = date_series.dt.date.unique()
+    offset = len(xticks) // max_ticks
+    if offset > 1:
+        return xticks[::offset]
+    else:
+        xticks
+
+
+def make_large_descriptors(size="xx-large"):
+    """Helper function to have very large titles, labes and tick texts for
+    matplotlib plots per default.
+
+    size: str
+        fontsize or allowed category. Change default if necessary, default 'xx-large'
+    """
+    plt.rcParams.update(
+        {
+            k: size
+            for k in [
+                "xtick.labelsize",
+                "ytick.labelsize",
+                "axes.titlesize",
+                "axes.labelsize",
+                "legend.fontsize",
+                "legend.title_fontsize",
+            ]
+        }
+    )
+
+
+set_font_sizes = make_large_descriptors
+
+
+def add_prop_as_second_yaxis(
+    ax: matplotlib.axes.Axes, n_samples: int, format_str: str = "{x:,.3f}"
+) -> matplotlib.axes.Axes:
+    """Add proportion as second axis. Try to align cleverly
+
+    Parameters
+    ----------
+    ax : matplotlib.axes.Axes
+        Axes for which you want to add a second y-axis
+    n_samples : int
+        Number of total samples (to normalize against)
+
+    Returns
+    -------
+    matplotlib.axes.Axes
+        Second layover twin Axes with right-hand side y-axis
+    """
+    ax2 = ax.twinx()
+    n_min, n_max = np.round(ax.get_ybound())
+    logger.info(f"{n_min = }, {n_max = }")
+    lower_prop = n_min / n_samples + (ax.get_ybound()[0] - n_min) / n_samples
+    upper_prop = n_max / n_samples + (ax.get_ybound()[1] - n_max) / n_samples
+    logger.info(f"{lower_prop = }, {upper_prop = }")
+    ax2.set_ybound(lower_prop, upper_prop)
+    # _ = ax2.set_yticks(np.linspace(n_min/n_samples,
+    #                    n_max /n_samples, len(ax.get_yticks())-2))
+    _ = ax2.set_yticks(ax.get_yticks()[1:-1] / n_samples)
+    ax2.yaxis.set_major_formatter(matplotlib.ticker.StrMethodFormatter(format_str))
+    return ax2
+
+
+def add_height_to_barplot(
+    ax: matplotlib.axes.Axes, size: int = 15
+) -> matplotlib.axes.Axes:
+    """Add height of bar to each bar in a barplot."""
+    for bar in ax.patches:
+        ax.annotate(
+            text=format(bar.get_height(), ".2f"),
+            xy=(bar.get_x() + bar.get_width() / 2, bar.get_height()),
+            xytext=(0, 7),
+            ha="center",
+            va="center",
+            size=size,
+            textcoords="offset points",
+        )
+    return ax
+
+
+def add_text_to_barplot(
+    ax: matplotlib.axes.Axes, text: Iterable[str], size=15
+) -> matplotlib.axes.Axes:
+    """Add custom text from Iterable to each bar in a barplot."""
+    for bar, text_bar in zip(ax.patches, text):
+        msg = f"{bar = }, {text = }, {bar.get_height() = }"
+        logger.debug(msg)
+        ax.annotate(
+            text=text_bar,
+            xy=(bar.get_x() + bar.get_width() / 2, bar.get_height()),
+            xytext=(0, -5),
+            rotation=90,
+            ha="center",
+            va="top",
+            size=size,
+            textcoords="offset points",
+        )
+    return ax
+
+
+def format_large_numbers(
+    ax: matplotlib.axes.Axes, format_str: str = "{x:,.0f}"
+) -> matplotlib.axes.Axes:
+    """Format large integer numbers to be read more easily.
+
+    Parameters
+    ----------
+    ax : matplotlib.axes.Axes
+        Axes which labels should be manipulated.
+    format_str : str, optional
+        Default float format string, by default '{x:,.0f}'
+
+    Returns
+    -------
+    matplotlib.axes.Axes
+        Return reference to modified input Axes object.
+    """
+    ax.xaxis.set_major_formatter(matplotlib.ticker.StrMethodFormatter(format_str))
+    ax.yaxis.set_major_formatter(matplotlib.ticker.StrMethodFormatter(format_str))
+    return ax
diff --git a/acore/plotting/decomposition.py b/acore/plotting/decomposition.py
@@ -0,0 +1,19 @@
+"""Decompositon plots like pca, umap, tsne, etc."""
+
+from typing import Optional
+
+import matplotlib
+import pandas as pd
+import sklearn.decomposition
+
+
+def plot_explained_variance(
+    pca: sklearn.decomposition.PCA, ax: Optional[matplotlib.axes.Axes] = None
+) -> matplotlib.axes.Axes:
+    """Plot explained variance of PCA from scikit-learn."""
+    exp_var = pd.Series(pca.explained_variance_ratio_).to_frame("explained variance")
+    exp_var.index += 1  # start at 1
+    exp_var["explained variance (cummulated)"] = exp_var["explained variance"].cumsum()
+    exp_var.index.name = "PC"
+    ax = exp_var.plot(ax=ax)
+    return ax
diff --git a/docs/api_examples/exploratory_analysis.ipynb b/docs/api_examples/exploratory_analysis.ipynb
@@ -8,6 +8,20 @@
     "# Exploratory Analysis"
    ]
   },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "2958fb55",
+   "metadata": {
+    "tags": [
+     "hide-output"
+    ]
+   },
+   "outputs": [],
+   "source": [
+    "%pip install acore"
+   ]
+  },
   {
    "cell_type": "code",
    "execution_count": null,
@@ -16,6 +30,7 @@
    "outputs": [],
    "source": [
     "import pandas as pd\n",
+    "\n",
     "import acore.exploratory_analysis as ea\n",
     "\n",
     "data = pd.DataFrame(\n",
@@ -129,7 +144,7 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "result['umap']"
+    "result[\"umap\"]"
    ]
   },
   {

diff --git a/docs/api_examples/exploratory_analysis.py b/docs/api_examples/exploratory_analysis.py
@@ -1,8 +1,12 @@
 # %% [markdown]
 # # Exploratory Analysis
 
+# %%
+# %pip install acore
+
 # %%
 import pandas as pd
+
 import acore.exploratory_analysis as ea
 
 data = pd.DataFrame(
@@ -52,7 +56,7 @@
 )
 
 # %%
-result['umap']
+result["umap"]
 
 # %%
 annotation