Skip to content

Commit

Permalink
🎨 shorten docstrings, use f-strings (pylint suggestion), parameter names
Browse files Browse the repository at this point in the history
- still some open issues (list as default parameters), unused arguments, and too general Exceptions
  • Loading branch information
enryH committed Feb 21, 2025
1 parent 444f384 commit 6814f09
Show file tree
Hide file tree
Showing 2 changed files with 156 additions and 66 deletions.
134 changes: 100 additions & 34 deletions src/acore/differential_regulation/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,17 +13,15 @@
get_max_permutations,
)

from .tests import (
from .tests import ( # calculate_thsd, complement_posthoc,
calc_means_between_groups,
calc_ttest,
calculate_ancova,
calculate_anova,
calculate_mixed_anova,
calculate_pairwise_ttest,
calculate_repeated_measures_anova,
calculate_THSD,
calculate_ttest,
complement_posthoc,
eta_squared,
format_anova_table,
omega_squared,
Expand Down Expand Up @@ -73,21 +71,34 @@ def run_anova(
):
"""
Performs statistical test for each protein in a dataset.
Checks what type of data is the input (paired, unpaired or repeated measurements) and performs posthoc tests for multiclass data.
Multiple hypothesis correction uses permutation-based if permutations>0 and Benjamini/Hochberg if permutations=0.
Checks what type of data is the input (paired, unpaired or repeated measurements) and
performs posthoc tests for multiclass data.
Multiple hypothesis correction uses permutation-based
if permutations>0 and Benjamini/Hochberg if permutations=0.
:param df: pandas dataframe with samples as rows and protein identifiers as columns (with additional columns 'group', 'sample' and 'subject').
:param df: pandas dataframe with samples as rows and protein identifiers as columns
(with additional columns 'group', 'sample' and 'subject').
:param str subject: column with subject identifiers
:param str group: column with group identifiers
:param list drop_cols: column labels to be dropped from the dataframe
:param float alpha: error rate for multiple hypothesis correction
:param int permutations: number of permutations used to estimate false discovery rates.
:param bool non_par: if True, normality and variance equality assumptions are checked and non-parametric test Mann Whitney U test if not passed
:return: Pandas dataframe with columns 'identifier', 'group1', 'group2', 'mean(group1)', 'mean(group2)', 'Log2FC', 'std_error', 'tail', 't-statistics', 'posthoc pvalue', 'effsize', 'efftype', 'FC', 'rejected', 'F-statistics', 'p-value', 'correction', '-log10 p-value', and 'method'.
:param bool non_par: if True, normality and variance equality assumptions are checked
and non-parametric test Mann Whitney U test if not passed
:return: Pandas dataframe with columns 'identifier', 'group1', 'group2',
'mean(group1)', 'mean(group2)', 'Log2FC', 'std_error', 'tail', 't-statistics',
'posthoc pvalue', 'effsize', 'efftype', 'FC', 'rejected', 'F-statistics', 'p-value',
'correction', '-log10 p-value', and 'method'.
Example::
result = run_anova(df, alpha=0.05, drop_cols=["sample",'subject'], subject='subject', group='group', permutations=50)
result = run_anova(df,
alpha=0.05,
drop_cols=["sample",'subject'],
subject='subject',
group='group',
permutations=50
)
"""
res = pd.DataFrame()
if subject is not None and acore.utils.check_is_paired(df, subject, group):
Expand Down Expand Up @@ -180,26 +191,40 @@ def run_ancova(
permutations: int = 0,
correction: str = "fdr_bh",
is_logged: bool = True,
non_pa: bool = False,
non_par: bool = False,
):
"""
Performs statistical test for each protein in a dataset.
Checks what type of data is the input (paired, unpaired or repeated measurements) and performs posthoc tests for multiclass data.
Multiple hypothesis correction uses permutation-based if permutations>0 and Benjamini/Hochberg if permutations=0.
Checks what type of data is the input (paired, unpaired or repeated measurements)
and performs posthoc tests for multiclass data.
Multiple hypothesis correction uses permutation-based
if permutations>0 and Benjamini/Hochberg if permutations=0.
:param df: pandas dataframe with samples as rows and protein identifiers and covariates as columns (with additional columns 'group', 'sample' and 'subject').
:param df: pandas dataframe with samples as rows and protein identifiers and
covariates as columns (with additional columns 'group', 'sample' and 'subject').
:param list covariates: list of covariates to include in the model (column in df)
:param str subject: column with subject identifiers
:param str group: column with group identifiers
:param list drop_cols: column labels to be dropped from the dataframe
:param float alpha: error rate for multiple hypothesis correction
:param int permutations: number of permutations used to estimate false discovery rates.
:param bool non_par: if True, normality and variance equality assumptions are checked and non-parametric test Mann Whitney U test if not passed
:return: Pandas dataframe with columns 'identifier', 'group1', 'group2', 'mean(group1)', 'mean(group2)', 'Log2FC', 'std_error', 'tail', 't-statistics', 'posthoc pvalue', 'effsize', 'efftype', 'FC', 'rejected', 'F-statistics', 'p-value', 'correction', '-log10 p-value', and 'method'.
:param bool non_par: if True, normality and variance equality assumptions are checked
and non-parametric test Mann Whitney U test if not passed
:return: Pandas dataframe with columns 'identifier', 'group1', 'group2',
'mean(group1)', 'mean(group2)', 'Log2FC', 'std_error', 'tail', 't-statistics',
'posthoc pvalue', 'effsize', 'efftype', 'FC', 'rejected', 'F-statistics', 'p-value',
'correction', '-log10 p-value', and 'method'.
Example::
result = run_ancova(df, covariates=['age'], alpha=0.05, drop_cols=["sample",'subject'], subject='subject', group='group', permutations=50)
result = run_ancova(df,
covariates=['age'],
alpha=0.05,
drop_cols=["sample",'subject'],
subject='subject',
group='group',
permutations=50
)
"""
df = df.drop(drop_cols, axis=1)
for cova in covariates:
Expand Down Expand Up @@ -250,7 +275,8 @@ def run_repeated_measurements_anova(
"""
Performs repeated measurements anova and pairwise posthoc tests for each protein in dataframe.
:param df: pandas dataframe with samples as rows and protein identifiers as columns (with additional columns 'group', 'sample' and 'subject').
:param df: pandas dataframe with samples as rows and protein identifiers as columns
(with additional columns 'group', 'sample' and 'subject').
:param str subject: column with subject identifiers
:param str within: column with within factor identifiers
:param list drop_cols: column labels to be dropped from the dataframe
Expand All @@ -260,7 +286,13 @@ def run_repeated_measurements_anova(
Example::
result = run_repeated_measurements_anova(df, alpha=0.05, drop_cols=['sample'], subject='subject', within='group', permutations=50)
result = run_repeated_measurements_anova(df,
alpha=0.05,
drop_cols=['sample'],
subject='subject',
within='group',
permutations=50
)
"""
df = df.drop(drop_cols, axis=1).dropna(axis=1)
aov_results = []
Expand Down Expand Up @@ -309,13 +341,19 @@ def run_mixed_anova(
correction="fdr_bh",
):
"""
In statistics, a mixed-design analysis of variance model, also known as a split-plot ANOVA, is used to test
for differences between two or more independent groups whilst subjecting participants to repeated measures.
Thus, in a mixed-design ANOVA model, one factor (a fixed effects factor) is a between-subjects variable and the other
(a random effects factor) is a within-subjects variable. Thus, overall, the model is a type of mixed-effects model.
[source:https://en.wikipedia.org/wiki/Mixed-design_analysis_of_variance]
:param df: pandas dataframe with samples as rows and protein identifiers as columns (with additional columns 'group', 'sample' and 'subject').
In statistics, a mixed-design analysis of variance model, also known as a split-plot
ANOVA, is used to test
for differences between two or more independent groups whilst subjecting participants
to repeated measures.
Thus, in a mixed-design ANOVA model, one factor (a fixed effects factor) is a
between-subjects variable and the other
(a random effects factor) is a within-subjects variable. Thus, overall, the model is a
type of mixed-effects model (source_)
.. _source: https://en.wikipedia.org/wiki/Mixed-design_analysis_of_variance
:param df: pandas dataframe with samples as rows and protein identifiers as columns
(with additional columns 'group', 'sample' and 'subject').
:param str subject: column with subject identifiers
:param str within: column with within factor identifiers
:param str between: column with between factor identifiers
Expand All @@ -326,7 +364,14 @@ def run_mixed_anova(
Example::
result = run_mixed_anova(df, alpha=0.05, drop_cols=['sample'], subject='subject', within='group', between='group2', permutations=50)
result = run_mixed_anova(df,
alpha=0.05,
drop_cols=['sample'],
subject='subject',
within='group',
between='group2',
permutations=50
)
"""
df = df.drop(drop_cols, axis=1).dropna(axis=1)
aov_results = []
Expand Down Expand Up @@ -370,9 +415,12 @@ def run_ttest(
non_par=False,
):
"""
Runs t-test (paired/unpaired) for each protein in dataset and performs permutation-based (if permutations>0) or Benjamini/Hochberg (if permutations=0) multiple hypothesis correction.
Runs t-test (paired/unpaired) for each protein in dataset and performs
permutation-based (if permutations>0) or Benjamini/Hochberg (if permutations=0)
multiple hypothesis correction.
:param df: pandas dataframe with samples as rows and protein identifiers as columns (with additional columns 'group', 'sample' and 'subject').
:param df: pandas dataframe with samples as rows and protein identifiers as columns
(with additional columns 'group', 'sample' and 'subject').
:param str condition1: first of two conditions of the independent variable
:param str condition2: second of two conditions of the independent variable
:param str subject: column with subject identifiers
Expand All @@ -383,12 +431,25 @@ def run_ttest(
:param float alpha: error rate for multiple hypothesis correction
:param int permutations: number of permutations used to estimate false discovery rates.
:param bool is_logged: data is log-transformed
:param bool non_par: if True, normality and variance equality assumptions are checked and non-parametric test Mann Whitney U test if not passed
:return: Pandas dataframe with columns 'identifier', 'group1', 'group2', 'mean(group1)', 'mean(group2)', 'std(group1)', 'std(group2)', 'Log2FC', 'FC', 'rejected', 'T-statistics', 'p-value', 'correction', '-log10 p-value', and 'method'.
:param bool non_par: if True, normality and variance equality assumptions are checked
and non-parametric test Mann Whitney U test if not passed
:return: Pandas dataframe with columns 'identifier', 'group1', 'group2',
'mean(group1)', 'mean(group2)', 'std(group1)', 'std(group2)', 'Log2FC', 'FC',
'rejected', 'T-statistics', 'p-value', 'correction', '-log10 p-value', and 'method'.
Example::
result = run_ttest(df, condition1='group1', condition2='group2', alpha = 0.05, drop_cols=['sample'], subject='subject', group='group', paired=False, correction='fdr_bh', permutations=50)
result = run_ttest(df,
condition1='group1',
condition2='group2',
alpha = 0.05,
drop_cols=['sample'],
subject='subject',
group='group',
paired=False,
correction='fdr_bh',
permutations=50
)
"""
columns = [
"T-statistics",
Expand Down Expand Up @@ -438,7 +499,7 @@ def run_ttest(
permutations=permutations,
)
scores = scores.join(count)
scores["correction"] = "permutation FDR ({} perm)".format(permutations)
scores["correction"] = f"permutation FDR ({permutations} perm)"
corrected = True

if not corrected:
Expand Down Expand Up @@ -473,15 +534,20 @@ def run_two_way_anova(
"""
Run a 2-way ANOVA when data['secondary_group'] is not empty
:param df: processed pandas dataframe with samples as rows, and proteins and groups as columns.
:param df: processed pandas dataframe with samples as rows,
and proteins and groups as columns.
:param list drop_cols: column names to drop from dataframe
:param str subject: column name containing subject identifiers.
:param list group: column names corresponding to independent variable groups
:return: Two dataframes, anova results and residuals.
Example::
result = run_two_way_anova(data, drop_cols=['sample'], subject='subject', group=['group', 'secondary_group'])
result = run_two_way_anova(data,
drop_cols=['sample'],
subject='subject',
group=['group', 'secondary_group']
)
"""
data = df.copy()
factorA, factorB = group
Expand Down
Loading

0 comments on commit 6814f09

Please sign in to comment.