From 2ac6df7ca044d50bc0d2584adc37e6e0f2135988 Mon Sep 17 00:00:00 2001 From: Will Dumm Date: Mon, 8 Apr 2024 11:46:37 -0700 Subject: [PATCH] update docs, format, and lint --- docs/quickstart.rst | 4 ++-- gctree/branching_processes.py | 18 ++++++++++-------- gctree/cli.py | 4 ++-- gctree/isotype.py | 2 +- 4 files changed, 15 insertions(+), 13 deletions(-) diff --git a/docs/quickstart.rst b/docs/quickstart.rst index 5fe28b24..fdf1fcdc 100644 --- a/docs/quickstart.rst +++ b/docs/quickstart.rst @@ -106,7 +106,7 @@ This file may be manipulated using ``gctree infer``, instead of providing a dnapars ``outfile``. .. note:: - Although described below, using mutability parsimony or isotype parsimony + Although described below, using context likelihood, mutability parsimony, or isotype parsimony as ranking criteria is experimental, and has not yet been shown in a careful validation to improve tree inference. Only the default branching process likelihood is recommended for tree ranking! @@ -117,7 +117,7 @@ between trees. Providing arguments ``--isotype_mapfile`` and arguments ``--mutability`` and ``--substitution`` allows trees to be ranked according to a context-sensitive mutation model. By default, trees are ranked lexicographically, first maximizing likelihood, then minimizing isotype -parsimony and mutabilities, if such information is provided. +parsimony, and finally maximizing a context-based poisson likelihood, if such information is provided. Ranking priorities can be adjusted using the argument ``--ranking_coeffs``. For example, to find the optimal tree diff --git a/gctree/branching_processes.py b/gctree/branching_processes.py index cb6fbca7..17b9d43e 100755 --- a/gctree/branching_processes.py +++ b/gctree/branching_processes.py @@ -411,7 +411,7 @@ def mle(self, **kwargs) -> Tuple[np.float64, np.float64]: (p, q) = \arg\max_{p,q\in [0,1]}\ell(p, q) Args: - kwargs: keyword arguments passed along to the log likelihood :meth:`CollapsedTree.ll` + kwargs: keyword arguments passed along to the branching process likelihood :meth:`CollapsedTree.ll` Returns: Tuple :math:`(p, q)` with estimated branching probability and estimated mutation probability @@ -1052,7 +1052,7 @@ def ll( marginal: compute the marginal likelihood over trees, otherwise compute the joint likelihood of trees Returns: - Log likelihood :math:`\ell(p, q; T, A)` and its gradient :math:`\nabla\ell(p, q; T, A)` + Log branching process likelihood :math:`\ell(p, q; T, A)` and its gradient :math:`\nabla\ell(p, q; T, A)` """ if self._cm_countlist is None: if self._forest is not None: @@ -1124,7 +1124,7 @@ def mle(self, **kwargs) -> Tuple[np.float64, np.float64]: (p, q) = \arg\max_{p,q\in [0,1]}\ell(p, q) Args: - kwargs: keyword arguments passed along to the log likelihood :meth:`CollapsedForest.ll` + kwargs: keyword arguments passed along to the branching process likelihood :meth:`CollapsedForest.ll` Returns: Tuple :math:`(p, q)` with estimated branching probability and estimated mutation probability @@ -1151,7 +1151,7 @@ def filter_trees( # noqa: C901 Trim the forest to minimize a linear combination of branching process likelihood, isotype parsimony score, - mutability parsimony score, and number of alleles, with coefficients + context/mutability-based Poisson likelihood, and number of alleles, with coefficients provided in the argument ``ranking_coeffs`, in that order. Args: @@ -1169,7 +1169,7 @@ def filter_trees( # noqa: C901 ignore_isotype: Ignore isotype parsimony when ranking. By default, isotype information added with :meth:``add_isotypes`` will be used to compute isotype parsimony, which is used in ranking. chain_split: The index at which non-adjacent sequences are concatenated, for calculating - mutability parsimony. + context-based Poisson likelihood. verbose: print information about trimming outbase: file name stem for a file with information for each tree in the DAG. summarize_forest: whether to write a summary of the forest to file `[outbase].forest_summary.log` @@ -1182,7 +1182,8 @@ def filter_trees( # noqa: C901 Returns: The trimmed forest, containing all optimal trees according to the specified criteria, and a tuple - of data about the trees in that forest, with format (ll, isotype parsimony, mutability parsimony, alleles). + of data about the trees in that forest, with format (branching process likelihood, isotype parsimony, + context-based Poisson likelihood, alleles). """ dag = self._forest @@ -1681,7 +1682,7 @@ def _mle_helper( bounds = ((1e-6, 1 - 1e-6), (1e-6, 1 - 1e-6)) def f(x): - """Negative log likelihood.""" + """Negative log branching process likelihood.""" return tuple(-y for y in ll(*x, **kwargs)) grad_check = sco.check_grad(lambda x: f(x)[0], lambda x: f(x)[1], x_0) @@ -1919,7 +1920,8 @@ def accum_func(cmsetlist: List[multiset.FrozenMultiset]): def _ll_genotype_dagfuncs(p: np.float64, q: np.float64) -> hdag.utils.HistoryDagFilter: - """Return functions for counting tree log likelihood on the history DAG. + """Return functions for counting tree log branching process likelihood on + the history DAG. For numerical consistency, we resort to the use of ``decimal.Decimal``. This is exactly for the purpose of solving the problem that float sum is diff --git a/gctree/cli.py b/gctree/cli.py index 9f80a91c..46f0ee72 100644 --- a/gctree/cli.py +++ b/gctree/cli.py @@ -537,7 +537,7 @@ def get_parser(): help=( "when using concatenated heavy and light chains, this is the 0-based" " index at which the 2nd chain begins, needed for determining coding frame in both chains," - " and also to correctly calculate mutability parsimony." + " and also to correctly calculate context-based Poisson likelihood." ), ) parser_infer.add_argument( @@ -632,7 +632,7 @@ def get_parser(): "Coefficients are in order: isotype parsimony, mutation model parsimony, number of alleles. " "A coefficient of -1 will be applied to branching process likelihood. " "If not provided, trees will be ranked lexicographically by likelihood, " - "isotype parsimony, and mutability parsimony in that order." + "isotype parsimony, and context-based Poisson likelihood in that order." ), ) parser_infer.add_argument( diff --git a/gctree/isotype.py b/gctree/isotype.py index 31055ac9..abe9ec39 100644 --- a/gctree/isotype.py +++ b/gctree/isotype.py @@ -46,7 +46,7 @@ def get_parser() -> argparse.ArgumentParser: " nodes.\n\n" "This tool doesn’t make any judgements about which tree is best.\n" "Tree output order is the same as in gctree inference: ranking is\n" - "by log likelihood before isotype additions. A determination of\n" + "by branching process likelihood before isotype additions. A determination of\n" "which is the best tree is left to the user, based on likelihoods,\n" "isotype parsimony score, and changes in the number of nodes after\n" "isotype additions.\n"