Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

MAINT: hardcoded skbio metrics to temporarily work around NaN to 0 implementation #63

Merged
merged 7 commits into from
May 2, 2024
113 changes: 107 additions & 6 deletions q2_diversity_lib/alpha.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,9 +7,16 @@
# ----------------------------------------------------------------------------

import pandas as pd
import numpy as np
import functools

import skbio.diversity
from skbio.diversity._util import _validate_counts_vector
import skbio.diversity.alpha

from scipy.special import gammaln

import biom
import numpy as np

from q2_types.feature_table import BIOMV210Format
from q2_types.sample_data import AlphaDiversityFormat
Expand Down Expand Up @@ -91,7 +98,10 @@ def transform_(v, i, m):

results = []
for v in table.iter_data(dense=True):
results.append(_skbio_alpha_diversity_from_1d(v, 'pielou_e'))
# using in-house metrics temporarily
# results.append(_skbio_alpha_diversity_from_1d(v, 'pielou_e'))
v = np.reshape(v, (1, len(v)))
results.extend([_p_evenness(c)for c in v])
results = pd.Series(results, index=table.ids(), name='pielou_evenness')
return results

Expand All @@ -104,16 +114,107 @@ def shannon_entropy(table: biom.Table,

results = []
for v in table.iter_data(dense=True):
results.append(_skbio_alpha_diversity_from_1d(v, 'shannon'))
# using in-house metrics temporarily
# results.append(_skbio_alpha_diversity_from_1d(v, 'shannon'))
v = np.reshape(v, (1, len(v)))
results.extend([_shannon(c)for c in v])
results = pd.Series(results, index=table.ids(), name='shannon_entropy')
return results


@_validate_tables
def alpha_passthrough(table: biom.Table, metric: str) -> pd.Series:
results = []

for v in table.iter_data(dense=True):
results.append(_skbio_alpha_diversity_from_1d(v.astype(int), metric))
method_map = {"berger_parker_d": _berger_parker,
"brillouin_d": _brillouin_d,
"simpson": _simpsons_dominance,
"esty_ci": _esty_ci,
"goods_coverage": _goods_coverage,
"margalef": _margalef,
"mcintosh_d": _mcintosh_d,
"strong": _strong}

if metric in method_map:
metric = functools.partial(method_map[metric])
for v in table.iter_data(dense=True):
v = np.reshape(v, (1, len(v)))
results.extend([metric(c)for c in v])
else:
for v in table.iter_data(dense=True):
results.append(_skbio_alpha_diversity_from_1d(v.astype(int),
metric))
results = pd.Series(results, index=table.ids(), name=metric)
return results


# c&p methods from skbio
def _berger_parker(counts):
counts = _validate_counts_vector(counts)
return counts.max() / counts.sum()


def _brillouin_d(counts):
counts = _validate_counts_vector(counts)
nz = counts[counts.nonzero()]
n = nz.sum()
return (gammaln(n + 1) - gammaln(nz + 1).sum()) / n


def _simpsons_dominance(counts):
counts = _validate_counts_vector(counts)
return 1 - skbio.diversity.alpha.dominance(counts)


def _esty_ci(counts):
counts = _validate_counts_vector(counts)

f1 = skbio.diversity.alpha.singles(counts)
f2 = skbio.diversity.alpha.doubles(counts)
n = counts.sum()
z = 1.959963985
W = (f1 * (n - f1) + 2 * n * f2) / (n ** 3)

return f1 / n - z * np.sqrt(W), f1 / n + z * np.sqrt(W)


def _goods_coverage(counts):
counts = _validate_counts_vector(counts)
f1 = skbio.diversity.alpha.singles(counts)
N = counts.sum()
return 1 - (f1 / N)


def _margalef(counts):
counts = _validate_counts_vector(counts)
# replaced observed_otu call to sobs
return (skbio.diversity.alpha.sobs(counts) - 1) / np.log(counts.sum())


def _mcintosh_d(counts):
counts = _validate_counts_vector(counts)
u = np.sqrt((counts * counts).sum())
n = counts.sum()
return (n - u) / (n - np.sqrt(n))


def _strong(counts):
counts = _validate_counts_vector(counts)
n = counts.sum()
# replaced observed_otu call to sobs
s = skbio.diversity.alpha.sobs(counts)
i = np.arange(1, len(counts) + 1)
sorted_sum = np.sort(counts)[::-1].cumsum()
return (sorted_sum / n - (i / s)).max()


def _p_evenness(counts):
counts = _validate_counts_vector(counts)
return _shannon(counts, base=np.e) / np.log(
skbio.diversity.alpha.sobs(counts=counts))


def _shannon(counts, base=2):
counts = _validate_counts_vector(counts)
freqs = counts / counts.sum()
nonzero_freqs = freqs[freqs.nonzero()]
return -(nonzero_freqs * np.log(nonzero_freqs)).sum() / np.log(base)
2 changes: 1 addition & 1 deletion q2_diversity_lib/beta.py
Original file line number Diff line number Diff line change
Expand Up @@ -38,7 +38,7 @@
'IMPL': {'braycurtis', 'jaccard'},
'UNIMPL': {'cityblock', 'euclidean', 'seuclidean', 'sqeuclidean',
'cosine', 'correlation', 'hamming', 'chebyshev', 'canberra',
'yule', 'matching', 'dice', 'kulsinski',
'yule', 'matching', 'dice',
'rogerstanimoto', 'russellrao', 'sokalmichener',
'sokalsneath', 'minkowski', 'aitchison', 'canberra_adkins',
'jensenshannon'}
Expand Down
4 changes: 2 additions & 2 deletions q2_diversity_lib/examples.py
Original file line number Diff line number Diff line change
Expand Up @@ -322,8 +322,8 @@ def beta_passthrough_n_jobs_example(use):
result, = use.action(
use.UsageAction(plugin_id='diversity_lib',
action_id='beta_passthrough'),
use.UsageInputs(table=ft, metric='kulsinski', n_jobs=1),
use.UsageOutputNames(distance_matrix='kulsinski_dm')
use.UsageInputs(table=ft, metric='euclidean', n_jobs=1),
use.UsageOutputNames(distance_matrix='euclidean_dm')
)
result.assert_output_type('DistanceMatrix')

Expand Down
77 changes: 75 additions & 2 deletions q2_diversity_lib/tests/test_alpha.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,7 @@
from subprocess import CalledProcessError

import numpy as np
import numpy.testing as npt
import pandas as pd
import pandas.testing as pdt
import biom
Expand All @@ -17,7 +18,12 @@
from qiime2 import Artifact

from ..alpha import (pielou_evenness, observed_features,
shannon_entropy, METRICS)
shannon_entropy, METRICS,
_berger_parker, _brillouin_d,
_simpsons_dominance, _esty_ci,
_goods_coverage, _margalef,
_mcintosh_d, _strong
)


class SmokeTests(TestPluginBase):
Expand Down Expand Up @@ -154,7 +160,9 @@ def test_drop_undefined_samples(self):
[0, 0, 0, 1, 0, 1]]),
['A', 'B', 'C'],
['S1', 'S2', 'S3', 'S4', 'S5', 'S6'])
expected = pd.Series({'S5': 1, 'S6': 1}, name='pielou_evenness')
# pandas supports floating point correction for float dtype only,
# these 1 ints were changed to 1.0 floats to get that support
expected = pd.Series({'S5': 1.0, 'S6': 1.0}, name='pielou_evenness')
actual = pielou_evenness(table=NaN_table, drop_undefined_samples=True)
pdt.assert_series_equal(actual, expected, check_dtype=False)

Expand Down Expand Up @@ -250,3 +258,68 @@ def test_passed_implemented_metric(self):
for metric in METRICS['NONPHYLO']['IMPL']:
with self.assertRaisesRegex(TypeError, f"{metric}.*incompatible"):
self.method(table=self.crawford_tbl, metric=metric)

# tests for passthrough metrics were sourced from skbio
def test_berger_parker_d(self):
self.assertEqual(_berger_parker(np.array([5, 5])), 0.5)
self.assertEqual(_berger_parker(np.array([1, 1, 1, 1, 0])), 0.25)

def test_brillouin_d(self):
self.assertAlmostEqual(_brillouin_d(np.array([1, 2, 0, 0, 3, 1])),
0.86289353018248782)

def test_esty_ci(self):
def _diversity(indices, f):
"""Calculate diversity index for each window of size 1.

indices: vector of indices of taxa
f: f(counts) -> diversity measure

"""
result = []
max_size = max(indices) + 1
freqs = np.zeros(max_size, dtype=int)
for i in range(len(indices)):
freqs += np.bincount(indices[i:i + 1], minlength=max_size)
try:
curr = f(freqs)
except (ZeroDivisionError, FloatingPointError):
curr = 0
result.append(curr)
return np.array(result)

data = [1, 1, 2, 1, 1, 3, 2, 1, 3, 4]

observed_lower, observed_upper = zip(*_diversity(data, _esty_ci))

expected_lower = np.array([1, -1.38590382, -0.73353593, -0.17434465,
-0.15060902, -0.04386191, -0.33042054,
-0.29041008, -0.43554755, -0.33385652])
expected_upper = np.array([1, 1.38590382, 1.40020259, 0.67434465,
0.55060902, 0.71052858, 0.61613483,
0.54041008, 0.43554755, 0.53385652])

npt.assert_array_almost_equal(observed_lower, expected_lower)
npt.assert_array_almost_equal(observed_upper, expected_upper)

def test_simpson(self):
self.assertAlmostEqual(_simpsons_dominance(np.array([1, 0, 2, 5, 2])),
0.66)
self.assertAlmostEqual(_simpsons_dominance(np.array([5])), 0)

def test_goods_coverage(self):
counts = [1] * 75 + [2, 2, 2, 2, 2, 2, 3, 4, 4]
obs = _goods_coverage(counts)
self.assertAlmostEqual(obs, 0.23469387755)

def test_margalef(self):

self.assertEqual(_margalef(np.array([0, 1, 1, 4, 2, 5, 2, 4, 1, 2])),
8 / np.log(22))

def test_mcintosh_d(self):
self.assertAlmostEqual(_mcintosh_d(np.array([1, 2, 3])),
0.636061424871458)

def test_strong(self):
self.assertAlmostEqual(_strong(np.array([1, 2, 3, 1])), 0.214285714)
Loading