Skip to content

Commit 983125e

Browse files
authored
MAINT: hardcoded skbio metrics to temporarily work around NaN to 0 implementation (#63)
1 parent 40510b5 commit 983125e

File tree

4 files changed

+185
-11
lines changed

4 files changed

+185
-11
lines changed

q2_diversity_lib/alpha.py

+107-6
Original file line numberDiff line numberDiff line change
@@ -7,9 +7,16 @@
77
# ----------------------------------------------------------------------------
88

99
import pandas as pd
10+
import numpy as np
11+
import functools
12+
1013
import skbio.diversity
14+
from skbio.diversity._util import _validate_counts_vector
15+
import skbio.diversity.alpha
16+
17+
from scipy.special import gammaln
18+
1119
import biom
12-
import numpy as np
1320

1421
from q2_types.feature_table import BIOMV210Format
1522
from q2_types.sample_data import AlphaDiversityFormat
@@ -91,7 +98,10 @@ def transform_(v, i, m):
9198

9299
results = []
93100
for v in table.iter_data(dense=True):
94-
results.append(_skbio_alpha_diversity_from_1d(v, 'pielou_e'))
101+
# using in-house metrics temporarily
102+
# results.append(_skbio_alpha_diversity_from_1d(v, 'pielou_e'))
103+
v = np.reshape(v, (1, len(v)))
104+
results.extend([_p_evenness(c)for c in v])
95105
results = pd.Series(results, index=table.ids(), name='pielou_evenness')
96106
return results
97107

@@ -104,16 +114,107 @@ def shannon_entropy(table: biom.Table,
104114

105115
results = []
106116
for v in table.iter_data(dense=True):
107-
results.append(_skbio_alpha_diversity_from_1d(v, 'shannon'))
117+
# using in-house metrics temporarily
118+
# results.append(_skbio_alpha_diversity_from_1d(v, 'shannon'))
119+
v = np.reshape(v, (1, len(v)))
120+
results.extend([_shannon(c)for c in v])
108121
results = pd.Series(results, index=table.ids(), name='shannon_entropy')
109122
return results
110123

111124

112125
@_validate_tables
113126
def alpha_passthrough(table: biom.Table, metric: str) -> pd.Series:
114127
results = []
115-
116-
for v in table.iter_data(dense=True):
117-
results.append(_skbio_alpha_diversity_from_1d(v.astype(int), metric))
128+
method_map = {"berger_parker_d": _berger_parker,
129+
"brillouin_d": _brillouin_d,
130+
"simpson": _simpsons_dominance,
131+
"esty_ci": _esty_ci,
132+
"goods_coverage": _goods_coverage,
133+
"margalef": _margalef,
134+
"mcintosh_d": _mcintosh_d,
135+
"strong": _strong}
136+
137+
if metric in method_map:
138+
metric = functools.partial(method_map[metric])
139+
for v in table.iter_data(dense=True):
140+
v = np.reshape(v, (1, len(v)))
141+
results.extend([metric(c)for c in v])
142+
else:
143+
for v in table.iter_data(dense=True):
144+
results.append(_skbio_alpha_diversity_from_1d(v.astype(int),
145+
metric))
118146
results = pd.Series(results, index=table.ids(), name=metric)
119147
return results
148+
149+
150+
# c&p methods from skbio
151+
def _berger_parker(counts):
152+
counts = _validate_counts_vector(counts)
153+
return counts.max() / counts.sum()
154+
155+
156+
def _brillouin_d(counts):
157+
counts = _validate_counts_vector(counts)
158+
nz = counts[counts.nonzero()]
159+
n = nz.sum()
160+
return (gammaln(n + 1) - gammaln(nz + 1).sum()) / n
161+
162+
163+
def _simpsons_dominance(counts):
164+
counts = _validate_counts_vector(counts)
165+
return 1 - skbio.diversity.alpha.dominance(counts)
166+
167+
168+
def _esty_ci(counts):
169+
counts = _validate_counts_vector(counts)
170+
171+
f1 = skbio.diversity.alpha.singles(counts)
172+
f2 = skbio.diversity.alpha.doubles(counts)
173+
n = counts.sum()
174+
z = 1.959963985
175+
W = (f1 * (n - f1) + 2 * n * f2) / (n ** 3)
176+
177+
return f1 / n - z * np.sqrt(W), f1 / n + z * np.sqrt(W)
178+
179+
180+
def _goods_coverage(counts):
181+
counts = _validate_counts_vector(counts)
182+
f1 = skbio.diversity.alpha.singles(counts)
183+
N = counts.sum()
184+
return 1 - (f1 / N)
185+
186+
187+
def _margalef(counts):
188+
counts = _validate_counts_vector(counts)
189+
# replaced observed_otu call to sobs
190+
return (skbio.diversity.alpha.sobs(counts) - 1) / np.log(counts.sum())
191+
192+
193+
def _mcintosh_d(counts):
194+
counts = _validate_counts_vector(counts)
195+
u = np.sqrt((counts * counts).sum())
196+
n = counts.sum()
197+
return (n - u) / (n - np.sqrt(n))
198+
199+
200+
def _strong(counts):
201+
counts = _validate_counts_vector(counts)
202+
n = counts.sum()
203+
# replaced observed_otu call to sobs
204+
s = skbio.diversity.alpha.sobs(counts)
205+
i = np.arange(1, len(counts) + 1)
206+
sorted_sum = np.sort(counts)[::-1].cumsum()
207+
return (sorted_sum / n - (i / s)).max()
208+
209+
210+
def _p_evenness(counts):
211+
counts = _validate_counts_vector(counts)
212+
return _shannon(counts, base=np.e) / np.log(
213+
skbio.diversity.alpha.sobs(counts=counts))
214+
215+
216+
def _shannon(counts, base=2):
217+
counts = _validate_counts_vector(counts)
218+
freqs = counts / counts.sum()
219+
nonzero_freqs = freqs[freqs.nonzero()]
220+
return -(nonzero_freqs * np.log(nonzero_freqs)).sum() / np.log(base)

q2_diversity_lib/beta.py

+1-1
Original file line numberDiff line numberDiff line change
@@ -38,7 +38,7 @@
3838
'IMPL': {'braycurtis', 'jaccard'},
3939
'UNIMPL': {'cityblock', 'euclidean', 'seuclidean', 'sqeuclidean',
4040
'cosine', 'correlation', 'hamming', 'chebyshev', 'canberra',
41-
'yule', 'matching', 'dice', 'kulsinski',
41+
'yule', 'matching', 'dice',
4242
'rogerstanimoto', 'russellrao', 'sokalmichener',
4343
'sokalsneath', 'minkowski', 'aitchison', 'canberra_adkins',
4444
'jensenshannon'}

q2_diversity_lib/examples.py

+2-2
Original file line numberDiff line numberDiff line change
@@ -322,8 +322,8 @@ def beta_passthrough_n_jobs_example(use):
322322
result, = use.action(
323323
use.UsageAction(plugin_id='diversity_lib',
324324
action_id='beta_passthrough'),
325-
use.UsageInputs(table=ft, metric='kulsinski', n_jobs=1),
326-
use.UsageOutputNames(distance_matrix='kulsinski_dm')
325+
use.UsageInputs(table=ft, metric='euclidean', n_jobs=1),
326+
use.UsageOutputNames(distance_matrix='euclidean_dm')
327327
)
328328
result.assert_output_type('DistanceMatrix')
329329

q2_diversity_lib/tests/test_alpha.py

+75-2
Original file line numberDiff line numberDiff line change
@@ -9,6 +9,7 @@
99
from subprocess import CalledProcessError
1010

1111
import numpy as np
12+
import numpy.testing as npt
1213
import pandas as pd
1314
import pandas.testing as pdt
1415
import biom
@@ -17,7 +18,12 @@
1718
from qiime2 import Artifact
1819

1920
from ..alpha import (pielou_evenness, observed_features,
20-
shannon_entropy, METRICS)
21+
shannon_entropy, METRICS,
22+
_berger_parker, _brillouin_d,
23+
_simpsons_dominance, _esty_ci,
24+
_goods_coverage, _margalef,
25+
_mcintosh_d, _strong
26+
)
2127

2228

2329
class SmokeTests(TestPluginBase):
@@ -154,7 +160,9 @@ def test_drop_undefined_samples(self):
154160
[0, 0, 0, 1, 0, 1]]),
155161
['A', 'B', 'C'],
156162
['S1', 'S2', 'S3', 'S4', 'S5', 'S6'])
157-
expected = pd.Series({'S5': 1, 'S6': 1}, name='pielou_evenness')
163+
# pandas supports floating point correction for float dtype only,
164+
# these 1 ints were changed to 1.0 floats to get that support
165+
expected = pd.Series({'S5': 1.0, 'S6': 1.0}, name='pielou_evenness')
158166
actual = pielou_evenness(table=NaN_table, drop_undefined_samples=True)
159167
pdt.assert_series_equal(actual, expected, check_dtype=False)
160168

@@ -250,3 +258,68 @@ def test_passed_implemented_metric(self):
250258
for metric in METRICS['NONPHYLO']['IMPL']:
251259
with self.assertRaisesRegex(TypeError, f"{metric}.*incompatible"):
252260
self.method(table=self.crawford_tbl, metric=metric)
261+
262+
# tests for passthrough metrics were sourced from skbio
263+
def test_berger_parker_d(self):
264+
self.assertEqual(_berger_parker(np.array([5, 5])), 0.5)
265+
self.assertEqual(_berger_parker(np.array([1, 1, 1, 1, 0])), 0.25)
266+
267+
def test_brillouin_d(self):
268+
self.assertAlmostEqual(_brillouin_d(np.array([1, 2, 0, 0, 3, 1])),
269+
0.86289353018248782)
270+
271+
def test_esty_ci(self):
272+
def _diversity(indices, f):
273+
"""Calculate diversity index for each window of size 1.
274+
275+
indices: vector of indices of taxa
276+
f: f(counts) -> diversity measure
277+
278+
"""
279+
result = []
280+
max_size = max(indices) + 1
281+
freqs = np.zeros(max_size, dtype=int)
282+
for i in range(len(indices)):
283+
freqs += np.bincount(indices[i:i + 1], minlength=max_size)
284+
try:
285+
curr = f(freqs)
286+
except (ZeroDivisionError, FloatingPointError):
287+
curr = 0
288+
result.append(curr)
289+
return np.array(result)
290+
291+
data = [1, 1, 2, 1, 1, 3, 2, 1, 3, 4]
292+
293+
observed_lower, observed_upper = zip(*_diversity(data, _esty_ci))
294+
295+
expected_lower = np.array([1, -1.38590382, -0.73353593, -0.17434465,
296+
-0.15060902, -0.04386191, -0.33042054,
297+
-0.29041008, -0.43554755, -0.33385652])
298+
expected_upper = np.array([1, 1.38590382, 1.40020259, 0.67434465,
299+
0.55060902, 0.71052858, 0.61613483,
300+
0.54041008, 0.43554755, 0.53385652])
301+
302+
npt.assert_array_almost_equal(observed_lower, expected_lower)
303+
npt.assert_array_almost_equal(observed_upper, expected_upper)
304+
305+
def test_simpson(self):
306+
self.assertAlmostEqual(_simpsons_dominance(np.array([1, 0, 2, 5, 2])),
307+
0.66)
308+
self.assertAlmostEqual(_simpsons_dominance(np.array([5])), 0)
309+
310+
def test_goods_coverage(self):
311+
counts = [1] * 75 + [2, 2, 2, 2, 2, 2, 3, 4, 4]
312+
obs = _goods_coverage(counts)
313+
self.assertAlmostEqual(obs, 0.23469387755)
314+
315+
def test_margalef(self):
316+
317+
self.assertEqual(_margalef(np.array([0, 1, 1, 4, 2, 5, 2, 4, 1, 2])),
318+
8 / np.log(22))
319+
320+
def test_mcintosh_d(self):
321+
self.assertAlmostEqual(_mcintosh_d(np.array([1, 2, 3])),
322+
0.636061424871458)
323+
324+
def test_strong(self):
325+
self.assertAlmostEqual(_strong(np.array([1, 2, 3, 1])), 0.214285714)

0 commit comments

Comments
 (0)