Skip to content

Commit dcbf220

Browse files
authored
Merge pull request #55 from histogrammar/performance_updates
Multiple performance updates, to Bin, SparselyBin and Categorize hists
2 parents 8c82095 + 1e303db commit dcbf220

File tree

9 files changed

+159
-67
lines changed

9 files changed

+159
-67
lines changed

CHANGES.rst

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -2,6 +2,14 @@
22
Release notes
33
=============
44

5+
Version 1.0.28, June 2022
6+
-------------------------
7+
* Multiple performance updates, to Bin, SparselyBin and Categorize histograms.
8+
* SparselyBin, Categorize: optimized filling with 1-d and 2-d numpy arrays
9+
* Bin, SparselyBin, Categorize: (fast) numpy arrays for bin-centers and bin-labels.
10+
* Count: new, fast filling option when float weight is known.
11+
* util.py: faster get_datatype() and get_ndim() functions.
12+
513
Version 1.0.27, May 2022
614
------------------------
715
* Multiple performance updates, thanks to Simon Brugman.

README.rst

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -20,7 +20,7 @@ PyCUDA is available, they can also be filled from Numpy arrays by JIT-compiling
2020

2121
This Python implementation of histogrammar been tested to guarantee compatibility with its Scala implementation.
2222

23-
Latest Python release: v1.0.27 (May 2022).
23+
Latest Python release: v1.0.28 (June 2022).
2424

2525
Announcements
2626
=============

histogrammar/primitives/bin.py

Lines changed: 9 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -148,7 +148,7 @@ def __init__(self, num, low, high, quantity=identity, value=Count(),
148148
self.values = [None] * num
149149
self.contentType = "Count"
150150
else:
151-
self.values = [value.zero() for i in xrange(num)]
151+
self.values = [value.zero() for i in range(num)]
152152
self.contentType = value.name
153153
self.underflow = underflow.copy()
154154
self.overflow = overflow.copy()
@@ -955,6 +955,11 @@ def __hash__(self):
955955
return hash((self.low, self.high, self.quantity, self.entries, tuple(
956956
self.values), self.underflow, self.overflow, self.nanflow))
957957

958+
@property
959+
def size(self):
960+
"""Get number of bins, consistent with SparselyBin and Categorize """
961+
return self.num
962+
958963
@property
959964
def n_bins(self):
960965
"""Get number of bins, consistent with SparselyBin and Categorize """
@@ -1107,7 +1112,8 @@ def bin_centers(self, low=None, high=None):
11071112
import numpy as np
11081113
# trivial case
11091114
if low is None and high is None:
1110-
return np.array([sum(self.range(i)) / 2.0 for i in self.indexes])
1115+
bw = self.bin_width()
1116+
return np.arange(self.low + bw / 2., self.high + bw / 2., bw)
11111117
# catch weird cases
11121118
elif low is not None and high is not None:
11131119
if low > high:
@@ -1131,7 +1137,7 @@ def bin_centers(self, low=None, high=None):
11311137
if np.isclose(high, self.low + self.bin_width() * maxBin):
11321138
maxBin -= 1
11331139

1134-
return np.array([sum(self.range(i)) / 2.0 for i in range(minBin, maxBin + 1)])
1140+
return self.low + (np.linspace(minBin, maxBin, maxBin - minBin + 1) + 0.5) * self.bin_width()
11351141

11361142
def _center_from_key(self, idx):
11371143
xc = (idx + 0.5) * self.bin_width() + self.low

histogrammar/primitives/categorize.py

Lines changed: 42 additions & 18 deletions
Original file line numberDiff line numberDiff line change
@@ -283,29 +283,51 @@ def _numpy(self, data, weights, shape):
283283
if isinstance(q, (list, tuple)):
284284
q = np.array(q)
285285
self._checkNPQuantity(q, shape)
286+
287+
if isinstance(weights, (float, int)) and weights == 1:
288+
all_weights_one = True
289+
elif isinstance(weights, np.ndarray) and np.all(weights == 1):
290+
all_weights_one = True
291+
else:
292+
all_weights_one = False
286293
self._checkNPWeights(weights, shape)
287294
weights = self._makeNPWeights(weights, shape)
288295
newentries = weights.sum()
289296

290297
subweights = weights.copy()
291298
subweights[weights < 0.0] = 0.0
292299

293-
selection = np.empty(q.shape, dtype=np.bool)
294-
uniques, inverse = np.unique(q, return_inverse=True)
295-
296-
# no possibility of exception from here on out (for rollback)
297-
for i, x in enumerate(uniques):
298-
if isinstance(x, (basestring, bool)):
299-
pass
300-
elif x is None or np.isnan(x):
301-
x = 'NaN'
302-
if x not in self.bins:
303-
self.bins[x] = self.value.zero()
300+
if self.n_dim == 1 and all_weights_one and isinstance(self.value, Count):
301+
# special case of filling single array where all weights are 1
302+
uniques, counts = np.unique(q, return_counts=True)
303+
304+
for c, x in zip(counts, uniques):
305+
if isinstance(x, (basestring, bool)):
306+
pass
307+
elif x is None or np.isnan(x):
308+
x = 'NaN'
309+
if x not in self.bins:
310+
self.bins[x] = self.value.zero()
311+
self.bins[x]._numpy(None, c, [None])
312+
else:
313+
# all other cases ...
314+
selection = np.empty(q.shape, dtype=np.bool)
315+
uniques, inverse = np.unique(q, return_inverse=True)
304316

305-
np.not_equal(inverse, i, selection)
306-
subweights[:] = weights
307-
subweights[selection] = 0.0
308-
self.bins[x]._numpy(data, subweights, shape)
317+
# no possibility of exception from here on out (for rollback)
318+
for i, x in enumerate(uniques):
319+
if isinstance(x, (basestring, bool)):
320+
pass
321+
elif x is None or np.isnan(x):
322+
x = 'NaN'
323+
if x not in self.bins:
324+
self.bins[x] = self.value.zero()
325+
326+
# passing on the full array seems faster for one- AND multi-dim histograms
327+
np.not_equal(inverse, i, selection)
328+
subweights[:] = weights
329+
subweights[selection] = 0.0
330+
self.bins[x]._numpy(data, subweights, shape)
309331

310332
self.entries += float(newentries)
311333

@@ -430,12 +452,14 @@ def bin_labels(self, max_length=-1):
430452
"""
431453
Returns bin labels
432454
433-
:param int max_length: maximum length of a label. Default if full length.
455+
:param int max_length: maximum length of a label. Default is full length.
434456
:returns: array of labels
435457
:rtype: numpy.array
436458
"""
437-
labels = []
459+
if max_length == -1:
460+
return np.array(list(self.bins.keys()))
438461

462+
labels = []
439463
for i, key in enumerate(self.bins.keys()):
440464
try:
441465
label = str(key)
@@ -444,7 +468,7 @@ def bin_labels(self, max_length=-1):
444468
except BaseException:
445469
label = 'bin_%d' % i
446470
labels.append(label)
447-
return np.asarray(labels)
471+
return np.array(labels)
448472

449473
def bin_centers(self, max_length=-1):
450474
"""

histogrammar/primitives/count.py

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -232,6 +232,12 @@ def _numpy(self, _, weights, shape):
232232
assert t.shape[0] == 1
233233
self.entries += float(t[0])
234234

235+
elif isinstance(weights, (int, float, numpy.number)):
236+
if self.transform is identity:
237+
self.entries += float(weights)
238+
else:
239+
self.entries += self.transform(weights)
240+
235241
else:
236242
raise ValueError("cannot use Numpy to fill an isolated Count (unless the weights are given as an array)")
237243

histogrammar/primitives/sparselybin.py

Lines changed: 84 additions & 35 deletions
Original file line numberDiff line numberDiff line change
@@ -14,6 +14,7 @@
1414
# See the License for the specific language governing permissions and
1515
# limitations under the License.
1616

17+
import numpy as np
1718
import math
1819
import numbers
1920

@@ -259,7 +260,32 @@ def at(self, index):
259260
@property
260261
def indexes(self):
261262
"""Get a sequence of filled indexes."""
262-
return sorted(self.keys)
263+
return sorted(self.bins.keys())
264+
265+
@property
266+
def binsMap(self):
267+
"""Input ``bins`` as a key-value map."""
268+
return self.bins
269+
270+
@property
271+
def size(self):
272+
"""Number of ``bins``."""
273+
return len(self.bins)
274+
275+
@property
276+
def keys(self):
277+
"""Iterable over the keys of the ``bins``."""
278+
return self.bins.keys()
279+
280+
@property
281+
def values(self):
282+
"""Iterable over the values of the ``bins``."""
283+
return list(self.bins.values())
284+
285+
@property
286+
def keySet(self):
287+
"""Set of keys among the ``bins``."""
288+
return set(self.bins.keys())
263289

264290
def range(self, index):
265291
"""Get the low and high edge of a bin (given by index number)."""
@@ -432,48 +458,76 @@ def _c99StructName(self):
432458
def _numpy(self, data, weights, shape):
433459
q = self.quantity(data)
434460
self._checkNPQuantity(q, shape)
461+
462+
if isinstance(weights, (float, int)) and weights == 1:
463+
all_weights_one = True
464+
elif isinstance(weights, np.ndarray) and np.all(weights == 1):
465+
all_weights_one = True
466+
else:
467+
all_weights_one = False
435468
self._checkNPWeights(weights, shape)
436469
weights = self._makeNPWeights(weights, shape)
437470
newentries = weights.sum()
438471

439-
import numpy
440-
441-
selection = numpy.isnan(q)
442-
numpy.bitwise_not(selection, selection)
472+
selection = np.isnan(q)
473+
np.bitwise_not(selection, selection) # invert selection
443474
subweights = weights.copy()
444475
subweights[selection] = 0.0
445476
self.nanflow._numpy(data, subweights, shape)
477+
subweights[:] = weights
446478

447479
# switch to float here like in bin.py else numpy throws
448480
# TypeError on trivial integer cases such as:
449-
# >>> q = numpy.array([1,2,3,4])
481+
# >>> q = np.array([1,2,3,4])
450482
# >>> np.divide(q,1,q)
451483
# >>> np.floor(q,q)
452-
q = numpy.array(q, dtype=numpy.float64)
453-
neginfs = numpy.isneginf(q)
454-
posinfs = numpy.isposinf(q)
455-
456-
numpy.subtract(q, self.origin, q)
457-
numpy.divide(q, self.binWidth, q)
458-
numpy.floor(q, q)
459-
q = numpy.array(q, dtype=numpy.int64)
484+
q = np.array(q, dtype=np.float64)
485+
neginfs = np.isneginf(q)
486+
posinfs = np.isposinf(q)
487+
488+
np.subtract(q, self.origin, q)
489+
np.divide(q, self.binWidth, q)
490+
np.floor(q, q)
491+
q = np.array(q, dtype=np.int64)
460492
q[neginfs] = LONG_MINUSINF
461493
q[posinfs] = LONG_PLUSINF
462494

463495
selected = q[weights > 0.0]
464496

465-
selection = numpy.empty(q.shape, dtype=numpy.bool)
466-
for index in numpy.unique(selected):
467-
if index != LONG_NAN:
468-
bin = self.bins.get(index)
469-
if bin is None:
470-
bin = self.value.zero()
471-
self.bins[index] = bin
472-
473-
numpy.not_equal(q, index, selection)
474-
subweights[:] = weights
475-
subweights[selection] = 0.0
476-
bin._numpy(data, subweights, shape)
497+
# used below. bit expensive, so do here once
498+
n_dim = self.n_dim
499+
500+
if n_dim == 1 and all_weights_one and isinstance(self.value, Count):
501+
# special case: filling single array where all weights are 1
502+
# (use fast np.unique that returns counts)
503+
uniques, counts = np.unique(selected, return_counts=True)
504+
for c, index in zip(counts, uniques):
505+
if index != LONG_NAN:
506+
bin = self.bins.get(index)
507+
if bin is None:
508+
bin = self.value.zero()
509+
self.bins[index] = bin
510+
# pass counts directly to Count object
511+
self.bins[index]._numpy(None, c, [None])
512+
else:
513+
# all other cases ...
514+
selection = np.empty(q.shape, dtype=np.bool)
515+
for index in np.unique(selected):
516+
if index != LONG_NAN:
517+
bin = self.bins.get(index)
518+
if bin is None:
519+
bin = self.value.zero()
520+
self.bins[index] = bin
521+
if n_dim == 1:
522+
# passing on the full array is faster for one-dim histograms
523+
np.not_equal(q, index, selection)
524+
subweights[:] = weights
525+
subweights[selection] = 0.0
526+
self.bins[index]._numpy(data, subweights, shape)
527+
else:
528+
# in practice passing on sliced arrays is faster for multi-dim histograms
529+
np.equal(q, index, selection)
530+
self.bins[index]._numpy(data[selection], subweights[selection], [np.sum(selection)])
477531

478532
# no possibility of exception from here on out (for rollback)
479533
self.entries += float(newentries)
@@ -615,12 +669,12 @@ def __hash__(self):
615669

616670
@property
617671
def n_bins(self):
618-
"""Get number of bins, consistent with SparselyBin and Categorize """
619-
return self.size
672+
"""Get number of filled bins, consistent with SparselyBin and Categorize """
673+
return len(self.bins)
620674

621675
def num_bins(self, low=None, high=None):
622676
"""
623-
Returns number of bins
677+
Returns number of bins from low to high, including unfilled
624678
625679
Possible to set range with low and high params
626680
@@ -629,7 +683,6 @@ def num_bins(self, low=None, high=None):
629683
:returns: number of bins in range
630684
:rtype: int
631685
"""
632-
import numpy as np
633686
# sparse hist not filled
634687
if self.minBin is None or self.maxBin is None:
635688
return 0
@@ -672,7 +725,6 @@ def bin_edges(self, low=None, high=None):
672725
:returns: numpy array with bin edges for selected range
673726
:rtype: numpy.array
674727
"""
675-
import numpy as np
676728
# sparse hist not filled
677729
if self.minBin is None or self.maxBin is None:
678730
return np.array([self.origin, self.origin + 1])
@@ -715,7 +767,6 @@ def bin_entries(self, low=None, high=None, xvalues=[]):
715767
:returns: numpy array with numbers of entries for selected bins
716768
:rtype: numpy.array
717769
"""
718-
import numpy as np
719770
# sparse hist not filled
720771
if self.minBin is None or self.maxBin is None:
721772
return np.array([])
@@ -757,10 +808,8 @@ def bin_centers(self, low=None, high=None):
757808
:returns: numpy array with bin centers for selected range
758809
:rtype: numpy.array
759810
"""
760-
import numpy as np
761811
bin_edges = self.bin_edges(low, high)
762-
centers = [(bin_edges[i] + bin_edges[i + 1]) / 2. for i in range(len(bin_edges) - 1)]
763-
return np.array(centers)
812+
return (bin_edges[:-1] + bin_edges[1:]) / 2
764813

765814
@property
766815
def mpv(self):

0 commit comments

Comments
 (0)