forked from histogrammar/histogrammar-python
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathutil.py
754 lines (606 loc) · 26.5 KB
/
util.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
#!/usr/bin/env python
# Copyright 2016 DIANA-HEP
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import marshal
import math
import types
import sys
import histogrammar.pycparser.c_ast
# Definitions for python 2/3 compatibility
if sys.version_info[0] > 2:
basestring = str
xrange = range
long = int
def inheritdoc(cls):
def _fn(fn):
if fn.__name__ in cls.__dict__:
fn.__doc__ = cls.__dict__[fn.__name__].__doc__
return fn
return _fn
# attach sub-methods to the fill and plot methods
class FillMethod(object):
def __init__(self, container, fill):
self.container = container
self.fill = fill
self.root = container.fillroot
self.pycuda = container.fillpycuda
self.numpy = container.fillnumpy
self.sparksql = container.fillsparksql
def __call__(self, *args, **kwds):
return self.fill(*args, **kwds)
class PlotMethod(object):
def __init__(self, container, plot):
self.container = container
self.plot = plot
try:
self.root = container.plotroot
except (AttributeError, KeyError):
pass
try:
self.bokeh = container.plotbokeh
except (AttributeError, KeyError):
pass
try:
self.matplotlib = container.plotmatplotlib
except (AttributeError, KeyError):
pass
def __call__(self, *args, **kwds):
return self.plot(*args, **kwds)
# handling key set comparisons with optional keys
def hasKeys(test, required, optional=set()):
"""Checks to see if a dict from JSON has the right keys."""
if not isinstance(test, set):
test = set(test)
if not isinstance(required, set):
required = set(required)
if not isinstance(optional, set):
optional = set(optional)
return required.issubset(test) and test.issubset(required.union(optional))
def maybeAdd(json, **pairs):
"""Adds key-value pairs to a dict for JSON if the value is not None."""
if len(pairs) == 0:
return json
else:
out = dict(json)
for k, v in pairs.items():
if v is not None:
out[k] = v
return out
# inexact floating point and NaN handling
relativeTolerance = 0.0
absoluteTolerance = 0.0
def numeq(x, y):
"""Introduces a ``===`` operator for all ``Double`` tolerance comparisons.
Custom equality rules:
- nan == nan (nans are used by some primitives to indicate missing data).
- inf == inf and -inf == -inf (naturally, but has to be explicit given the following).
- if ``histogrammar.util.relativeTolerance`` is greater than zero, numbers may differ by this small ratio.
- if ``histogrammar.util.absoluteTolerance`` is greater than zero, numbers may differ by this small difference.
Python's math.isclose algorithm is applied for non-NaNs:
``abs(x - y) <= max(relativeTolerance * max(abs(x), abs(y)), absoluteTolerance)``
"""
if math.isnan(x) and math.isnan(y):
return True
elif math.isinf(x) and math.isinf(y):
return (x > 0.0) == (y > 0.0)
elif relativeTolerance > 0.0 and absoluteTolerance > 0.0:
return abs(x - y) <= max(relativeTolerance * max(abs(x), abs(y)), absoluteTolerance)
elif relativeTolerance > 0.0:
return abs(x - y) <= relativeTolerance * max(abs(x), abs(y))
elif absoluteTolerance > 0.0:
return abs(x - y) <= absoluteTolerance
else:
return x == y
def minplus(x, y):
"""Rule for finding the minimum of two numbers
Given the Histogrammar convention of representing the minimum of no data to be nan."""
if math.isnan(x) and math.isnan(y):
return float("nan")
elif math.isnan(x):
return y
elif math.isnan(y):
return x
elif x < y:
return x
else:
return y
def maxplus(x, y):
"""Rule for finding the maximum of two numbers
Given the Histogrammar convention of representing the maximum of no data to be nan."""
if math.isnan(x) and math.isnan(y):
return float("nan")
elif math.isnan(x):
return y
elif math.isnan(y):
return x
elif x > y:
return x
else:
return y
def floatOrNan(x):
"""NaN is not a good key in a hash map
Because it isn't equal to itself. histogrammar.primitives.Bag therefore uses the string ``"nan"`` as a substitute.
This converts to the right JSON string representation."""
x = float(x)
if math.isnan(x):
return "nan"
else:
return x
def floatToJson(x):
"""Custom rule for converting non-finite numbers to JSON as quoted strings: ``"inf"``, ``"-inf"``, and ``"nan"``.
This avoids Python's bad habit of putting literal ``Infinity``, ``-Infinity``, and ``NaN``
in the JSON (without quotes)."""
if x in ("nan", "inf", "-inf"):
return x
elif math.isnan(x):
return "nan"
elif math.isinf(x) and x > 0.0:
return "inf"
elif math.isinf(x):
return "-inf"
else:
return x
def floatToC99(x):
if math.isnan(x):
return "NAN"
elif math.isinf(x) and x > 0.0:
return "INFINITY"
elif math.isinf(x):
return "-INFINITY"
else:
return str(x)
def rangeToJson(x):
"""Custom rule for converting numbers, one-dimensional vectors of numbers, and strings to JSON
Converting non-finite nmbers to ``"inf"``, ``"-inf"``, and ``"nan"``.
This avoids Python's bad habit of putting literal ``Infinity``, ``-Infinity``, and ``NaN``
in the JSON (without quotes)."""
if isinstance(x, basestring):
return x
elif isinstance(x, (list, tuple)):
return [floatToJson(xi) for xi in x]
else:
return floatToJson(x)
# function tools
class UserFcn(object):
"""Base trait for user functions.
All functions passed to Histogrammar primitives get wrapped as UserFcn objects.
Functions (instances of ``types.FunctionType``, not any callable) are used as-is and strings and deferred for
later evaluation. If a string-based UserFcn is used in a normal ``fill`` operation, it gets compiled (once) as
a Python function of the input structure's fields or a single-argument function for unstructured data.
The string need not be interpreted this way: backends targeting JIT compilation can interpret the strings as C
code; backends targeting GPUs and FPGAs can interpret them as CUDA/OpenCL or pin-out names. As usual with
Histogrammar, the only platform-specific part is the user functions.
UserFcns have a ``name`` parameter that may not be set. The user would ordinarily use the histogrammar.util.named
function to give a function a name. Similarly, histogrammar.util.cached adds caching. (Naming and caching
commute: they can be applied in either order.)
UserFcns are also 100% serializable, so that Histogrammar trees can be pickled and they can be passed through
PySpark.
Note that the histogrammar.util.serializable function creates a UserFcn, avoids duplication, and commutes with
histogrammar.util.named and histogrammar.util.cached (they can be applied in any order).
"""
def __init__(self, expr, name=None):
self.expr = expr
if isinstance(expr, basestring) and name is None:
self.name = expr
elif isinstance(expr, types.FunctionType) and expr.__name__ != "<lambda>" and name is None:
self.name = expr.__name__
else:
self.name = name
if expr is None:
ok = True
elif isinstance(expr, basestring):
ok = True
elif isinstance(expr, types.FunctionType):
ok = True
else:
try:
from pyspark.sql.column import Column
except ImportError:
ok = False
else:
if isinstance(expr, Column):
if self.name is None:
self.name = str(expr)[7:-1]
ok = True
if not ok:
raise TypeError("quantity ({0}) must be a string, function, or SparkSQL Column".format(expr))
if name is not None and not isinstance(name, basestring):
raise TypeError(
"function name must be a string, not {0} (perhaps your arguments are reversed)".format(name))
def asSparkSQL(self):
from pyspark.sql.column import Column
if isinstance(self.expr, Column):
return self.expr._jc
else:
raise TypeError("UserFcn is not a SparkSQL Column: " + repr(self))
def __call__(self, *args, **kwds):
if not hasattr(self, "fcn"):
if isinstance(self.expr, types.FunctionType):
self.fcn = self.expr
elif isinstance(self.expr, basestring):
c = compile(self.expr, "<string>", "eval")
# close over this state
varname = [None]
try:
import numpy
except ImportError:
numpy = None
try:
import pandas
except ImportError:
pandas = None
def function(datum):
context = dict(globals())
# fill the namespace with math.* functions
context.update(math.__dict__)
# if you have Numpy, include numpy.* functions
if numpy is not None:
context["numpy"] = numpy
context["np"] = numpy
major = int(numpy.__version__.split('.')[0])
npcore = numpy._core if major > 1 else numpy.core
# if the datum is a dict, override the namespace with its dict keys
if isinstance(datum, dict): # if it's a dict
context.update(datum) # use its items as variables
# if the datum is a Numpy record array, override the namespace with its field names
elif numpy is not None and isinstance(datum, npcore.records.recarray):
context.update(dict((n, datum[n]) for n in datum.dtype.names))
# if the datum is a Pandas DataFrame, override the namespace with its column names
elif pandas is not None and isinstance(datum, pandas.core.frame.DataFrame):
context.update(dict((n, datum[n].values) for n in datum.columns))
else:
try:
context.update(datum.__dict__) # try to use its attributes as variables
except AttributeError:
v, = varname # otherwise, use the one and only variable
if v is None: # as the object (only discover it once)
v = set(c.co_names) - set(context.keys())
if len(v) > 1:
raise NameError(
"more than one unrecognized variable names in single-argument "
"function: {0}".format(set(c.co_names) - set(context.keys())))
elif len(v) == 0:
v = None
else:
v = list(v)[0]
varname[0] = v
if v is not None:
context.update({v: datum})
return eval(c, context)
self.fcn = function
elif self.expr is None:
raise TypeError("immutable container (created from JSON or .ed) cannot be filled")
else:
try:
from pyspark.sql.column import Column
except ImportError:
pass
else:
if isinstance(self.expr, Column):
raise TypeError("cannot use SparkSQL Column with the normal fill method; use fill.sparksql")
raise TypeError("unrecognized type for function: {0}".format(type(self.expr)))
return self.fcn(*args, **kwds)
def __reduce__(self):
if isinstance(self.expr, basestring) or self.expr is None:
return (deserializeString, (self.__class__, self.expr, self.name))
elif isinstance(self.expr, types.FunctionType):
refs = dict((n, self.expr.__globals__[n])
for n in self.expr.__code__.co_names if n in self.expr.__globals__)
return (deserializeFunction, (self.__class__, marshal.dumps(self.expr.__code__), self.expr.__name__,
self.expr.__defaults__, self.expr.__closure__, refs, self.name))
else:
raise TypeError("unrecognized type for function: {0}".format(type(self.expr)))
def __repr__(self):
return "UserFcn({0}, {1})".format(self.expr, self.name)
def __eq__(self, other):
out = isinstance(other, UserFcn) and self.name == other.name
if isinstance(self.expr, types.FunctionType) and isinstance(other.expr, types.FunctionType):
out = out and (self.expr.__code__.co_code == other.expr.__code__.co_code)
else:
out = out and (self.expr == other.expr)
return out
def __hash__(self):
if isinstance(self.expr, types.FunctionType):
return hash((None, self.expr.__code__.co_code, self.name))
else:
return hash((self.expr, self.name))
class CachedFcn(UserFcn):
"""Represents a cached UserFcn.
Note that the histogrammar.util.cached function creates a CachedFcn, avoids duplication, and commutes
with histogrammar.util.named and histogrammar.util.serializable (they can be applied in any order).
**Example:**
::
f = cached(lambda x: complexFunction(x))
f(3.14) # computes the function
f(3.14) # re-uses the old value
f(4.56) # computes the function again at a new point
"""
try:
import numpy
np = numpy
except ImportError:
np = None
def __call__(self, *args, **kwds):
if hasattr(self, "lastArgs") and \
len(args) == len(self.lastArgs) and \
(all(x is y for x, y in zip(args, self.lastArgs)) or
(self.np is not None and all(self.np.array_equal(x, y) for x, y in zip(args, self.lastArgs))) or
(self.np is None and all(x == y for x, y in zip(args, self.lastArgs)))) and \
set(kwds.keys()) == set(self.lastKwds.keys()) and \
(all(kwds[k] is self.lastKwds[k] for k in kwds) or
(self.np is not None and all(self.np.array_equal(kwds[k], self.lastKwds[k]) for k in kwds)) or
(self.np is None and all(kwds[k] == self.lastKwds[k] for k in kwds))):
return self.lastReturn
else:
self.lastArgs = args
self.lastKwds = kwds
self.lastReturn = super(CachedFcn, self).__call__(*args, **kwds)
return self.lastReturn
def __repr__(self):
return "CachedFcn({0}, {1})".format(self.expr, self.name)
def deserializeString(cls, expr, name):
"""Used by Pickle to reconstruct a string-based histogrammar.util.UserFcn from Pickle data."""
out = cls.__new__(cls)
out.expr = expr
out.name = name
return out
def deserializeFunction(cls, __code__, __name__, __defaults__, __closure__, refs, name):
"""Used by Pickle to reconstruct a function-based histogrammar.util.UserFcn from Pickle data."""
out = cls.__new__(cls)
g = dict(globals(), **refs)
out.expr = types.FunctionType(marshal.loads(__code__), g, __name__, __defaults__, __closure__)
out.name = name
return out
def serializable(fcn):
"""Create a serializable version of fcn (histogrammar.util.UserFcn)
This can be a types.FunctionType or a string.
Unlike the histogrammar.util.UserFcn constructor, this function avoids duplication (doubly wrapped objects)
and commutes with histogrammar.util.cached and histogrammar.util.named (they can be applied in any order).
"""
if isinstance(fcn, UserFcn):
return fcn
else:
return UserFcn(fcn)
def cached(fcn):
"""Create a cached version of this function.
Unlike the histogrammar.util.CachedFcn constructor, this function avoids duplication (doubly wrapped objects)
and commutes with histogrammar.util.named and histogrammar.util.serializable (they can be applied in either order).
**Example:**
::
f = cached(lambda x: complexFunction(x))
f(3.14) # computes the function
f(3.14) # re-uses the old value
f(4.56) # computes the function again at a new point
"""
if isinstance(fcn, CachedFcn):
return fcn
elif isinstance(fcn, UserFcn):
return CachedFcn(fcn.expr, fcn.name)
else:
return CachedFcn(fcn)
def named(name, fcn):
"""Create a named, serializable version of fcn (histogrammar.util.UserFcn)
This can be a types.FunctionType or a string.
Unlike the histogrammar.util.UserFcn constructor, this function avoids duplication (doubly wrapped objects) and
commutes with histogrammar.util.cached and histogrammar.util.serializable (they can be applied in any order).
"""
if isinstance(fcn, UserFcn) and fcn.name is not None:
raise ValueError("two names applied to the same function: {0} and {1}".format(fcn.name, name))
elif isinstance(fcn, CachedFcn):
return CachedFcn(fcn.expr, name)
elif isinstance(fcn, UserFcn):
return UserFcn(fcn.expr, name)
else:
return UserFcn(fcn, name)
def get_n_dim(hist, itr=0):
"""Histogram dimension
:returns: dimension of the histogram
:rtype: int
"""
# no sub-histogram possible for these:
if not isinstance(hist, histogrammar.Container):
return 0
elif isinstance(hist, histogrammar.Count):
return 0
elif isinstance(hist, histogrammar.Bag):
return hist.dimension if hist.dimension > 0 else 1
elif isinstance(hist, (histogrammar.Maximize, histogrammar.Minimize, histogrammar.Average,
histogrammar.Deviate, histogrammar.Sum)):
return 1 if itr == 0 else 0
# histogram has a sub-histogram. Extract it and recurse dimension
sub_hist = _get_sub_hist(hist)
return 1 + get_n_dim(sub_hist, itr + 1)
def get_datatype(hist, itr=0):
"""Get histogrammar histogram datatype(s) of its axes
Return data type of the variable represented by the histogram. If not
already set, will determine datatype automatically.
:returns: list with datatypes of all dimenensions of the histogram
:rtype: list
"""
import numpy as np
# no sub-histogram possible for these:
if not isinstance(hist, histogrammar.Container):
return []
elif isinstance(hist, histogrammar.Count):
return []
elif isinstance(hist, histogrammar.Bag):
if hist.dimension > 0:
return [np.number] * hist.dimension
return [str] if hist.range == 'S' else [np.number]
elif isinstance(hist, (histogrammar.Maximize, histogrammar.Minimize, histogrammar.Average,
histogrammar.Deviate, histogrammar.Sum)):
# return if data type has already been determined from parent histogram
if itr > 0:
return []
# else: no parent histogram.
# input datatype must be a number. Let's also make an educated guess if it's a converted timestamp
datatype = [np.number]
if isinstance(hist, histogrammar.Maximize):
value = hist.max
elif isinstance(hist, histogrammar.Minimize):
value = hist.min
elif isinstance(hist, (histogrammar.Average, histogrammar.Deviate)):
value = hist.mean
elif isinstance(hist, histogrammar.Sum):
value = hist.sum
else:
value = np.nan
if hist.entries > 0 and _is_probable_timestamp(value):
datatype = [np.datetime64]
return datatype
# from here histogram has a sub-histogram. (get it below and recurse.)
datatype = []
if isinstance(hist, histogrammar.Categorize):
# type of input histogram is a string or bool.
if len(hist.bins) > 0:
keys = list(hist.bins.keys())
dt = type(keys[0]) if len(keys) > 0 else str
dt = np.dtype(dt).type
if (dt is np.str_) or (dt is np.bytes_) or (dt is np.object_):
dt = str
datatype = [dt]
else:
# unfilled, default to string (but who knows?)
datatype = [str]
else:
# input datatype must be a number.
# let's make an educated guess if it's a converted timestamp
datatype = [np.number]
if isinstance(hist, (histogrammar.Bin, histogrammar.SparselyBin)):
values = [hist.low, hist.high]
elif isinstance(hist, histogrammar.CentrallyBin):
values = hist.centers
elif isinstance(hist, (histogrammar.IrregularlyBin, histogrammar.Stack)):
# throw away inf
values = hist.thresholds[1:]
# elif isinstance(hist, (histogrammar.Fraction, histogrammar.Select)):
# # for fraction and select we assume this is a number
# # b/c in the code, the selection is done on 0.
# values = []
else:
values = []
if len(values) > 0 and _is_probable_timestamp(values[0]) and _is_probable_timestamp(values[-1]):
datatype = [np.datetime64]
# Extract sub-hist and recurse
sub_hist = _get_sub_hist(hist)
return datatype + get_datatype(sub_hist, itr + 1)
def _get_sub_hist(hist):
"""Get sub-histrogram of input histrogram
In case input is multi-dimensional, get the first sub-histrogram of input histrogram
:param hist: input histogram
:return: sub-histogram, else None
"""
# if histogram has a sub-histogram, extract and return it
# sub hists are only possible for the following hists
if isinstance(hist, histogrammar.Categorize):
sub_hist = hist.values[0] if hist.values else hist.value
elif isinstance(hist, histogrammar.Bin):
if hist.entries > 0 and len(hist.values) > 0:
# pick first sub-hist found that is filled
# note: could still be that all bins are unfilled. if so pick first bin.
idx = 0
for i, b in enumerate(hist.values):
if b.entries > 0:
idx = i
break
sub_hist = hist.values[idx]
else:
sub_hist = hist.values[0] if len(hist.values) > 0 else histogrammar.Count()
elif isinstance(hist, (histogrammar.SparselyBin, histogrammar.CentrallyBin)):
sub_hist = list(dict(hist.bins).values())[0] if hist.bins else hist.value
elif isinstance(hist, (histogrammar.IrregularlyBin, histogrammar.Stack)):
sub_hist = list(dict(hist.bins).values())[0] if hist.bins else histogrammar.Count()
elif isinstance(hist, histogrammar.Fraction):
sub_hist = hist.denominator if hist.denominator else histogrammar.Count()
elif isinstance(hist, histogrammar.Select):
sub_hist = hist.cut if hist.cut else histogrammar.Count()
else:
sub_hist = None
return sub_hist
def _is_probable_timestamp(value, DATE_LOW=5e16, DATE_HIGH=9.9e18):
"""function to check if input number is probably a timestamp in nanoseconds
:param value: input value
:return: True if timestamp
"""
import numpy as np
# HACK: making an educated guess for timestamp
# large numbers (time in ns since 1970) used to determine if float corresponds to a timestamp
# DATE_LOW = 5e16 = 1971-08-02 16:53:20 in nanosec
# DATE_HIGH = 9.9e18 = 2260-1-1 in nanosec
# timestamp is in ns since 1970, so a huge number.
is_ts = False
if isinstance(value, (np.number,float,int)) and not np.isnan(value):
is_ts = DATE_LOW < value < DATE_HIGH
return is_ts
@property
def n_dim(self): # noqa
"""Histogram dimension
:returns: dimension of the histogram
:rtype: int
"""
return get_n_dim(self)
@property
def datatype(self): # noqa
"""Data type of histogram variable.
Return data type of the variable represented by the histogram. If not
already set, will determine datatype automatically.
:returns: data type
:rtype: type or list(type)
"""
# making an educated guess to determine data-type categories
datatype = get_datatype(self)
if isinstance(datatype, list):
if len(datatype) == 1:
return datatype[0]
elif len(datatype) == 0:
return type(None)
return datatype
def get_hist_props(hist):
"""Get histogram datatype properties.
:param hist: input histogram
:returns dict: Column properties
"""
import numpy as np
var_type = (
hist.datatype if not isinstance(hist.datatype, list) else hist.datatype[0]
)
npdtype = np.dtype(var_type)
# determine data-type categories
is_int = isinstance(npdtype.type(), np.integer)
is_ts = isinstance(npdtype.type(), np.datetime64)
is_num = is_ts or isinstance(npdtype.type(), np.number)
is_bool = isinstance(npdtype.type(), np.bool_)
return dict(
dtype=npdtype, is_num=is_num, is_int=is_int, is_ts=is_ts, is_bool=is_bool
)
def dumper(obj):
"""Utility function to convert objects to json
From: https://stackoverflow.com/questions/3768895/how-to-make-a-class-json-serializable
E.g. use to convert dict of histogrammar objects to json
Use as:
.. code-block:: python
js = json.dumps(hists, default=dumper)
with open(filename, 'w') as f:
json.dump(hists, f, default=dumper)
:param obj: input object
:return: output json object
"""
if hasattr(obj, "toJSON"):
return obj.toJSON()
elif hasattr(obj, "toJson"):
return obj.toJson()
elif hasattr(obj, "__dict__"):
return obj.__dict__
else:
raise RuntimeError(f"Do not know how to serialize object type {type(obj)}")