|
5 | 5 | """
|
6 | 6 |
|
7 | 7 | import re
|
8 |
| -from functools import reduce |
9 |
| -from itertools import chain |
| 8 | +import math |
10 | 9 |
|
11 | 10 | from influxdb_client.client.write.point import _ESCAPE_KEY, _ESCAPE_STRING, _ESCAPE_MEASUREMENT
|
12 | 11 |
|
13 | 12 |
|
14 |
| -def _replace(data_frame): |
15 |
| - from ...extras import np |
16 |
| - |
17 |
| - # string columns |
18 |
| - obj_cols = {k for k, v in dict(data_frame.dtypes).items() if v is np.dtype('O')} |
19 |
| - |
20 |
| - # number columns |
21 |
| - other_cols = set(data_frame.columns) - obj_cols |
22 |
| - |
23 |
| - obj_nans = (f'{k}=nan' for k in obj_cols) |
24 |
| - other_nans = (f'{k}=nani?' for k in other_cols) |
25 |
| - |
26 |
| - replacements = [ |
27 |
| - ('|'.join(chain(obj_nans, other_nans)), ''), |
28 |
| - (',{2,}', ','), |
29 |
| - ('|'.join([', ,', ', ', ' ,']), ' '), |
30 |
| - ] |
31 |
| - |
32 |
| - return replacements |
33 |
| - |
34 |
| - |
35 | 13 | def _itertuples(data_frame):
|
36 | 14 | cols = [data_frame.iloc[:, k] for k in range(len(data_frame.columns))]
|
37 | 15 | return zip(data_frame.index, *cols)
|
38 | 16 |
|
39 | 17 |
|
40 |
| -def _is_nan(x): |
41 |
| - return x != x |
| 18 | +def _not_nan(x): |
| 19 | + return x == x |
42 | 20 |
|
43 | 21 |
|
44 | 22 | def _any_not_nan(p, indexes):
|
45 |
| - return any(map(lambda inx: not _is_nan(p[inx]), indexes)) |
| 23 | + return any(map(lambda x: _not_nan(p[x]), indexes)) |
46 | 24 |
|
47 | 25 |
|
48 | 26 | def data_frame_to_list_of_points(data_frame, point_settings, **kwargs):
|
49 | 27 | """Serialize DataFrame into LineProtocols."""
|
| 28 | + # This function is hard to understand but for good reason: |
| 29 | + # the approach used here is considerably more efficient |
| 30 | + # than the alternatives. |
| 31 | + # |
| 32 | + # We build up a Python expression that efficiently converts a data point |
| 33 | + # tuple into line-protocol entry, and then evaluate the expression |
| 34 | + # as a lambda so that we can call it. This avoids the overhead of |
| 35 | + # invoking a function on every data value - we only have one function |
| 36 | + # call per row instead. The expression consists of exactly |
| 37 | + # one f-string, so we build up the parts of it as segments |
| 38 | + # that are concatenated together to make the full f-string inside |
| 39 | + # the lambda. |
| 40 | + # |
| 41 | + # Things are made a little more complex because fields and tags with NaN |
| 42 | + # values and empty tags are omitted from the generated line-protocol |
| 43 | + # output. |
| 44 | + # |
| 45 | + # As an example, say we have a data frame with two value columns: |
| 46 | + # a float |
| 47 | + # b int |
| 48 | + # |
| 49 | + # This will generate a lambda expression to be evaluated that looks like |
| 50 | + # this: |
| 51 | + # |
| 52 | + # lambda p: f"""{measurement_name} {keys[0]}={p[1]},{keys[1]}={p[2]}i {p[0].value}""" |
| 53 | + # |
| 54 | + # This lambda is then executed for each row p. |
| 55 | + # |
| 56 | + # When NaNs are present, the expression looks like this (split |
| 57 | + # across two lines to satisfy the code-style checker) |
| 58 | + # |
| 59 | + # lambda p: f"""{measurement_name} {"" if math.isnan(p[1]) |
| 60 | + # else f"{keys[0]}={p[1]}"},{keys[1]}={p[2]}i {p[0].value}""" |
| 61 | + # |
| 62 | + # When there's a NaN value in column a, we'll end up with a comma at the start of the |
| 63 | + # fields, so we run a regexp substitution after generating the line-protocol entries |
| 64 | + # to remove this. |
| 65 | + # |
| 66 | + # We're careful to run these potentially costly extra steps only when NaN values actually |
| 67 | + # exist in the data. |
| 68 | + |
50 | 69 | from ...extras import pd, np
|
51 | 70 | if not isinstance(data_frame, pd.DataFrame):
|
52 | 71 | raise TypeError('Must be DataFrame, but type was: {0}.'
|
53 | 72 | .format(type(data_frame)))
|
54 | 73 |
|
55 |
| - if 'data_frame_measurement_name' not in kwargs: |
| 74 | + data_frame_measurement_name = kwargs.get('data_frame_measurement_name') |
| 75 | + if data_frame_measurement_name is None: |
56 | 76 | raise TypeError('"data_frame_measurement_name" is a Required Argument')
|
57 | 77 |
|
| 78 | + data_frame = data_frame.copy(deep=False) |
58 | 79 | if isinstance(data_frame.index, pd.PeriodIndex):
|
59 | 80 | data_frame.index = data_frame.index.to_timestamp()
|
60 | 81 | else:
|
| 82 | + # TODO: this is almost certainly not what you want |
| 83 | + # when the index is the default RangeIndex. |
| 84 | + # Instead, it would probably be better to leave |
| 85 | + # out the timestamp unless a time column is explicitly |
| 86 | + # enabled. |
61 | 87 | data_frame.index = pd.to_datetime(data_frame.index)
|
62 | 88 |
|
63 | 89 | if data_frame.index.tzinfo is None:
|
64 | 90 | data_frame.index = data_frame.index.tz_localize('UTC')
|
65 | 91 |
|
66 |
| - measurement_name = str(kwargs.get('data_frame_measurement_name')).translate(_ESCAPE_MEASUREMENT) |
67 | 92 | data_frame_tag_columns = kwargs.get('data_frame_tag_columns')
|
68 | 93 | data_frame_tag_columns = set(data_frame_tag_columns or [])
|
69 | 94 |
|
| 95 | + # keys holds a list of string keys. |
| 96 | + keys = [] |
| 97 | + # tags holds a list of tag f-string segments ordered alphabetically by tag key. |
70 | 98 | tags = []
|
| 99 | + # fields holds a list of field f-string segments ordered alphebetically by field key |
71 | 100 | fields = []
|
72 |
| - fields_indexes = [] |
73 |
| - keys = [] |
| 101 | + # field_indexes holds the index into each row of all the fields. |
| 102 | + field_indexes = [] |
74 | 103 |
|
75 | 104 | if point_settings.defaultTags:
|
76 | 105 | for key, value in point_settings.defaultTags.items():
|
77 |
| - data_frame[key] = value |
78 |
| - data_frame_tag_columns.add(key) |
79 |
| - |
80 |
| - for index, (key, value) in enumerate(data_frame.dtypes.items()): |
| 106 | + # Avoid overwriting existing data if there's a column |
| 107 | + # that already exists with the default tag's name. |
| 108 | + # Note: when a new column is added, the old DataFrame |
| 109 | + # that we've made a shallow copy of is unaffected. |
| 110 | + # TODO: when there are NaN or empty values in |
| 111 | + # the column, we could make a deep copy of the |
| 112 | + # data and fill in those values with the default tag value. |
| 113 | + if key not in data_frame.columns: |
| 114 | + data_frame[key] = value |
| 115 | + data_frame_tag_columns.add(key) |
| 116 | + |
| 117 | + # Get a list of all the columns sorted by field/tag key. |
| 118 | + # We want to iterate through the columns in sorted order |
| 119 | + # so that we know when we're on the first field so we |
| 120 | + # can know whether a comma is needed for that |
| 121 | + # field. |
| 122 | + columns = sorted(enumerate(data_frame.dtypes.items()), key=lambda col: col[1][0]) |
| 123 | + |
| 124 | + # null_columns has a bool value for each column holding |
| 125 | + # whether that column contains any null (NaN or None) values. |
| 126 | + null_columns = data_frame.isnull().any() |
| 127 | + |
| 128 | + # Iterate through the columns building up the expression for each column. |
| 129 | + for index, (key, value) in columns: |
81 | 130 | key = str(key)
|
| 131 | + key_format = f'{{keys[{len(keys)}]}}' |
82 | 132 | keys.append(key.translate(_ESCAPE_KEY))
|
83 |
| - key_format = f'{{keys[{index}]}}' |
| 133 | + # The field index is one more than the column index because the |
| 134 | + # time index is at column zero in the finally zipped-together |
| 135 | + # result columns. |
| 136 | + field_index = index + 1 |
| 137 | + val_format = f'p[{field_index}]' |
84 | 138 |
|
85 |
| - index_value = index + 1 |
86 | 139 | if key in data_frame_tag_columns:
|
87 |
| - tags.append({'key': key, 'value': f"{key_format}={{str(p[{index_value}]).translate(_ESCAPE_KEY)}}"}) |
88 |
| - elif issubclass(value.type, np.integer): |
89 |
| - fields.append(f"{key_format}={{p[{index_value}]}}i") |
90 |
| - fields_indexes.append(index_value) |
91 |
| - elif issubclass(value.type, (np.float, np.bool_)): |
92 |
| - fields.append(f"{key_format}={{p[{index_value}]}}") |
93 |
| - fields_indexes.append(index_value) |
| 140 | + # This column is a tag column. |
| 141 | + if null_columns[index]: |
| 142 | + key_value = f"""{{ |
| 143 | + '' if {val_format} == '' or type({val_format}) == float and math.isnan({val_format}) else |
| 144 | + f',{key_format}={{str({val_format}).translate(_ESCAPE_STRING)}}' |
| 145 | + }}""" |
| 146 | + else: |
| 147 | + key_value = f',{key_format}={{str({val_format}).translate(_ESCAPE_KEY)}}' |
| 148 | + tags.append(key_value) |
| 149 | + continue |
| 150 | + |
| 151 | + # This column is a field column. |
| 152 | + # Note: no comma separator is needed for the first field. |
| 153 | + # It's important to omit it because when the first |
| 154 | + # field column has no nulls, we don't run the comma-removal |
| 155 | + # regexp substitution step. |
| 156 | + sep = '' if len(field_indexes) == 0 else ',' |
| 157 | + if issubclass(value.type, np.integer): |
| 158 | + field_value = f"{sep}{key_format}={{{val_format}}}i" |
| 159 | + elif issubclass(value.type, np.bool_): |
| 160 | + field_value = f'{sep}{key_format}={{{val_format}}}' |
| 161 | + elif issubclass(value.type, np.float): |
| 162 | + if null_columns[index]: |
| 163 | + field_value = f"""{{"" if math.isnan({val_format}) else f"{sep}{key_format}={{{val_format}}}"}}""" |
| 164 | + else: |
| 165 | + field_value = f'{sep}{key_format}={{{val_format}}}' |
94 | 166 | else:
|
95 |
| - fields.append(f"{key_format}=\"{{str(p[{index_value}]).translate(_ESCAPE_STRING)}}\"") |
96 |
| - fields_indexes.append(index_value) |
97 |
| - |
98 |
| - tags.sort(key=lambda x: x['key']) |
99 |
| - tags = ','.join(map(lambda y: y['value'], tags)) |
100 |
| - |
101 |
| - fmt = ('{measurement_name}', f'{"," if tags else ""}', tags, |
102 |
| - ' ', ','.join(fields), ' {p[0].value}') |
103 |
| - f = eval("lambda p: f'{}'".format(''.join(fmt)), |
104 |
| - {'measurement_name': measurement_name, '_ESCAPE_KEY': _ESCAPE_KEY, '_ESCAPE_STRING': _ESCAPE_STRING, |
105 |
| - 'keys': keys}) |
| 167 | + if null_columns[index]: |
| 168 | + field_value = f"""{{ |
| 169 | + '' if type({val_format}) == float and math.isnan({val_format}) else |
| 170 | + f'{sep}{key_format}="{{str({val_format}).translate(_ESCAPE_STRING)}}"' |
| 171 | + }}""" |
| 172 | + else: |
| 173 | + field_value = f'''{sep}{key_format}="{{str({val_format}).translate(_ESCAPE_STRING)}}"''' |
| 174 | + field_indexes.append(field_index) |
| 175 | + fields.append(field_value) |
| 176 | + |
| 177 | + measurement_name = str(data_frame_measurement_name).translate(_ESCAPE_MEASUREMENT) |
| 178 | + |
| 179 | + tags = ''.join(tags) |
| 180 | + fields = ''.join(fields) |
| 181 | + timestamp = '{p[0].value}' |
| 182 | + |
| 183 | + f = eval(f'lambda p: f"""{{measurement_name}}{tags} {fields} {timestamp}"""', { |
| 184 | + 'measurement_name': measurement_name, |
| 185 | + '_ESCAPE_KEY': _ESCAPE_KEY, |
| 186 | + '_ESCAPE_STRING': _ESCAPE_STRING, |
| 187 | + 'keys': keys, |
| 188 | + 'math': math, |
| 189 | + }) |
106 | 190 |
|
107 | 191 | for k, v in dict(data_frame.dtypes).items():
|
108 | 192 | if k in data_frame_tag_columns:
|
109 | 193 | data_frame[k].replace('', np.nan, inplace=True)
|
110 | 194 |
|
111 |
| - isnull = data_frame.isnull().any(axis=1) |
112 |
| - |
113 |
| - if isnull.any(): |
114 |
| - rep = _replace(data_frame) |
115 |
| - lp = (reduce(lambda a, b: re.sub(*b, a), rep, f(p)) |
116 |
| - for p in filter(lambda x: _any_not_nan(x, fields_indexes), _itertuples(data_frame))) |
| 195 | + first_field_maybe_null = null_columns[field_indexes[0] - 1] |
| 196 | + if first_field_maybe_null: |
| 197 | + # When the first field is null (None/NaN), we'll have |
| 198 | + # a spurious leading comma which needs to be removed. |
| 199 | + lp = (re.sub('^((\\ |[^ ])* ),', '\\1', f(p)) |
| 200 | + for p in filter(lambda x: _any_not_nan(x, field_indexes), _itertuples(data_frame))) |
117 | 201 | return list(lp)
|
118 | 202 | else:
|
119 | 203 | return list(map(f, _itertuples(data_frame)))
|
0 commit comments