Skip to content
forked from pydata/xarray

Commit fbcac76

Browse files
kmuehlbauerdcherianpre-commit-ci[bot]
authored
correctly encode/decode _FillValues/missing_values/dtypes for packed data (pydata#8713)
* correctly encode/decode _FillValues * fix mypy * fix CFMaskCode test * fix scale/offset * avert zarr issue * add whats-new.rst entry * refactor fillvalue/missing value check to catch non-conforming values, apply review suggestions * fix typing * suppress warning in doc * FIX: silence SerializationWarnings * FIX: silence mypy by casting to string early * Update xarray/tests/test_conventions.py Co-authored-by: Deepak Cherian <[email protected]> * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci * Shorten test, add comment checking for two warnings * make test even shorter --------- Co-authored-by: Deepak Cherian <[email protected]> Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com>
1 parent c9d3084 commit fbcac76

File tree

6 files changed

+192
-99
lines changed

6 files changed

+192
-99
lines changed

Diff for: doc/user-guide/indexing.rst

+1
Original file line numberDiff line numberDiff line change
@@ -549,6 +549,7 @@ __ https://numpy.org/doc/stable/user/basics.indexing.html#assigning-values-to-in
549549
You can also assign values to all variables of a :py:class:`Dataset` at once:
550550

551551
.. ipython:: python
552+
:okwarning:
552553
553554
ds_org = xr.tutorial.open_dataset("eraint_uvz").isel(
554555
latitude=slice(56, 59), longitude=slice(255, 258), level=0

Diff for: doc/whats-new.rst

+4
Original file line numberDiff line numberDiff line change
@@ -53,6 +53,10 @@ Bug fixes
5353
when used in :py:meth:`DataArray.expand_dims` and
5454
::py:meth:`Dataset.expand_dims` (:pull:`8781`). By `Spencer
5555
Clark <https://github.com/spencerkclark>`_.
56+
- CF conform handling of `_FillValue`/`missing_value` and `dtype` in
57+
`CFMaskCoder`/`CFScaleOffsetCoder` (:issue:`2304`, :issue:`5597`,
58+
:issue:`7691`, :pull:`8713`, see also discussion in :pull:`7654`).
59+
By `Kai Mühlbauer <https://github.com/kmuehlbauer>`_.
5660

5761
Documentation
5862
~~~~~~~~~~~~~

Diff for: xarray/coding/variables.py

+126-57
Original file line numberDiff line numberDiff line change
@@ -262,6 +262,44 @@ def _is_time_like(units):
262262
return any(tstr == units for tstr in time_strings)
263263

264264

265+
def _check_fill_values(attrs, name, dtype):
266+
""" "Check _FillValue and missing_value if available.
267+
268+
Return dictionary with raw fill values and set with encoded fill values.
269+
270+
Issue SerializationWarning if appropriate.
271+
"""
272+
raw_fill_dict = {}
273+
[
274+
pop_to(attrs, raw_fill_dict, attr, name=name)
275+
for attr in ("missing_value", "_FillValue")
276+
]
277+
encoded_fill_values = set()
278+
for k in list(raw_fill_dict):
279+
v = raw_fill_dict[k]
280+
kfill = {fv for fv in np.ravel(v) if not pd.isnull(fv)}
281+
if not kfill and np.issubdtype(dtype, np.integer):
282+
warnings.warn(
283+
f"variable {name!r} has non-conforming {k!r} "
284+
f"{v!r} defined, dropping {k!r} entirely.",
285+
SerializationWarning,
286+
stacklevel=3,
287+
)
288+
del raw_fill_dict[k]
289+
else:
290+
encoded_fill_values |= kfill
291+
292+
if len(encoded_fill_values) > 1:
293+
warnings.warn(
294+
f"variable {name!r} has multiple fill values "
295+
f"{encoded_fill_values} defined, decoding all values to NaN.",
296+
SerializationWarning,
297+
stacklevel=3,
298+
)
299+
300+
return raw_fill_dict, encoded_fill_values
301+
302+
265303
class CFMaskCoder(VariableCoder):
266304
"""Mask or unmask fill values according to CF conventions."""
267305

@@ -283,67 +321,56 @@ def encode(self, variable: Variable, name: T_Name = None):
283321
f"Variable {name!r} has conflicting _FillValue ({fv}) and missing_value ({mv}). Cannot encode data."
284322
)
285323

286-
# special case DateTime to properly handle NaT
287-
is_time_like = _is_time_like(attrs.get("units"))
288-
289324
if fv_exists:
290325
# Ensure _FillValue is cast to same dtype as data's
291326
encoding["_FillValue"] = dtype.type(fv)
292327
fill_value = pop_to(encoding, attrs, "_FillValue", name=name)
293-
if not pd.isnull(fill_value):
294-
if is_time_like and data.dtype.kind in "iu":
295-
data = duck_array_ops.where(
296-
data != np.iinfo(np.int64).min, data, fill_value
297-
)
298-
else:
299-
data = duck_array_ops.fillna(data, fill_value)
300328

301329
if mv_exists:
302-
# Ensure missing_value is cast to same dtype as data's
303-
encoding["missing_value"] = dtype.type(mv)
330+
# try to use _FillValue, if it exists to align both values
331+
# or use missing_value and ensure it's cast to same dtype as data's
332+
encoding["missing_value"] = attrs.get("_FillValue", dtype.type(mv))
304333
fill_value = pop_to(encoding, attrs, "missing_value", name=name)
305-
if not pd.isnull(fill_value) and not fv_exists:
306-
if is_time_like and data.dtype.kind in "iu":
307-
data = duck_array_ops.where(
308-
data != np.iinfo(np.int64).min, data, fill_value
309-
)
310-
else:
311-
data = duck_array_ops.fillna(data, fill_value)
334+
335+
# apply fillna
336+
if not pd.isnull(fill_value):
337+
# special case DateTime to properly handle NaT
338+
if _is_time_like(attrs.get("units")) and data.dtype.kind in "iu":
339+
data = duck_array_ops.where(
340+
data != np.iinfo(np.int64).min, data, fill_value
341+
)
342+
else:
343+
data = duck_array_ops.fillna(data, fill_value)
312344

313345
return Variable(dims, data, attrs, encoding, fastpath=True)
314346

315347
def decode(self, variable: Variable, name: T_Name = None):
316-
dims, data, attrs, encoding = unpack_for_decoding(variable)
317-
318-
raw_fill_values = [
319-
pop_to(attrs, encoding, attr, name=name)
320-
for attr in ("missing_value", "_FillValue")
321-
]
322-
if raw_fill_values:
323-
encoded_fill_values = {
324-
fv
325-
for option in raw_fill_values
326-
for fv in np.ravel(option)
327-
if not pd.isnull(fv)
328-
}
329-
330-
if len(encoded_fill_values) > 1:
331-
warnings.warn(
332-
"variable {!r} has multiple fill values {}, "
333-
"decoding all values to NaN.".format(name, encoded_fill_values),
334-
SerializationWarning,
335-
stacklevel=3,
336-
)
348+
raw_fill_dict, encoded_fill_values = _check_fill_values(
349+
variable.attrs, name, variable.dtype
350+
)
337351

338-
# special case DateTime to properly handle NaT
339-
dtype: np.typing.DTypeLike
340-
decoded_fill_value: Any
341-
if _is_time_like(attrs.get("units")) and data.dtype.kind in "iu":
342-
dtype, decoded_fill_value = np.int64, np.iinfo(np.int64).min
343-
else:
344-
dtype, decoded_fill_value = dtypes.maybe_promote(data.dtype)
352+
if raw_fill_dict:
353+
dims, data, attrs, encoding = unpack_for_decoding(variable)
354+
[
355+
safe_setitem(encoding, attr, value, name=name)
356+
for attr, value in raw_fill_dict.items()
357+
]
345358

346359
if encoded_fill_values:
360+
# special case DateTime to properly handle NaT
361+
dtype: np.typing.DTypeLike
362+
decoded_fill_value: Any
363+
if _is_time_like(attrs.get("units")) and data.dtype.kind in "iu":
364+
dtype, decoded_fill_value = np.int64, np.iinfo(np.int64).min
365+
else:
366+
if "scale_factor" not in attrs and "add_offset" not in attrs:
367+
dtype, decoded_fill_value = dtypes.maybe_promote(data.dtype)
368+
else:
369+
dtype, decoded_fill_value = (
370+
_choose_float_dtype(data.dtype, attrs),
371+
np.nan,
372+
)
373+
347374
transform = partial(
348375
_apply_mask,
349376
encoded_fill_values=encoded_fill_values,
@@ -366,20 +393,51 @@ def _scale_offset_decoding(data, scale_factor, add_offset, dtype: np.typing.DTyp
366393
return data
367394

368395

369-
def _choose_float_dtype(dtype: np.dtype, has_offset: bool) -> type[np.floating[Any]]:
396+
def _choose_float_dtype(
397+
dtype: np.dtype, mapping: MutableMapping
398+
) -> type[np.floating[Any]]:
370399
"""Return a float dtype that can losslessly represent `dtype` values."""
371-
# Keep float32 as-is. Upcast half-precision to single-precision,
400+
# check scale/offset first to derive wanted float dtype
401+
# see https://github.com/pydata/xarray/issues/5597#issuecomment-879561954
402+
scale_factor = mapping.get("scale_factor")
403+
add_offset = mapping.get("add_offset")
404+
if scale_factor is not None or add_offset is not None:
405+
# get the type from scale_factor/add_offset to determine
406+
# the needed floating point type
407+
if scale_factor is not None:
408+
scale_type = np.dtype(type(scale_factor))
409+
if add_offset is not None:
410+
offset_type = np.dtype(type(add_offset))
411+
# CF conforming, both scale_factor and add-offset are given and
412+
# of same floating point type (float32/64)
413+
if (
414+
add_offset is not None
415+
and scale_factor is not None
416+
and offset_type == scale_type
417+
and scale_type in [np.float32, np.float64]
418+
):
419+
# in case of int32 -> we need upcast to float64
420+
# due to precision issues
421+
if dtype.itemsize == 4 and np.issubdtype(dtype, np.integer):
422+
return np.float64
423+
return scale_type.type
424+
# Not CF conforming and add_offset given:
425+
# A scale factor is entirely safe (vanishing into the mantissa),
426+
# but a large integer offset could lead to loss of precision.
427+
# Sensitivity analysis can be tricky, so we just use a float64
428+
# if there's any offset at all - better unoptimised than wrong!
429+
if add_offset is not None:
430+
return np.float64
431+
# return dtype depending on given scale_factor
432+
return scale_type.type
433+
# If no scale_factor or add_offset is given, use some general rules.
434+
# Keep float32 as-is. Upcast half-precision to single-precision,
372435
# because float16 is "intended for storage but not computation"
373436
if dtype.itemsize <= 4 and np.issubdtype(dtype, np.floating):
374437
return np.float32
375438
# float32 can exactly represent all integers up to 24 bits
376439
if dtype.itemsize <= 2 and np.issubdtype(dtype, np.integer):
377-
# A scale factor is entirely safe (vanishing into the mantissa),
378-
# but a large integer offset could lead to loss of precision.
379-
# Sensitivity analysis can be tricky, so we just use a float64
380-
# if there's any offset at all - better unoptimised than wrong!
381-
if not has_offset:
382-
return np.float32
440+
return np.float32
383441
# For all other types and circumstances, we just use float64.
384442
# (safe because eg. complex numbers are not supported in NetCDF)
385443
return np.float64
@@ -396,7 +454,12 @@ def encode(self, variable: Variable, name: T_Name = None) -> Variable:
396454
dims, data, attrs, encoding = unpack_for_encoding(variable)
397455

398456
if "scale_factor" in encoding or "add_offset" in encoding:
399-
dtype = _choose_float_dtype(data.dtype, "add_offset" in encoding)
457+
# if we have a _FillValue/masked_value we do not want to cast now
458+
# but leave that to CFMaskCoder
459+
dtype = data.dtype
460+
if "_FillValue" not in encoding and "missing_value" not in encoding:
461+
dtype = _choose_float_dtype(data.dtype, encoding)
462+
# but still we need a copy prevent changing original data
400463
data = duck_array_ops.astype(data, dtype=dtype, copy=True)
401464
if "add_offset" in encoding:
402465
data -= pop_to(encoding, attrs, "add_offset", name=name)
@@ -412,11 +475,17 @@ def decode(self, variable: Variable, name: T_Name = None) -> Variable:
412475

413476
scale_factor = pop_to(attrs, encoding, "scale_factor", name=name)
414477
add_offset = pop_to(attrs, encoding, "add_offset", name=name)
415-
dtype = _choose_float_dtype(data.dtype, "add_offset" in encoding)
416478
if np.ndim(scale_factor) > 0:
417479
scale_factor = np.asarray(scale_factor).item()
418480
if np.ndim(add_offset) > 0:
419481
add_offset = np.asarray(add_offset).item()
482+
# if we have a _FillValue/masked_value we already have the wanted
483+
# floating point dtype here (via CFMaskCoder), so no check is necessary
484+
# only check in other cases
485+
dtype = data.dtype
486+
if "_FillValue" not in encoding and "missing_value" not in encoding:
487+
dtype = _choose_float_dtype(dtype, encoding)
488+
420489
transform = partial(
421490
_scale_offset_decoding,
422491
scale_factor=scale_factor,

0 commit comments

Comments
 (0)