@@ -262,6 +262,44 @@ def _is_time_like(units):
262
262
return any (tstr == units for tstr in time_strings )
263
263
264
264
265
+ def _check_fill_values (attrs , name , dtype ):
266
+ """ "Check _FillValue and missing_value if available.
267
+
268
+ Return dictionary with raw fill values and set with encoded fill values.
269
+
270
+ Issue SerializationWarning if appropriate.
271
+ """
272
+ raw_fill_dict = {}
273
+ [
274
+ pop_to (attrs , raw_fill_dict , attr , name = name )
275
+ for attr in ("missing_value" , "_FillValue" )
276
+ ]
277
+ encoded_fill_values = set ()
278
+ for k in list (raw_fill_dict ):
279
+ v = raw_fill_dict [k ]
280
+ kfill = {fv for fv in np .ravel (v ) if not pd .isnull (fv )}
281
+ if not kfill and np .issubdtype (dtype , np .integer ):
282
+ warnings .warn (
283
+ f"variable { name !r} has non-conforming { k !r} "
284
+ f"{ v !r} defined, dropping { k !r} entirely." ,
285
+ SerializationWarning ,
286
+ stacklevel = 3 ,
287
+ )
288
+ del raw_fill_dict [k ]
289
+ else :
290
+ encoded_fill_values |= kfill
291
+
292
+ if len (encoded_fill_values ) > 1 :
293
+ warnings .warn (
294
+ f"variable { name !r} has multiple fill values "
295
+ f"{ encoded_fill_values } defined, decoding all values to NaN." ,
296
+ SerializationWarning ,
297
+ stacklevel = 3 ,
298
+ )
299
+
300
+ return raw_fill_dict , encoded_fill_values
301
+
302
+
265
303
class CFMaskCoder (VariableCoder ):
266
304
"""Mask or unmask fill values according to CF conventions."""
267
305
@@ -283,67 +321,56 @@ def encode(self, variable: Variable, name: T_Name = None):
283
321
f"Variable { name !r} has conflicting _FillValue ({ fv } ) and missing_value ({ mv } ). Cannot encode data."
284
322
)
285
323
286
- # special case DateTime to properly handle NaT
287
- is_time_like = _is_time_like (attrs .get ("units" ))
288
-
289
324
if fv_exists :
290
325
# Ensure _FillValue is cast to same dtype as data's
291
326
encoding ["_FillValue" ] = dtype .type (fv )
292
327
fill_value = pop_to (encoding , attrs , "_FillValue" , name = name )
293
- if not pd .isnull (fill_value ):
294
- if is_time_like and data .dtype .kind in "iu" :
295
- data = duck_array_ops .where (
296
- data != np .iinfo (np .int64 ).min , data , fill_value
297
- )
298
- else :
299
- data = duck_array_ops .fillna (data , fill_value )
300
328
301
329
if mv_exists :
302
- # Ensure missing_value is cast to same dtype as data's
303
- encoding ["missing_value" ] = dtype .type (mv )
330
+ # try to use _FillValue, if it exists to align both values
331
+ # or use missing_value and ensure it's cast to same dtype as data's
332
+ encoding ["missing_value" ] = attrs .get ("_FillValue" , dtype .type (mv ))
304
333
fill_value = pop_to (encoding , attrs , "missing_value" , name = name )
305
- if not pd .isnull (fill_value ) and not fv_exists :
306
- if is_time_like and data .dtype .kind in "iu" :
307
- data = duck_array_ops .where (
308
- data != np .iinfo (np .int64 ).min , data , fill_value
309
- )
310
- else :
311
- data = duck_array_ops .fillna (data , fill_value )
334
+
335
+ # apply fillna
336
+ if not pd .isnull (fill_value ):
337
+ # special case DateTime to properly handle NaT
338
+ if _is_time_like (attrs .get ("units" )) and data .dtype .kind in "iu" :
339
+ data = duck_array_ops .where (
340
+ data != np .iinfo (np .int64 ).min , data , fill_value
341
+ )
342
+ else :
343
+ data = duck_array_ops .fillna (data , fill_value )
312
344
313
345
return Variable (dims , data , attrs , encoding , fastpath = True )
314
346
315
347
def decode (self , variable : Variable , name : T_Name = None ):
316
- dims , data , attrs , encoding = unpack_for_decoding (variable )
317
-
318
- raw_fill_values = [
319
- pop_to (attrs , encoding , attr , name = name )
320
- for attr in ("missing_value" , "_FillValue" )
321
- ]
322
- if raw_fill_values :
323
- encoded_fill_values = {
324
- fv
325
- for option in raw_fill_values
326
- for fv in np .ravel (option )
327
- if not pd .isnull (fv )
328
- }
329
-
330
- if len (encoded_fill_values ) > 1 :
331
- warnings .warn (
332
- "variable {!r} has multiple fill values {}, "
333
- "decoding all values to NaN." .format (name , encoded_fill_values ),
334
- SerializationWarning ,
335
- stacklevel = 3 ,
336
- )
348
+ raw_fill_dict , encoded_fill_values = _check_fill_values (
349
+ variable .attrs , name , variable .dtype
350
+ )
337
351
338
- # special case DateTime to properly handle NaT
339
- dtype : np .typing .DTypeLike
340
- decoded_fill_value : Any
341
- if _is_time_like (attrs .get ("units" )) and data .dtype .kind in "iu" :
342
- dtype , decoded_fill_value = np .int64 , np .iinfo (np .int64 ).min
343
- else :
344
- dtype , decoded_fill_value = dtypes .maybe_promote (data .dtype )
352
+ if raw_fill_dict :
353
+ dims , data , attrs , encoding = unpack_for_decoding (variable )
354
+ [
355
+ safe_setitem (encoding , attr , value , name = name )
356
+ for attr , value in raw_fill_dict .items ()
357
+ ]
345
358
346
359
if encoded_fill_values :
360
+ # special case DateTime to properly handle NaT
361
+ dtype : np .typing .DTypeLike
362
+ decoded_fill_value : Any
363
+ if _is_time_like (attrs .get ("units" )) and data .dtype .kind in "iu" :
364
+ dtype , decoded_fill_value = np .int64 , np .iinfo (np .int64 ).min
365
+ else :
366
+ if "scale_factor" not in attrs and "add_offset" not in attrs :
367
+ dtype , decoded_fill_value = dtypes .maybe_promote (data .dtype )
368
+ else :
369
+ dtype , decoded_fill_value = (
370
+ _choose_float_dtype (data .dtype , attrs ),
371
+ np .nan ,
372
+ )
373
+
347
374
transform = partial (
348
375
_apply_mask ,
349
376
encoded_fill_values = encoded_fill_values ,
@@ -366,20 +393,51 @@ def _scale_offset_decoding(data, scale_factor, add_offset, dtype: np.typing.DTyp
366
393
return data
367
394
368
395
369
- def _choose_float_dtype (dtype : np .dtype , has_offset : bool ) -> type [np .floating [Any ]]:
396
+ def _choose_float_dtype (
397
+ dtype : np .dtype , mapping : MutableMapping
398
+ ) -> type [np .floating [Any ]]:
370
399
"""Return a float dtype that can losslessly represent `dtype` values."""
371
- # Keep float32 as-is. Upcast half-precision to single-precision,
400
+ # check scale/offset first to derive wanted float dtype
401
+ # see https://github.com/pydata/xarray/issues/5597#issuecomment-879561954
402
+ scale_factor = mapping .get ("scale_factor" )
403
+ add_offset = mapping .get ("add_offset" )
404
+ if scale_factor is not None or add_offset is not None :
405
+ # get the type from scale_factor/add_offset to determine
406
+ # the needed floating point type
407
+ if scale_factor is not None :
408
+ scale_type = np .dtype (type (scale_factor ))
409
+ if add_offset is not None :
410
+ offset_type = np .dtype (type (add_offset ))
411
+ # CF conforming, both scale_factor and add-offset are given and
412
+ # of same floating point type (float32/64)
413
+ if (
414
+ add_offset is not None
415
+ and scale_factor is not None
416
+ and offset_type == scale_type
417
+ and scale_type in [np .float32 , np .float64 ]
418
+ ):
419
+ # in case of int32 -> we need upcast to float64
420
+ # due to precision issues
421
+ if dtype .itemsize == 4 and np .issubdtype (dtype , np .integer ):
422
+ return np .float64
423
+ return scale_type .type
424
+ # Not CF conforming and add_offset given:
425
+ # A scale factor is entirely safe (vanishing into the mantissa),
426
+ # but a large integer offset could lead to loss of precision.
427
+ # Sensitivity analysis can be tricky, so we just use a float64
428
+ # if there's any offset at all - better unoptimised than wrong!
429
+ if add_offset is not None :
430
+ return np .float64
431
+ # return dtype depending on given scale_factor
432
+ return scale_type .type
433
+ # If no scale_factor or add_offset is given, use some general rules.
434
+ # Keep float32 as-is. Upcast half-precision to single-precision,
372
435
# because float16 is "intended for storage but not computation"
373
436
if dtype .itemsize <= 4 and np .issubdtype (dtype , np .floating ):
374
437
return np .float32
375
438
# float32 can exactly represent all integers up to 24 bits
376
439
if dtype .itemsize <= 2 and np .issubdtype (dtype , np .integer ):
377
- # A scale factor is entirely safe (vanishing into the mantissa),
378
- # but a large integer offset could lead to loss of precision.
379
- # Sensitivity analysis can be tricky, so we just use a float64
380
- # if there's any offset at all - better unoptimised than wrong!
381
- if not has_offset :
382
- return np .float32
440
+ return np .float32
383
441
# For all other types and circumstances, we just use float64.
384
442
# (safe because eg. complex numbers are not supported in NetCDF)
385
443
return np .float64
@@ -396,7 +454,12 @@ def encode(self, variable: Variable, name: T_Name = None) -> Variable:
396
454
dims , data , attrs , encoding = unpack_for_encoding (variable )
397
455
398
456
if "scale_factor" in encoding or "add_offset" in encoding :
399
- dtype = _choose_float_dtype (data .dtype , "add_offset" in encoding )
457
+ # if we have a _FillValue/masked_value we do not want to cast now
458
+ # but leave that to CFMaskCoder
459
+ dtype = data .dtype
460
+ if "_FillValue" not in encoding and "missing_value" not in encoding :
461
+ dtype = _choose_float_dtype (data .dtype , encoding )
462
+ # but still we need a copy prevent changing original data
400
463
data = duck_array_ops .astype (data , dtype = dtype , copy = True )
401
464
if "add_offset" in encoding :
402
465
data -= pop_to (encoding , attrs , "add_offset" , name = name )
@@ -412,11 +475,17 @@ def decode(self, variable: Variable, name: T_Name = None) -> Variable:
412
475
413
476
scale_factor = pop_to (attrs , encoding , "scale_factor" , name = name )
414
477
add_offset = pop_to (attrs , encoding , "add_offset" , name = name )
415
- dtype = _choose_float_dtype (data .dtype , "add_offset" in encoding )
416
478
if np .ndim (scale_factor ) > 0 :
417
479
scale_factor = np .asarray (scale_factor ).item ()
418
480
if np .ndim (add_offset ) > 0 :
419
481
add_offset = np .asarray (add_offset ).item ()
482
+ # if we have a _FillValue/masked_value we already have the wanted
483
+ # floating point dtype here (via CFMaskCoder), so no check is necessary
484
+ # only check in other cases
485
+ dtype = data .dtype
486
+ if "_FillValue" not in encoding and "missing_value" not in encoding :
487
+ dtype = _choose_float_dtype (dtype , encoding )
488
+
420
489
transform = partial (
421
490
_scale_offset_decoding ,
422
491
scale_factor = scale_factor ,
0 commit comments