Skip to content

Commit 3c50da4

Browse files
committed
Fix label data decoding, for when coded bytes contain zero-bytes: WIP=problems.
1 parent 4653710 commit 3c50da4

File tree

1 file changed

+37
-6
lines changed
  • lib/iris/fileformats

1 file changed

+37
-6
lines changed

lib/iris/fileformats/cf.py

Lines changed: 37 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -15,6 +15,7 @@
1515
"""
1616

1717
from abc import ABCMeta, abstractmethod
18+
import codecs
1819
from collections.abc import Iterable, MutableMapping
1920
import os
2021
import re
@@ -807,13 +808,42 @@ def cf_label_data(self, cf_data_var):
807808
label_data = self[:]
808809

809810
if ma.isMaskedArray(label_data):
810-
label_data = label_data.filled()
811+
label_data = label_data.filled(b"\0")
812+
813+
default_encoding = "utf-8"
814+
encoding = getattr(self, "_Encoding", None)
815+
if encoding is None:
816+
# utf-8 is a reasonable "safe" default, equivalent to 'ascii' for ascii data
817+
encoding = default_encoding
818+
else:
819+
try:
820+
# Accept + normalise naming of encodings
821+
encoding = codecs.lookup(encoding).name
822+
# NOTE: if encoding does not suit data, errors can occur.
823+
# For example, _Encoding = "ascii", with non-ascii content.
824+
except LookupError:
825+
# Replace some invalid setting with "safe"(ish) fallback.
826+
encoding = default_encoding
827+
828+
def string_from_1d_bytearray(array, encoding):
829+
r"""Because numpy bytes arrays behave very oddly.
830+
831+
Elements which "should" contain a zero byte b'\0' instead appear to contain
832+
an *empty* byte b''. So a "b''.join()" will *omit* any zero bytes.
833+
"""
834+
assert array.dtype.kind == "S" and array.dtype.itemsize == 1
835+
assert array.ndim == 1
836+
bytelist = [b"\0" if byte == b"" else byte for byte in array]
837+
bytes = b"".join(bytelist)
838+
assert len(bytes) == array.shape[0]
839+
string = bytes.decode(encoding=encoding)
840+
result = string.strip()
841+
return result
811842

812843
# Determine whether we have a string-valued scalar label
813844
# i.e. a character variable that only has one dimension (the length of the string).
814845
if self.ndim == 1:
815-
label_string = b"".join(label_data).strip()
816-
label_string = label_string.decode("utf8")
846+
label_string = string_from_1d_bytearray(label_data, encoding)
817847
data = np.array([label_string])
818848
else:
819849
# Determine the index of the string dimension.
@@ -834,9 +864,10 @@ def cf_label_data(self, cf_data_var):
834864
else:
835865
label_index = index + (slice(None, None),)
836866

837-
label_string = b"".join(label_data[label_index]).strip()
838-
label_string = label_string.decode("utf8")
839-
data[index] = label_string
867+
label_string = string_from_1d_bytearray(
868+
label_data[label_index], encoding
869+
)
870+
data[index] = label_string.strip()
840871

841872
return data
842873

0 commit comments

Comments
 (0)