1515"""
1616
1717from abc import ABCMeta , abstractmethod
18+ import codecs
1819from collections .abc import Iterable , MutableMapping
1920import os
2021import re
@@ -807,13 +808,42 @@ def cf_label_data(self, cf_data_var):
807808 label_data = self [:]
808809
809810 if ma .isMaskedArray (label_data ):
810- label_data = label_data .filled ()
811+ label_data = label_data .filled (b"\0 " )
812+
813+ default_encoding = "utf-8"
814+ encoding = getattr (self , "_Encoding" , None )
815+ if encoding is None :
816+ # utf-8 is a reasonable "safe" default, equivalent to 'ascii' for ascii data
817+ encoding = default_encoding
818+ else :
819+ try :
820+ # Accept + normalise naming of encodings
821+ encoding = codecs .lookup (encoding ).name
822+ # NOTE: if encoding does not suit data, errors can occur.
823+ # For example, _Encoding = "ascii", with non-ascii content.
824+ except LookupError :
825+ # Replace some invalid setting with "safe"(ish) fallback.
826+ encoding = default_encoding
827+
828+ def string_from_1d_bytearray (array , encoding ):
829+ r"""Because numpy bytes arrays behave very oddly.
830+
831+ Elements which "should" contain a zero byte b'\0' instead appear to contain
832+ an *empty* byte b''. So a "b''.join()" will *omit* any zero bytes.
833+ """
834+ assert array .dtype .kind == "S" and array .dtype .itemsize == 1
835+ assert array .ndim == 1
836+ bytelist = [b"\0 " if byte == b"" else byte for byte in array ]
837+ bytes = b"" .join (bytelist )
838+ assert len (bytes ) == array .shape [0 ]
839+ string = bytes .decode (encoding = encoding )
840+ result = string .strip ()
841+ return result
811842
812843 # Determine whether we have a string-valued scalar label
813844 # i.e. a character variable that only has one dimension (the length of the string).
814845 if self .ndim == 1 :
815- label_string = b"" .join (label_data ).strip ()
816- label_string = label_string .decode ("utf8" )
846+ label_string = string_from_1d_bytearray (label_data , encoding )
817847 data = np .array ([label_string ])
818848 else :
819849 # Determine the index of the string dimension.
@@ -834,9 +864,10 @@ def cf_label_data(self, cf_data_var):
834864 else :
835865 label_index = index + (slice (None , None ),)
836866
837- label_string = b"" .join (label_data [label_index ]).strip ()
838- label_string = label_string .decode ("utf8" )
839- data [index ] = label_string
867+ label_string = string_from_1d_bytearray (
868+ label_data [label_index ], encoding
869+ )
870+ data [index ] = label_string .strip ()
840871
841872 return data
842873
0 commit comments