diff --git a/lib/iris/fileformats/_nc_load_rules/helpers.py b/lib/iris/fileformats/_nc_load_rules/helpers.py index 35c2e96924..50e282db5f 100644 --- a/lib/iris/fileformats/_nc_load_rules/helpers.py +++ b/lib/iris/fileformats/_nc_load_rules/helpers.py @@ -708,13 +708,13 @@ def build_and_add_global_attributes(engine: Engine): ), ) if problem is not None: - stack_notes = problem.stack_trace.__notes__ + stack_notes = problem.stack_trace.__notes__ # type: ignore[attr-defined] if stack_notes is None: stack_notes = [] stack_notes.append( f"Skipping disallowed global attribute '{attr_name}' (see above error)" ) - problem.stack_trace.__notes__ = stack_notes + problem.stack_trace.__notes__ = stack_notes # type: ignore[attr-defined] ################################################################################ @@ -1536,14 +1536,14 @@ def build_and_add_dimension_coordinate( ) if problem is not None: coord_var_name = str(cf_coord_var.cf_name) - stack_notes = problem.stack_trace.__notes__ + stack_notes = problem.stack_trace.__notes__ # type: ignore[attr-defined] if stack_notes is None: stack_notes = [] stack_notes.append( f"Failed to create {coord_var_name} dimension coordinate:\n" f"Gracefully creating {coord_var_name!r} auxiliary coordinate instead." ) - problem.stack_trace.__notes__ = stack_notes + problem.stack_trace.__notes__ = stack_notes # type: ignore[attr-defined] problem.handled = True _ = _add_or_capture( diff --git a/lib/iris/fileformats/cf.py b/lib/iris/fileformats/cf.py index 2b6568c315..e734ac04bd 100644 --- a/lib/iris/fileformats/cf.py +++ b/lib/iris/fileformats/cf.py @@ -15,6 +15,7 @@ """ from abc import ABCMeta, abstractmethod +import codecs from collections.abc import Iterable, MutableMapping import os import re @@ -89,6 +90,11 @@ def __init__(self, name, data): self.cf_data = data """NetCDF4 Variable data instance.""" + # Note: *always* disable encoding/decoding translations + # To avoid current known problems + # See https://github.com/Unidata/netcdf4-python/issues/1440 + data.set_auto_chartostring(False) + # ALSO NOTE: not stored. NetCDFDataProxy must re-assert when re-loading. """File source of the NetCDF content.""" try: @@ -802,13 +808,49 @@ def cf_label_data(self, cf_data_var): label_data = self[:] if ma.isMaskedArray(label_data): - label_data = label_data.filled() + label_data = label_data.filled(b"\0") + + default_encoding = "utf-8" + encoding = getattr(self, "_Encoding", None) + if encoding is None: + # utf-8 is a reasonable "safe" default, equivalent to 'ascii' for ascii data + encoding = default_encoding + else: + try: + # Accept + normalise naming of encodings + encoding = codecs.lookup(encoding).name + # NOTE: if encoding does not suit data, errors can occur. + # For example, _Encoding = "ascii", with non-ascii content. + except LookupError: + # Replace some invalid setting with "safe"(ish) fallback. + encoding = default_encoding + + def string_from_1d_bytearray(array, encoding): + r"""Because numpy bytes arrays behave very oddly. + + Elements which "should" contain a zero byte b'\0' instead appear to contain + an *empty* byte b''. So a "b''.join()" will *omit* any zero bytes. + """ + assert array.dtype.kind == "S" and array.dtype.itemsize == 1 + assert array.ndim == 1 + bytelist = [b"\0" if byte == b"" else byte for byte in array] + bytes = b"".join(bytelist) + assert len(bytes) == array.shape[0] + try: + string = bytes.decode(encoding=encoding) + except UnicodeDecodeError: + # if encoding == "ascii": + # print("\n\n*** FIX !!") + # string = bytes.decode("utf-8") + # else: + raise + result = string.strip() + return result # Determine whether we have a string-valued scalar label # i.e. a character variable that only has one dimension (the length of the string). if self.ndim == 1: - label_string = b"".join(label_data).strip() - label_string = label_string.decode("utf8") + label_string = string_from_1d_bytearray(label_data, encoding) data = np.array([label_string]) else: # Determine the index of the string dimension. @@ -829,9 +871,10 @@ def cf_label_data(self, cf_data_var): else: label_index = index + (slice(None, None),) - label_string = b"".join(label_data[label_index]).strip() - label_string = label_string.decode("utf8") - data[index] = label_string + label_string = string_from_1d_bytearray( + label_data[label_index], encoding + ) + data[index] = label_string.strip() return data diff --git a/lib/iris/fileformats/netcdf/_thread_safe_nc.py b/lib/iris/fileformats/netcdf/_thread_safe_nc.py index 35588eb2c4..e982cb3acd 100644 --- a/lib/iris/fileformats/netcdf/_thread_safe_nc.py +++ b/lib/iris/fileformats/netcdf/_thread_safe_nc.py @@ -336,6 +336,11 @@ def __getitem__(self, keys): dataset = netCDF4.Dataset(self.path) try: variable = dataset.variables[self.variable_name] + # ALWAYS disable byte encoding/decoding + # To avoid current known problems + # See https://github.com/Unidata/netcdf4-python/issues/1440 + variable.set_auto_chartostring(False) + # Get the NetCDF variable data and slice. var = variable[keys] finally: diff --git a/lib/iris/fileformats/netcdf/saver.py b/lib/iris/fileformats/netcdf/saver.py index 5177749c07..129427289c 100644 --- a/lib/iris/fileformats/netcdf/saver.py +++ b/lib/iris/fileformats/netcdf/saver.py @@ -990,12 +990,12 @@ def _add_aux_coords( ] # Include any relevant mesh location coordinates. - mesh: MeshXY | None = getattr(cube, "mesh") - mesh_location: str | None = getattr(cube, "location") + mesh: MeshXY | None = getattr(cube, "mesh") # type: ignore[annotation-unchecked] + mesh_location: str | None = getattr(cube, "location") # type: ignore[annotation-unchecked] if mesh and mesh_location: location_coords: MeshNodeCoords | MeshEdgeCoords | MeshFaceCoords = getattr( mesh, f"{mesh_location}_coords" - ) + ) # type: ignore[annotation-unchecked] coords_to_add.extend(list(location_coords)) return self._add_inner_related_vars( diff --git a/lib/iris/tests/integration/netcdf/test_chararrays.py b/lib/iris/tests/integration/netcdf/test_chararrays.py new file mode 100644 index 0000000000..a3ce9f9128 --- /dev/null +++ b/lib/iris/tests/integration/netcdf/test_chararrays.py @@ -0,0 +1,113 @@ +import netCDF4 as nc +import numpy as np +import pytest + +import iris + +NX, N_STRLEN = 3, 64 +TEST_STRINGS = ["Münster", "London", "Amsterdam"] +TEST_COORD_VALS = ["bun", "éclair", "sandwich"] + + +def convert_chararray(string_array_1d, maxlen, encoding="utf-8"): + bbytes = [text.encode(encoding) for text in string_array_1d] + pad = b"\0" * maxlen + bbytes = [(x + pad)[:maxlen] for x in bbytes] + chararray = np.array([[bb[i : i + 1] for i in range(maxlen)] for bb in bbytes]) + return chararray + + +INCLUDE_COORD = True +# INCLUDE_COORD = False + + +def make_testfile(filepath, chararray, coordarray, encoding_str=None): + with nc.Dataset(filepath, "w") as ds: + ds.createDimension("x", NX) + ds.createDimension("nstr", N_STRLEN) + vx = ds.createVariable("x", int, dimensions=("x")) + vx[:] = np.arange(NX) + if INCLUDE_COORD: + ds.createDimension("nstr2", N_STRLEN) + v_co = ds.createVariable( + "v_co", + "S1", + dimensions=( + "x", + "nstr2", + ), + ) + v_co[:] = coordarray + if encoding_str is not None: + v_co._Encoding = encoding_str + v = ds.createVariable( + "v", + "S1", + dimensions=( + "x", + "nstr", + ), + ) + v[:] = chararray + if encoding_str is not None: + v._Encoding = encoding_str + if INCLUDE_COORD: + v.coordinates = "v_co" + + +def show_result(filepath): + from pp_utils import ncdump + + print(f"File {filepath}") + print("NCDUMP:") + ncdump(filepath, "") + # with nc.Dataset(filepath, "r") as ds: + # v = ds.variables["v"] + # print("\n----\nNetcdf data readback (basic)") + # try: + # print(repr(v[:])) + # except UnicodeDecodeError as err: + # print(repr(err)) + # print("..raw:") + # v.set_auto_chartostring(False) + # print(repr(v[:])) + print("\nAs iris cube..") + try: + cube = iris.load_cube(filepath) + print(cube) + if iris.loading.LOAD_PROBLEMS._problems: + print(iris.loading.LOAD_PROBLEMS) + print( + "\n".join(iris.loading.LOAD_PROBLEMS._problems[0].stack_trace.format()) + ) + print("-data-") + print(repr(cube.data)) + if INCLUDE_COORD: + print("-coord data-") + try: + print(repr(cube.coord("v_co").points)) + except Exception as err2: + print(repr(err2)) + except UnicodeDecodeError as err: + print(repr(err)) + + +# tsts = (None, "ascii", "utf-8", "utf-32",) +# tsts = ("utf-8",) +# tsts = ("utf-8", "utf-32",) +# tsts = ("utf-32",) +tsts = ("utf-8", "ascii", "utf-8") + + +@pytest.mark.parametrize("encoding", tsts) +def test_encodings(encoding): + # small change + print(f"\n=========\nTesting encoding: {encoding}") + filepath = f"tmp_{str(encoding)}.nc" + do_as = encoding + if encoding != "utf-32": + do_as = "utf-8" + TEST_CHARARRAY = convert_chararray(TEST_STRINGS, N_STRLEN, encoding=do_as) + TEST_COORDARRAY = convert_chararray(TEST_COORD_VALS, N_STRLEN, encoding=do_as) + make_testfile(filepath, TEST_CHARARRAY, TEST_COORDARRAY, encoding_str=encoding) + show_result(filepath)