Skip to content
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
8 changes: 4 additions & 4 deletions lib/iris/fileformats/_nc_load_rules/helpers.py
Original file line number Diff line number Diff line change
Expand Up @@ -708,13 +708,13 @@ def build_and_add_global_attributes(engine: Engine):
),
)
if problem is not None:
stack_notes = problem.stack_trace.__notes__
stack_notes = problem.stack_trace.__notes__ # type: ignore[attr-defined]
if stack_notes is None:
stack_notes = []
stack_notes.append(
f"Skipping disallowed global attribute '{attr_name}' (see above error)"
)
problem.stack_trace.__notes__ = stack_notes
problem.stack_trace.__notes__ = stack_notes # type: ignore[attr-defined]


################################################################################
Expand Down Expand Up @@ -1536,14 +1536,14 @@ def build_and_add_dimension_coordinate(
)
if problem is not None:
coord_var_name = str(cf_coord_var.cf_name)
stack_notes = problem.stack_trace.__notes__
stack_notes = problem.stack_trace.__notes__ # type: ignore[attr-defined]
if stack_notes is None:
stack_notes = []
stack_notes.append(
f"Failed to create {coord_var_name} dimension coordinate:\n"
f"Gracefully creating {coord_var_name!r} auxiliary coordinate instead."
)
problem.stack_trace.__notes__ = stack_notes
problem.stack_trace.__notes__ = stack_notes # type: ignore[attr-defined]
problem.handled = True

_ = _add_or_capture(
Expand Down
55 changes: 49 additions & 6 deletions lib/iris/fileformats/cf.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,7 @@
"""

from abc import ABCMeta, abstractmethod
import codecs
from collections.abc import Iterable, MutableMapping
import os
import re
Expand Down Expand Up @@ -89,6 +90,11 @@ def __init__(self, name, data):

self.cf_data = data
"""NetCDF4 Variable data instance."""
# Note: *always* disable encoding/decoding translations
# To avoid current known problems
# See https://github.com/Unidata/netcdf4-python/issues/1440
data.set_auto_chartostring(False)
# ALSO NOTE: not stored. NetCDFDataProxy must re-assert when re-loading.

"""File source of the NetCDF content."""
try:
Expand Down Expand Up @@ -802,13 +808,49 @@ def cf_label_data(self, cf_data_var):
label_data = self[:]

if ma.isMaskedArray(label_data):
label_data = label_data.filled()
label_data = label_data.filled(b"\0")

default_encoding = "utf-8"
encoding = getattr(self, "_Encoding", None)
if encoding is None:
# utf-8 is a reasonable "safe" default, equivalent to 'ascii' for ascii data
encoding = default_encoding
else:
try:
# Accept + normalise naming of encodings
encoding = codecs.lookup(encoding).name
# NOTE: if encoding does not suit data, errors can occur.
# For example, _Encoding = "ascii", with non-ascii content.
except LookupError:
# Replace some invalid setting with "safe"(ish) fallback.
encoding = default_encoding

def string_from_1d_bytearray(array, encoding):
r"""Because numpy bytes arrays behave very oddly.

Elements which "should" contain a zero byte b'\0' instead appear to contain
an *empty* byte b''. So a "b''.join()" will *omit* any zero bytes.
"""
assert array.dtype.kind == "S" and array.dtype.itemsize == 1
assert array.ndim == 1
bytelist = [b"\0" if byte == b"" else byte for byte in array]
bytes = b"".join(bytelist)
assert len(bytes) == array.shape[0]
try:
string = bytes.decode(encoding=encoding)
except UnicodeDecodeError:
# if encoding == "ascii":
# print("\n\n*** FIX !!")
# string = bytes.decode("utf-8")
# else:
raise
result = string.strip()
return result

# Determine whether we have a string-valued scalar label
# i.e. a character variable that only has one dimension (the length of the string).
if self.ndim == 1:
label_string = b"".join(label_data).strip()
label_string = label_string.decode("utf8")
label_string = string_from_1d_bytearray(label_data, encoding)
data = np.array([label_string])
else:
# Determine the index of the string dimension.
Expand All @@ -829,9 +871,10 @@ def cf_label_data(self, cf_data_var):
else:
label_index = index + (slice(None, None),)

label_string = b"".join(label_data[label_index]).strip()
label_string = label_string.decode("utf8")
data[index] = label_string
label_string = string_from_1d_bytearray(
label_data[label_index], encoding
)
data[index] = label_string.strip()

return data

Expand Down
5 changes: 5 additions & 0 deletions lib/iris/fileformats/netcdf/_thread_safe_nc.py
Original file line number Diff line number Diff line change
Expand Up @@ -336,6 +336,11 @@ def __getitem__(self, keys):
dataset = netCDF4.Dataset(self.path)
try:
variable = dataset.variables[self.variable_name]
# ALWAYS disable byte encoding/decoding
# To avoid current known problems
# See https://github.com/Unidata/netcdf4-python/issues/1440
variable.set_auto_chartostring(False)

# Get the NetCDF variable data and slice.
var = variable[keys]
finally:
Expand Down
6 changes: 3 additions & 3 deletions lib/iris/fileformats/netcdf/saver.py
Original file line number Diff line number Diff line change
Expand Up @@ -990,12 +990,12 @@ def _add_aux_coords(
]

# Include any relevant mesh location coordinates.
mesh: MeshXY | None = getattr(cube, "mesh")
mesh_location: str | None = getattr(cube, "location")
mesh: MeshXY | None = getattr(cube, "mesh") # type: ignore[annotation-unchecked]
mesh_location: str | None = getattr(cube, "location") # type: ignore[annotation-unchecked]
if mesh and mesh_location:
location_coords: MeshNodeCoords | MeshEdgeCoords | MeshFaceCoords = getattr(
mesh, f"{mesh_location}_coords"
)
) # type: ignore[annotation-unchecked]
coords_to_add.extend(list(location_coords))

return self._add_inner_related_vars(
Expand Down
113 changes: 113 additions & 0 deletions lib/iris/tests/integration/netcdf/test_chararrays.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,113 @@
import netCDF4 as nc
import numpy as np
import pytest

import iris

NX, N_STRLEN = 3, 64
TEST_STRINGS = ["Münster", "London", "Amsterdam"]
TEST_COORD_VALS = ["bun", "éclair", "sandwich"]


def convert_chararray(string_array_1d, maxlen, encoding="utf-8"):
bbytes = [text.encode(encoding) for text in string_array_1d]
pad = b"\0" * maxlen
bbytes = [(x + pad)[:maxlen] for x in bbytes]
chararray = np.array([[bb[i : i + 1] for i in range(maxlen)] for bb in bbytes])
return chararray


INCLUDE_COORD = True
# INCLUDE_COORD = False


def make_testfile(filepath, chararray, coordarray, encoding_str=None):
with nc.Dataset(filepath, "w") as ds:
ds.createDimension("x", NX)
ds.createDimension("nstr", N_STRLEN)
vx = ds.createVariable("x", int, dimensions=("x"))
vx[:] = np.arange(NX)
if INCLUDE_COORD:
ds.createDimension("nstr2", N_STRLEN)
v_co = ds.createVariable(
"v_co",
"S1",
dimensions=(
"x",
"nstr2",
),
)
v_co[:] = coordarray
if encoding_str is not None:
v_co._Encoding = encoding_str
v = ds.createVariable(
"v",
"S1",
dimensions=(
"x",
"nstr",
),
)
v[:] = chararray
if encoding_str is not None:
v._Encoding = encoding_str
if INCLUDE_COORD:
v.coordinates = "v_co"


def show_result(filepath):
from pp_utils import ncdump

print(f"File {filepath}")
print("NCDUMP:")
ncdump(filepath, "")
# with nc.Dataset(filepath, "r") as ds:
# v = ds.variables["v"]
# print("\n----\nNetcdf data readback (basic)")
# try:
# print(repr(v[:]))
# except UnicodeDecodeError as err:
# print(repr(err))
# print("..raw:")
# v.set_auto_chartostring(False)
# print(repr(v[:]))
print("\nAs iris cube..")
try:
cube = iris.load_cube(filepath)
print(cube)
if iris.loading.LOAD_PROBLEMS._problems:
print(iris.loading.LOAD_PROBLEMS)
print(
"\n".join(iris.loading.LOAD_PROBLEMS._problems[0].stack_trace.format())
)
print("-data-")
print(repr(cube.data))
if INCLUDE_COORD:
print("-coord data-")
try:
print(repr(cube.coord("v_co").points))
except Exception as err2:
print(repr(err2))
except UnicodeDecodeError as err:
print(repr(err))


# tsts = (None, "ascii", "utf-8", "utf-32",)
# tsts = ("utf-8",)
# tsts = ("utf-8", "utf-32",)
# tsts = ("utf-32",)
tsts = ("utf-8", "ascii", "utf-8")


@pytest.mark.parametrize("encoding", tsts)
def test_encodings(encoding):
# small change
print(f"\n=========\nTesting encoding: {encoding}")
filepath = f"tmp_{str(encoding)}.nc"
do_as = encoding
if encoding != "utf-32":
do_as = "utf-8"
TEST_CHARARRAY = convert_chararray(TEST_STRINGS, N_STRLEN, encoding=do_as)
TEST_COORDARRAY = convert_chararray(TEST_COORD_VALS, N_STRLEN, encoding=do_as)
make_testfile(filepath, TEST_CHARARRAY, TEST_COORDARRAY, encoding_str=encoding)
show_result(filepath)
Loading