From 8534cadfb0d3184f1faa9ce8dbe0ea69fdca4f84 Mon Sep 17 00:00:00 2001 From: Joshua Gould Date: Thu, 11 Sep 2025 10:14:58 -0400 Subject: [PATCH 1/4] Fix for encoding objects for zarr<3 --- xarray/backends/zarr.py | 14 ++++++++++++++ xarray/tests/test_backends.py | 12 +++++++++++- 2 files changed, 25 insertions(+), 1 deletion(-) diff --git a/xarray/backends/zarr.py b/xarray/backends/zarr.py index f0578ca9352..773f9f218e9 100644 --- a/xarray/backends/zarr.py +++ b/xarray/backends/zarr.py @@ -1130,6 +1130,20 @@ def _create_new_array( if c in encoding: encoding["config"][c] = encoding.pop(c) + else: + from zarr.util import normalize_dtype + + _, object_codec = normalize_dtype(dtype, None) + + if object_codec is not None: + existing_filters = encoding.get("filters") + if ( + existing_filters is not None + and len(existing_filters) == 1 + and existing_filters[0] == object_codec + ): + del encoding["filters"] + zarr_array = self.zarr_group.create( name, shape=shape, diff --git a/xarray/tests/test_backends.py b/xarray/tests/test_backends.py index 0f4debada29..0d8c1dd01c9 100644 --- a/xarray/tests/test_backends.py +++ b/xarray/tests/test_backends.py @@ -121,7 +121,6 @@ except ImportError: pass - if has_zarr: import zarr import zarr.codecs @@ -3075,6 +3074,17 @@ def test_zarr_mode_w_overwrites_encoding(self) -> None: zarr.open_group(store, **self.version_kwargs)["foo"], data.foo.data ) + def test_object_codec(self) -> None: + data = xr.DataArray( + data=np.zeros((2, 2)), + dims=["x", "y"], + coords=dict(y=np.array(["a", "b"], dtype=object)), + ) + with create_tmp_file() as path1: + data.to_zarr(path1, mode="w") + data = xr.open_zarr(path1) + data.to_zarr(path1, mode="w") + def test_encoding_kwarg_fixed_width_string(self) -> None: # not relevant for zarr, since we don't use EncodedStringCoder pass From 9b9ca2b14831b1ccd9637098fbcdb17438705042 Mon Sep 17 00:00:00 2001 From: Joshua Gould Date: Mon, 6 Oct 2025 13:53:34 -0400 Subject: [PATCH 2/4] Fix for encoding objects for zarr<3 Resolved conflicts with main --- xarray/tests/test_backends.py | 326 +++++++++++++--------------------- 1 file changed, 119 insertions(+), 207 deletions(-) diff --git a/xarray/tests/test_backends.py b/xarray/tests/test_backends.py index 41454f668ba..40484ec7142 100644 --- a/xarray/tests/test_backends.py +++ b/xarray/tests/test_backends.py @@ -89,7 +89,6 @@ requires_fsspec, requires_h5netcdf, requires_h5netcdf_1_4_0_or_above, - requires_h5netcdf_or_netCDF4, requires_h5netcdf_ros3, requires_iris, requires_netcdf, @@ -116,12 +115,12 @@ with contextlib.suppress(ImportError): import netCDF4 as nc4 -with contextlib.suppress(ImportError): +try: import dask import dask.array as da +except ImportError: + pass -with contextlib.suppress(ImportError): - import fsspec if has_zarr: import zarr @@ -193,6 +192,7 @@ def _check_compression_codec_available(codec: str | None) -> bool: try: import os + import tempfile import netCDF4 @@ -626,17 +626,22 @@ def test_dataset_compute(self) -> None: def test_pickle(self) -> None: expected = Dataset({"foo": ("x", [42])}) with self.roundtrip(expected, allow_cleanup_failure=ON_WINDOWS) as roundtripped: - # Windows doesn't like reopening an already open file - raw_pickle = pickle.dumps(roundtripped) + with roundtripped: + # Windows doesn't like reopening an already open file + raw_pickle = pickle.dumps(roundtripped) with pickle.loads(raw_pickle) as unpickled_ds: assert_identical(expected, unpickled_ds) + @pytest.mark.filterwarnings("ignore:deallocating CachingFileManager") def test_pickle_dataarray(self) -> None: expected = Dataset({"foo": ("x", [42])}) with self.roundtrip(expected, allow_cleanup_failure=ON_WINDOWS) as roundtripped: - raw_pickle = pickle.dumps(roundtripped["foo"]) - with pickle.loads(raw_pickle) as unpickled: - assert_identical(expected["foo"], unpickled) + with roundtripped: + raw_pickle = pickle.dumps(roundtripped["foo"]) + # TODO: figure out how to explicitly close the file for the + # unpickled DataArray? + unpickled = pickle.loads(raw_pickle) + assert_identical(expected["foo"], unpickled) def test_dataset_caching(self) -> None: expected = Dataset({"foo": ("x", [5, 6, 7])}) @@ -652,6 +657,7 @@ def test_dataset_caching(self) -> None: _ = actual.foo.values # no caching assert not actual.foo.variable._in_memory + @pytest.mark.filterwarnings("ignore:deallocating CachingFileManager") def test_roundtrip_None_variable(self) -> None: expected = Dataset({None: (("x", "y"), [[0, 1], [2, 3]])}) with self.roundtrip(expected) as actual: @@ -2111,7 +2117,7 @@ def test_encoding_enum__no_fill_value(self, recwarn): fill_value=None, ) v[:] = 1 - with open_dataset(tmp_file, engine="netcdf4") as original: + with open_dataset(tmp_file) as original: save_kwargs = {} # We don't expect any errors. # This is effectively a void context manager @@ -2163,7 +2169,7 @@ def test_encoding_enum__multiple_variable_with_enum(self): "time", fill_value=255, ) - with open_dataset(tmp_file, engine="netcdf4") as original: + with open_dataset(tmp_file) as original: save_kwargs = {} if self.engine == "h5netcdf" and not has_h5netcdf_1_4_0_or_above: save_kwargs["invalid_netcdf"] = True @@ -2212,7 +2218,7 @@ def test_encoding_enum__error_multiple_variable_with_changing_enum(self): "time", fill_value=255, ) - with open_dataset(tmp_file, engine="netcdf4") as original: + with open_dataset(tmp_file) as original: assert ( original.clouds.encoding["dtype"].metadata == original.tifa.encoding["dtype"].metadata @@ -2487,70 +2493,6 @@ def test_deepcopy(self) -> None: assert_identical(expected, copied) -class InMemoryNetCDF: - engine: T_NetcdfEngine | None - - def test_roundtrip_via_memoryview(self) -> None: - original = create_test_data() - result = original.to_netcdf(engine=self.engine) - roundtrip = load_dataset(result, engine=self.engine) - assert_identical(roundtrip, original) - - def test_roundtrip_via_bytes(self) -> None: - original = create_test_data() - result = bytes(original.to_netcdf(engine=self.engine)) - roundtrip = load_dataset(result, engine=self.engine) - assert_identical(roundtrip, original) - - def test_pickle_open_dataset_from_bytes(self) -> None: - original = Dataset({"foo": ("x", [1, 2, 3])}) - netcdf_bytes = bytes(original.to_netcdf(engine=self.engine)) - with open_dataset(netcdf_bytes, engine=self.engine) as roundtrip: - with pickle.loads(pickle.dumps(roundtrip)) as unpickled: - assert_identical(unpickled, original) - - def test_compute_false(self) -> None: - original = create_test_data() - with pytest.raises( - NotImplementedError, - match=re.escape("to_netcdf() with compute=False is not yet implemented"), - ): - original.to_netcdf(engine=self.engine, compute=False) - - -class InMemoryNetCDFWithGroups(InMemoryNetCDF): - def test_roundtrip_group_via_memoryview(self) -> None: - original = create_test_data() - netcdf_bytes = original.to_netcdf(group="sub", engine=self.engine) - roundtrip = load_dataset(netcdf_bytes, group="sub", engine=self.engine) - assert_identical(roundtrip, original) - - -class FileObjectNetCDF: - engine: T_NetcdfEngine - - def test_file_remains_open(self) -> None: - data = Dataset({"foo": ("x", [1, 2, 3])}) - f = BytesIO() - data.to_netcdf(f, engine=self.engine) - assert not f.closed - restored = open_dataset(f, engine=self.engine) - assert not f.closed - assert_identical(restored, data) - restored.close() - assert not f.closed - - -@requires_h5netcdf_or_netCDF4 -class TestGenericNetCDF4InMemory(InMemoryNetCDFWithGroups): - engine = None - - -@requires_netCDF4 -class TestNetCDF4InMemory(InMemoryNetCDFWithGroups): - engine: T_NetcdfEngine = "netcdf4" - - @requires_netCDF4 @requires_dask @pytest.mark.filterwarnings("ignore:deallocating CachingFileManager") @@ -3133,17 +3075,6 @@ def test_zarr_mode_w_overwrites_encoding(self) -> None: zarr.open_group(store, **self.version_kwargs)["foo"], data.foo.data ) - def test_object_codec(self) -> None: - data = xr.DataArray( - data=np.zeros((2, 2)), - dims=["x", "y"], - coords=dict(y=np.array(["a", "b"], dtype=object)), - ) - with create_tmp_file() as path1: - data.to_zarr(path1, mode="w") - data = xr.open_zarr(path1) - data.to_zarr(path1, mode="w") - def test_encoding_kwarg_fixed_width_string(self) -> None: # not relevant for zarr, since we don't use EncodedStringCoder pass @@ -3454,14 +3385,6 @@ def test_write_region(self, consolidated, compute, use_dask, write_empty) -> Non ) as actual: assert_identical(actual, nonzeros) - def test_region_scalar(self) -> None: - ds = Dataset({"x": 0}) - with self.create_zarr_target() as store: - ds.to_zarr(store) - ds.to_zarr(store, region={}, mode="r+") - with xr.open_zarr(store) as actual: - assert_identical(actual, ds) - @pytest.mark.parametrize("mode", [None, "r+", "a"]) def test_write_region_mode(self, mode) -> None: zeros = Dataset({"u": (("x",), np.zeros(10))}) @@ -3806,6 +3729,17 @@ def test_zarr_fill_value_setting(self, dtype): # ``raise_on_invalid=vn in check_encoding_set`` line in zarr.py # ds.foo.encoding["fill_value"] = fv + def test_object_codec(self) -> None: + data = xr.DataArray( + data=np.zeros((2, 2)), + dims=["x", "y"], + coords=dict(y=np.array(["a", "b"], dtype=object)), + ) + with create_tmp_file() as path1: + data.to_zarr(path1, mode="w") + data = xr.open_zarr(path1) + data.to_zarr(path1, mode="w") + @requires_zarr @pytest.mark.skipif( @@ -4399,23 +4333,6 @@ def roundtrip_dir( ) as ds: yield ds - @requires_dask - def test_default_zarr_fill_value(self): - inputs = xr.Dataset({"floats": ("x", [1.0]), "ints": ("x", [1])}).chunk() - expected = xr.Dataset({"floats": ("x", [np.nan]), "ints": ("x", [0])}) - with self.temp_dir() as (d, store): - inputs.to_zarr(store, compute=False) - with open_dataset(store) as on_disk: - assert np.isnan(on_disk.variables["floats"].encoding["_FillValue"]) - assert ( - "_FillValue" not in on_disk.variables["ints"].encoding - ) # use default - if not has_zarr_v3: - # zarr-python v2 interprets fill_value=None inconsistently - del on_disk["ints"] - del expected["ints"] - assert_identical(expected, on_disk) - @pytest.mark.parametrize("consolidated", [True, False, None]) @pytest.mark.parametrize("write_empty", [True, False, None]) def test_write_empty( @@ -4454,13 +4371,14 @@ def assert_expected_files(expected: list[str], store: str) -> None: "0.1.1", ] - # use nan for default fill_value behaviour - data = np.array([np.nan, np.nan, 1.0, np.nan]).reshape((1, 2, 2)) - if zarr_format_3: + data = np.array([0.0, 0, 1.0, 0]).reshape((1, 2, 2)) # transform to the path style of zarr 3 # e.g. 0/0/1 expected = [e.replace(".", "/") for e in expected] + else: + # use nan for default fill_value behaviour + data = np.array([np.nan, np.nan, 1.0, np.nan]).reshape((1, 2, 2)) ds = xr.Dataset(data_vars={"test": (("Z", "Y", "X"), data)}) @@ -4589,7 +4507,7 @@ def test_zarr_version_deprecated() -> None: @requires_scipy -class TestScipyInMemoryData(CFEncodedBase, NetCDF3Only, InMemoryNetCDF): +class TestScipyInMemoryData(NetCDF3Only, CFEncodedBase): engine: T_NetcdfEngine = "scipy" @contextlib.contextmanager @@ -4597,26 +4515,38 @@ def create_store(self): fobj = BytesIO() yield backends.ScipyDataStore(fobj, "w") - @contextlib.contextmanager - def roundtrip( - self, data, save_kwargs=None, open_kwargs=None, allow_cleanup_failure=False - ): - if save_kwargs is None: - save_kwargs = {} - if open_kwargs is None: - open_kwargs = {} - saved = self.save(data, path=None, **save_kwargs) - with self.open(saved, **open_kwargs) as ds: - yield ds - @pytest.mark.asyncio @pytest.mark.skip(reason="NetCDF backends don't support async loading") async def test_load_async(self) -> None: await super().test_load_async() + def test_to_netcdf_explicit_engine(self) -> None: + Dataset({"foo": 42}).to_netcdf(engine="scipy") + + def test_roundtrip_via_bytes(self) -> None: + original = create_test_data() + netcdf_bytes = original.to_netcdf(engine="scipy") + roundtrip = open_dataset(netcdf_bytes, engine="scipy") + assert_identical(roundtrip, original) + + def test_to_bytes_compute_false(self) -> None: + original = create_test_data() + with pytest.raises( + NotImplementedError, + match=re.escape("to_netcdf() with compute=False is not yet implemented"), + ): + original.to_netcdf(engine="scipy", compute=False) + + def test_bytes_pickle(self) -> None: + data = Dataset({"foo": ("x", [1, 2, 3])}) + fobj = data.to_netcdf(engine="scipy") + with self.open(fobj) as ds: + unpickled = pickle.loads(pickle.dumps(ds)) + assert_identical(unpickled, data) + @requires_scipy -class TestScipyFileObject(CFEncodedBase, NetCDF3Only, FileObjectNetCDF): +class TestScipyFileObject(NetCDF3Only, CFEncodedBase): # TODO: Consider consolidating some of these cases (e.g., # test_file_remains_open) with TestH5NetCDFFileObject engine: T_NetcdfEngine = "scipy" @@ -4641,18 +4571,27 @@ def roundtrip( with self.open(f, **open_kwargs) as ds: yield ds - @pytest.mark.asyncio - @pytest.mark.skip(reason="NetCDF backends don't support async loading") - async def test_load_async(self) -> None: - await super().test_load_async() + @pytest.mark.xfail( + reason="scipy.io.netcdf_file closes files upon garbage collection" + ) + def test_file_remains_open(self) -> None: + data = Dataset({"foo": ("x", [1, 2, 3])}) + f = BytesIO() + data.to_netcdf(f, engine="scipy") + assert not f.closed + restored = open_dataset(f, engine="scipy") + assert not f.closed + assert_identical(restored, data) + restored.close() + assert not f.closed @pytest.mark.skip(reason="cannot pickle file objects") def test_pickle(self) -> None: - super().test_pickle() + pass @pytest.mark.skip(reason="cannot pickle file objects") def test_pickle_dataarray(self) -> None: - super().test_pickle_dataarray() + pass @pytest.mark.parametrize("create_default_indexes", [True, False]) def test_create_default_indexes(self, tmp_path, create_default_indexes) -> None: @@ -4765,23 +4704,19 @@ def test_engine(self) -> None: with pytest.raises(ValueError, match=r"unrecognized engine"): data.to_netcdf("foo.nc", engine="foobar") # type: ignore[call-overload] + with pytest.raises( + ValueError, + match=re.escape( + "can only read bytes or file-like objects with engine='scipy' or 'h5netcdf'" + ), + ): + data.to_netcdf(engine="netcdf4") + with create_tmp_file() as tmp_file: data.to_netcdf(tmp_file) with pytest.raises(ValueError, match=r"unrecognized engine"): open_dataset(tmp_file, engine="foobar") - with pytest.raises( - TypeError, - match=re.escape("file objects are not supported by the netCDF4 backend"), - ): - data.to_netcdf(BytesIO(), engine="netcdf4") - - with pytest.raises( - TypeError, - match=re.escape("file objects are not supported by the netCDF4 backend"), - ): - open_dataset(BytesIO(), engine="netcdf4") - bytes_io = BytesIO() data.to_netcdf(bytes_io, engine="scipy") with pytest.raises(ValueError, match=r"unrecognized engine"): @@ -5119,7 +5054,7 @@ def test_deepcopy(self) -> None: @requires_h5netcdf -class TestH5NetCDFFileObject(TestH5NetCDFData, FileObjectNetCDF): +class TestH5NetCDFFileObject(TestH5NetCDFData): engine: T_NetcdfEngine = "h5netcdf" def test_open_badbytes(self) -> None: @@ -5128,10 +5063,8 @@ def test_open_badbytes(self) -> None: ): with open_dataset(b"garbage"): pass - with pytest.raises( - ValueError, match=r"not the signature of a valid netCDF4 file" - ): - with open_dataset(b"garbage", engine="h5netcdf"): + with pytest.raises(ValueError, match=r"can only read bytes"): + with open_dataset(b"garbage", engine="netcdf4"): pass with pytest.raises( ValueError, match=r"not the signature of a valid netCDF4 file" @@ -5141,12 +5074,13 @@ def test_open_badbytes(self) -> None: def test_open_twice(self) -> None: expected = create_test_data() + expected.attrs["foo"] = "bar" with create_tmp_file() as tmp_file: - expected.to_netcdf(tmp_file, engine=self.engine) + expected.to_netcdf(tmp_file, engine="h5netcdf") with open(tmp_file, "rb") as f: - with open_dataset(f, engine=self.engine): - with open_dataset(f, engine=self.engine): - pass # should not crash + with open_dataset(f, engine="h5netcdf"): + with open_dataset(f, engine="h5netcdf"): + pass @requires_scipy def test_open_fileobj(self) -> None: @@ -5181,24 +5115,31 @@ def test_open_fileobj(self) -> None: with open_dataset(f): # ensure file gets closed pass - @requires_fsspec - def test_fsspec(self) -> None: - expected = create_test_data() - with create_tmp_file() as tmp_file: - expected.to_netcdf(tmp_file, engine="h5netcdf") - - with fsspec.open(tmp_file, "rb") as f: - with open_dataset(f, engine="h5netcdf") as actual: - assert_identical(actual, expected) - - # fsspec.open() creates a pickleable file, unlike open() - with pickle.loads(pickle.dumps(actual)) as unpickled: - assert_identical(unpickled, expected) + def test_file_remains_open(self) -> None: + data = Dataset({"foo": ("x", [1, 2, 3])}) + f = BytesIO() + data.to_netcdf(f, engine="h5netcdf") + assert not f.closed + restored = open_dataset(f, engine="h5netcdf") + assert not f.closed + assert_identical(restored, data) + restored.close() + assert not f.closed @requires_h5netcdf -class TestH5NetCDFInMemoryData(InMemoryNetCDFWithGroups): - engine: T_NetcdfEngine = "h5netcdf" +class TestH5NetCDFInMemoryData: + def test_roundtrip_via_bytes(self) -> None: + original = create_test_data() + netcdf_bytes = original.to_netcdf(engine="h5netcdf") + roundtrip = open_dataset(netcdf_bytes, engine="h5netcdf") + assert_identical(roundtrip, original) + + def test_roundtrip_group_via_bytes(self) -> None: + original = create_test_data() + netcdf_bytes = original.to_netcdf(group="sub", engine="h5netcdf") + roundtrip = open_dataset(netcdf_bytes, group="sub", engine="h5netcdf") + assert_identical(roundtrip, original) @requires_h5netcdf @@ -5241,25 +5182,6 @@ def test_write_inconsistent_chunks(self) -> None: assert actual["y"].encoding["chunksizes"] == (100, 50) -@requires_netCDF4 -@requires_h5netcdf -def test_memoryview_write_h5netcdf_read_netcdf4() -> None: - original = create_test_data() - result = original.to_netcdf(engine="h5netcdf") - roundtrip = load_dataset(result, engine="netcdf4") - assert_identical(roundtrip, original) - - -@requires_netCDF4 -@requires_h5netcdf -def test_memoryview_write_netcdf4_read_h5netcdf() -> None: - original = create_test_data() - result = original.to_netcdf(engine="netcdf4") - roundtrip = load_dataset(result, engine="h5netcdf") - assert_identical(roundtrip, original) - - -@network @requires_h5netcdf_ros3 class TestH5NetCDFDataRos3Driver(TestCommon): engine: T_NetcdfEngine = "h5netcdf" @@ -5393,9 +5315,11 @@ def test_open_mfdataset_list_attr() -> None: """ Case when an attribute of type list differs across the multiple files """ + from netCDF4 import Dataset + with create_tmp_files(2) as nfiles: for i in range(2): - with nc4.Dataset(nfiles[i], "w") as f: + with Dataset(nfiles[i], "w") as f: f.createDimension("x", 3) vlvar = f.createVariable("test_var", np.int32, ("x")) # here create an attribute as a list @@ -6808,7 +6732,6 @@ def _assert_no_dates_out_of_range_warning(record): @requires_scipy_or_netCDF4 -@pytest.mark.filterwarnings("ignore:deallocating CachingFileManager") @pytest.mark.parametrize("calendar", _STANDARD_CALENDARS) def test_use_cftime_standard_calendar_default_in_range(calendar) -> None: x = [0, 1] @@ -6931,7 +6854,6 @@ def test_use_cftime_false_standard_calendar_in_range(calendar) -> None: @requires_scipy_or_netCDF4 -@pytest.mark.filterwarnings("ignore:deallocating CachingFileManager") @pytest.mark.parametrize("calendar", ["standard", "gregorian"]) def test_use_cftime_false_standard_calendar_out_of_range(calendar) -> None: x = [0, 1] @@ -6944,13 +6866,12 @@ def test_use_cftime_false_standard_calendar_out_of_range(calendar) -> None: with create_tmp_file() as tmp_file: original.to_netcdf(tmp_file) - decoder = CFDatetimeCoder(use_cftime=False) with pytest.raises((OutOfBoundsDatetime, ValueError)): + decoder = CFDatetimeCoder(use_cftime=False) open_dataset(tmp_file, decode_times=decoder) @requires_scipy_or_netCDF4 -@pytest.mark.filterwarnings("ignore:deallocating CachingFileManager") @pytest.mark.parametrize("calendar", _NON_STANDARD_CALENDARS) @pytest.mark.parametrize("units_year", [1500, 2000, 2500]) def test_use_cftime_false_nonstandard_calendar(calendar, units_year) -> None: @@ -6964,8 +6885,8 @@ def test_use_cftime_false_nonstandard_calendar(calendar, units_year) -> None: with create_tmp_file() as tmp_file: original.to_netcdf(tmp_file) - decoder = CFDatetimeCoder(use_cftime=False) with pytest.raises((OutOfBoundsDatetime, ValueError)): + decoder = CFDatetimeCoder(use_cftime=False) open_dataset(tmp_file, decode_times=decoder) @@ -7184,9 +7105,6 @@ def test_netcdf4_entrypoint(tmp_path: Path) -> None: assert entrypoint.guess_can_open("something-local.cdf") assert not entrypoint.guess_can_open("not-found-and-no-extension") - contents = ds.to_netcdf(engine="netcdf4") - _check_guess_can_open_and_open(entrypoint, contents, engine="netcdf4", expected=ds) - path = tmp_path / "baz" with open(path, "wb") as f: f.write(b"not-a-netcdf-file") @@ -7237,9 +7155,6 @@ def test_h5netcdf_entrypoint(tmp_path: Path) -> None: with open(path, "rb") as f: _check_guess_can_open_and_open(entrypoint, f, engine="h5netcdf", expected=ds) - contents = ds.to_netcdf(engine="h5netcdf") - _check_guess_can_open_and_open(entrypoint, contents, engine="h5netcdf", expected=ds) - assert entrypoint.guess_can_open("something-local.nc") assert entrypoint.guess_can_open("something-local.nc4") assert entrypoint.guess_can_open("something-local.cdf") @@ -7312,7 +7227,6 @@ def _create_nczarr(self, filename): # https://github.com/Unidata/netcdf-c/issues/2259 ds = ds.drop_vars("dim3") - # engine="netcdf4" is not required for backwards compatibility ds.to_netcdf(f"file://{filename}#mode=nczarr") return ds @@ -7366,6 +7280,8 @@ def test_zarr_closing_internal_zip_store(): @requires_zarr @pytest.mark.parametrize("create_default_indexes", [True, False]) def test_zarr_create_default_indexes(tmp_path, create_default_indexes) -> None: + from xarray.core.indexes import PandasIndex + store_path = tmp_path / "tmp.zarr" original_ds = xr.Dataset({"data": ("x", np.arange(3))}, coords={"x": [-1, 0, 1]}) original_ds.to_zarr(store_path, mode="w") @@ -7555,10 +7471,6 @@ def test_zarr_append_chunk_partial(self): mode="a", ) - @pytest.mark.xfail( - ON_WINDOWS, - reason="Permission errors from Zarr: https://github.com/pydata/xarray/pull/10793", - ) @requires_dask def test_zarr_region_chunk_partial_offset(self): # https://github.com/pydata/xarray/pull/8459#issuecomment-1819417545 From 007fbd75def99882c689bfecb1a9c3b46622d8e6 Mon Sep 17 00:00:00 2001 From: Joshua Gould Date: Mon, 6 Oct 2025 13:57:58 -0400 Subject: [PATCH 3/4] use create_zarr_target --- xarray/tests/test_backends.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/xarray/tests/test_backends.py b/xarray/tests/test_backends.py index 40484ec7142..46198ccdf28 100644 --- a/xarray/tests/test_backends.py +++ b/xarray/tests/test_backends.py @@ -3735,10 +3735,10 @@ def test_object_codec(self) -> None: dims=["x", "y"], coords=dict(y=np.array(["a", "b"], dtype=object)), ) - with create_tmp_file() as path1: - data.to_zarr(path1, mode="w") - data = xr.open_zarr(path1) - data.to_zarr(path1, mode="w") + with self.create_zarr_target() as store_target: + data.to_zarr(store_target, **self.version_kwargs) + data = xr.open_zarr(store_target, **self.version_kwargs) + data.to_zarr(store_target, **self.version_kwargs, mode="w") @requires_zarr From 356d37fac3ceb655e9db0d56276716c227b70e4f Mon Sep 17 00:00:00 2001 From: Joshua Gould Date: Mon, 6 Oct 2025 14:23:05 -0400 Subject: [PATCH 4/4] use create_zarr_target --- xarray/tests/test_backends.py | 304 ++++++++++++++++++++++------------ 1 file changed, 196 insertions(+), 108 deletions(-) diff --git a/xarray/tests/test_backends.py b/xarray/tests/test_backends.py index 46198ccdf28..811154e2451 100644 --- a/xarray/tests/test_backends.py +++ b/xarray/tests/test_backends.py @@ -89,6 +89,7 @@ requires_fsspec, requires_h5netcdf, requires_h5netcdf_1_4_0_or_above, + requires_h5netcdf_or_netCDF4, requires_h5netcdf_ros3, requires_iris, requires_netcdf, @@ -115,12 +116,12 @@ with contextlib.suppress(ImportError): import netCDF4 as nc4 -try: +with contextlib.suppress(ImportError): import dask import dask.array as da -except ImportError: - pass +with contextlib.suppress(ImportError): + import fsspec if has_zarr: import zarr @@ -192,7 +193,6 @@ def _check_compression_codec_available(codec: str | None) -> bool: try: import os - import tempfile import netCDF4 @@ -626,22 +626,17 @@ def test_dataset_compute(self) -> None: def test_pickle(self) -> None: expected = Dataset({"foo": ("x", [42])}) with self.roundtrip(expected, allow_cleanup_failure=ON_WINDOWS) as roundtripped: - with roundtripped: - # Windows doesn't like reopening an already open file - raw_pickle = pickle.dumps(roundtripped) + # Windows doesn't like reopening an already open file + raw_pickle = pickle.dumps(roundtripped) with pickle.loads(raw_pickle) as unpickled_ds: assert_identical(expected, unpickled_ds) - @pytest.mark.filterwarnings("ignore:deallocating CachingFileManager") def test_pickle_dataarray(self) -> None: expected = Dataset({"foo": ("x", [42])}) with self.roundtrip(expected, allow_cleanup_failure=ON_WINDOWS) as roundtripped: - with roundtripped: - raw_pickle = pickle.dumps(roundtripped["foo"]) - # TODO: figure out how to explicitly close the file for the - # unpickled DataArray? - unpickled = pickle.loads(raw_pickle) - assert_identical(expected["foo"], unpickled) + raw_pickle = pickle.dumps(roundtripped["foo"]) + with pickle.loads(raw_pickle) as unpickled: + assert_identical(expected["foo"], unpickled) def test_dataset_caching(self) -> None: expected = Dataset({"foo": ("x", [5, 6, 7])}) @@ -657,7 +652,6 @@ def test_dataset_caching(self) -> None: _ = actual.foo.values # no caching assert not actual.foo.variable._in_memory - @pytest.mark.filterwarnings("ignore:deallocating CachingFileManager") def test_roundtrip_None_variable(self) -> None: expected = Dataset({None: (("x", "y"), [[0, 1], [2, 3]])}) with self.roundtrip(expected) as actual: @@ -2117,7 +2111,7 @@ def test_encoding_enum__no_fill_value(self, recwarn): fill_value=None, ) v[:] = 1 - with open_dataset(tmp_file) as original: + with open_dataset(tmp_file, engine="netcdf4") as original: save_kwargs = {} # We don't expect any errors. # This is effectively a void context manager @@ -2169,7 +2163,7 @@ def test_encoding_enum__multiple_variable_with_enum(self): "time", fill_value=255, ) - with open_dataset(tmp_file) as original: + with open_dataset(tmp_file, engine="netcdf4") as original: save_kwargs = {} if self.engine == "h5netcdf" and not has_h5netcdf_1_4_0_or_above: save_kwargs["invalid_netcdf"] = True @@ -2218,7 +2212,7 @@ def test_encoding_enum__error_multiple_variable_with_changing_enum(self): "time", fill_value=255, ) - with open_dataset(tmp_file) as original: + with open_dataset(tmp_file, engine="netcdf4") as original: assert ( original.clouds.encoding["dtype"].metadata == original.tifa.encoding["dtype"].metadata @@ -2493,6 +2487,70 @@ def test_deepcopy(self) -> None: assert_identical(expected, copied) +class InMemoryNetCDF: + engine: T_NetcdfEngine | None + + def test_roundtrip_via_memoryview(self) -> None: + original = create_test_data() + result = original.to_netcdf(engine=self.engine) + roundtrip = load_dataset(result, engine=self.engine) + assert_identical(roundtrip, original) + + def test_roundtrip_via_bytes(self) -> None: + original = create_test_data() + result = bytes(original.to_netcdf(engine=self.engine)) + roundtrip = load_dataset(result, engine=self.engine) + assert_identical(roundtrip, original) + + def test_pickle_open_dataset_from_bytes(self) -> None: + original = Dataset({"foo": ("x", [1, 2, 3])}) + netcdf_bytes = bytes(original.to_netcdf(engine=self.engine)) + with open_dataset(netcdf_bytes, engine=self.engine) as roundtrip: + with pickle.loads(pickle.dumps(roundtrip)) as unpickled: + assert_identical(unpickled, original) + + def test_compute_false(self) -> None: + original = create_test_data() + with pytest.raises( + NotImplementedError, + match=re.escape("to_netcdf() with compute=False is not yet implemented"), + ): + original.to_netcdf(engine=self.engine, compute=False) + + +class InMemoryNetCDFWithGroups(InMemoryNetCDF): + def test_roundtrip_group_via_memoryview(self) -> None: + original = create_test_data() + netcdf_bytes = original.to_netcdf(group="sub", engine=self.engine) + roundtrip = load_dataset(netcdf_bytes, group="sub", engine=self.engine) + assert_identical(roundtrip, original) + + +class FileObjectNetCDF: + engine: T_NetcdfEngine + + def test_file_remains_open(self) -> None: + data = Dataset({"foo": ("x", [1, 2, 3])}) + f = BytesIO() + data.to_netcdf(f, engine=self.engine) + assert not f.closed + restored = open_dataset(f, engine=self.engine) + assert not f.closed + assert_identical(restored, data) + restored.close() + assert not f.closed + + +@requires_h5netcdf_or_netCDF4 +class TestGenericNetCDF4InMemory(InMemoryNetCDFWithGroups): + engine = None + + +@requires_netCDF4 +class TestNetCDF4InMemory(InMemoryNetCDFWithGroups): + engine: T_NetcdfEngine = "netcdf4" + + @requires_netCDF4 @requires_dask @pytest.mark.filterwarnings("ignore:deallocating CachingFileManager") @@ -3385,6 +3443,14 @@ def test_write_region(self, consolidated, compute, use_dask, write_empty) -> Non ) as actual: assert_identical(actual, nonzeros) + def test_region_scalar(self) -> None: + ds = Dataset({"x": 0}) + with self.create_zarr_target() as store: + ds.to_zarr(store) + ds.to_zarr(store, region={}, mode="r+") + with xr.open_zarr(store) as actual: + assert_identical(actual, ds) + @pytest.mark.parametrize("mode", [None, "r+", "a"]) def test_write_region_mode(self, mode) -> None: zeros = Dataset({"u": (("x",), np.zeros(10))}) @@ -4333,6 +4399,23 @@ def roundtrip_dir( ) as ds: yield ds + @requires_dask + def test_default_zarr_fill_value(self): + inputs = xr.Dataset({"floats": ("x", [1.0]), "ints": ("x", [1])}).chunk() + expected = xr.Dataset({"floats": ("x", [np.nan]), "ints": ("x", [0])}) + with self.temp_dir() as (d, store): + inputs.to_zarr(store, compute=False) + with open_dataset(store) as on_disk: + assert np.isnan(on_disk.variables["floats"].encoding["_FillValue"]) + assert ( + "_FillValue" not in on_disk.variables["ints"].encoding + ) # use default + if not has_zarr_v3: + # zarr-python v2 interprets fill_value=None inconsistently + del on_disk["ints"] + del expected["ints"] + assert_identical(expected, on_disk) + @pytest.mark.parametrize("consolidated", [True, False, None]) @pytest.mark.parametrize("write_empty", [True, False, None]) def test_write_empty( @@ -4371,14 +4454,13 @@ def assert_expected_files(expected: list[str], store: str) -> None: "0.1.1", ] + # use nan for default fill_value behaviour + data = np.array([np.nan, np.nan, 1.0, np.nan]).reshape((1, 2, 2)) + if zarr_format_3: - data = np.array([0.0, 0, 1.0, 0]).reshape((1, 2, 2)) # transform to the path style of zarr 3 # e.g. 0/0/1 expected = [e.replace(".", "/") for e in expected] - else: - # use nan for default fill_value behaviour - data = np.array([np.nan, np.nan, 1.0, np.nan]).reshape((1, 2, 2)) ds = xr.Dataset(data_vars={"test": (("Z", "Y", "X"), data)}) @@ -4507,7 +4589,7 @@ def test_zarr_version_deprecated() -> None: @requires_scipy -class TestScipyInMemoryData(NetCDF3Only, CFEncodedBase): +class TestScipyInMemoryData(CFEncodedBase, NetCDF3Only, InMemoryNetCDF): engine: T_NetcdfEngine = "scipy" @contextlib.contextmanager @@ -4515,38 +4597,26 @@ def create_store(self): fobj = BytesIO() yield backends.ScipyDataStore(fobj, "w") + @contextlib.contextmanager + def roundtrip( + self, data, save_kwargs=None, open_kwargs=None, allow_cleanup_failure=False + ): + if save_kwargs is None: + save_kwargs = {} + if open_kwargs is None: + open_kwargs = {} + saved = self.save(data, path=None, **save_kwargs) + with self.open(saved, **open_kwargs) as ds: + yield ds + @pytest.mark.asyncio @pytest.mark.skip(reason="NetCDF backends don't support async loading") async def test_load_async(self) -> None: await super().test_load_async() - def test_to_netcdf_explicit_engine(self) -> None: - Dataset({"foo": 42}).to_netcdf(engine="scipy") - - def test_roundtrip_via_bytes(self) -> None: - original = create_test_data() - netcdf_bytes = original.to_netcdf(engine="scipy") - roundtrip = open_dataset(netcdf_bytes, engine="scipy") - assert_identical(roundtrip, original) - - def test_to_bytes_compute_false(self) -> None: - original = create_test_data() - with pytest.raises( - NotImplementedError, - match=re.escape("to_netcdf() with compute=False is not yet implemented"), - ): - original.to_netcdf(engine="scipy", compute=False) - - def test_bytes_pickle(self) -> None: - data = Dataset({"foo": ("x", [1, 2, 3])}) - fobj = data.to_netcdf(engine="scipy") - with self.open(fobj) as ds: - unpickled = pickle.loads(pickle.dumps(ds)) - assert_identical(unpickled, data) - @requires_scipy -class TestScipyFileObject(NetCDF3Only, CFEncodedBase): +class TestScipyFileObject(CFEncodedBase, NetCDF3Only, FileObjectNetCDF): # TODO: Consider consolidating some of these cases (e.g., # test_file_remains_open) with TestH5NetCDFFileObject engine: T_NetcdfEngine = "scipy" @@ -4571,27 +4641,18 @@ def roundtrip( with self.open(f, **open_kwargs) as ds: yield ds - @pytest.mark.xfail( - reason="scipy.io.netcdf_file closes files upon garbage collection" - ) - def test_file_remains_open(self) -> None: - data = Dataset({"foo": ("x", [1, 2, 3])}) - f = BytesIO() - data.to_netcdf(f, engine="scipy") - assert not f.closed - restored = open_dataset(f, engine="scipy") - assert not f.closed - assert_identical(restored, data) - restored.close() - assert not f.closed + @pytest.mark.asyncio + @pytest.mark.skip(reason="NetCDF backends don't support async loading") + async def test_load_async(self) -> None: + await super().test_load_async() @pytest.mark.skip(reason="cannot pickle file objects") def test_pickle(self) -> None: - pass + super().test_pickle() @pytest.mark.skip(reason="cannot pickle file objects") def test_pickle_dataarray(self) -> None: - pass + super().test_pickle_dataarray() @pytest.mark.parametrize("create_default_indexes", [True, False]) def test_create_default_indexes(self, tmp_path, create_default_indexes) -> None: @@ -4704,19 +4765,23 @@ def test_engine(self) -> None: with pytest.raises(ValueError, match=r"unrecognized engine"): data.to_netcdf("foo.nc", engine="foobar") # type: ignore[call-overload] - with pytest.raises( - ValueError, - match=re.escape( - "can only read bytes or file-like objects with engine='scipy' or 'h5netcdf'" - ), - ): - data.to_netcdf(engine="netcdf4") - with create_tmp_file() as tmp_file: data.to_netcdf(tmp_file) with pytest.raises(ValueError, match=r"unrecognized engine"): open_dataset(tmp_file, engine="foobar") + with pytest.raises( + TypeError, + match=re.escape("file objects are not supported by the netCDF4 backend"), + ): + data.to_netcdf(BytesIO(), engine="netcdf4") + + with pytest.raises( + TypeError, + match=re.escape("file objects are not supported by the netCDF4 backend"), + ): + open_dataset(BytesIO(), engine="netcdf4") + bytes_io = BytesIO() data.to_netcdf(bytes_io, engine="scipy") with pytest.raises(ValueError, match=r"unrecognized engine"): @@ -5054,7 +5119,7 @@ def test_deepcopy(self) -> None: @requires_h5netcdf -class TestH5NetCDFFileObject(TestH5NetCDFData): +class TestH5NetCDFFileObject(TestH5NetCDFData, FileObjectNetCDF): engine: T_NetcdfEngine = "h5netcdf" def test_open_badbytes(self) -> None: @@ -5063,8 +5128,10 @@ def test_open_badbytes(self) -> None: ): with open_dataset(b"garbage"): pass - with pytest.raises(ValueError, match=r"can only read bytes"): - with open_dataset(b"garbage", engine="netcdf4"): + with pytest.raises( + ValueError, match=r"not the signature of a valid netCDF4 file" + ): + with open_dataset(b"garbage", engine="h5netcdf"): pass with pytest.raises( ValueError, match=r"not the signature of a valid netCDF4 file" @@ -5074,13 +5141,12 @@ def test_open_badbytes(self) -> None: def test_open_twice(self) -> None: expected = create_test_data() - expected.attrs["foo"] = "bar" with create_tmp_file() as tmp_file: - expected.to_netcdf(tmp_file, engine="h5netcdf") + expected.to_netcdf(tmp_file, engine=self.engine) with open(tmp_file, "rb") as f: - with open_dataset(f, engine="h5netcdf"): - with open_dataset(f, engine="h5netcdf"): - pass + with open_dataset(f, engine=self.engine): + with open_dataset(f, engine=self.engine): + pass # should not crash @requires_scipy def test_open_fileobj(self) -> None: @@ -5115,31 +5181,24 @@ def test_open_fileobj(self) -> None: with open_dataset(f): # ensure file gets closed pass - def test_file_remains_open(self) -> None: - data = Dataset({"foo": ("x", [1, 2, 3])}) - f = BytesIO() - data.to_netcdf(f, engine="h5netcdf") - assert not f.closed - restored = open_dataset(f, engine="h5netcdf") - assert not f.closed - assert_identical(restored, data) - restored.close() - assert not f.closed + @requires_fsspec + def test_fsspec(self) -> None: + expected = create_test_data() + with create_tmp_file() as tmp_file: + expected.to_netcdf(tmp_file, engine="h5netcdf") + with fsspec.open(tmp_file, "rb") as f: + with open_dataset(f, engine="h5netcdf") as actual: + assert_identical(actual, expected) -@requires_h5netcdf -class TestH5NetCDFInMemoryData: - def test_roundtrip_via_bytes(self) -> None: - original = create_test_data() - netcdf_bytes = original.to_netcdf(engine="h5netcdf") - roundtrip = open_dataset(netcdf_bytes, engine="h5netcdf") - assert_identical(roundtrip, original) + # fsspec.open() creates a pickleable file, unlike open() + with pickle.loads(pickle.dumps(actual)) as unpickled: + assert_identical(unpickled, expected) - def test_roundtrip_group_via_bytes(self) -> None: - original = create_test_data() - netcdf_bytes = original.to_netcdf(group="sub", engine="h5netcdf") - roundtrip = open_dataset(netcdf_bytes, group="sub", engine="h5netcdf") - assert_identical(roundtrip, original) + +@requires_h5netcdf +class TestH5NetCDFInMemoryData(InMemoryNetCDFWithGroups): + engine: T_NetcdfEngine = "h5netcdf" @requires_h5netcdf @@ -5182,6 +5241,25 @@ def test_write_inconsistent_chunks(self) -> None: assert actual["y"].encoding["chunksizes"] == (100, 50) +@requires_netCDF4 +@requires_h5netcdf +def test_memoryview_write_h5netcdf_read_netcdf4() -> None: + original = create_test_data() + result = original.to_netcdf(engine="h5netcdf") + roundtrip = load_dataset(result, engine="netcdf4") + assert_identical(roundtrip, original) + + +@requires_netCDF4 +@requires_h5netcdf +def test_memoryview_write_netcdf4_read_h5netcdf() -> None: + original = create_test_data() + result = original.to_netcdf(engine="netcdf4") + roundtrip = load_dataset(result, engine="h5netcdf") + assert_identical(roundtrip, original) + + +@network @requires_h5netcdf_ros3 class TestH5NetCDFDataRos3Driver(TestCommon): engine: T_NetcdfEngine = "h5netcdf" @@ -5315,11 +5393,9 @@ def test_open_mfdataset_list_attr() -> None: """ Case when an attribute of type list differs across the multiple files """ - from netCDF4 import Dataset - with create_tmp_files(2) as nfiles: for i in range(2): - with Dataset(nfiles[i], "w") as f: + with nc4.Dataset(nfiles[i], "w") as f: f.createDimension("x", 3) vlvar = f.createVariable("test_var", np.int32, ("x")) # here create an attribute as a list @@ -6732,6 +6808,7 @@ def _assert_no_dates_out_of_range_warning(record): @requires_scipy_or_netCDF4 +@pytest.mark.filterwarnings("ignore:deallocating CachingFileManager") @pytest.mark.parametrize("calendar", _STANDARD_CALENDARS) def test_use_cftime_standard_calendar_default_in_range(calendar) -> None: x = [0, 1] @@ -6854,6 +6931,7 @@ def test_use_cftime_false_standard_calendar_in_range(calendar) -> None: @requires_scipy_or_netCDF4 +@pytest.mark.filterwarnings("ignore:deallocating CachingFileManager") @pytest.mark.parametrize("calendar", ["standard", "gregorian"]) def test_use_cftime_false_standard_calendar_out_of_range(calendar) -> None: x = [0, 1] @@ -6866,12 +6944,13 @@ def test_use_cftime_false_standard_calendar_out_of_range(calendar) -> None: with create_tmp_file() as tmp_file: original.to_netcdf(tmp_file) + decoder = CFDatetimeCoder(use_cftime=False) with pytest.raises((OutOfBoundsDatetime, ValueError)): - decoder = CFDatetimeCoder(use_cftime=False) open_dataset(tmp_file, decode_times=decoder) @requires_scipy_or_netCDF4 +@pytest.mark.filterwarnings("ignore:deallocating CachingFileManager") @pytest.mark.parametrize("calendar", _NON_STANDARD_CALENDARS) @pytest.mark.parametrize("units_year", [1500, 2000, 2500]) def test_use_cftime_false_nonstandard_calendar(calendar, units_year) -> None: @@ -6885,8 +6964,8 @@ def test_use_cftime_false_nonstandard_calendar(calendar, units_year) -> None: with create_tmp_file() as tmp_file: original.to_netcdf(tmp_file) + decoder = CFDatetimeCoder(use_cftime=False) with pytest.raises((OutOfBoundsDatetime, ValueError)): - decoder = CFDatetimeCoder(use_cftime=False) open_dataset(tmp_file, decode_times=decoder) @@ -7105,6 +7184,9 @@ def test_netcdf4_entrypoint(tmp_path: Path) -> None: assert entrypoint.guess_can_open("something-local.cdf") assert not entrypoint.guess_can_open("not-found-and-no-extension") + contents = ds.to_netcdf(engine="netcdf4") + _check_guess_can_open_and_open(entrypoint, contents, engine="netcdf4", expected=ds) + path = tmp_path / "baz" with open(path, "wb") as f: f.write(b"not-a-netcdf-file") @@ -7155,6 +7237,9 @@ def test_h5netcdf_entrypoint(tmp_path: Path) -> None: with open(path, "rb") as f: _check_guess_can_open_and_open(entrypoint, f, engine="h5netcdf", expected=ds) + contents = ds.to_netcdf(engine="h5netcdf") + _check_guess_can_open_and_open(entrypoint, contents, engine="h5netcdf", expected=ds) + assert entrypoint.guess_can_open("something-local.nc") assert entrypoint.guess_can_open("something-local.nc4") assert entrypoint.guess_can_open("something-local.cdf") @@ -7227,6 +7312,7 @@ def _create_nczarr(self, filename): # https://github.com/Unidata/netcdf-c/issues/2259 ds = ds.drop_vars("dim3") + # engine="netcdf4" is not required for backwards compatibility ds.to_netcdf(f"file://{filename}#mode=nczarr") return ds @@ -7280,8 +7366,6 @@ def test_zarr_closing_internal_zip_store(): @requires_zarr @pytest.mark.parametrize("create_default_indexes", [True, False]) def test_zarr_create_default_indexes(tmp_path, create_default_indexes) -> None: - from xarray.core.indexes import PandasIndex - store_path = tmp_path / "tmp.zarr" original_ds = xr.Dataset({"data": ("x", np.arange(3))}, coords={"x": [-1, 0, 1]}) original_ds.to_zarr(store_path, mode="w") @@ -7471,6 +7555,10 @@ def test_zarr_append_chunk_partial(self): mode="a", ) + @pytest.mark.xfail( + ON_WINDOWS, + reason="Permission errors from Zarr: https://github.com/pydata/xarray/pull/10793", + ) @requires_dask def test_zarr_region_chunk_partial_offset(self): # https://github.com/pydata/xarray/pull/8459#issuecomment-1819417545