diff --git a/pyproject.toml b/pyproject.toml index 15b9ff3..b2e3a06 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -20,7 +20,7 @@ classifiers = [ dynamic = ["version"] dependencies = [ "numpy>=1.24", - "zarr>=3.0.3,<3.1", + "zarr>=3.1", ] [dependency-groups] diff --git a/python/zarrs/pipeline.py b/python/zarrs/pipeline.py index 6ebc24d..2efce5e 100644 --- a/python/zarrs/pipeline.py +++ b/python/zarrs/pipeline.py @@ -21,6 +21,7 @@ from zarr.core.chunk_grids import ChunkGrid from zarr.core.common import ChunkCoords from zarr.core.indexing import SelectorTuple + from zarr.dtype import ZDType from ._internal import CodecPipelineImpl, codec_metadata_v2_to_v3 from .utils import ( @@ -134,7 +135,7 @@ def __iter__(self) -> Iterator[Codec]: yield from self.codecs def validate( - self, *, shape: ChunkCoords, dtype: np.dtype[Any], chunk_grid: ChunkGrid + self, *, shape: ChunkCoords, dtype: ZDType, chunk_grid: ChunkGrid ) -> None: raise NotImplementedError("validate") @@ -236,7 +237,7 @@ def _raise_error_on_unsupported_batch_dtype( # https://github.com/LDeakin/zarrs/blob/0532fe983b7b42b59dbf84e50a2fe5e6f7bad4ce/zarrs_metadata/src/v2_to_v3.rs#L289-L293 for VSUMm # Further, our pipeline does not support variable-length objects due to limitations on decode_into, so object/np.dtypes.StringDType is also out if any( - info.dtype.kind in {"V", "S", "U", "M", "m", "O", "T"} + info.dtype.to_native_dtype().kind in {"V", "S", "U", "M", "m", "O", "T"} for (_, info, _, _, _) in batch_info ): raise UnsupportedDataTypeError() diff --git a/python/zarrs/utils.py b/python/zarrs/utils.py index 35743b8..81d5977 100644 --- a/python/zarrs/utils.py +++ b/python/zarrs/utils.py @@ -8,7 +8,6 @@ import numpy as np from zarr.core.array_spec import ArraySpec from zarr.core.indexing import SelectorTuple, is_integer -from zarr.core.metadata.v2 import _default_fill_value from zarrs._internal import Basic, WithSubset @@ -17,6 +16,7 @@ from types import EllipsisType from zarr.abc.store import ByteGetter, ByteSetter + from zarr.dtype import ZDType # adapted from https://docs.python.org/3/library/concurrent.futures.html#concurrent.futures.ThreadPoolExecutor @@ -139,9 +139,9 @@ def get_shape_for_selector( return resulting_shape_from_index(shape, selector_tuple, drop_axes, pad=pad) -def get_implicit_fill_value(dtype: np.dtype, fill_value: Any) -> Any: +def get_implicit_fill_value(dtype: ZDType, fill_value: Any) -> Any: if fill_value is None: - fill_value = _default_fill_value(dtype) + fill_value = dtype.default_scalar() return fill_value diff --git a/src/chunk_item.rs b/src/chunk_item.rs index feac5cb..da0d9f8 100644 --- a/src/chunk_item.rs +++ b/src/chunk_item.rs @@ -68,6 +68,7 @@ impl Basic { let chunk_shape = chunk_spec.getattr("shape")?.extract()?; let mut dtype: String = chunk_spec .getattr("dtype")? + .call_method0("to_native_dtype")? .call_method0("__str__")? .extract()?; if dtype == "object" { diff --git a/tests/test_blosc.py b/tests/test_blosc.py deleted file mode 100644 index 86e82dd..0000000 --- a/tests/test_blosc.py +++ /dev/null @@ -1,57 +0,0 @@ -import json - -import numpy as np -import pytest -from zarr import AsyncArray -from zarr.abc.store import Store -from zarr.codecs import BloscCodec, BytesCodec, ShardingCodec -from zarr.core.buffer import default_buffer_prototype -from zarr.storage import StorePath - - -@pytest.mark.parametrize("dtype", ["uint8", "uint16"]) -async def test_blosc_evolve(store: Store, dtype: str) -> None: - typesize = np.dtype(dtype).itemsize - path = "blosc_evolve" - spath = StorePath(store, path) - await AsyncArray.create( - spath, - shape=(16, 16), - chunk_shape=(16, 16), - dtype=dtype, - fill_value=0, - codecs=[BytesCodec(), BloscCodec()], - ) - buf = await store.get(f"{path}/zarr.json", prototype=default_buffer_prototype()) - assert buf is not None - zarr_json = json.loads(buf.to_bytes()) - blosc_configuration_json = zarr_json["codecs"][1]["configuration"] - assert blosc_configuration_json["typesize"] == typesize - if typesize == 1: - assert blosc_configuration_json["shuffle"] == "bitshuffle" - else: - assert blosc_configuration_json["shuffle"] == "shuffle" - - path2 = "blosc_evolve_sharding" - spath2 = StorePath(store, path2) - await AsyncArray.create( - spath2, - shape=(16, 16), - chunk_shape=(16, 16), - dtype=dtype, - fill_value=0, - codecs=[ - ShardingCodec(chunk_shape=(16, 16), codecs=[BytesCodec(), BloscCodec()]) - ], - ) - buf = await store.get(f"{path2}/zarr.json", prototype=default_buffer_prototype()) - assert buf is not None - zarr_json = json.loads(buf.to_bytes()) - blosc_configuration_json = zarr_json["codecs"][0]["configuration"]["codecs"][1][ - "configuration" - ] - assert blosc_configuration_json["typesize"] == typesize - if typesize == 1: - assert blosc_configuration_json["shuffle"] == "bitshuffle" - else: - assert blosc_configuration_json["shuffle"] == "shuffle" diff --git a/tests/test_codecs.py b/tests/test_codecs.py index 42606f0..befce8e 100644 --- a/tests/test_codecs.py +++ b/tests/test_codecs.py @@ -1,6 +1,5 @@ from __future__ import annotations -import json from dataclasses import dataclass from typing import TYPE_CHECKING @@ -13,7 +12,6 @@ TransposeCodec, ) from zarr.core.buffer import default_buffer_prototype -from zarr.core.indexing import Selection, morton_order_iter from zarr.storage import StorePath if TYPE_CHECKING: @@ -21,6 +19,7 @@ from zarr.abc.store import Store from zarr.core.buffer.core import NDArrayLike from zarr.core.common import MemoryOrder + from zarr.core.indexing import Selection @dataclass(frozen=True) @@ -165,51 +164,6 @@ def test_order_implicit( assert read_data.flags["C_CONTIGUOUS"] -def test_open(store: Store) -> None: - spath = StorePath(store) - a = Array.create( - spath, - shape=(16, 16), - chunk_shape=(16, 16), - dtype="int32", - fill_value=0, - ) - b = Array.open(spath) - assert a.metadata == b.metadata - - -def test_morton() -> None: - assert list(morton_order_iter((2, 2))) == [(0, 0), (1, 0), (0, 1), (1, 1)] - assert list(morton_order_iter((2, 2, 2))) == [ - (0, 0, 0), - (1, 0, 0), - (0, 1, 0), - (1, 1, 0), - (0, 0, 1), - (1, 0, 1), - (0, 1, 1), - (1, 1, 1), - ] - assert list(morton_order_iter((2, 2, 2, 2))) == [ - (0, 0, 0, 0), - (1, 0, 0, 0), - (0, 1, 0, 0), - (1, 1, 0, 0), - (0, 0, 1, 0), - (1, 0, 1, 0), - (0, 1, 1, 0), - (1, 1, 1, 0), - (0, 0, 0, 1), - (1, 0, 0, 1), - (0, 1, 0, 1), - (1, 1, 0, 1), - (0, 0, 1, 1), - (1, 0, 1, 1), - (0, 1, 1, 1), - (1, 1, 1, 1), - ] - - def test_write_partial_chunks(store: Store) -> None: data = np.arange(0, 256, dtype="uint16").reshape((16, 16)) spath = StorePath(store) @@ -241,41 +195,6 @@ async def test_delete_empty_chunks(store: Store) -> None: assert await store.get(f"{path}/c0/0", prototype=default_buffer_prototype()) is None -async def test_dimension_names(store: Store) -> None: - data = np.arange(0, 256, dtype="uint16").reshape((16, 16)) - path = "dimension_names" - spath = StorePath(store, path) - await AsyncArray.create( - spath, - shape=data.shape, - chunk_shape=(16, 16), - dtype=data.dtype, - fill_value=0, - dimension_names=("x", "y"), - ) - - assert (await AsyncArray.open(spath)).metadata.dimension_names == ( - "x", - "y", - ) - path2 = "dimension_names2" - spath2 = StorePath(store, path2) - await AsyncArray.create( - spath2, - shape=data.shape, - chunk_shape=(16, 16), - dtype=data.dtype, - fill_value=0, - ) - - assert (await AsyncArray.open(spath2)).metadata.dimension_names is None - zarr_json_buffer = await store.get( - f"{path2}/zarr.json", prototype=default_buffer_prototype() - ) - assert zarr_json_buffer is not None - assert "dimension_names" not in json.loads(zarr_json_buffer.to_bytes()) - - def test_invalid_metadata(store: Store) -> None: # LD: Disabled for `zarrs`. Including endianness for a single-byte data type is not invalid. # spath2 = StorePath(store, "invalid_endian") diff --git a/tests/test_sharding.py b/tests/test_sharding.py index c8d8151..2ff6234 100644 --- a/tests/test_sharding.py +++ b/tests/test_sharding.py @@ -1,4 +1,3 @@ -import pickle from typing import Any import numpy as np @@ -286,30 +285,6 @@ def test_nested_sharding( assert np.array_equal(data, read_data) -def test_open_sharding(store: Store) -> None: - path = "open_sharding" - spath = StorePath(store, path) - a = Array.create( - spath, - shape=(16, 16), - chunk_shape=(16, 16), - dtype="int32", - fill_value=0, - codecs=[ - ShardingCodec( - chunk_shape=(8, 8), - codecs=[ - TransposeCodec(order=order_from_dim("F", 2)), - BytesCodec(), - BloscCodec(), - ], - ) - ], - ) - b = Array.open(spath) - assert a.metadata == b.metadata - - def test_write_partial_sharded_chunks(store: Store) -> None: data = np.arange(0, 16 * 16, dtype="uint16").reshape((16, 16)) spath = StorePath(store) @@ -365,11 +340,6 @@ async def test_delete_empty_shards(store: Store) -> None: assert len(chunk_bytes) == 16 * 2 + 8 * 8 * 2 + 4 -def test_pickle() -> None: - codec = ShardingCodec(chunk_shape=(8, 8)) - assert pickle.loads(pickle.dumps(codec)) == codec - - @pytest.mark.parametrize( "index_location", [ShardingCodecIndexLocation.start, ShardingCodecIndexLocation.end] ) diff --git a/tests/test_transpose.py b/tests/test_transpose.py index e2477dc..b9a2859 100644 --- a/tests/test_transpose.py +++ b/tests/test_transpose.py @@ -84,21 +84,3 @@ def test_transpose_non_self_inverse(store: Store, order: list[int]) -> None: a[:, :] = data read_data = a[:, :] assert np.array_equal(data, read_data) - - -def test_transpose_invalid( - store: Store, -) -> None: - data = np.arange(0, 256, dtype="uint16").reshape((1, 32, 8)) - spath = StorePath(store, "transpose_invalid") - for order in [(1, 0), (3, 2, 1), (3, 3, 1)]: - with pytest.raises(ValueError, match=r".*order"): - Array.create( - spath, - shape=data.shape, - chunk_shape=(1, 32, 8), - dtype=data.dtype, - fill_value=0, - chunk_key_encoding=("v2", "."), - codecs=[TransposeCodec(order=order), BytesCodec()], - ) diff --git a/tests/test_v2.py b/tests/test_v2.py index fd8592e..b5eb279 100644 --- a/tests/test_v2.py +++ b/tests/test_v2.py @@ -11,10 +11,12 @@ import zarr.storage from numcodecs import Delta from numcodecs.blosc import Blosc -from numcodecs.zstd import Zstd from zarr import config from zarr.abc.store import Store from zarr.core.buffer.core import default_buffer_prototype +from zarr.core.dtype import FixedLengthUTF32, Structured, VariableLengthUTF8 +from zarr.core.dtype.npy.bytes import NullTerminatedBytes +from zarr.core.dtype.wrapper import ZDType from zarr.core.sync import sync from zarr.storage import LocalStore, StorePath @@ -40,33 +42,7 @@ def test_simple(store: StorePath) -> None: assert np.array_equal(data, a[:, :]) -@pytest.mark.parametrize( - ("dtype", "fill_value"), - [ - ("bool", False), - ("int64", 0), - ("float64", 0.0), - ("|S1", b""), - ("|U1", ""), - ("object", ""), - (str, ""), - ], -) -def test_implicit_fill_value(store: LocalStore, dtype: str, fill_value: Any) -> None: - arr = zarr.create( - store=store, shape=(4,), fill_value=None, zarr_format=2, dtype=dtype - ) - assert arr.metadata.fill_value is None - assert arr.metadata.to_dict()["fill_value"] is None - result = arr[:] - numpy_dtype = np.dtype(object) if dtype is str else np.dtype(dtype) - expected = np.full(arr.shape, fill_value, dtype=numpy_dtype) - np.testing.assert_array_equal(result, expected) - - -def test_codec_pipeline(tmp_path) -> None: - # https://github.com/zarr-developers/zarr-python/issues/2243 - store = LocalStore(tmp_path) +def test_fill_single_value(store: Store) -> None: array = zarr.create( store=store, shape=(1,), @@ -82,93 +58,69 @@ def test_codec_pipeline(tmp_path) -> None: @pytest.mark.parametrize( - ("dtype", "expected_dtype", "fill_value", "fill_value_encoding"), + ("dtype", "expected_dtype", "fill_value", "fill_value_json"), [ - ("|S", "|S0", b"X", "WA=="), - ("|V", "|V0", b"X", "WA=="), + ("|S1", "|S1", b"X", "WA=="), + ("|V1", "|V1", b"X", "WA=="), ("|V10", "|V10", b"X", "WAAAAAAAAAAAAA=="), ], ) async def test_v2_encode_decode( - dtype, expected_dtype, fill_value, fill_value_encoding, tmp_path + dtype, expected_dtype, fill_value, fill_value_json, tmp_path ) -> None: - with config.set( - { - "array.v2_default_filters.bytes": [{"id": "vlen-bytes"}], - "array.v2_default_compressor.bytes": None, - } - ): - store = zarr.storage.LocalStore(tmp_path) - g = zarr.group(store=store, zarr_format=2) - g.create_array( - name="foo", - shape=(3,), - chunks=(3,), - dtype=dtype, - fill_value=fill_value, - compressor=None, - ) + store = zarr.storage.LocalStore(tmp_path) + g = zarr.group(store=store, zarr_format=2) + g.create_array( + name="foo", + shape=(3,), + chunks=(3,), + dtype=dtype, + fill_value=fill_value, + compressor=None, + ) - result = await store.get( - "foo/.zarray", zarr.core.buffer.default_buffer_prototype() - ) - assert result is not None - - serialized = json.loads(result.to_bytes()) - expected = { - "chunks": [3], - "compressor": None, - "dtype": expected_dtype, - "fill_value": fill_value_encoding, - "filters": [{"id": "vlen-bytes"}] if dtype == "|S" else None, - "order": "C", - "shape": [3], - "zarr_format": 2, - "dimension_separator": ".", - } - assert serialized == expected - - data = zarr.open_array(store=store, path="foo")[:] - expected = np.full((3,), b"X", dtype=dtype) - np.testing.assert_equal(data, expected) - - -@pytest.mark.parametrize("dtype_value", [["|S", b"Y"], ["|U", "Y"], ["O", b"Y"]]) -def test_v2_encode_decode_with_data(dtype_value, tmp_path): - dtype, value = dtype_value - with config.set( - { - "array.v2_default_filters": { - "string": [{"id": "vlen-utf8"}], - "bytes": [{"id": "vlen-bytes"}], - }, - } - ): - expected = np.full((3,), value, dtype=dtype) - a = zarr.create( - store=tmp_path, - shape=(3,), - zarr_format=2, - dtype=dtype, - ) - a[:] = expected - data = a[:] - np.testing.assert_equal(data, expected) - - -@pytest.mark.parametrize("dtype", [str, "str"]) -async def test_create_dtype_str(dtype: Any, tmp_path) -> None: - # see https://github.com/zarr-developers/zarr-python/issues/2627 for why this test - # is probably wrong - arr = zarr.create(store=tmp_path, shape=3, dtype=dtype, zarr_format=2) - assert arr.dtype.kind == "O" - assert arr.metadata.to_dict()["dtype"] == "|O" - assert arr.metadata.filters == (numcodecs.vlen.VLenBytes(),) - arr[:] = [b"a", b"bb", b"ccc"] - result = arr[:] - np.testing.assert_array_equal( - result, np.array([b"a", b"bb", b"ccc"], dtype="object") + result = await store.get("foo/.zarray", zarr.core.buffer.default_buffer_prototype()) + assert result is not None + + serialized = json.loads(result.to_bytes()) + expected = { + "chunks": [3], + "compressor": None, + "dtype": expected_dtype, + "fill_value": fill_value_json, + "filters": None, + "order": "C", + "shape": [3], + "zarr_format": 2, + "dimension_separator": ".", + } + assert serialized == expected + + data = zarr.open_array(store=store, path="foo")[:] + np.testing.assert_equal(data, np.full((3,), b"X", dtype=dtype)) + + +@pytest.mark.parametrize( + ("dtype", "value"), + [ + (NullTerminatedBytes(length=1), b"Y"), + (FixedLengthUTF32(length=1), "Y"), + (VariableLengthUTF8(), "Y"), + ], +) +def test_v2_encode_decode_with_data( + dtype: ZDType[Any, Any], value: str, tmp_path: Path +): + expected = np.full((3,), value, dtype=dtype.to_native_dtype()) + a = zarr.create( + store=tmp_path, + shape=(3,), + zarr_format=2, + dtype=dtype, ) + a[:] = expected + data = a[:] + np.testing.assert_equal(data, expected) @pytest.mark.parametrize( @@ -188,48 +140,17 @@ def test_v2_filters_codecs( np.testing.assert_array_equal(result, array_fixture) -@pytest.mark.filterwarnings("ignore") -def test_create_array_defaults(store: Store): - """ - Test that passing compressor=None results in no compressor. Also test that the default value of the compressor - parameter does produce a compressor. - """ - g = zarr.open(store, mode="w", zarr_format=2) - arr = g.create_array("one", dtype="i8", shape=(1,), chunks=(1,), compressor=None) - assert arr._async_array.compressor is None - assert not (arr.filters) - arr = g.create_array("two", dtype="i8", shape=(1,), chunks=(1,)) - assert arr._async_array.compressor is not None - assert not (arr.filters) - arr = g.create_array( - "three", dtype="i8", shape=(1,), chunks=(1,), compressor=Zstd() - ) - assert arr._async_array.compressor is not None - assert not (arr.filters) - with pytest.raises(ValueError): # noqa: PT011 - g.create_array( - "four", - dtype="i8", - shape=(1,), - chunks=(1,), - compressor=None, - compressors=None, - ) - - +@pytest.mark.parametrize("numpy_order", ["C", "F"]) @pytest.mark.parametrize( - "array_order", ["C", pytest.param("F", marks=[pytest.mark.xfail])] -) -@pytest.mark.parametrize("data_order", ["C", "F"]) -@pytest.mark.parametrize( - "memory_order", ["C", pytest.param("F", marks=[pytest.mark.xfail])] + "zarr_order", ["C", pytest.param("F", marks=pytest.mark.xfail())] ) def test_v2_non_contiguous( - array_order: Literal["C", "F"], - data_order: Literal["C", "F"], - memory_order: Literal["C", "F"], - tmp_path: Path, + numpy_order: Literal["C", "F"], zarr_order: Literal["C", "F"], tmp_path: Path ) -> None: + """ + Make sure zarr v2 arrays save data using the memory order given to the zarr array, + not the memory order of the original numpy array. + """ store = LocalStore(tmp_path / "a_store") arr = zarr.create_array( store, @@ -241,26 +162,29 @@ def test_v2_non_contiguous( filters=None, compressors=None, overwrite=True, - order=array_order, - config={"order": memory_order}, + order=zarr_order, ) - # Non-contiguous write - a = np.arange(arr.shape[0] * arr.shape[1]).reshape(arr.shape, order=data_order) + # Non-contiguous write, using numpy memory order + a = np.arange(arr.shape[0] * arr.shape[1]).reshape(arr.shape, order=numpy_order) arr[6:9, 3:6] = a[6:9, 3:6] # The slice on the RHS is important np.testing.assert_array_equal(arr[6:9, 3:6], a[6:9, 3:6]) + buf = sync(store.get("2.1", default_buffer_prototype())) + assert buf is not None np.testing.assert_array_equal( a[6:9, 3:6], - np.frombuffer( - sync(store.get("2.1", default_buffer_prototype())).to_bytes(), - dtype="float64", - ).reshape((3, 3), order=array_order), + np.frombuffer(buf.to_bytes(), dtype="float64").reshape( + (3, 3), order=zarr_order + ), ) - if memory_order == "F": - assert (arr[6:9, 3:6]).flags.f_contiguous + # After writing and reading from zarr array, order should be same as zarr order + sub_arr = arr[6:9, 3:6] + assert isinstance(sub_arr, np.ndarray) + if zarr_order == "F": + assert (sub_arr).flags.f_contiguous else: - assert (arr[6:9, 3:6]).flags.c_contiguous + assert (sub_arr).flags.c_contiguous store = LocalStore(tmp_path / "other_store") arr = zarr.create_array( @@ -273,18 +197,19 @@ def test_v2_non_contiguous( compressors=None, filters=None, overwrite=True, - order=array_order, - config={"order": memory_order}, + order=zarr_order, ) - # Contiguous write - a = np.arange(9).reshape((3, 3), order=data_order) - if data_order == "F": - assert a.flags.f_contiguous - else: - assert a.flags.c_contiguous + a = np.arange(9).reshape((3, 3), order=numpy_order) arr[6:9, 3:6] = a np.testing.assert_array_equal(arr[6:9, 3:6], a) + # After writing and reading from zarr array, order should be same as zarr order + sub_arr = arr[6:9, 3:6] + assert isinstance(sub_arr, np.ndarray) + if zarr_order == "F": + assert (sub_arr).flags.f_contiguous + else: + assert (sub_arr).flags.c_contiguous def test_default_compressor_deprecation_warning(): @@ -292,38 +217,6 @@ def test_default_compressor_deprecation_warning(): zarr.storage.default_compressor = "zarr.codecs.zstd.ZstdCodec()" -@pytest.mark.parametrize( - "dtype_expected", - [ - ["b", "zstd", None], - ["i", "zstd", None], - ["f", "zstd", None], - ["|S1", "zstd", "vlen-bytes"], - ["|U1", "zstd", "vlen-utf8"], - ], -) -def test_default_filters_and_compressor(dtype_expected: Any) -> None: - with config.set( - { - "array.v2_default_compressor": { - "numeric": {"id": "zstd", "level": "0"}, - "string": {"id": "zstd", "level": "0"}, - "bytes": {"id": "zstd", "level": "0"}, - }, - "array.v2_default_filters": { - "numeric": [], - "string": [{"id": "vlen-utf8"}], - "bytes": [{"id": "vlen-bytes"}], - }, - } - ): - dtype, expected_compressor, expected_filter = dtype_expected - arr = zarr.create(shape=(3,), path="foo", store={}, zarr_format=2, dtype=dtype) - assert arr.metadata.compressor.codec_id == expected_compressor - if expected_filter is not None: - assert arr.metadata.filters[0].codec_id == expected_filter - - @pytest.mark.parametrize("fill_value", [None, (b"", 0, 0.0)], ids=["no_fill", "fill"]) def test_structured_dtype_roundtrip(fill_value, tmp_path) -> None: a = np.array( @@ -344,3 +237,63 @@ def test_structured_dtype_roundtrip(fill_value, tmp_path) -> None: za[...] = a za = zarr.open_array(store=array_path) assert (a == za[:]).all() + + +@pytest.mark.parametrize( + ( + "fill_value", + "dtype", + "expected_result", + ), + [ + ( + ("Alice", 30), + np.dtype([("name", "U10"), ("age", "i4")]), + np.array([("Alice", 30)], dtype=[("name", "U10"), ("age", "i4")])[0], + ), + ( + ["Bob", 25], + np.dtype([("name", "U10"), ("age", "i4")]), + np.array([("Bob", 25)], dtype=[("name", "U10"), ("age", "i4")])[0], + ), + ( + b"\x01\x00\x00\x00\x02\x00\x00\x00", + np.dtype([("x", "i4"), ("y", "i4")]), + np.array([(1, 2)], dtype=[("x", "i4"), ("y", "i4")])[0], + ), + ], + ids=[ + "tuple_input", + "list_input", + "bytes_input", + ], +) +def test_parse_structured_fill_value_valid( + fill_value: Any, dtype: np.dtype[Any], expected_result: Any +) -> None: + zdtype = Structured.from_native_dtype(dtype) + result = zdtype.cast_scalar(fill_value) + assert result.dtype == expected_result.dtype + assert result == expected_result + if isinstance(expected_result, np.void): + for name in expected_result.dtype.names or []: + assert result[name] == expected_result[name] + + +@pytest.mark.parametrize("fill_value", [None, b"x"], ids=["no_fill", "fill"]) +def test_other_dtype_roundtrip(fill_value, tmp_path) -> None: + a = np.array([b"a\0\0", b"bb", b"ccc"], dtype="V7") + array_path = tmp_path / "data.zarr" + za = zarr.create( + shape=(3,), + store=array_path, + chunks=(2,), + fill_value=fill_value, + zarr_format=2, + dtype=a.dtype, + ) + if fill_value is not None: + assert (np.array([fill_value] * a.shape[0], dtype=a.dtype) == za[:]).all() + za[...] = a + za = zarr.open_array(store=array_path) + assert (a == za[:]).all() diff --git a/tests/test_vlen.py b/tests/test_vlen.py index afb3610..b9e29eb 100644 --- a/tests/test_vlen.py +++ b/tests/test_vlen.py @@ -3,32 +3,44 @@ import numpy as np import pytest import zarr +from zarr import Array from zarr.abc.codec import Codec from zarr.abc.store import Store from zarr.codecs import ZstdCodec -from zarr.core.metadata.v3 import ArrayV3Metadata, DataType -from zarr.core.strings import _NUMPY_SUPPORTS_VLEN_STRING +from zarr.core.dtype import get_data_type_from_native_dtype +from zarr.core.dtype.npy.string import _NUMPY_SUPPORTS_VLEN_STRING +from zarr.core.metadata.v3 import ArrayV3Metadata from zarr.storage import StorePath -numpy_str_dtypes: list[type | str | None] = [None, str, "str", np.dtypes.StrDType] -expected_zarr_string_dtype: np.dtype[Any] +numpy_str_dtypes: list[type | str | None] = [ + None, + str, + "str", + np.dtypes.StrDType, + "S", + "U", +] +expected_array_string_dtype: np.dtype[Any] if _NUMPY_SUPPORTS_VLEN_STRING: numpy_str_dtypes.append(np.dtypes.StringDType) - expected_zarr_string_dtype = np.dtypes.StringDType() + expected_array_string_dtype = np.dtypes.StringDType() else: - expected_zarr_string_dtype = np.dtype("O") + expected_array_string_dtype = np.dtype("O") -@pytest.mark.parametrize("store", ["local"], indirect=["store"]) +@pytest.mark.filterwarnings( + "ignore::zarr.core.dtype.common.UnstableSpecificationWarning" +) +@pytest.mark.parametrize("store", ["memory", "local"], indirect=["store"]) @pytest.mark.parametrize("dtype", numpy_str_dtypes) @pytest.mark.parametrize("as_object_array", [False, True]) @pytest.mark.parametrize("compressor", [None, ZstdCodec()]) def test_vlen_string( store: Store, dtype: np.dtype[Any] | None, + compressor: Codec | None, *, as_object_array: bool, - compressor: Codec | None, ) -> None: strings = ["hello", "world", "this", "is", "a", "test"] data = np.array(strings, dtype=dtype).reshape((2, 3)) @@ -47,54 +59,18 @@ def test_vlen_string( # should also work if input array is an object array, provided we explicitly specified # a stringlike dtype when creating the Array if as_object_array: - data = data.astype("O") - - a[:, :] = data - assert np.array_equal(data, a[:, :]) - assert a.metadata.data_type == DataType.string - assert a.dtype == expected_zarr_string_dtype - - # test round trip - b = zarr.open(sp) - assert isinstance(b.metadata, ArrayV3Metadata) # needed for mypy - assert np.array_equal(data, b[:, :]) - assert b.metadata.data_type == DataType.string - assert a.dtype == expected_zarr_string_dtype - - -@pytest.mark.parametrize("store", ["local"], indirect=["store"]) -@pytest.mark.parametrize("as_object_array", [False, True]) -@pytest.mark.parametrize("compressor", [None, ZstdCodec()]) -def test_vlen_bytes( - store: Store, *, as_object_array: bool, compressor: Codec | None -) -> None: - bstrings = [b"hello", b"world", b"this", b"is", b"a", b"test"] - data = np.array(bstrings).reshape((2, 3)) - assert data.dtype == "|S5" + data_obj = data.astype("O") - sp = StorePath(store, path="string") - a = zarr.create_array( - sp, - shape=data.shape, - chunks=data.shape, - dtype=data.dtype, - fill_value=b"", - compressors=compressor, - ) - assert isinstance(a.metadata, ArrayV3Metadata) # needed for mypy - - # should also work if input array is an object array, provided we explicitly specified - # a bytesting-like dtype when creating the Array - if as_object_array: - data = data.astype("O") - a[:, :] = data + a[:, :] = data_obj + else: + a[:, :] = data assert np.array_equal(data, a[:, :]) - assert a.metadata.data_type == DataType.bytes - assert a.dtype == "O" + assert a.metadata.data_type == get_data_type_from_native_dtype(data.dtype) + assert a.dtype == data.dtype # test round trip - b = zarr.open(sp) + b = Array.open(sp) assert isinstance(b.metadata, ArrayV3Metadata) # needed for mypy assert np.array_equal(data, b[:, :]) - assert b.metadata.data_type == DataType.bytes - assert a.dtype == "O" + assert b.metadata.data_type == get_data_type_from_native_dtype(data.dtype) + assert a.dtype == data.dtype