diff --git a/.changeset/vast-things-design.md b/.changeset/vast-things-design.md new file mode 100644 index 00000000..6d024472 --- /dev/null +++ b/.changeset/vast-things-design.md @@ -0,0 +1,5 @@ +--- +"zarrita": minor +--- + +Added optional chunk cache for use with zarr.get diff --git a/packages/zarrita/__tests__/indexing/get.test.ts b/packages/zarrita/__tests__/indexing/get.test.ts index 21b43c02..b517a194 100644 --- a/packages/zarrita/__tests__/indexing/get.test.ts +++ b/packages/zarrita/__tests__/indexing/get.test.ts @@ -5,7 +5,9 @@ import { describe, expect, it } from "vitest"; import * as zarr from "../../src/index.js"; import { get } from "../../src/indexing/ops.js"; +import type { ChunkCache } from "../../src/indexing/types.js"; import { range } from "../../src/indexing/util.js"; +import type { Chunk, DataType } from "../../src/metadata.js"; let __dirname = path.dirname(url.fileURLToPath(import.meta.url)); @@ -661,3 +663,202 @@ describe("get v3", () => { expect(res.stride).toStrictEqual([1, 3, 9]); }); }); + +describe("chunk caching", () => { + async function get_array_with_cache(): Promise> { + let root = path.resolve(__dirname, "../../../../fixtures/v3/data.zarr"); + let store = zarr.root(new FileSystemStore(root)); + return zarr.open.v3(store.resolve("/2d.chunked.i2"), { + kind: "array", + }) as Promise>; + } + + it("should work without cache (default behavior)", async () => { + let arr = await get_array_with_cache(); + let result = await get(arr, null); + + expect(result.data).toStrictEqual(new Int16Array([1, 2, 3, 4])); + expect(result.shape).toStrictEqual([2, 2]); + }); + + it("should work with Map as cache", async () => { + let arr = await get_array_with_cache(); + let cache = new Map(); + + // First call should populate cache + let result1 = await get(arr, null, { cache }); + expect(result1.data).toStrictEqual(new Int16Array([1, 2, 3, 4])); + const cacheSize = cache.size; + expect(cacheSize).toBeGreaterThan(0); + + // Second call should use cache + let result2 = await get(arr, null, { cache }); + expect(result2).toStrictEqual(result1); + expect(cache.size).toBe(cacheSize); + }); + + it("should cache chunks with proper keys", async () => { + let arr = await get_array_with_cache(); + let cache = new Map(); + + await get(arr, null, { cache }); + + let keys = Array.from(cache.keys()); + expect(keys.length).toBeGreaterThan(0); + + // Keys should contain store ID, array path and chunk coordinates + for (let key of keys) { + expect(key).toMatch( + /^store_\d+:\/2d\.chunked\.i2:c[.\\/]\d+([.\\/]\d+)*$/, + ); + } + }); + + it("should reuse cached chunks across multiple calls", async () => { + let arr = await get_array_with_cache(); + let cache = new Map(); + + let originalGetChunk = arr.getChunk; + let getChunkCallCount = 0; + arr.getChunk = async (...args) => { + getChunkCallCount++; + return originalGetChunk.call(arr, ...args); + }; + + // First call + await get(arr, null, { cache }); + let firstCallCount = getChunkCallCount; + expect(firstCallCount).toBeGreaterThan(0); + + // Second call should not fetch chunks again + await get(arr, null, { cache }); + expect(getChunkCallCount).toBe(firstCallCount); + + // Restore original method + arr.getChunk = originalGetChunk; + }); + + it("should work with custom cache implementation", async () => { + let arr = await get_array_with_cache(); + + // Custom cache that tracks operations + let operations: string[] = []; + let cache: ChunkCache = { + get(key: string) { + operations.push(`get:${key}`); + return undefined; // Always miss for this test + }, + set(key: string, _value: Chunk) { + operations.push(`set:${key}`); + }, + }; + + let result = await get(arr, null, { cache }); + + expect(result.data).toStrictEqual(new Int16Array([1, 2, 3, 4])); + expect(operations.length).toBeGreaterThan(0); + expect(operations.some((op) => op.startsWith("get:"))).toBe(true); + expect(operations.some((op) => op.startsWith("set:"))).toBe(true); + }); + + it("should handle cache hits correctly", async () => { + let arr = await get_array_with_cache(); + + // First, find out what cache keys are needed by doing a real call + let cache = new Map(); + await get(arr, null, { cache }); + let realKeys = Array.from(cache.keys()); + let realValues = Array.from(cache.values()); + + // Clear cache and pre-populate ALL chunks with fake data using the correct keys + cache.clear(); + for (let i = 0; i < realKeys.length; i++) { + let fakeChunkData = { + data: new Int16Array([99, 98, 97, 96]), // Use same fake data for all chunks + shape: realValues[i].shape, // Use original shape + stride: realValues[i].stride, // Use original stride + }; + cache.set(realKeys[i], fakeChunkData); + } + + // Mock getChunk to ensure it's not called + let originalGetChunk = arr.getChunk; + arr.getChunk = async () => { + throw new Error("getChunk should not be called when cache hits"); + }; + + try { + let result = await get(arr, null, { cache }); + // Should get some fake data (exact result depends on how chunks are assembled) + expect(result.data).toBeInstanceOf(Int16Array); + expect(result.shape).toStrictEqual([2, 2]); + } finally { + // Restore original method + arr.getChunk = originalGetChunk; + } + }); + + it("should handle different arrays with separate cache entries", async () => { + let root = path.resolve(__dirname, "../../../../fixtures/v3/data.zarr"); + let store = zarr.root(new FileSystemStore(root)); + + let arr1 = await zarr.open.v3(store.resolve("/1d.contiguous.raw.i2"), { + kind: "array", + }); + let arr2 = await zarr.open.v3(store.resolve("/2d.contiguous.i2"), { + kind: "array", + }); + + let cache = new Map(); + + await get(arr1, null, { cache }); + await get(arr2, null, { cache }); + + // Should have separate cache entries for different arrays + let keys = Array.from(cache.keys()); + expect(keys.some((k) => k.includes("/1d.contiguous.raw.i2:"))).toBe(true); + expect(keys.some((k) => k.includes("/2d.contiguous.i2:"))).toBe(true); + }); + + it("should handle multiple stores", async () => { + let root1 = path.resolve(__dirname, "../../../../fixtures/v3/data.zarr"); + let store1 = zarr.root(new FileSystemStore(root1)); + let arr1 = await zarr.open.v3(store1.resolve("/2d.chunked.i2"), { + kind: "array", + }); + + let root2 = path.resolve(__dirname, "../../../../fixtures/v2/data.zarr"); + let store2 = zarr.root(new FileSystemStore(root2)); + let arr2 = await zarr.open.v2(store2.resolve("/1d.contiguous.i4"), { + kind: "array", + }); + + let cache = new Map(); + + await get(arr1, null, { cache }); + await get(arr2, null, { cache }); + + expect(cache.size).toBeGreaterThan(0); + const storePrefixes = new Set( + Array.from(cache.keys()).map((key) => { + return key.split(":")[0]; // Extract store ID from cache key + }), + ); + expect(storePrefixes.size).toBe(2); // Should have entries for both stores + }); + + it("should work with sliced access using cache", async () => { + let arr = await get_array_with_cache(); + let cache = new Map(); + + // Access a slice + let result = await get(arr, [zarr.slice(0, 1), null], { cache }); + + expect(result.shape).toStrictEqual([1, 2]); + expect(cache.size).toBeGreaterThan(0); + + // Access another slice that might reuse same chunks + let result2 = await get(arr, [zarr.slice(1, 2), null], { cache }); + expect(result2.shape).toStrictEqual([1, 2]); + }); +}); diff --git a/packages/zarrita/src/indexing/get.ts b/packages/zarrita/src/indexing/get.ts index 56d19e28..1608cf16 100644 --- a/packages/zarrita/src/indexing/get.ts +++ b/packages/zarrita/src/indexing/get.ts @@ -4,6 +4,7 @@ import { type Array, get_context } from "../hierarchy.js"; import type { Chunk, DataType, Scalar, TypedArray } from "../metadata.js"; import { BasicIndexer } from "./indexer.js"; import type { + ChunkCache, GetOptions, Prepare, SetFromChunk, @@ -12,6 +13,32 @@ import type { } from "./types.js"; import { create_queue } from "./util.js"; +const NULL_CACHE: ChunkCache = { + get: () => undefined, + set: () => {}, +}; + +// WeakMap to assign unique IDs to store instances +const storeIdMap = new WeakMap(); +let storeIdCounter = 0; + +function getStoreId(store: Readable): string { + if (!storeIdMap.has(store)) { + storeIdMap.set(store, storeIdCounter++); + } + return `store_${storeIdMap.get(store)}`; +} + +function createCacheKey( + arr: Array, + chunk_coords: number[], +): string { + let context = get_context(arr); + let chunkKey = context.encode_chunk_key(chunk_coords); + let storeId = getStoreId(arr.store); + return `${storeId}:${arr.path}:${chunkKey}`; +} + function unwrap( arr: TypedArray, idx: number, @@ -50,11 +77,29 @@ export async function get< ); let queue = opts.create_queue?.() ?? create_queue(); + let cache = opts.cache ?? NULL_CACHE; for (const { chunk_coords, mapping } of indexer) { queue.add(async () => { - let { data, shape, stride } = await arr.getChunk(chunk_coords, opts.opts); - let chunk = setter.prepare(data, shape, stride); - setter.set_from_chunk(out, chunk, mapping); + let cacheKey = createCacheKey(arr, chunk_coords); + let cachedChunk = cache.get(cacheKey); + + if (cachedChunk) { + let chunk = setter.prepare( + cachedChunk.data as TypedArray, + cachedChunk.shape, + cachedChunk.stride, + ); + setter.set_from_chunk(out, chunk, mapping); + } else { + let chunkData = await arr.getChunk(chunk_coords, opts.opts); + cache.set(cacheKey, chunkData); + let chunk = setter.prepare( + chunkData.data, + chunkData.shape, + chunkData.stride, + ); + setter.set_from_chunk(out, chunk, mapping); + } }); } diff --git a/packages/zarrita/src/indexing/types.ts b/packages/zarrita/src/indexing/types.ts index 6bb3a1d6..24fb43fb 100644 --- a/packages/zarrita/src/indexing/types.ts +++ b/packages/zarrita/src/indexing/types.ts @@ -34,6 +34,11 @@ export type SetFromChunk> = ( proj: Projection[], ) => void; +export interface ChunkCache { + get(key: string): Chunk | undefined; + set(key: string, value: Chunk): void; +} + export type Setter> = { prepare: Prepare; set_from_chunk: SetFromChunk; @@ -42,6 +47,7 @@ export type Setter> = { export type Options = { create_queue?: () => ChunkQueue; + cache?: ChunkCache; }; export type GetOptions = Options & { opts?: O };