diff --git a/CLAUDE.md b/CLAUDE.md index 15649c0..5d3bb9a 100644 --- a/CLAUDE.md +++ b/CLAUDE.md @@ -4,7 +4,7 @@ This file provides guidance to Claude Code (claude.ai/code) when working with co ## Project Overview -**warp_cache** — a thread-safe Python caching decorator backed by a Rust extension (PyO3 + maturin). Provides LRU/MRU/FIFO/LFU eviction, TTL support, async awareness, and a cross-process shared memory backend. +**warp_cache** — a thread-safe Python caching decorator backed by a Rust extension (PyO3 + maturin). Uses SIEVE eviction, with TTL support, async awareness, and a cross-process shared memory backend. ## Build & Test Commands @@ -35,30 +35,30 @@ make test PYTHON=3.13 # Specific version ### Rust core (`src/`) - **`lib.rs`** — PyO3 module entry, exports `CachedFunction`, `SharedCachedFunction`, info types -- **`store.rs`** — In-process backend: `CachedFunction` wraps `parking_lot::RwLock`. The `__call__` method does the entire cache lookup in Rust (hash → lookup → equality check → LRU reorder → return) in a single FFI crossing +- **`store.rs`** — In-process backend: `CachedFunction` uses a sharded `hashbrown::HashMap` + `parking_lot::RwLock` per shard (read lock for cache hits, write lock for misses/eviction). The `__call__` method does the entire cache lookup in Rust (hash → shard select → read lock → lookup → equality check → SIEVE visited update → return) in a single FFI crossing - **`serde.rs`** — Fast-path binary serialization for common primitives (None, bool, int, float, str, bytes, flat tuples); avoids pickle overhead for the shared backend -- **`shared_store.rs`** — Cross-process backend: `SharedCachedFunction` serializes via serde.rs (with pickle fallback), stores in mmap'd shared memory -- **`entry.rs`** — `CacheEntry` { value, created_at, frequency } +- **`shared_store.rs`** — Cross-process backend: `SharedCachedFunction` holds `ShmCache` directly (no Mutex), with cached `max_key_size`/`max_value_size` fields and a pre-built `ahash::RandomState`. Serializes via serde.rs (with pickle fallback), stores in mmap'd shared memory +- **`entry.rs`** — `CacheEntry` { value, created_at, visited } - **`key.rs`** — `CacheKey` wraps `Py` + precomputed hash; uses raw `ffi::PyObject_RichCompareBool` for equality (safe because called inside `#[pymethods]` where GIL is held) -- **`strategies/`** — Enum-based static dispatch (`StrategyEnum`) over LRU/MRU/FIFO/LFU (avoids `Box` overhead). LRU uses `hashlink::LruCache` - **`shm/`** — Shared memory infrastructure: - - `mod.rs` — `ShmCache`: create/open, get/set with serialized bytes + - `mod.rs` — `ShmCache`: create/open, get/set with serialized bytes. Uses interior mutability (`&self` methods): reads are lock-free (seqlock), writes acquire seqlock internally. `next_unique_id` is `AtomicU64` - `layout.rs` — Header + SlotHeader structs, memory offsets - `region.rs` — `ShmRegion`: mmap file management (`$TMPDIR/warp_cache/{name}.cache`) - `lock.rs` — `ShmSeqLock`: seqlock (optimistic reads + TTAS spinlock) in shared memory - `hashtable.rs` — Open-addressing with linear probing (power-of-2 capacity, bitmask) - - `ordering.rs` — Eviction ordering state in shared memory + - `ordering.rs` — SIEVE eviction: intrusive linked list + `sieve_evict()` hand scan ### Python layer (`warp_cache/`) -- **`_decorator.py`** — `cache()` factory: dispatches to `CachedFunction` (memory) or `SharedCachedFunction` (shared). Auto-detects async functions and wraps with `AsyncCachedFunction` (cache hit in Rust, only misses `await` the coroutine). Also exports `lru_cache()` — a convenience shorthand for `cache(strategy=Strategy.LRU, ...)` -- **`_strategies.py`** — `Strategy(IntEnum)`: LRU=0, MRU=1, FIFO=2, LFU=3 +- **`_decorator.py`** — `cache()` factory: dispatches to `CachedFunction` (memory) or `SharedCachedFunction` (shared). Auto-detects async functions and wraps with `AsyncCachedFunction` (cache hit in Rust, only misses `await` the coroutine) +- **`_strategies.py`** — `Backend(IntEnum)`: MEMORY=0, SHARED=1 ### Key design decisions - **Single FFI crossing**: entire cache lookup happens in Rust `__call__`, no Python wrapper overhead - **Release profile**: fat LTO + `codegen-units=1` for cross-crate inlining of PyO3 wrappers -- **Thread safety**: `parking_lot::RwLock` (~8ns uncontended) for in-process backend; seqlock (optimistic reads + TTAS spinlock) for shared backend. Enables true parallel reads under free-threaded Python (3.13t+) +- **SIEVE eviction**: unified across both backends. On hit, sets `visited=1` (single-word store). On evict, hand scans for unvisited entry. Lock-free reads on both backends +- **Thread safety**: sharded `hashbrown::HashMap` + `parking_lot::RwLock` per shard (read lock for hits, write lock for misses) for in-process backend; seqlock (optimistic reads + TTAS spinlock) for shared backend — no Mutex, `ShmCache` uses `&self` methods with interior mutability. Cache hits only acquire a cheap per-shard read lock (memory) or are fully lock-free (shared). Enables true parallel reads across shards under free-threaded Python (3.13t+) ## Critical Invariants diff --git a/Cargo.lock b/Cargo.lock index a5bc48f..9d157da 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -15,6 +15,12 @@ dependencies = [ "zerocopy", ] +[[package]] +name = "allocator-api2" +version = "0.2.21" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "683d7910e743518b0e34f1186f92494becacb047c7b6bf616c96772180fef923" + [[package]] name = "bitflags" version = "2.10.0" @@ -27,6 +33,18 @@ version = "1.0.4" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "9330f8b2ff13f34540b44e946ef35111825727b38d33286ef986142615121801" +[[package]] +name = "equivalent" +version = "1.0.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "877a4ace8713b0bcf2a4e7eec82529c029f1d0619886d18145fea96c3ffe5c0f" + +[[package]] +name = "foldhash" +version = "0.1.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d9c4f5dac5e15c24eb999c26181a6ca40b39fe946cbe4c263c7209467bc83af2" + [[package]] name = "getrandom" version = "0.3.4" @@ -41,20 +59,13 @@ dependencies = [ [[package]] name = "hashbrown" -version = "0.14.5" +version = "0.15.5" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "e5274423e17b7c9fc20b6e7e208532f9b19825d82dfd615708b70edd83df41f1" +checksum = "9229cfe53dfd69f0609a49f65461bd93001ea1ef889cd5529dd176593f5338a1" dependencies = [ - "ahash", -] - -[[package]] -name = "hashlink" -version = "0.9.1" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "6ba4ff7128dee98c7dc9794b6a411377e1404dba1c97deb8d1a55297bd25d8af" -dependencies = [ - "hashbrown", + "allocator-api2", + "equivalent", + "foldhash", ] [[package]] @@ -259,7 +270,7 @@ name = "warp_cache_rs" version = "0.0.0" dependencies = [ "ahash", - "hashlink", + "hashbrown", "libc", "memmap2", "parking_lot", diff --git a/Cargo.toml b/Cargo.toml index c57efaf..a59589f 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -10,7 +10,7 @@ crate-type = ["cdylib"] [dependencies] pyo3 = { version = "0.28.1", features = ["extension-module"] } parking_lot = "0.12" -hashlink = "0.9" +hashbrown = "0.15" [target.'cfg(not(target_os = "windows"))'.dependencies] memmap2 = "0.9" diff --git a/Makefile b/Makefile index 6c122f1..184da34 100644 --- a/Makefile +++ b/Makefile @@ -1,4 +1,4 @@ -.PHONY: help fmt lint typecheck build build-debug test test-rust test-only bench bench-quick bench-all bench-report clean publish publish-test setup all +.PHONY: help fmt lint typecheck build build-debug test test-rust test-only bench bench-quick bench-all bench-report bench-sieve clean publish publish-test setup all # Optional: specify Python version, e.g. make build PYTHON=3.14 PYTHON ?= @@ -62,6 +62,9 @@ bench-quick: build ## Quick benchmarks (skip sustained/TTL) bench-all: ## Run benchmarks across Python versions + generate report bash benchmarks/bench_all.sh +bench-sieve: build ## Run SIEVE eviction quality benchmarks + uv run $(UV_PYTHON) python benchmarks/bench_sieve.py + bench-report: ## Generate report from existing results uv run python benchmarks/_report_generator.py diff --git a/README.md b/README.md index 143115b..13f5dbf 100644 --- a/README.md +++ b/README.md @@ -1,22 +1,21 @@ # warp_cache -A thread-safe Python caching decorator backed by a Rust extension. Through a -series of optimizations — eliminating serialization, moving the call wrapper -into Rust, applying link-time optimization, and using direct C API calls — we -achieve **0.55-0.66x** of `lru_cache`'s single-threaded throughput while -providing native thread safety that delivers **1.3-1.4x** higher throughput -under concurrent load — and **18-24x** faster than pure-Python `cachetools`. +A thread-safe Python caching decorator backed by a Rust extension. Uses +**SIEVE eviction** for scan-resistant, near-optimal hit rates with per-shard +read locks. The entire cache lookup happens in a single Rust `__call__` — no Python +wrapper overhead. **13-20M ops/s** single-threaded, **22x** faster than +`cachetools`, with a cross-process shared memory backend reaching **9.2M ops/s**. ## Features -- **Drop-in replacement for `functools.lru_cache`** — same decorator pattern and hashable-argument requirement, with added thread safety, TTL, eviction strategies, and async support -- **Thread-safe** out of the box (`parking_lot::RwLock` in Rust) +- **Drop-in replacement for `functools.lru_cache`** — same decorator pattern and hashable-argument requirement, with added thread safety, TTL, and async support +- **[SIEVE eviction](https://junchengyang.com/publication/nsdi24-SIEVE.pdf)** — a simple, scan-resistant algorithm with near-optimal hit rates and O(1) overhead per access +- **Thread-safe** out of the box (sharded `RwLock` + `AtomicBool` for SIEVE visited bit) - **Async support**: works with `async def` functions — zero overhead on sync path -- **Shared memory backend**: cross-process caching via mmap -- **Multiple eviction strategies**: LRU, MRU, FIFO, LFU +- **Shared memory backend**: cross-process caching via mmap with fully lock-free reads - **TTL support**: optional time-to-live expiration - **Single FFI crossing**: entire cache lookup happens in Rust, no Python wrapper overhead -- **12-18M ops/s** single-threaded, **16M ops/s** under concurrent load, **18-24x** faster than `cachetools` +- **13-20M ops/s** single-threaded, **17M+ ops/s** under concurrent load, **22x** faster than `cachetools` ## Installation @@ -59,20 +58,39 @@ Like `lru_cache`, all arguments must be hashable. See the [usage guide](docs/usa | Metric | warp_cache | cachetools | lru_cache | |---|---|---|---| -| Single-threaded | 12-18M ops/s | 0.6-1.2M ops/s | 21-40M ops/s | -| Multi-threaded (8T) | 16M ops/s | 770K ops/s (with Lock) | 12M ops/s (with Lock) | -| Thread-safe | Yes (RwLock) | No (manual Lock) | No | +| Single-threaded (cache=256) | 18.1M ops/s | 814K ops/s | 32.1M ops/s | +| Multi-threaded (8T) | 17.9M ops/s | 774K ops/s (with Lock) | 12.3M ops/s (with Lock) | +| Shared memory (single proc) | 9.2M ops/s (mmap) | No | No | +| Shared memory (4 procs) | 7.5M ops/s total | No | No | +| Thread-safe | Yes (sharded RwLock) | No (manual Lock) | No | | Async support | Yes | No | No | -| Cross-process (shared) | ~7.8M ops/s (mmap) | No | No | | TTL support | Yes | Yes | No | -| Eviction strategies | LRU, MRU, FIFO, LFU | LRU, LFU, FIFO, RR | LRU only | +| Eviction | SIEVE (scan-resistant) | LRU, LFU, FIFO, RR | LRU only | | Implementation | Rust (PyO3) | Pure Python | C (CPython) | -Under concurrent load, `warp_cache` delivers **1.3-1.4x** higher throughput than `lru_cache + Lock` and **18-24x** higher than `cachetools`. See [full benchmarks](docs/performance.md) for details. +`warp_cache` is the fastest *thread-safe* cache — **22x** faster than `cachetools` and **4.9x** faster than `moka_py`. Under multi-threaded load, it's **1.5x faster** than `lru_cache + Lock`. See [full benchmarks](docs/performance.md) for details. + + + + Multi-thread scaling: GIL vs no-GIL + + +## Eviction quality: SIEVE vs LRU + +Beyond throughput, SIEVE delivers **up to 21.6% miss reduction** vs LRU. From the [NSDI'24 paper](https://junchengyang.com/publication/nsdi24-SIEVE.pdf), key findings reproduced in `benchmarks/bench_sieve.py` (1M requests, Zipf-distributed keys): + +| Workload | SIEVE | LRU | Miss Reduction | +|---|---:|---:|---:| +| Zipf, 10% cache | 74.5% | 67.5% | +21.6% | +| Scan resistance (70% hot) | 69.9% | 63.5% | +17.6% | +| One-hit wonders (25% unique) | 53.9% | 43.7% | +18.1% | +| Working set shift | 75.5% | 69.7% | +16.6% | + +SIEVE's visited-bit design protects hot entries from sequential scans and filters out one-hit wonders that would pollute LRU. See [eviction quality benchmarks](docs/performance.md#sieve-eviction-quality) for the full breakdown. ## Documentation -- **[Usage guide](docs/usage.md)** — eviction strategies, async, TTL, shared memory, decorator parameters +- **[Usage guide](docs/usage.md)** — SIEVE eviction, async, TTL, shared memory, decorator parameters - **[Performance](docs/performance.md)** — benchmarks, architecture deep-dive, optimization journey - **[Alternatives](docs/alternatives.md)** — comparison with cachebox, moka-py, cachetools, lru_cache - **[Examples](examples/)** — runnable scripts for every feature (`uv run examples/.py`) diff --git a/benchmarks/COMPARISON.md b/benchmarks/COMPARISON.md index 68c0e6c..4054bfa 100644 --- a/benchmarks/COMPARISON.md +++ b/benchmarks/COMPARISON.md @@ -1,224 +1,242 @@ -# warp_cache vs lru_cache vs moka_py +# warp_cache vs lru_cache vs moka_py vs cachebox -A deep comparison of three Python caching libraries: **warp_cache** (Rust/PyO3), **lru_cache** (CPython builtin), and **moka_py** (Rust/PyO3, port of Java's Caffeine). +A head-to-head comparison of four Python caching libraries, all benchmarked on the same machine, same workload, same measurement methodology. -*Benchmarks: Apple M-series (arm64), Zipf-distributed keys (2000 unique), 100K ops per config, `time.perf_counter()`.* +*Environment: Python 3.13.2, Apple M-series (arm64), Zipf-distributed keys (2000 unique), 100K ops per config, `time.perf_counter()`.* --- ## TL;DR -| Scenario | warp_cache | lru_cache | moka_py | -|---|---:|---:|---:| -| Single-thread (3.13, cache=256) | 16.3M ops/s | 29.8M ops/s | 3.9M ops/s | -| Single-thread (3.13t, no GIL) | 14.0M ops/s | 21.5M ops/s | 3.3M ops/s | -| Multi-thread 8T (3.13, GIL) | 16.4M ops/s | 12.6M ops/s (+Lock) | 3.8M ops/s | -| Multi-thread 8T (3.13t, no GIL) | 12.7M ops/s | 9.0M ops/s (+Lock) | 3.0M ops/s | -| Shared memory (3.13, single proc) | 7.8M ops/s | N/A | N/A | -| Thread-safe (builtin) | Yes | No | Yes | -| Async support | Yes | No | No | -| Cross-process shared memory | Yes | No | No | -| Eviction strategies | LRU/MRU/FIFO/LFU | LRU | LRU/LFU/FIFO | -| TTL support | Yes | No | Yes | - -**Bottom line:** `lru_cache` is fastest single-threaded (it's C code inside CPython with zero overhead). `warp_cache` is the fastest *thread-safe* cache, 1.3x faster than `lru_cache+Lock` under the GIL and 1.4x faster without it. `moka_py` is 4-5x slower than `warp_cache` despite also being Rust. The shared memory backend reaches ~7.8M ops/s via seqlock-based optimistic reads — only ~2x slower than the in-process backend. +| | warp_cache | lru_cache | moka_py | cachebox | +|---|---:|---:|---:|---:| +| **Single-thread (cache=256)** | **18.1M ops/s** | **32.1M ops/s** | 3.7M ops/s | 1.5M ops/s | +| **Multi-thread 8T** | **17.9M ops/s** | 12.3M ops/s (+Lock) | 3.6M ops/s | 1.5M ops/s | +| **Sustained (10s)** | **7.9M ops/s** | **10.2M ops/s** | 2.8M ops/s | 1.3M ops/s | +| Shared memory | 9.2M ops/s | N/A | N/A | N/A | +| Implementation | Rust (PyO3) | C (CPython) | Rust (PyO3, moka) | Rust (PyO3) | +| Thread-safe (builtin) | Yes | No | Yes | Yes | +| Eviction | SIEVE | LRU | TinyLFU / LRU | LRU / LFU / FIFO / RR | +| TTL support | Yes | No | Yes (+ TTI) | Yes (TTLCache, VTTLCache) | ---- +**Bottom line:** `lru_cache` is fastest single-threaded — it's C code inside CPython with zero lock overhead. Among thread-safe caches, `warp_cache` leads at **18.1M ops/s** — 4.9x faster than `moka_py` and 12x faster than `cachebox`. Under multi-threaded load, `warp_cache` is **1.5x faster** than `lru_cache + Lock`. All three Rust libraries provide builtin thread safety, but with very different performance characteristics. Only `warp_cache` offers cross-process shared memory. -## The GIL Question +--- -### What the GIL means for caching +## The libraries -Python's Global Interpreter Lock (GIL) serializes all Python bytecode execution. This has two consequences for caches: +| Library | What it is | PyPI | +|---|---|---| +| **[warp_cache](https://github.com/toloco/warp_cache)** | Rust/PyO3 caching decorator with SIEVE eviction, shared memory backend, and single-FFI-crossing architecture | `pip install warp_cache` | +| **[lru_cache](https://docs.python.org/3/library/functools.html#functools.lru_cache)** | CPython builtin LRU cache decorator, implemented in C. Zero dependencies, zero overhead, but not thread-safe | (builtin) | +| **[moka-py](https://github.com/deliro/moka-py)** | Rust port of Java's Caffeine cache with TinyLFU admission. Offers both decorator and manual cache object APIs | `pip install moka-py` | +| **[cachebox](https://github.com/awolverp/cachebox)** | Rust/PyO3 with 7 cache types (LRU, LFU, FIFO, RR, TTL, VTTL, plain). Dictionary-like API with decorator support | `pip install cachebox` | -1. **`lru_cache` doesn't need a lock.** The GIL guarantees that only one thread touches the cache at a time. This is why it's so fast — zero synchronization overhead. +--- -2. **Thread-safe caches pay a tax.** Any cache that uses its own lock (like `warp_cache`'s `parking_lot::RwLock`) pays ~8ns per operation even though the GIL already serializes access. This is the price of correctness under free-threaded Python. +## Feature matrix + +| Feature | warp_cache | lru_cache | moka_py | cachebox | +|---|:---:|:---:|:---:|:---:| +| Implementation | Rust (PyO3) | C (CPython) | Rust (PyO3) | Rust (PyO3) | +| Thread-safe (builtin) | Yes (lock-free reads) | No | Yes | Yes | +| Async support | Yes (auto-detect) | No | Yes (`@cached`) | Yes (`@cached`) | +| Cross-process shared memory | Yes (mmap) | No | No | No | +| TTL support | Yes | No | Yes | Yes | +| TTI (time-to-idle) | No | No | Yes | No | +| Per-entry TTL | No | No | Yes | Yes (VTTLCache) | +| Eviction strategies | SIEVE | LRU | TinyLFU, LRU | LRU, LFU, FIFO, RR | +| Stampede prevention | No | No | Yes (`get_with`) | Yes | +| Eviction listener | No | No | Yes | No | +| Cache statistics | Yes (hits/misses) | Yes (hits/misses) | No | Yes (hits/misses + memory) | +| `cache_clear()` | Yes | Yes | Yes | Yes | +| Manual cache object | No (decorator only) | No (decorator only) | Yes (`Moka(...)`) | Yes (dict-like API) | +| Copy-on-return | No | No | No | Yes (configurable) | +| Decorator API | `@cache()` | `@lru_cache()` | `@cached()` | `@cached(Cache())` | +| Free-threaded Python ready | Yes | No (needs Lock) | Yes | Yes | +| Python versions | 3.10+ | Any | 3.9+ | 3.9+ | -### Free-threaded Python 3.13t +--- -Python 3.13 introduced an experimental free-threaded mode (`python3.13t`) that disables the GIL entirely. This changes the equation: +## Single-thread performance -- **`lru_cache` becomes unsafe.** Without the GIL, concurrent reads/writes corrupt internal state. You *must* wrap it in `threading.Lock()`, adding contention overhead. -- **`warp_cache` already has the lock.** Its `RwLock` enables true parallel reads across cores — multiple threads can read the cache simultaneously with no contention. -- **Everything gets ~15-20% slower.** Atomic reference counting (replacing the GIL's implicit refcount protection) adds overhead to all Python objects. This affects every library equally. +Cache hit throughput across different cache sizes, Zipf-distributed keys: -### warp_cache's read-lock architecture +| Cache Size | warp_cache | lru_cache | moka_py | cachebox | +|---:|---:|---:|---:|---:| +| 32 | 13.0M | 19.6M | 3.1M | 1.1M | +| 64 | 15.0M | 22.3M | 3.3M | 1.2M | +| 128 | 16.6M | 25.7M | 3.4M | 1.3M | +| 256 | 18.1M | 32.1M | 3.7M | 1.5M | +| 512 | 18.6M | 34.5M | 4.1M | 1.8M | +| 1024 | 19.9M | 39.5M | 4.4M | 2.4M | -The key insight: most cache operations are **reads** (cache hits). `warp_cache` uses a read-write lock where cache hits only acquire a *read lock* — multiple threads read simultaneously. Only cache misses require a write lock. + + + Single-Thread Throughput + -This means under real workloads with high hit rates (typical for caches), contention is near-zero even with many threads. +### Why is lru_cache fastest? ---- +`lru_cache` is C code inside CPython. It pays no thread-safety overhead (the GIL provides implicit safety), no PyO3 dispatch overhead, and no reference counting overhead. It simply cannot be beaten by an extension module under the GIL. -## Single-Thread Performance +### Why is warp_cache 4.9x faster than moka_py? -![Single-Thread Throughput](results/comparison_st_throughput.png) +Both are Rust + PyO3, yet `warp_cache` is significantly faster. The differences: -### Why lru_cache wins single-threaded +1. **Single FFI crossing.** `warp_cache` does the entire lookup — hash, find, equality check, SIEVE visited update, return — in one Rust `__call__`. `moka_py` crosses the FFI boundary multiple times. -`lru_cache` is unbeatable in single-threaded scenarios because it pays almost nothing: +2. **SIEVE eviction.** Cache hits just set a `visited` bit (a single-word store). No linked-list reordering, no frequency counter updates on the hot path. -| Operation | lru_cache (C) | warp_cache (Rust) | Delta | -|---|---:|---:|---:| -| Call dispatch (`tp_call`) | ~5ns | ~10ns | +5ns | -| Hash args (`PyObject_Hash`) | ~15ns | ~15ns | 0 | -| Table lookup + key equality | ~10ns | ~12ns | +2ns | -| LRU reorder (linked list) | ~5ns | ~8ns | +3ns | -| **Lock acquire + release** | **0ns** | **~8ns** | **+8ns** | -| Refcount management | ~2ns | ~5ns | +3ns | -| Return value | ~2ns | ~2ns | 0 | -| **Total** | **~39ns** | **~60ns** | **+21ns (~34ns measured)** | +3. **Precomputed hash + raw C equality.** `CacheKey` stores the Python hash once and uses `ffi::PyObject_RichCompareBool` directly — the same raw C call that `lru_cache` uses. -The three categories of overhead: +4. **No serialization.** The in-memory backend stores `Py` directly. No copies. -1. **Irreducible: Thread safety lock (~8ns)** — `lru_cache` pays nothing because the GIL provides implicit safety. `warp_cache` pays ~8ns for `parking_lot::RwLock`. Cannot be eliminated without removing thread safety. +### Why is cachebox slower than moka_py? -2. **Structural: PyO3 call dispatch (~5ns)** — PyO3's `tp_call` shim extracts GIL tokens, validates and converts argument pointers. `lru_cache` receives raw `PyObject*` directly. Inherent to using a safe FFI layer. +Despite both being Rust + PyO3, cachebox's `@cached` decorator adds more Python-level overhead. The LRU linked-list reordering on every hit is also more expensive than moka_py's deferred frequency tracking. cachebox's default `copy_level=1` (copy dict/list/set return values) adds additional overhead that the benchmarks measure. -3. **Marginal: Reference counting (~3ns)** — `lru_cache` uses the args tuple pointer as-is. `warp_cache` does `Py_INCREF` to own it in `CacheKey`, then `Py_DECREF` on drop. Cost of Rust's ownership model. +--- -### Why warp_cache beats moka_py 4-5x +## Multi-thread performance -Both are Rust + PyO3, yet `warp_cache` is **4.2x faster** (16.3M vs 3.9M ops/s on Python 3.13). The differences: +All thread-safe libraries used directly. `lru_cache` wrapped in `threading.Lock()`. -1. **Single FFI crossing.** `warp_cache` does the entire lookup — hash, find, equality check, LRU reorder, return — in one Rust `__call__`. `moka_py` crosses the FFI boundary multiple times. +| Threads | warp_cache | lru_cache + Lock | moka_py | cachebox | +|---:|---:|---:|---:|---:| +| 1 | 18.3M | 11.9M | 3.6M | 1.5M | +| 2 | 17.3M | 12.2M | 3.5M | 1.5M | +| 4 | 18.0M | 12.6M | 3.6M | 1.5M | +| 8 | 17.9M | 12.3M | 3.6M | 1.5M | +| 16 | 17.2M | 11.6M | 3.6M | 1.5M | +| 32 | 16.8M | 11.6M | 3.6M | 1.4M | +| 64 | 12.8M | 9.8M | 3.9M | 1.5M | +| 128 | 12.1M | 9.5M | 3.8M | 1.4M | -2. **Static dispatch.** `warp_cache` uses an `enum` over strategy types (`StrategyEnum`), allowing the compiler to inline and devirtualize. No `Box` indirection. + + + Multi-Thread Scaling + -3. **Precomputed hash + raw C equality.** `CacheKey` stores the Python hash once and uses `ffi::PyObject_RichCompareBool` directly — the same raw C call that `lru_cache` uses. +Under the GIL, `warp_cache` is **1.4-1.5x faster** than `lru_cache + Lock` across all thread counts. The sharded `RwLock` architecture means cache hits only acquire a cheap per-shard read lock (~8ns), while `lru_cache + Lock` must acquire a global `threading.Lock()` on every access. -4. **No serialization.** The in-memory backend stores `Py` directly. No pickle, no copies. +Under **free-threaded Python** (no GIL), `warp_cache`'s sharded `RwLock` enables true parallel reads across cores, while `lru_cache` must still acquire a real lock on every access. --- -## Multi-Thread Performance +## Sustained throughput -![Multi-Thread Scaling](results/comparison_mt_scaling.png) +10-second sustained benchmark (cache size = 256, Zipf-distributed keys): -![Scaling Efficiency](results/comparison_scaling_ratio.png) +| Library | ops/s | vs warp_cache | +|---|---:|---:| +| lru_cache | 10.2M | 1.3x faster | +| **warp_cache** | **7.9M** | **1.0x** | +| moka_py | 2.8M | 2.8x slower | +| cachebox | 1.3M | 6.1x slower | -### GIL mode (Python 3.13) +Sustained throughput is lower than burst throughput because it includes GC pauses, CPU frequency scaling, and cache-line effects over time. The relative ordering remains consistent. -Under the GIL, `warp_cache` maintains ~16M ops/s regardless of thread count. This is remarkable — adding threads doesn't slow it down because: +--- -- The `RwLock` is uncontended (the GIL serializes access anyway) -- Atomic hit/miss counters use `Ordering::Relaxed` — no memory barriers -- The deferred access log batches LRU updates, reducing write-lock contention +## TTL throughput -Meanwhile, `lru_cache + Lock` drops to ~12.6M ops/s. The `threading.Lock()` wrapper adds Python-level function call overhead on every access. +Cache size = 256, various TTL values (10-second sustained per configuration): -### No-GIL mode (Python 3.13t) +| TTL | warp_cache | moka_py | ratio | +|---|---:|---:|---:| +| 1ms | 6.7M | 2.5M | 2.7x | +| 10ms | 6.9M | 2.7M | 2.6x | +| 100ms | 6.9M | 2.7M | 2.6x | +| 1s | 7.0M | 2.6M | 2.7x | +| None | 6.9M | 2.7M | 2.6x | -Without the GIL, the story is similar but shifted down ~15-20% across the board due to atomic reference counting overhead: +TTL adds minimal overhead to `warp_cache` — the expiry timestamp is checked inline during the read path. `cachebox` is excluded from TTL benchmarks because its `TTLCache` uses FIFO eviction (not LRU-comparable). `lru_cache` does not support TTL. -- **warp_cache: ~12.7M ops/s** — stable across thread counts -- **lru_cache + Lock: ~9.0M ops/s** — degrades slightly with contention -- **moka_py: ~3.0M ops/s** — stable but slow +--- -`warp_cache` leads by **1.4x** over `lru_cache+Lock` — its RwLock architecture specifically benefits here because multiple readers can proceed in parallel without the GIL's serialization. +## Async throughput -### Why warp_cache doesn't scale *up* with threads +Cache hit throughput for `async def` cached functions (cache size = 256, Zipf-distributed keys): + +| Mode | warp_cache | moka_py | ratio | +|---|---:|---:|---:| +| Sync | 16.3M | 3.7M | 4.4x | +| Async | 5.6M | 3.2M | 1.7x | -Under the GIL, adding threads can't increase throughput because only one thread runs at a time. The GIL turns parallelism into concurrency. + + + Sync vs Async Throughput + -Under no-GIL, `warp_cache` could theoretically scale reads across cores. In practice, the benchmark workload is CPU-bound with very short operations (~70ns each), so thread scheduling overhead dominates any parallelism gains. For I/O-bound workloads with expensive cache misses, the read-lock architecture would show clear scaling benefits. +Async cache hits are slower than sync because every call creates and resolves a Python coroutine object, even though the actual cache lookup is synchronous Rust code. `warp_cache` async hits are still **1.7x faster** than `moka_py` async. The async overhead is dominated by CPython's coroutine machinery, not the cache itself — `warp_cache`'s `AsyncCachedFunction` calls the Rust `get()` synchronously and only `await`s the original function on cache miss. --- -## Why warp_cache Is Fast — Architecture Deep Dive +## Cross-process shared memory -### 1. Single FFI crossing +`warp_cache` is the only library in this comparison that supports cross-process caching via mmap'd shared memory. -The entire cache lookup happens in Rust's `__call__` method. Python calls `cached_fn(42)`, which enters Rust once and returns the cached value. No Python wrapper function, no intermediate objects. +| Backend | Throughput | Hit Rate | +|---|---:|---:| +| Memory (in-process) | 17.2M ops/s | 71.2% | +| Shared (mmap, single process) | 9.2M ops/s | 72.3% | +| Shared (mmap, 4 processes) | 7.5M ops/s total | — | +| Shared (mmap, 8 processes) | 6.6M ops/s total | — | + +The shared backend reaches **54% of in-process speed** with no Mutex on the read path. The gap is irreducible cross-process overhead: serialization (serde fast-path for primitives, pickle fallback), deterministic hashing, seqlock, and mmap copy. All shared reads are fully lock-free. + +This is orders of magnitude faster than network-based caches (Redis: ~100-500K ops/s over localhost) and requires no external services. + +--- + +## Architecture deep dive + +### Why warp_cache is fast ``` Python: fn(42) └─ tp_call (PyO3) ─────────────────────────────── one FFI crossing ├─ hash(args) via ffi::PyObject_Hash - ├─ HashMap lookup Rust hashbrown, precomputed hash + ├─ shard select hash % n_shards + ├─ RwLock::read() per-shard read lock (~8ns) + ├─ HashMap lookup hashbrown ├─ equality check via ffi::PyObject_RichCompareBool - ├─ RwLock (read) parking_lot, ~8ns uncontended + ├─ visited.store(true) AtomicBool, lock-free └─ return cached value ``` -### 2. Read-lock fast path + deferred access log - -Cache hits acquire only a **read lock** (`store.rs` lines 110-139). The LRU reorder is deferred — instead of immediately promoting the accessed key (which would require a write lock), the key is pushed to a bounded access log. The log is drained on the next cache miss (under a write lock). - -This means cache hits under high hit rates (~65%+ in these benchmarks) almost never contend for the write lock. - -### 3. Enum static dispatch - -Eviction strategies are dispatched via a Rust `enum` (`StrategyEnum`) with `#[inline(always)]` on every method. The compiler knows all variants at compile time and can inline the specific strategy code directly into the hot path. Compare with `Box`, which requires vtable indirection on every call. - -### 4. Precomputed hash + raw C API equality - -`CacheKey` computes `PyObject_Hash` once at key creation and stores the result. HashMap lookups use the precomputed hash directly. Key equality uses raw `ffi::PyObject_RichCompareBool` — the exact same C call that `lru_cache` uses — bypassing PyO3's safe-but-slower `Python::with_gil` wrapper. - -### 5. parking_lot::RwLock (~8ns) +1. **Single FFI crossing** — the entire lookup happens in Rust's `__call__` method. No Python wrapper function, no intermediate objects. +2. **SIEVE eviction** — cache hits set `visited = 1` (one store). No linked-list reordering. Eviction scans for unvisited entries, giving visited entries a second chance. +3. **Sharded RwLock** — cache hits acquire a cheap per-shard read lock (~8ns). The write lock is only acquired on cache misses (eviction). Multiple readers proceed in parallel across shards. +4. **Fat LTO + codegen-units=1** — link-time optimization inlines PyO3's FFI wrappers into the hot path. -`parking_lot` provides a significantly faster mutex than `std::sync::RwLock` (~8ns vs ~25ns uncontended on arm64). It uses adaptive spinning before parking, reducing syscall overhead. +### How moka_py works -### 6. Fat LTO + codegen-units=1 +`moka_py` wraps Rust's `moka` crate (inspired by Java's Caffeine). It uses **W-TinyLFU** — a window + main cache with frequency sketches for admission filtering. This provides excellent hit rates but requires more bookkeeping per access. The Python `@cached` decorator crosses the FFI boundary for both key hashing and value retrieval. -The release profile enables fat link-time optimization across all crates (including PyO3) and forces single-codegen-unit compilation. This allows the compiler to inline PyO3's FFI wrappers directly into `warp_cache`'s hot path, eliminating call overhead at the boundary. +### How cachebox works -```toml -[profile.release] -lto = "fat" -codegen-units = 1 -``` +`cachebox` implements 7 different cache types in Rust using Google's SwissTable (`hashbrown`). The `@cached` decorator wraps a cache object instance. It defaults to copying dict/list/set return values (`copy_level=1`) to prevent mutation of cached data — a safety feature that adds overhead. Its thread safety uses internal locks. -### 7. Atomic hit/miss counters +### How lru_cache works -Hit and miss counts use `AtomicU64` with `Ordering::Relaxed` — no memory barriers, no cache-line bouncing on single-socket machines. Stats collection is essentially free. +`lru_cache` is C code compiled directly into CPython. It uses the GIL for implicit thread safety (zero lock overhead). The cache is a doubly-linked list over a C hash table — the simplest possible implementation with the lowest possible overhead. Under free-threaded Python, it needs an external `threading.Lock()`. --- -## Cross-Process Shared Memory - -![Backend Comparison](results/comparison_backends.png) - -`warp_cache` is the only library in this comparison that supports cross-process caching via mmap'd shared memory. This enables multiple Python processes to share a single cache without serialization overhead of Redis/Memcached. - -| Backend | Throughput | Use case | -|---|---:|---| -| Memory (in-process) | ~15.0M ops/s | Single process, maximum speed | -| Shared (mmap, single process) | ~7.8M ops/s | Cross-process capable, near lock-free reads | -| Shared (mmap, 8 processes) | ~1.9M ops/s total | Multiple concurrent processes | - -The shared backend uses: -- **mmap'd files** (`$TMPDIR/warp_cache/{name}.cache`) for zero-copy access -- **Seqlock** in shared memory for cross-process synchronization — reads are optimistic and lock-free (~10-20ns), only writes acquire a spinlock -- **Open-addressing hash table** with linear probing (power-of-2 capacity for bitmask) -- **Pickle serialization** for values (required for cross-process compatibility) -- **Atomic stats** — `hits`, `misses`, and `oversize_skips` use `AtomicU64`, so `info()` and `record_oversize_skip()` never acquire a lock - -The read path splits into two phases: (1) an optimistic lock-free hash lookup + value copy under the seqlock, retried if a writer was active; (2) a brief write lock only when ordering needs updating (LRU/MRU/LFU hit). FIFO cache hits are fully lock-free. TTL-expired entries are detected during the optimistic read, then cleaned up under the write lock with re-verification. - -The ~2x throughput gap between memory and shared backend is dominated by pickle serialization (the lock itself is near-free on the read path). The shared backend is still orders of magnitude faster than network-based caches (Redis: ~100-500K ops/s over localhost). - ---- +## When to use each -## Feature Matrix - -| Feature | warp_cache | lru_cache | moka_py | -|---|---|---|---| -| Implementation | Rust (PyO3) | C (CPython builtin) | Rust (PyO3) | -| Thread-safe (builtin) | Yes (`RwLock`) | No (needs `Lock` wrapper) | Yes | -| Async support | Yes (auto-detect) | No | No | -| Cross-process (shared mem) | Yes (mmap) | No | No | -| TTL support | Yes | No | Yes | -| LRU eviction | Yes | Yes | Yes | -| LFU eviction | Yes | No | Yes | -| FIFO eviction | Yes | No | Yes | -| MRU eviction | Yes | No | No | -| Cache statistics | Yes (hits/misses) | Yes (hits/misses) | No | -| `cache_clear()` | Yes | Yes | No | -| Decorator API | `@cache()` | `@lru_cache()` | `Moka(maxsize)` | -| Python version | 3.9+ | Any | 3.8+ | -| Free-threaded ready | Yes | No (needs Lock) | Yes | +| Use case | Recommendation | +|---|---| +| Single-threaded, maximum speed | **lru_cache** — unbeatable C code, zero overhead | +| Thread-safe, high throughput | **warp_cache** — fastest thread-safe cache by 2.8x+ | +| Cross-process (Gunicorn, Celery) | **warp_cache** — only option with shared memory | +| Per-entry TTL with stampede prevention | **cachebox** (VTTLCache) or **moka_py** (`get_with`) | +| Time-to-idle (TTI) expiration | **moka_py** — only option with TTI | +| Manual cache object API (no decorator) | **moka_py** (`Moka(...)`) or **cachebox** (dict-like) | +| Async with concurrent dedup | **moka_py** (`wait_concurrent=True`) | +| Free-threaded Python (no GIL) | **warp_cache**, **moka_py**, or **cachebox** — all three are ready | --- @@ -226,23 +244,20 @@ The ~2x throughput gap between memory and shared backend is dominated by pickle **Machine:** Apple M-series (arm64), macOS -**Python versions tested:** -- Python 3.12.0 (GIL) -- Python 3.13.2 (GIL) -- Python 3.13.2 free-threaded (no GIL) +**Python:** 3.13.2 (CPython, GIL enabled) -**Workload:** Zipf-distributed keys (alpha=1.0) over 2000 unique values, producing ~65% cache hit rate at maxsize=256. This models realistic access patterns where some keys are much hotter than others. +**Workload:** Zipf-distributed keys (alpha=1.0) over 2000 unique values, producing ~72% cache hit rate at maxsize=256. This models realistic access patterns where some keys are much hotter than others. -**Thread safety wrapping:** `lru_cache` and `cachetools` are not thread-safe, so multi-threaded benchmarks wrap them in `threading.Lock()`. `warp_cache` and `moka_py` are used directly (builtin thread safety). +**Thread safety wrapping:** `lru_cache` is not thread-safe, so multi-threaded benchmarks wrap it in `threading.Lock()`. `warp_cache`, `moka_py`, and `cachebox` are used directly (builtin thread safety). -**Timing:** `time.perf_counter()` with 100K operations per configuration. Sustained benchmarks run for 10 seconds. Results are the most recent run; variance across runs is typically <5%. +**Timing:** `time.perf_counter()` with 100K operations per burst configuration. Sustained benchmarks run for 10 seconds. Results are from a single run; variance across runs is typically <5%. -**Library versions:** warp_cache 0.1.0, moka_py 0.3.0, cachetools 7.0.1 +**Library versions:** warp_cache 0.1.0, moka_py 0.3.0, cachebox 5.2.2 -**Source data:** `benchmarks/results/bench_py3.12.json`, `bench_py3.13.json`, `bench_default.json` (3.13t) +**Source data:** `benchmarks/results/bench_default.json` -**Charts generated by:** `benchmarks/_generate_comparison_charts.py` +**Benchmark runner:** `benchmarks/_bench_runner.py` --- -*Generated from benchmark data. See `benchmarks/` for full source and raw results.* +*All benchmarks run on the same machine, same workload, same measurement methodology. See `benchmarks/` for full source and raw results.* diff --git a/benchmarks/_bench_runner.py b/benchmarks/_bench_runner.py index 1c81b9f..3fb95ca 100644 --- a/benchmarks/_bench_runner.py +++ b/benchmarks/_bench_runner.py @@ -9,6 +9,7 @@ """ import argparse +import asyncio import functools import json import platform @@ -36,6 +37,7 @@ class Contestant: name: str make_lru: Callable[[int], Callable] | None = None make_ttl: Callable[[int, float], Callable] | None = None + make_async_lru: Callable[[int], Callable] | None = None thread_safe: bool = False available: bool = False version: str = "" @@ -46,17 +48,22 @@ def _identity(x: int) -> int: return x +async def _async_identity(x: int) -> int: + return x + + def _build_contestants() -> list[Contestant]: contestants: list[Contestant] = [] # 1. warp_cache (always available — this is the project under test) - from warp_cache import Strategy, cache + from warp_cache import cache contestants.append( Contestant( name="warp_cache", - make_lru=lambda sz: cache(strategy=Strategy.LRU, max_size=sz)(_identity), - make_ttl=lambda sz, ttl: cache(strategy=Strategy.LRU, max_size=sz, ttl=ttl)(_identity), + make_lru=lambda sz: cache(max_size=sz)(_identity), + make_ttl=lambda sz, ttl: cache(max_size=sz, ttl=ttl)(_identity), + make_async_lru=lambda sz: cache(max_size=sz)(_async_identity), thread_safe=True, available=True, version="0.1.0", @@ -137,11 +144,19 @@ def fn(x: int) -> int: return fn + def _moka_async_lru(sz): + @moka_py.cached(maxsize=sz) + async def fn(x: int) -> int: + return x + + return fn + contestants.append( Contestant( name="moka_py", make_lru=_moka_lru, make_ttl=_moka_ttl, + make_async_lru=_moka_async_lru, thread_safe=True, available=True, version=getattr(moka_py, "VERSION", ""), @@ -238,12 +253,12 @@ def _time_loop(fn, keys: list[int]) -> float: def verify_correctness(n_ops: int = 50_000) -> bool: - from warp_cache import Strategy, cache + from warp_cache import cache max_size = 256 num_keys = 500 - @cache(strategy=Strategy.LRU, max_size=max_size) + @cache(max_size=max_size) def fc_fn(x: int) -> int: return x * 7 + 3 @@ -432,14 +447,58 @@ def bench_ttl( # ═══════════════════════════════════════════════════════════════════════════ -# Benchmark 6 — Shared backend: single-process throughput +# Benchmark 6 — Async throughput +# ═══════════════════════════════════════════════════════════════════════════ + + +def bench_async_throughput( + contestants: list[Contestant], + cache_sizes: list[int], + n_ops: int = 100_000, +) -> dict: + """Benchmark async cached function throughput (cache hits via event loop).""" + num_keys = 2000 + keys = zipf_keys(n_ops, num_keys) + + async_contestants = [c for c in contestants if c.available and c.make_async_lru is not None] + results: dict[str, dict[str, float]] = {} + + for sz in cache_sizes: + sz_results: dict[str, float] = {} + for c in async_contestants: + fn = c.make_async_lru(sz) + + async def _run(f=fn): + for k in keys: + await f(k) + + t0 = time.perf_counter() + asyncio.run(_run()) + elapsed = time.perf_counter() - t0 + sz_results[c.name] = n_ops / elapsed + + results[str(sz)] = sz_results + + # Also measure sync for comparison (same contestants that have async) + sync_results: dict[str, float] = {} + for c in async_contestants: + fn = c.make_lru(256) + elapsed = _time_loop(fn, keys) + sync_results[c.name] = n_ops / elapsed + results["sync_256"] = sync_results + + return results + + +# ═══════════════════════════════════════════════════════════════════════════ +# Benchmark 7 — Shared backend: single-process throughput # ═══════════════════════════════════════════════════════════════════════════ def bench_shared_throughput( n_ops: int = 100_000, max_size: int = 256 ) -> dict[str, dict[str, float]]: - from warp_cache import Strategy, cache + from warp_cache import cache num_keys = 2000 keys = zipf_keys(n_ops, num_keys) @@ -447,7 +506,7 @@ def bench_shared_throughput( for backend in ("memory", "shared"): - @cache(strategy=Strategy.LRU, max_size=max_size, backend=backend) + @cache(max_size=max_size, backend=backend) def fn(x: int) -> int: return x @@ -477,7 +536,6 @@ def _mp_worker(args): fn = SharedCachedFunction( lambda x: x, - 0, 512, None, 512, @@ -515,7 +573,6 @@ def bench_multiprocess( _init_fn = SharedCachedFunction( lambda x: x, - 0, max_size, None, 512, @@ -564,7 +621,7 @@ def main() -> None: available = [c for c in contestants if c.available] unavailable = [c for c in contestants if not c.available] - total_steps = 5 if args.quick else 7 + total_steps = 6 if args.quick else 8 tag_suffix = " (free-threaded)" if info["gil_disabled"] else "" print(f"Python {info['version']}{tag_suffix} [{info['implementation']}]") @@ -629,15 +686,26 @@ def main() -> None: parts.append(f"{name}={fmt(d['ops_per_sec'])}") print(f" TTL={ttl_label}: {' '.join(parts)}") - # 6. Shared backend single-process + # 6. Async throughput step = 4 if args.quick else 6 + async_cache_sizes = [256] + print(f"\n[{step}/{total_steps}] Async throughput ...") + async_results = bench_async_throughput(contestants, async_cache_sizes) + for sz_label, sz_data in async_results.items(): + parts = [] + for name, ops in sz_data.items(): + parts.append(f"{name}={fmt(ops)}") + print(f" {sz_label}: {' '.join(parts)}") + + # 7. Shared backend single-process + step = 5 if args.quick else 7 print(f"\n[{step}/{total_steps}] Shared backend: memory vs shared ...") shared_tp_results = bench_shared_throughput() for backend, data in shared_tp_results.items(): print(f" {backend}: {data['ops_per_sec']:,.0f} ops/s hit_rate={data['hit_rate']:.1%}") - # 7. Multi-process scaling - step = 5 if args.quick else 7 + # 8. Multi-process scaling + step = 6 if args.quick else 8 process_counts = [1, 2, 4, 8] print(f"\n[{step}/{total_steps}] Shared backend: multi-process scaling ...") mp_results = bench_multiprocess(process_counts) @@ -657,6 +725,7 @@ def main() -> None: "contestants": contestant_info, "throughput": tp_results, "threading": th_results, + "async_throughput": async_results, "shared_throughput": shared_tp_results, "multiprocess": mp_results, } diff --git a/benchmarks/_generate_comparison_charts.py b/benchmarks/_generate_comparison_charts.py index a748d62..6fc8558 100644 --- a/benchmarks/_generate_comparison_charts.py +++ b/benchmarks/_generate_comparison_charts.py @@ -1,20 +1,22 @@ #!/usr/bin/env python3 """Generate comparison charts (warp_cache vs lru_cache vs moka_py). -Produces 4 PNGs in benchmarks/results/: - - comparison_st_throughput.png Single-thread throughput (cache=256, 3 Python versions) - - comparison_mt_scaling.png Multi-thread scaling (py3.13 GIL vs py3.13t no-GIL) - - comparison_scaling_ratio.png Scaling efficiency normalized to 1-thread baseline - - comparison_backends.png Backend comparison (memory / shared / multi-process) +Produces SVGs in benchmarks/results/ with light + dark variants: + - comparison_st_throughput_{light,dark}.svg + - comparison_mt_scaling_{light,dark}.svg + - comparison_scaling_ratio_{light,dark}.svg + - comparison_backends_{light,dark}.svg Usage: uv run python benchmarks/_generate_comparison_charts.py """ import json +from dataclasses import dataclass from pathlib import Path import matplotlib +import matplotlib.patheffects as pe matplotlib.use("Agg") @@ -23,10 +25,86 @@ RESULTS_DIR = Path(__file__).resolve().parent / "results" LIBS = ["warp_cache", "lru_cache", "moka_py"] -COLORS = {"warp_cache": "#2563eb", "lru_cache": "#ea580c", "moka_py": "#16a34a"} LABELS = {"warp_cache": "warp_cache", "lru_cache": "lru_cache", "moka_py": "moka_py"} -DPI = 150 + +@dataclass +class Theme: + name: str + text: str + text_dim: str + grid: str + warp_cache: str + lru_cache: str + moka_py: str + backend_colors: list[str] + + +LIGHT = Theme( + name="light", + text="#24292f", + text_dim="#656d76", + grid=(0.5, 0.5, 0.5, 0.25), + warp_cache="#4f46e5", + lru_cache="#d97706", + moka_py="#059669", + backend_colors=["#4f46e5", "#6366f1", "#818cf8"], +) + +DARK = Theme( + name="dark", + text="#e6edf3", + text_dim="#8b949e", + grid=(0.5, 0.5, 0.5, 0.25), + warp_cache="#818cf8", + lru_cache="#fbbf24", + moka_py="#34d399", + backend_colors=["#818cf8", "#a5b4fc", "#c7d2fe"], +) + + +def _apply_theme(theme: Theme) -> None: + plt.rcdefaults() + plt.xkcd(scale=0.3, length=200, randomness=1) + # xkcd() adds white stroke outlines — use dark stroke for dark mode + stroke_color = "white" if theme.name == "light" else "#0d1117" + plt.rcParams.update( + { + "figure.facecolor": "none", + "axes.facecolor": "none", + "savefig.facecolor": "none", + "axes.edgecolor": theme.text_dim, + "axes.labelcolor": theme.text, + "axes.titlecolor": theme.text, + "text.color": theme.text, + "xtick.color": theme.text_dim, + "ytick.color": theme.text_dim, + "legend.facecolor": "none", + "legend.edgecolor": "none", + "legend.labelcolor": theme.text, + "grid.color": theme.grid, + "grid.alpha": 1.0, + "font.size": 11, + "axes.titlesize": 13, + "axes.labelsize": 11, + "figure.titlesize": 15, + "axes.linewidth": 1.0, + "lines.linewidth": 1.5, + "path.effects": [pe.withStroke(linewidth=4, foreground=stroke_color)], + "svg.fonttype": "none", + } + ) + + +def _colors(theme: Theme) -> dict[str, str]: + return {"warp_cache": theme.warp_cache, "lru_cache": theme.lru_cache, "moka_py": theme.moka_py} + + +def _style_ax(ax: plt.Axes) -> None: + ax.spines["top"].set_visible(False) + ax.spines["right"].set_visible(False) + ax.grid(axis="y", linewidth=0.5) + ax.tick_params(length=0) def _load(filename: str) -> dict: @@ -44,16 +122,22 @@ def _py_label(data: dict) -> str: return f"Python {ver}{suffix}" -def chart_single_thread_throughput(py312: dict, py313: dict, py313t: dict) -> None: - """Chart 1: Grouped bar — single-thread throughput at cache size 256.""" - datasets = [ - ("3.12", py312), - ("3.13", py313), - ("3.13t", py313t), - ] +def _save(fig: plt.Figure, name: str) -> None: + fig.savefig(RESULTS_DIR / name, format="svg", bbox_inches="tight", transparent=True) + plt.close(fig) + print(f" {name}") - fig, ax = plt.subplots(figsize=(8, 5)) +def chart_single_thread_throughput( + py312: dict, + py313: dict, + py313t: dict, + theme: Theme, +) -> None: + datasets = [("3.12", py312), ("3.13", py313), ("3.13t", py313t)] + colors = _colors(theme) + + fig, ax = plt.subplots(figsize=(8, 5)) n_groups = len(datasets) bar_width = 0.22 x_positions = range(n_groups) @@ -64,25 +148,32 @@ def chart_single_thread_throughput(py312: dict, py313: dict, py313t: dict) -> No tp = data["throughput"]["256"] values.append(_millions(tp.get(lib, 0))) offsets = [x + i * bar_width for x in x_positions] - ax.bar(offsets, values, bar_width, label=LABELS[lib], color=COLORS[lib]) + ax.bar(offsets, values, bar_width, label=LABELS[lib], color=colors[lib]) for x, v in zip(offsets, values, strict=True): - ax.text(x, v + 0.3, f"{v:.1f}M", ha="center", va="bottom", fontsize=8) + ax.text( + x, + v + 0.4, + f"{v:.1f}M", + ha="center", + va="bottom", + fontsize=9, + color=colors[lib], + ) ax.set_xlabel("Python Version") ax.set_ylabel("Throughput (M ops/s)") - ax.set_title("Single-Thread Throughput (cache size = 256)") + ax.set_title("Single-Thread Throughput (cache size = 256)", pad=10) ax.set_xticks([x + bar_width for x in x_positions]) ax.set_xticklabels([label for label, _ in datasets]) - ax.legend() - ax.set_ylim(bottom=0) + ax.legend(ncol=3, loc="upper center", bbox_to_anchor=(0.5, -0.12), frameon=False) + ax.set_ylim(bottom=0, top=38) + _style_ax(ax) fig.tight_layout() - fig.savefig(RESULTS_DIR / "comparison_st_throughput.png", dpi=DPI) - plt.close(fig) - print(" comparison_st_throughput.png") + _save(fig, f"comparison_st_throughput_{theme.name}.svg") -def chart_multithread_scaling(py313: dict, py313t: dict) -> None: - """Chart 2: Dual-panel line — multi-thread scaling (GIL vs no-GIL).""" +def chart_multithread_scaling(py313: dict, py313t: dict, theme: Theme) -> None: + colors = _colors(theme) fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(12, 5), sharey=True) panels = [ @@ -94,37 +185,37 @@ def chart_multithread_scaling(py313: dict, py313t: dict) -> None: th = data["threading"] thread_counts = sorted((int(k) for k in th), key=int) for lib in LIBS: - values = [] - for tc in thread_counts: - val = th[str(tc)].get(lib, 0) - values.append(_millions(val)) + values = [_millions(th[str(tc)].get(lib, 0)) for tc in thread_counts] if any(v > 0 for v in values): ax.plot( thread_counts, values, marker="o", + markersize=4, label=LABELS[lib], - color=COLORS[lib], + color=colors[lib], linewidth=2, ) ax.set_xlabel("Threads") ax.set_title(title) - ax.legend() ax.set_xscale("log", base=2) ax.set_xticks(thread_counts) ax.set_xticklabels([str(t) for t in thread_counts]) - ax.grid(axis="y", alpha=0.3) + _style_ax(ax) ax1.set_ylabel("Throughput (M ops/s)") - fig.suptitle("Multi-Thread Scaling", fontsize=14, y=1.02) + handles, labels = ax1.get_legend_handles_labels() + fig.legend( + handles, labels, ncol=len(LIBS), + loc="lower center", bbox_to_anchor=(0.5, -0.05), frameon=False, + ) + fig.suptitle("Multi-Thread Scaling", y=1.02) fig.tight_layout() - fig.savefig(RESULTS_DIR / "comparison_mt_scaling.png", dpi=DPI) - plt.close(fig) - print(" comparison_mt_scaling.png") + _save(fig, f"comparison_mt_scaling_{theme.name}.svg") -def chart_scaling_efficiency(py313: dict, py313t: dict) -> None: - """Chart 3: Dual-panel line — scaling ratio normalized to 1-thread baseline.""" +def chart_scaling_efficiency(py313: dict, py313t: dict, theme: Theme) -> None: + colors = _colors(theme) fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(12, 5), sharey=True) panels = [ @@ -139,37 +230,36 @@ def chart_scaling_efficiency(py313: dict, py313t: dict) -> None: baseline = th["1"].get(lib, 0) if baseline == 0: continue - ratios = [] - for tc in thread_counts: - val = th[str(tc)].get(lib, 0) - ratios.append(val / baseline) + ratios = [th[str(tc)].get(lib, 0) / baseline for tc in thread_counts] ax.plot( thread_counts, ratios, marker="o", + markersize=5, label=LABELS[lib], - color=COLORS[lib], + color=colors[lib], linewidth=2, ) - ax.axhline(y=1.0, color="gray", linestyle="--", alpha=0.5) + ax.axhline(y=1.0, color=theme.text_dim, linestyle="--", alpha=0.4, linewidth=1) ax.set_xlabel("Threads") ax.set_title(title) - ax.legend() ax.set_xscale("log", base=2) ax.set_xticks(thread_counts) ax.set_xticklabels([str(t) for t in thread_counts]) - ax.grid(axis="y", alpha=0.3) + _style_ax(ax) ax1.set_ylabel("Scaling Ratio (vs 1 thread)") - fig.suptitle("Multi-Thread Scaling Efficiency", fontsize=14, y=1.02) + handles, labels = ax1.get_legend_handles_labels() + fig.legend( + handles, labels, ncol=len(LIBS), + loc="lower center", bbox_to_anchor=(0.5, -0.05), frameon=False, + ) + fig.suptitle("Multi-Thread Scaling Efficiency", y=1.02) fig.tight_layout() - fig.savefig(RESULTS_DIR / "comparison_scaling_ratio.png", dpi=DPI) - plt.close(fig) - print(" comparison_scaling_ratio.png") + _save(fig, f"comparison_scaling_ratio_{theme.name}.svg") -def chart_backends(py313: dict) -> None: - """Chart 4: Grouped bar (log y) — memory / shared / multi-process backends.""" +def chart_backends(py313: dict, theme: Theme) -> None: categories = ["Memory\n(in-process)", "Shared\n(mmap)", "Multi-process\n(8 workers)"] values = [ _millions(py313["shared_throughput"]["memory"]["ops_per_sec"]), @@ -178,29 +268,77 @@ def chart_backends(py313: dict) -> None: ] fig, ax = plt.subplots(figsize=(7, 5)) - bars = ax.bar(categories, values, color=[COLORS["warp_cache"], "#60a5fa", "#93c5fd"]) + bars = ax.bar(categories, values, color=theme.backend_colors) for bar, val in zip(bars, values, strict=True): - label = f"{val:.2f}M" if val >= 0.1 else f"{val * 1000:.0f}K" + label = f"{val:.1f}M" if val >= 1.0 else f"{val:.2f}M" ax.text( bar.get_x() + bar.get_width() / 2, - bar.get_height(), + bar.get_height() * 0.75, label, ha="center", - va="bottom", - fontsize=10, - fontweight="bold", + va="top", + fontsize=13, + color="white", ) ax.set_ylabel("Throughput (M ops/s)") - ax.set_title("warp_cache Backend Comparison (Python 3.13)") - ax.set_yscale("log") - ax.set_ylim(bottom=0.01) - ax.grid(axis="y", alpha=0.3) + ax.set_title("warp_cache Backend Comparison (Python 3.13)", pad=12) + ax.set_ylim(bottom=0) + _style_ax(ax) fig.tight_layout() - fig.savefig(RESULTS_DIR / "comparison_backends.png", dpi=DPI) - plt.close(fig) - print(" comparison_backends.png") + _save(fig, f"comparison_backends_{theme.name}.svg") + + +def chart_async_throughput(py313: dict, py313t: dict, theme: Theme) -> None: + """Bar chart comparing sync vs async throughput for warp_cache and moka_py.""" + colors = _colors(theme) + async_libs = ["warp_cache", "moka_py"] + async_labels = {"warp_cache": "warp_cache", "moka_py": "moka_py"} + + # Data: sync 256 and async 256 for each lib, on both 3.13 and 3.13t + datasets = [ + ("3.13\nsync", py313, "sync_256"), + ("3.13\nasync", py313, "256"), + ("3.13t\nsync", py313t, "sync_256"), + ("3.13t\nasync", py313t, "256"), + ] + + fig, ax = plt.subplots(figsize=(8, 5)) + n_groups = len(datasets) + bar_width = 0.3 + x_positions = range(n_groups) + + for i, lib in enumerate(async_libs): + values = [] + for _, data, key in datasets: + at = data.get("async_throughput", {}) + values.append(_millions(at.get(key, {}).get(lib, 0))) + offsets = [x + i * bar_width for x in x_positions] + ax.bar(offsets, values, bar_width, label=async_labels[lib], color=colors[lib]) + for x, v in zip(offsets, values, strict=True): + if v > 0: + ax.text( + x, + v + 0.3, + f"{v:.1f}M", + ha="center", + va="bottom", + fontsize=9, + color=colors[lib], + ) + + ax.set_xlabel("Python Version / Mode") + ax.set_ylabel("Throughput (M ops/s)") + ax.set_title("Sync vs Async Throughput (cache size = 256)", pad=10) + center = bar_width * (len(async_libs) - 1) / 2 + ax.set_xticks([x + center for x in x_positions]) + ax.set_xticklabels([label for label, _, _ in datasets]) + ax.legend(ncol=2, loc="upper center", bbox_to_anchor=(0.5, -0.15), frameon=False) + ax.set_ylim(bottom=0) + _style_ax(ax) + fig.tight_layout() + _save(fig, f"comparison_async_{theme.name}.svg") def main() -> None: @@ -213,11 +351,15 @@ def main() -> None: print(f" py3.13: {_py_label(py313)}") print(f" py3.13t: {_py_label(py313t)}") - print("\nGenerating charts...") - chart_single_thread_throughput(py312, py313, py313t) - chart_multithread_scaling(py313, py313t) - chart_scaling_efficiency(py313, py313t) - chart_backends(py313) + for theme in (LIGHT, DARK): + print(f"\nGenerating {theme.name} charts...") + _apply_theme(theme) + chart_single_thread_throughput(py312, py313, py313t, theme) + chart_multithread_scaling(py313, py313t, theme) + chart_scaling_efficiency(py313, py313t, theme) + chart_backends(py313, theme) + chart_async_throughput(py313, py313t, theme) + print("\nDone!") diff --git a/benchmarks/bench_sieve.py b/benchmarks/bench_sieve.py new file mode 100644 index 0000000..bd9f474 --- /dev/null +++ b/benchmarks/bench_sieve.py @@ -0,0 +1,538 @@ +#!/usr/bin/env python3 +"""SIEVE eviction quality benchmark — measures hit ratio advantages over LRU. + +Synthesizes workload patterns from the NSDI'24 SIEVE paper: +scan resistance, one-hit-wonder filtering, working set adaptivity, +and hit ratio across varying cache sizes and skewness. + +Usage: + python benchmarks/bench_sieve.py # full suite, 1M requests + python benchmarks/bench_sieve.py --quick # 100K requests + python benchmarks/bench_sieve.py --bench scan,ohw # specific benchmarks + python benchmarks/bench_sieve.py --seed 99 # custom seed +""" + +import argparse +import functools +import json +import random +import time +from dataclasses import dataclass +from pathlib import Path + +from warp_cache import cache + +RESULTS_DIR = Path(__file__).resolve().parent / "results" +RESULTS_DIR.mkdir(parents=True, exist_ok=True) + +ALL_BENCHMARKS = ["hitratio", "zipf", "scan", "ohw", "shift", "throughput"] + + +# ═══════════════════════════════════════════════════════════════════════════ +# Workload generators +# ═══════════════════════════════════════════════════════════════════════════ + + +def zipf_keys(n: int, num_keys: int, alpha: float = 1.0, *, seed: int = 42) -> list[int]: + """Generate *n* keys following a Zipf distribution with configurable skewness.""" + rng = random.Random(seed) + weights = [1.0 / ((i + 1) ** alpha) for i in range(num_keys)] + return rng.choices(range(num_keys), weights=weights, k=n) + + +def scan_resistant_keys( + n: int, + hot_size: int, + scan_size: int, + hot_fraction: float, + alpha: float = 1.0, + *, + seed: int = 42, +) -> list[int]: + """Interleave Zipf-distributed hot keys with sequential scan keys. + + hot_fraction controls the mix: 1.0 = all hot, 0.0 = all scan. + Scan keys are offset by hot_size so they don't overlap. + """ + rng = random.Random(seed) + hot_weights = [1.0 / ((i + 1) ** alpha) for i in range(hot_size)] + scan_seq = list(range(hot_size, hot_size + scan_size)) + scan_idx = 0 + + keys = [] + for _ in range(n): + if rng.random() < hot_fraction: + keys.append(rng.choices(range(hot_size), weights=hot_weights, k=1)[0]) + else: + keys.append(scan_seq[scan_idx % len(scan_seq)]) + scan_idx += 1 + return keys + + +def one_hit_wonder_keys( + n: int, + num_reused_keys: int, + ohw_ratio: float, + alpha: float = 1.0, + *, + seed: int = 42, +) -> list[int]: + """Mix Zipf-distributed reused keys with unique one-time keys. + + ohw_ratio: fraction of accesses that are unique (one-hit wonders). + OHW keys start at num_reused_keys and increment, never repeating. + """ + rng = random.Random(seed) + reused_weights = [1.0 / ((i + 1) ** alpha) for i in range(num_reused_keys)] + ohw_counter = num_reused_keys + + keys = [] + for _ in range(n): + if rng.random() < ohw_ratio: + keys.append(ohw_counter) + ohw_counter += 1 + else: + keys.append(rng.choices(range(num_reused_keys), weights=reused_weights, k=1)[0]) + return keys + + +def working_set_shift_keys( + n_per_phase: int, + set_size: int, + alpha: float = 1.0, + *, + seed: int = 42, +) -> tuple[list[int], list[int], list[int]]: + """Three phases: keys 0..set_size-1, then set_size..2*set_size-1, then back. + + Returns (phase1, phase2, phase3) key lists. + """ + rng = random.Random(seed) + weights = [1.0 / ((i + 1) ** alpha) for i in range(set_size)] + phase1 = rng.choices(range(set_size), weights=weights, k=n_per_phase) + phase2 = rng.choices(range(set_size, 2 * set_size), weights=weights, k=n_per_phase) + phase3 = rng.choices(range(set_size), weights=weights, k=n_per_phase) + return phase1, phase2, phase3 + + +# ═══════════════════════════════════════════════════════════════════════════ +# Cache factories +# ═══════════════════════════════════════════════════════════════════════════ + + +def make_sieve_fn(max_size: int): + """Create a warp_cache (SIEVE) cached identity function.""" + + @cache(max_size=max_size) + def fn(x): + return x + + return fn + + +def make_lru_fn(max_size: int): + """Create a functools.lru_cache (LRU) cached identity function.""" + + @functools.lru_cache(maxsize=max_size) + def fn(x): + return x + + return fn + + +# ═══════════════════════════════════════════════════════════════════════════ +# Measurement helpers +# ═══════════════════════════════════════════════════════════════════════════ + + +@dataclass +class HitRatioResult: + name: str + hit_ratio: float + hits: int + misses: int + ops_per_sec: float + + +def _get_info(fn): + """Get (hits, misses) from either warp_cache or functools cache_info.""" + info = fn.cache_info() + return info.hits, info.misses + + +def measure_hit_ratio(fn, keys: list[int], name: str) -> HitRatioResult: + """Run keys through fn, return hit ratio stats.""" + t0 = time.perf_counter() + for k in keys: + fn(k) + elapsed = time.perf_counter() - t0 + + hits, misses = _get_info(fn) + total = hits + misses + return HitRatioResult( + name=name, + hit_ratio=hits / total if total else 0.0, + hits=hits, + misses=misses, + ops_per_sec=len(keys) / elapsed, + ) + + +def measure_phase_hit_ratio(fn, keys: list[int]) -> tuple[int, int]: + """Run keys through fn and return (hits_delta, misses_delta) for this phase.""" + h0, m0 = _get_info(fn) + for k in keys: + fn(k) + h1, m1 = _get_info(fn) + return h1 - h0, m1 - m0 + + +def measure_windowed_hit_ratio(fn, keys: list[int], window_size: int = 10_000) -> list[float]: + """Run keys through fn, return per-window hit ratios.""" + ratios = [] + for start in range(0, len(keys), window_size): + chunk = keys[start : start + window_size] + h0, m0 = _get_info(fn) + for k in chunk: + fn(k) + h1, m1 = _get_info(fn) + dh, dm = h1 - h0, m1 - m0 + total = dh + dm + ratios.append(dh / total if total else 0.0) + return ratios + + +# ═══════════════════════════════════════════════════════════════════════════ +# Formatting +# ═══════════════════════════════════════════════════════════════════════════ + + +def fmt_pct(v: float) -> str: + return f"{v * 100:6.2f}%" + + +def fmt_ops(ops: float) -> str: + if ops >= 1_000_000: + return f"{ops / 1_000_000:.2f}M" + if ops >= 1_000: + return f"{ops / 1_000:.0f}K" + return f"{ops:.0f}" + + +def fmt_delta(sieve_ratio: float, lru_ratio: float) -> str: + """Format the miss ratio reduction of SIEVE vs LRU.""" + sieve_miss = 1 - sieve_ratio + lru_miss = 1 - lru_ratio + if lru_miss == 0: + return " n/a" + reduction = (lru_miss - sieve_miss) / lru_miss * 100 + return f"{reduction:+.1f}%" + + +# ═══════════════════════════════════════════════════════════════════════════ +# Benchmark 1 — Hit ratio vs cache size ratio +# ═══════════════════════════════════════════════════════════════════════════ + + +def bench_hitratio(n_ops: int, seed: int) -> dict: + num_keys = 10_000 + ratios = [0.001, 0.005, 0.01, 0.05, 0.10, 0.25, 0.50] + keys = zipf_keys(n_ops, num_keys, alpha=1.0, seed=seed) + + results = [] + print("\n Cache% SIEVE LRU MissReduction") + print(" " + "─" * 42) + + for r in ratios: + sz = max(1, int(num_keys * r)) + sieve_fn = make_sieve_fn(sz) + lru_fn = make_lru_fn(sz) + + s = measure_hit_ratio(sieve_fn, keys, "sieve") + lr = measure_hit_ratio(lru_fn, keys, "lru") + + delta = fmt_delta(s.hit_ratio, lr.hit_ratio) + print(f" {r * 100:5.1f}% {fmt_pct(s.hit_ratio)} {fmt_pct(lr.hit_ratio)} {delta}") + + results.append( + { + "cache_ratio": r, + "cache_size": sz, + "sieve_hit_ratio": s.hit_ratio, + "lru_hit_ratio": lr.hit_ratio, + } + ) + + return {"num_keys": num_keys, "n_ops": n_ops, "results": results} + + +# ═══════════════════════════════════════════════════════════════════════════ +# Benchmark 2 — Zipf skewness sweep +# ═══════════════════════════════════════════════════════════════════════════ + + +def bench_zipf(n_ops: int, seed: int) -> dict: + num_keys = 10_000 + cache_size = num_keys // 10 # 10% of unique keys + alphas = [0.5, 0.7, 0.8, 1.0, 1.2, 1.5] + + results = [] + print("\n Alpha SIEVE LRU MissReduction") + print(" " + "─" * 42) + + for alpha in alphas: + keys = zipf_keys(n_ops, num_keys, alpha=alpha, seed=seed) + sieve_fn = make_sieve_fn(cache_size) + lru_fn = make_lru_fn(cache_size) + + s = measure_hit_ratio(sieve_fn, keys, "sieve") + lr = measure_hit_ratio(lru_fn, keys, "lru") + + delta = fmt_delta(s.hit_ratio, lr.hit_ratio) + print(f" {alpha:5.2f} {fmt_pct(s.hit_ratio)} {fmt_pct(lr.hit_ratio)} {delta}") + + results.append( + { + "alpha": alpha, + "sieve_hit_ratio": s.hit_ratio, + "lru_hit_ratio": lr.hit_ratio, + } + ) + + return {"num_keys": num_keys, "cache_size": cache_size, "n_ops": n_ops, "results": results} + + +# ═══════════════════════════════════════════════════════════════════════════ +# Benchmark 3 — Scan resistance +# ═══════════════════════════════════════════════════════════════════════════ + + +def bench_scan(n_ops: int, seed: int) -> dict: + hot_size = 100 + scan_size = 10_000 + cache_size = 200 # can hold entire hot set + hot_fractions = [1.0, 0.9, 0.8, 0.7, 0.5, 0.3] + + results = [] + print("\n HotFrac SIEVE LRU MissReduction") + print(" " + "─" * 44) + + for hf in hot_fractions: + keys = scan_resistant_keys(n_ops, hot_size, scan_size, hf, seed=seed) + sieve_fn = make_sieve_fn(cache_size) + lru_fn = make_lru_fn(cache_size) + + s = measure_hit_ratio(sieve_fn, keys, "sieve") + lr = measure_hit_ratio(lru_fn, keys, "lru") + + delta = fmt_delta(s.hit_ratio, lr.hit_ratio) + print(f" {hf * 100:5.1f}% {fmt_pct(s.hit_ratio)} {fmt_pct(lr.hit_ratio)} {delta}") + + results.append( + { + "hot_fraction": hf, + "sieve_hit_ratio": s.hit_ratio, + "lru_hit_ratio": lr.hit_ratio, + } + ) + + return { + "hot_size": hot_size, + "scan_size": scan_size, + "cache_size": cache_size, + "n_ops": n_ops, + "results": results, + } + + +# ═══════════════════════════════════════════════════════════════════════════ +# Benchmark 4 — One-hit-wonder filtering +# ═══════════════════════════════════════════════════════════════════════════ + + +def bench_ohw(n_ops: int, seed: int) -> dict: + num_reused_keys = 5_000 + cache_size = 500 + ohw_ratios = [0.0, 0.25, 0.50, 0.75] + + results = [] + print("\n OHW% SIEVE LRU MissReduction") + print(" " + "─" * 42) + + for ohw in ohw_ratios: + keys = one_hit_wonder_keys(n_ops, num_reused_keys, ohw, seed=seed) + sieve_fn = make_sieve_fn(cache_size) + lru_fn = make_lru_fn(cache_size) + + s = measure_hit_ratio(sieve_fn, keys, "sieve") + lr = measure_hit_ratio(lru_fn, keys, "lru") + + delta = fmt_delta(s.hit_ratio, lr.hit_ratio) + print(f" {ohw * 100:5.1f}% {fmt_pct(s.hit_ratio)} {fmt_pct(lr.hit_ratio)} {delta}") + + results.append( + { + "ohw_ratio": ohw, + "sieve_hit_ratio": s.hit_ratio, + "lru_hit_ratio": lr.hit_ratio, + } + ) + + return { + "num_reused_keys": num_reused_keys, + "cache_size": cache_size, + "n_ops": n_ops, + "results": results, + } + + +# ═══════════════════════════════════════════════════════════════════════════ +# Benchmark 5 — Working set shift +# ═══════════════════════════════════════════════════════════════════════════ + + +def bench_shift(n_ops: int, seed: int) -> dict: + set_size = 1_000 + cache_size = 200 + n_per_phase = n_ops // 2 # split across 3 phases (slightly uneven is fine) + window_size = max(1_000, n_per_phase // 50) + + p1, p2, p3 = working_set_shift_keys(n_per_phase, set_size, seed=seed) + + results = {} + print() + + for label, impl_factory in [("sieve", make_sieve_fn), ("lru", make_lru_fn)]: + fn = impl_factory(cache_size) + + phase_results = {} + all_windowed = [] + + for phase_name, phase_keys in [("phase1", p1), ("phase2", p2), ("phase3", p3)]: + dh, dm = measure_phase_hit_ratio(fn, phase_keys) + total = dh + dm + hr = dh / total if total else 0.0 + phase_results[phase_name] = hr + + w = measure_windowed_hit_ratio(fn, phase_keys, window_size) + all_windowed.extend(w) + + results[label] = { + "phases": phase_results, + "windowed": all_windowed, + } + + print( + f" {label.upper():>5} " + f"P1={fmt_pct(phase_results['phase1'])} " + f"P2={fmt_pct(phase_results['phase2'])} " + f"P3={fmt_pct(phase_results['phase3'])}" + ) + + return { + "set_size": set_size, + "cache_size": cache_size, + "n_per_phase": n_per_phase, + "window_size": window_size, + "results": results, + } + + +# ═══════════════════════════════════════════════════════════════════════════ +# Benchmark 6 — Throughput under eviction pressure +# ═══════════════════════════════════════════════════════════════════════════ + + +def bench_throughput(n_ops: int, seed: int) -> dict: + cache_size = 64 + num_keys = 10_000 + # Use 2x ops for throughput to get stable numbers + actual_ops = n_ops * 2 + keys = zipf_keys(actual_ops, num_keys, seed=seed) + + results = {} + print() + + for label, factory in [("sieve", make_sieve_fn), ("lru", make_lru_fn)]: + fn = factory(cache_size) + + t0 = time.perf_counter() + for k in keys: + fn(k) + elapsed = time.perf_counter() - t0 + + ops_sec = actual_ops / elapsed + hits, misses = _get_info(fn) + total = hits + misses + hr = hits / total if total else 0.0 + + results[label] = { + "ops_per_sec": ops_sec, + "hit_ratio": hr, + "elapsed": elapsed, + } + + print(f" {label.upper():>5} {fmt_ops(ops_sec)} ops/s hit_ratio={fmt_pct(hr)}") + + return { + "cache_size": cache_size, + "num_keys": num_keys, + "n_ops": actual_ops, + "results": results, + } + + +# ═══════════════════════════════════════════════════════════════════════════ +# Main +# ═══════════════════════════════════════════════════════════════════════════ + + +BENCH_DISPATCH = { + "hitratio": ("Hit ratio vs cache size", bench_hitratio), + "zipf": ("Zipf skewness sweep", bench_zipf), + "scan": ("Scan resistance", bench_scan), + "ohw": ("One-hit-wonder filtering", bench_ohw), + "shift": ("Working set shift", bench_shift), + "throughput": ("Throughput under eviction pressure", bench_throughput), +} + + +def main() -> None: + parser = argparse.ArgumentParser(description="SIEVE eviction quality benchmark") + parser.add_argument("--quick", action="store_true", help="Use 100K requests instead of 1M") + parser.add_argument( + "--bench", + type=str, + default=None, + help="Comma-separated benchmarks to run (default: all). " + f"Options: {','.join(ALL_BENCHMARKS)}", + ) + parser.add_argument("--seed", type=int, default=42, help="Random seed (default: 42)") + args = parser.parse_args() + + n_ops = 100_000 if args.quick else 1_000_000 + seed = args.seed + selected = args.bench.split(",") if args.bench else ALL_BENCHMARKS + + for name in selected: + if name not in BENCH_DISPATCH: + parser.error(f"Unknown benchmark: {name!r}. Options: {','.join(ALL_BENCHMARKS)}") + + mode = "quick" if args.quick else "full" + print(f"SIEVE eviction quality benchmark ({mode}, {n_ops:,} ops, seed={seed})") + print("=" * 60) + + all_results = {"n_ops": n_ops, "seed": seed, "mode": mode} + + for i, name in enumerate(selected, 1): + title, bench_fn = BENCH_DISPATCH[name] + print(f"\n[{i}/{len(selected)}] {title}") + all_results[name] = bench_fn(n_ops, seed) + + json_path = RESULTS_DIR / "bench_sieve.json" + json_path.write_text(json.dumps(all_results, indent=2)) + print(f"\nResults saved to {json_path}") + + +if __name__ == "__main__": + main() diff --git a/benchmarks/results/bench_default.json b/benchmarks/results/bench_default.json index f5530a4..e004473 100644 --- a/benchmarks/results/bench_default.json +++ b/benchmarks/results/bench_default.json @@ -5,7 +5,7 @@ "build": "main", "compiler": "Clang 20.1.0 ", "arch": "arm64", - "gil_disabled": true + "gil_disabled": false }, "contestants": { "warp_cache": { @@ -41,125 +41,231 @@ }, "throughput": { "32": { - "warp_cache": 10474631.790567862, - "lru_cache": 14435655.419457715, - "cachetools": 452845.48288772785, - "cachebox": 873417.4363385128, - "moka_py": 2628644.823621646 + "warp_cache": 13022740.959457973, + "lru_cache": 19572344.41764575, + "cachetools": 593888.2930106695, + "cachebox": 1095940.4456885124, + "moka_py": 3050450.610295676 }, "64": { - "warp_cache": 11967030.387983482, - "lru_cache": 16399940.807027185, - "cachetools": 492432.23981610214, - "cachebox": 929445.4171746082, - "moka_py": 2885201.438573063 + "warp_cache": 14985669.285497736, + "lru_cache": 22276056.09667104, + "cachetools": 628128.3735489835, + "cachebox": 1176271.0786720647, + "moka_py": 3265874.946650633 }, "128": { - "warp_cache": 12469928.812786162, - "lru_cache": 18471058.017465174, - "cachetools": 556544.3246412581, - "cachebox": 1020043.4248854789, - "moka_py": 3025443.984643899 + "warp_cache": 16561887.844393728, + "lru_cache": 25709974.351785704, + "cachetools": 710946.5912075893, + "cachebox": 1265973.0191948232, + "moka_py": 3422918.6402684096 }, "256": { - "warp_cache": 14036401.719287656, - "lru_cache": 21473245.044126667, - "cachetools": 629606.7854332039, - "cachebox": 1182458.809654294, - "moka_py": 3295150.7771876557 + "warp_cache": 18110744.996017344, + "lru_cache": 32137972.21619345, + "cachetools": 813515.4728099192, + "cachebox": 1502334.2599238446, + "moka_py": 3709881.2073603617 }, "512": { - "warp_cache": 16386839.55454546, - "lru_cache": 24747881.107162368, - "cachetools": 746648.2493651968, - "cachebox": 1426324.9672906033, - "moka_py": 3440140.1630303245 + "warp_cache": 18628769.81889033, + "lru_cache": 34452559.074556306, + "cachetools": 947676.4159568828, + "cachebox": 1831219.5692332701, + "moka_py": 4059423.1351816226 }, "1024": { - "warp_cache": 16671065.109359963, - "lru_cache": 28326943.0703748, - "cachetools": 921946.1147886541, - "cachebox": 1859407.1407276061, - "moka_py": 3846813.4260749044 + "warp_cache": 19849473.5071924, + "lru_cache": 39489916.00536644, + "cachetools": 1128210.8258731489, + "cachebox": 2371040.137049482, + "moka_py": 4350221.922513227 }, "zoocache_unbounded": { - "zoocache": 1055419.642047378 + "zoocache": 1186277.150256708 } }, "threading": { "1": { - "warp_cache": 13331555.870066097, - "lru_cache": 9752449.710837664, - "cachetools": 557278.2170487391, - "cachebox": 1078148.2265835293, - "moka_py": 3004454.073072167 + "warp_cache": 18291845.274640042, + "lru_cache": 11944815.411426656, + "cachetools": 778314.8470287559, + "cachebox": 1505535.6820654592, + "moka_py": 3586232.5404906743 }, "2": { - "warp_cache": 13575120.757860987, - "lru_cache": 9647349.832044424, - "cachetools": 553300.5974632365, - "cachebox": 1064933.897503077, - "moka_py": 3031761.4894439727 + "warp_cache": 17285460.183202725, + "lru_cache": 12169891.60907636, + "cachetools": 793305.5609040853, + "cachebox": 1522891.9140570306, + "moka_py": 3529655.7211490097 }, "4": { - "warp_cache": 12884384.644378452, - "lru_cache": 9368412.57834681, - "cachetools": 552114.1726683924, - "cachebox": 1073878.3528667397, - "moka_py": 2973233.4376518168 + "warp_cache": 18018829.677750286, + "lru_cache": 12557883.977805972, + "cachetools": 802595.3237683141, + "cachebox": 1541876.3905118278, + "moka_py": 3598260.155499012 }, "8": { - "warp_cache": 12664840.780802963, - "lru_cache": 8988831.095590046, - "cachetools": 552554.8754723261, - "cachebox": 1051872.193418076, - "moka_py": 2964965.255744473 + "warp_cache": 17909781.617426973, + "lru_cache": 12265174.071667215, + "cachetools": 774121.3822242592, + "cachebox": 1515374.412916725, + "moka_py": 3612820.1322372477 }, "16": { - "warp_cache": 11605863.929693978, - "lru_cache": 8686116.349210039, - "cachetools": 541964.9092828757, - "cachebox": 1042606.0981901505, - "moka_py": 2928457.832788165 + "warp_cache": 17234197.406193413, + "lru_cache": 11633430.117256794, + "cachetools": 784637.5809287552, + "cachebox": 1494127.154048556, + "moka_py": 3641914.841487078 }, "32": { - "warp_cache": 11524056.455829944, - "lru_cache": 8284946.4906049725, - "cachetools": 540755.5040239964, - "cachebox": 1006342.0483310086, - "moka_py": 2892556.004654579 + "warp_cache": 16778053.264181446, + "lru_cache": 11590507.40200536, + "cachetools": 778824.0300816154, + "cachebox": 1429718.614562387, + "moka_py": 3624019.9636165453 + }, + "64": { + "warp_cache": 12785744.105428724, + "lru_cache": 9775546.117623253, + "moka_py": 3851620.1577660977, + "cachebox": 1492835.8200793546 + }, + "128": { + "warp_cache": 12093329.91651306, + "lru_cache": 9517739.801874982, + "moka_py": 3808867.023961398, + "cachebox": 1391972.7087392122 } }, "shared_throughput": { "memory": { - "ops_per_sec": 13759969.413229268, - "hit_rate": 0.6492 + "ops_per_sec": 17201093.096739274, + "hit_rate": 0.71241 }, "shared": { - "ops_per_sec": 6962778.2860887805, - "hit_rate": 0.649485 + "ops_per_sec": 9255867.110941615, + "hit_rate": 0.72338 } }, "multiprocess": { "1": { - "total_ops_per_sec": 3767355.746687779, - "per_process_avg_ops_per_sec": 7643355.878803674, - "wall_time": 0.13271908299066126 + "total_ops_per_sec": 4831050.1097373795, + "per_process_avg_ops_per_sec": 10182988.308657322, + "wall_time": 0.10349716700147837 }, "2": { - "total_ops_per_sec": 4872801.60626508, - "per_process_avg_ops_per_sec": 3863508.773518648, - "wall_time": 0.10261037497548386 + "total_ops_per_sec": 7204152.437990341, + "per_process_avg_ops_per_sec": 6109360.706454412, + "wall_time": 0.0694044170086272 }, "4": { - "total_ops_per_sec": 2826267.8992125136, - "per_process_avg_ops_per_sec": 820917.7951542896, - "wall_time": 0.17691174999345094 + "total_ops_per_sec": 7540331.388641312, + "per_process_avg_ops_per_sec": 2624801.0950859366, + "wall_time": 0.0663100829697214 }, "8": { - "total_ops_per_sec": 1756495.0408106316, - "per_process_avg_ops_per_sec": 247112.85596625824, - "wall_time": 0.2846577920136042 + "total_ops_per_sec": 6622352.097206755, + "per_process_avg_ops_per_sec": 1026249.0877601855, + "wall_time": 0.07550187496235594 + } + }, + "sustained": { + "warp_cache": { + "ops": 79017448, + "elapsed": 10.000001000007614, + "ops_per_sec": 7901744.009819582 + }, + "lru_cache": { + "ops": 102249072, + "elapsed": 10.00000437500421, + "ops_per_sec": 10224902.726600753 + }, + "cachetools": { + "ops": 7514944, + "elapsed": 9.999999959021807, + "ops_per_sec": 751494.4030794883 + }, + "cachebox": { + "ops": 12601527, + "elapsed": 9.999999958032276, + "ops_per_sec": 1260152.705288574 + }, + "moka_py": { + "ops": 27687264, + "elapsed": 9.999999875028152, + "ops_per_sec": 2768726.434601286 + } + }, + "ttl": { + "0.001": { + "warp_cache": { + "ops_per_sec": 6680155.723776542 + }, + "cachetools": { + "ops_per_sec": 584303.6658768142 + }, + "moka_py": { + "ops_per_sec": 2526872.4471823745 + } + }, + "0.01": { + "warp_cache": { + "ops_per_sec": 6922505.973063967 + }, + "cachetools": { + "ops_per_sec": 528319.6109877785 + }, + "moka_py": { + "ops_per_sec": 2702144.032445885 + } + }, + "0.1": { + "warp_cache": { + "ops_per_sec": 6922264.728406483 + }, + "cachetools": { + "ops_per_sec": 528720.7053065702 + }, + "moka_py": { + "ops_per_sec": 2664072.0333089074 + } + }, + "1.0": { + "warp_cache": { + "ops_per_sec": 7001105.05880484 + }, + "cachetools": { + "ops_per_sec": 525996.7049602838 + }, + "moka_py": { + "ops_per_sec": 2633113.210805349 + } + }, + "None": { + "warp_cache": { + "ops_per_sec": 6912340.158059271 + }, + "cachetools": { + "ops_per_sec": 532211.4133053867 + }, + "moka_py": { + "ops_per_sec": 2661041.5332555277 + } + } + }, + "async_throughput": { + "256": { + "warp_cache": 5616939.033286044, + "moka_py": 3218720.074609825 + }, + "sync_256": { + "warp_cache": 16278691.254512586, + "moka_py": 3689509.3404215863 } } } \ No newline at end of file diff --git a/benchmarks/results/bench_perf_test.json b/benchmarks/results/bench_perf_test.json index f4db3e8..86151b0 100644 --- a/benchmarks/results/bench_perf_test.json +++ b/benchmarks/results/bench_perf_test.json @@ -1,94 +1,165 @@ { "python": { - "version": "3.12.9", + "version": "3.13.2", "implementation": "CPython", "build": "main", - "compiler": "Clang 16.0.0 (clang-1600.0.26.6)", + "compiler": "Clang 20.1.0 ", "arch": "arm64", "gil_disabled": false }, + "contestants": { + "warp_cache": { + "version": "0.1.0", + "available": true, + "thread_safe": true + }, + "lru_cache": { + "version": "3.13.2", + "available": true, + "thread_safe": false + }, + "cachetools": { + "version": "7.0.1", + "available": true, + "thread_safe": false + }, + "cachebox": { + "version": "5.2.2", + "available": true, + "thread_safe": true + }, + "moka_py": { + "version": "0.3.0", + "available": true, + "thread_safe": true + }, + "zoocache": { + "version": "", + "available": true, + "thread_safe": true + } + }, "throughput": { "32": { - "warp_cache": 12822156.712211879, - "lru_cache": 19066838.388641227 + "warp_cache": 14308112.647545936, + "lru_cache": 20731653.35349717, + "cachetools": 590043.0263509648, + "cachebox": 1063028.2749105138, + "moka_py": 2931423.0224506822 }, "64": { - "warp_cache": 13402280.021351445, - "lru_cache": 21207972.191766735 + "warp_cache": 14885289.459090889, + "lru_cache": 22307939.300232273, + "cachetools": 631269.1257652182, + "cachebox": 1139179.766485533, + "moka_py": 3256193.2469199277 }, "128": { - "warp_cache": 14090131.182621641, - "lru_cache": 24097593.224030606 + "warp_cache": 16611295.759392038, + "lru_cache": 25202140.06232663, + "cachetools": 696514.8444739614, + "cachebox": 1224619.7554149416, + "moka_py": 3390199.1776521862 }, "256": { - "warp_cache": 14622732.81993702, - "lru_cache": 27449587.456561457 + "warp_cache": 17321641.27490993, + "lru_cache": 28052458.405218985, + "cachetools": 801508.1691904534, + "cachebox": 1483106.483002092, + "moka_py": 3715003.4996983386 }, "512": { - "warp_cache": 17238653.85558022, - "lru_cache": 32661507.840447053 + "warp_cache": 18385732.768209964, + "lru_cache": 35370584.395680524, + "cachetools": 930603.3537709542, + "cachebox": 1770685.8420114717, + "moka_py": 4033308.3526122873 }, "1024": { - "warp_cache": 17417178.767218716, - "lru_cache": 36530787.93597052 + "warp_cache": 19667942.14357153, + "lru_cache": 40264392.31897666, + "cachetools": 1139012.6952039886, + "cachebox": 2352242.422646535, + "moka_py": 4385996.840230291 + }, + "zoocache_unbounded": { + "zoocache": 1170969.3324545408 } }, "threading": { "1": { - "warp_cache": 15348605.154510036, - "lru_cache": 9897682.694595613 + "warp_cache": 17748720.28376184, + "lru_cache": 11837589.205750767, + "cachetools": 768186.5788631366, + "cachebox": 1506347.3708151765, + "moka_py": 3558634.3914518896 }, "2": { - "warp_cache": 14461665.21879856, - "lru_cache": 10031893.421193613 + "warp_cache": 17407070.62151973, + "lru_cache": 11974674.043475201, + "cachetools": 771673.906565188, + "cachebox": 1497026.537937484, + "moka_py": 3527404.1741586165 }, "4": { - "warp_cache": 15218383.795245877, - "lru_cache": 9442388.257813832 + "warp_cache": 17555024.086381514, + "lru_cache": 12015380.66040378, + "cachetools": 786849.9001890247, + "cachebox": 1378581.426968127, + "moka_py": 3511518.554108022 }, "8": { - "warp_cache": 15527448.796966495, - "lru_cache": 10113864.933091491 + "warp_cache": 17221583.30307671, + "lru_cache": 11728713.89071519, + "cachetools": 782671.1412021917, + "cachebox": 1510555.9467995588, + "moka_py": 3586725.442256531 }, "16": { - "warp_cache": 15591099.080845982, - "lru_cache": 9403578.054758623 + "warp_cache": 16575273.326457018, + "lru_cache": 11496730.66693745, + "cachetools": 780772.4359686638, + "cachebox": 1454955.4879910185, + "moka_py": 3559109.3465307606 }, "32": { - "warp_cache": 14465411.925485857, - "lru_cache": 9379030.353846923 + "warp_cache": 16080508.045336932, + "lru_cache": 11541013.840239659, + "cachetools": 765631.8108660043, + "cachebox": 1432799.0458399756, + "moka_py": 3619969.6467658766 } }, "shared_throughput": { "memory": { - "ops_per_sec": 14439042.425839456, - "hit_rate": 0.6492 + "ops_per_sec": 17226404.24790721, + "hit_rate": 0.71241 }, "shared": { - "ops_per_sec": 2756162.268104078, - "hit_rate": 0.6492 + "ops_per_sec": 9184809.588599356, + "hit_rate": 0.72338 } }, "multiprocess": { "1": { - "total_ops_per_sec": 2289040.588441179, - "per_process_avg_ops_per_sec": 3060469.522058054, - "wall_time": 0.21843212502426468 + "total_ops_per_sec": 4847480.0988956, + "per_process_avg_ops_per_sec": 9873877.600975463, + "wall_time": 0.10314637498231605 }, "2": { - "total_ops_per_sec": 167250.60123514832, - "per_process_avg_ops_per_sec": 84539.2184667449, - "wall_time": 2.98952587498934 + "total_ops_per_sec": 7493396.408961757, + "per_process_avg_ops_per_sec": 6437570.497444544, + "wall_time": 0.06672541698208079 }, "4": { - "total_ops_per_sec": 122304.62893978224, - "per_process_avg_ops_per_sec": 30728.17824523134, - "wall_time": 4.088152707990957 + "total_ops_per_sec": 7449816.1705269, + "per_process_avg_ops_per_sec": 2558620.88379943, + "wall_time": 0.06711575004737824 }, "8": { - "total_ops_per_sec": 116384.68773514514, - "per_process_avg_ops_per_sec": 14602.230670399847, - "wall_time": 4.29609779198654 + "total_ops_per_sec": 6494188.215989986, + "per_process_avg_ops_per_sec": 999356.3007887901, + "wall_time": 0.07699191698338836 } } } \ No newline at end of file diff --git a/benchmarks/results/bench_py3.13.json b/benchmarks/results/bench_py3.13.json index 38fa16e..f68c2da 100644 --- a/benchmarks/results/bench_py3.13.json +++ b/benchmarks/results/bench_py3.13.json @@ -128,6 +128,18 @@ "cachetools": 796023.8607494867, "cachebox": 1470335.963362468, "moka_py": 3798724.576692935 + }, + "64": { + "warp_cache": 12785744.105428724, + "lru_cache": 9775546.117623253, + "moka_py": 3851620.1577660977, + "cachebox": 1492835.8200793546 + }, + "128": { + "warp_cache": 12093329.91651306, + "lru_cache": 9517739.801874982, + "moka_py": 3808867.023961398, + "cachebox": 1391972.7087392122 } }, "shared_throughput": { @@ -245,5 +257,15 @@ "ops_per_sec": 2721474.5888478886 } } + }, + "async_throughput": { + "256": { + "warp_cache": 5616939.033286044, + "moka_py": 3218720.074609825 + }, + "sync_256": { + "warp_cache": 16278691.254512586, + "moka_py": 3689509.3404215863 + } } } \ No newline at end of file diff --git a/benchmarks/results/bench_sieve.json b/benchmarks/results/bench_sieve.json new file mode 100644 index 0000000..e987a96 --- /dev/null +++ b/benchmarks/results/bench_sieve.json @@ -0,0 +1,498 @@ +{ + "n_ops": 1000000, + "seed": 42, + "mode": "full", + "hitratio": { + "num_keys": 10000, + "n_ops": 1000000, + "results": [ + { + "cache_ratio": 0.001, + "cache_size": 10, + "sieve_hit_ratio": 0.261837, + "lru_hit_ratio": 0.130764 + }, + { + "cache_ratio": 0.005, + "cache_size": 50, + "sieve_hit_ratio": 0.409279, + "lru_hit_ratio": 0.308636 + }, + { + "cache_ratio": 0.01, + "cache_size": 100, + "sieve_hit_ratio": 0.49615, + "lru_hit_ratio": 0.390127 + }, + { + "cache_ratio": 0.05, + "cache_size": 500, + "sieve_hit_ratio": 0.671978, + "lru_hit_ratio": 0.586187 + }, + { + "cache_ratio": 0.1, + "cache_size": 1000, + "sieve_hit_ratio": 0.7452, + "lru_hit_ratio": 0.675096 + }, + { + "cache_ratio": 0.25, + "cache_size": 2500, + "sieve_hit_ratio": 0.840057, + "lru_hit_ratio": 0.798591 + }, + { + "cache_ratio": 0.5, + "cache_size": 5000, + "sieve_hit_ratio": 0.912964, + "lru_hit_ratio": 0.896617 + } + ] + }, + "zipf": { + "num_keys": 10000, + "cache_size": 1000, + "n_ops": 1000000, + "results": [ + { + "alpha": 0.5, + "sieve_hit_ratio": 0.266559, + "lru_hit_ratio": 0.180414 + }, + { + "alpha": 0.7, + "sieve_hit_ratio": 0.434511, + "lru_hit_ratio": 0.328612 + }, + { + "alpha": 0.8, + "sieve_hit_ratio": 0.536932, + "lru_hit_ratio": 0.435617 + }, + { + "alpha": 1.0, + "sieve_hit_ratio": 0.7452, + "lru_hit_ratio": 0.675096 + }, + { + "alpha": 1.2, + "sieve_hit_ratio": 0.89339, + "lru_hit_ratio": 0.86152 + }, + { + "alpha": 1.5, + "sieve_hit_ratio": 0.980378, + "lru_hit_ratio": 0.974598 + } + ] + }, + "scan": { + "hot_size": 100, + "scan_size": 10000, + "cache_size": 200, + "n_ops": 1000000, + "results": [ + { + "hot_fraction": 1.0, + "sieve_hit_ratio": 0.9999, + "lru_hit_ratio": 0.9999 + }, + { + "hot_fraction": 0.9, + "sieve_hit_ratio": 0.899884, + "lru_hit_ratio": 0.888736 + }, + { + "hot_fraction": 0.8, + "sieve_hit_ratio": 0.799987, + "lru_hit_ratio": 0.76028 + }, + { + "hot_fraction": 0.7, + "sieve_hit_ratio": 0.699375, + "lru_hit_ratio": 0.635343 + }, + { + "hot_fraction": 0.5, + "sieve_hit_ratio": 0.499898, + "lru_hit_ratio": 0.409364 + }, + { + "hot_fraction": 0.3, + "sieve_hit_ratio": 0.29949, + "lru_hit_ratio": 0.210645 + } + ] + }, + "ohw": { + "num_reused_keys": 5000, + "cache_size": 500, + "n_ops": 1000000, + "results": [ + { + "ohw_ratio": 0.0, + "sieve_hit_ratio": 0.724034, + "lru_hit_ratio": 0.649796 + }, + { + "ohw_ratio": 0.25, + "sieve_hit_ratio": 0.538846, + "lru_hit_ratio": 0.437105 + }, + { + "ohw_ratio": 0.5, + "sieve_hit_ratio": 0.355657, + "lru_hit_ratio": 0.257666 + }, + { + "ohw_ratio": 0.75, + "sieve_hit_ratio": 0.17212, + "lru_hit_ratio": 0.10564 + } + ] + }, + "shift": { + "set_size": 1000, + "cache_size": 200, + "n_per_phase": 500000, + "window_size": 10000, + "results": { + "sieve": { + "phases": { + "phase1": 0.75465, + "phase2": 0.755886, + "phase3": 0.755082 + }, + "windowed": [ + 0.7579, + 0.7454, + 0.7634, + 0.7533, + 0.7582, + 0.7544, + 0.7473, + 0.7506, + 0.7548, + 0.7545, + 0.7541, + 0.7527, + 0.7559, + 0.7555, + 0.7582, + 0.7547, + 0.7448, + 0.7537, + 0.7525, + 0.7577, + 0.7641, + 0.7595, + 0.7586, + 0.755, + 0.7532, + 0.7523, + 0.7549, + 0.7621, + 0.7562, + 0.7564, + 0.7568, + 0.7539, + 0.7519, + 0.7555, + 0.7491, + 0.7565, + 0.7567, + 0.7503, + 0.7612, + 0.7635, + 0.7666, + 0.7579, + 0.7568, + 0.7445, + 0.7494, + 0.7485, + 0.755, + 0.7558, + 0.7586, + 0.7555, + 0.7499, + 0.7668, + 0.7554, + 0.75, + 0.7515, + 0.755, + 0.7559, + 0.7568, + 0.7655, + 0.7436, + 0.7564, + 0.7578, + 0.7554, + 0.7518, + 0.7532, + 0.7583, + 0.7541, + 0.7654, + 0.7643, + 0.7586, + 0.7643, + 0.7631, + 0.761, + 0.754, + 0.7513, + 0.7522, + 0.7556, + 0.75, + 0.7599, + 0.7587, + 0.7559, + 0.757, + 0.7629, + 0.7554, + 0.7629, + 0.7545, + 0.76, + 0.7575, + 0.7555, + 0.7577, + 0.7549, + 0.7598, + 0.7411, + 0.757, + 0.762, + 0.76, + 0.7526, + 0.7494, + 0.7607, + 0.759, + 0.7557, + 0.7627, + 0.7542, + 0.7555, + 0.756, + 0.7588, + 0.7557, + 0.7604, + 0.749, + 0.7586, + 0.7622, + 0.7517, + 0.7474, + 0.7557, + 0.7544, + 0.7552, + 0.7485, + 0.7486, + 0.752, + 0.7589, + 0.7555, + 0.7619, + 0.7496, + 0.7517, + 0.7555, + 0.7599, + 0.7509, + 0.7592, + 0.7606, + 0.7656, + 0.7571, + 0.7604, + 0.7558, + 0.7591, + 0.7587, + 0.7637, + 0.7555, + 0.7601, + 0.749, + 0.7559, + 0.7534, + 0.758, + 0.7554, + 0.7632, + 0.7501, + 0.7585, + 0.7519, + 0.7474, + 0.7533, + 0.751 + ] + }, + "lru": { + "phases": { + "phase1": 0.696454, + "phase2": 0.698666, + "phase3": 0.696174 + }, + "windowed": [ + 0.6972, + 0.6879, + 0.705, + 0.6919, + 0.6982, + 0.6986, + 0.695, + 0.6975, + 0.6985, + 0.6994, + 0.6992, + 0.6983, + 0.7022, + 0.691, + 0.6963, + 0.6988, + 0.6887, + 0.6965, + 0.6961, + 0.6942, + 0.7045, + 0.6981, + 0.6907, + 0.6955, + 0.6928, + 0.6937, + 0.7004, + 0.704, + 0.6997, + 0.6994, + 0.6981, + 0.6935, + 0.6978, + 0.6996, + 0.6921, + 0.7002, + 0.6994, + 0.685, + 0.7018, + 0.6992, + 0.7056, + 0.6964, + 0.7, + 0.6867, + 0.6891, + 0.6882, + 0.6977, + 0.6928, + 0.6971, + 0.6993, + 0.6912, + 0.71, + 0.6947, + 0.6962, + 0.697, + 0.6954, + 0.6968, + 0.6945, + 0.705, + 0.6876, + 0.7028, + 0.6979, + 0.7062, + 0.6945, + 0.6942, + 0.7041, + 0.7007, + 0.6995, + 0.7057, + 0.6982, + 0.7105, + 0.7029, + 0.7037, + 0.6951, + 0.6978, + 0.6901, + 0.6974, + 0.6953, + 0.7021, + 0.6975, + 0.6999, + 0.7042, + 0.704, + 0.695, + 0.7046, + 0.694, + 0.7049, + 0.7027, + 0.6964, + 0.6995, + 0.702, + 0.6981, + 0.6786, + 0.6938, + 0.7091, + 0.7027, + 0.696, + 0.6898, + 0.7005, + 0.6991, + 0.696, + 0.6928, + 0.6981, + 0.6886, + 0.6976, + 0.6997, + 0.693, + 0.7066, + 0.6896, + 0.696, + 0.704, + 0.6946, + 0.6831, + 0.6967, + 0.6941, + 0.6939, + 0.6906, + 0.6876, + 0.6933, + 0.7021, + 0.6922, + 0.7006, + 0.6971, + 0.6985, + 0.698, + 0.7003, + 0.692, + 0.6973, + 0.7064, + 0.6999, + 0.6922, + 0.7032, + 0.6964, + 0.7024, + 0.6993, + 0.702, + 0.699, + 0.6975, + 0.6908, + 0.7, + 0.6892, + 0.7026, + 0.6972, + 0.6997, + 0.6933, + 0.7028, + 0.6944, + 0.6884, + 0.6923, + 0.6916 + ] + } + } + }, + "throughput": { + "cache_size": 64, + "num_keys": 10000, + "n_ops": 2000000, + "results": { + "sieve": { + "ops_per_sec": 14610653.444947425, + "hit_ratio": 0.4424175, + "elapsed": 0.13688641699263826 + }, + "lru": { + "ops_per_sec": 19857949.53114094, + "hit_ratio": 0.3377435, + "elapsed": 0.10071533301379532 + } + } + } +} \ No newline at end of file diff --git a/benchmarks/results/comparison_async_dark.svg b/benchmarks/results/comparison_async_dark.svg new file mode 100644 index 0000000..4678ede --- /dev/null +++ b/benchmarks/results/comparison_async_dark.svg @@ -0,0 +1,32902 @@ + + + + + + + + 2026-03-05T20:16:41.364028 + image/svg+xml + + + Matplotlib v3.10.8, https://matplotlib.org/ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/benchmarks/results/comparison_async_light.svg b/benchmarks/results/comparison_async_light.svg new file mode 100644 index 0000000..5d242fb --- /dev/null +++ b/benchmarks/results/comparison_async_light.svg @@ -0,0 +1,32902 @@ + + + + + + + + 2026-03-05T20:16:41.080526 + image/svg+xml + + + Matplotlib v3.10.8, https://matplotlib.org/ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/benchmarks/results/comparison_backends.png b/benchmarks/results/comparison_backends.png deleted file mode 100644 index ac95efc..0000000 Binary files a/benchmarks/results/comparison_backends.png and /dev/null differ diff --git a/benchmarks/results/comparison_backends_dark.svg b/benchmarks/results/comparison_backends_dark.svg new file mode 100644 index 0000000..c1c5ddb --- /dev/null +++ b/benchmarks/results/comparison_backends_dark.svg @@ -0,0 +1,26058 @@ + + + + + + + + 2026-03-05T20:16:41.320134 + image/svg+xml + + + Matplotlib v3.10.8, https://matplotlib.org/ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/benchmarks/results/comparison_backends_light.svg b/benchmarks/results/comparison_backends_light.svg new file mode 100644 index 0000000..3b70ffa --- /dev/null +++ b/benchmarks/results/comparison_backends_light.svg @@ -0,0 +1,26058 @@ + + + + + + + + 2026-03-05T20:16:41.037252 + image/svg+xml + + + Matplotlib v3.10.8, https://matplotlib.org/ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/benchmarks/results/comparison_mt_scaling.png b/benchmarks/results/comparison_mt_scaling.png deleted file mode 100644 index 3e70cac..0000000 Binary files a/benchmarks/results/comparison_mt_scaling.png and /dev/null differ diff --git a/benchmarks/results/comparison_mt_scaling_dark.svg b/benchmarks/results/comparison_mt_scaling_dark.svg new file mode 100644 index 0000000..fcae707 --- /dev/null +++ b/benchmarks/results/comparison_mt_scaling_dark.svg @@ -0,0 +1,41544 @@ + + + + + + + + 2026-03-05T20:16:41.198773 + image/svg+xml + + + Matplotlib v3.10.8, https://matplotlib.org/ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/benchmarks/results/comparison_mt_scaling_light.svg b/benchmarks/results/comparison_mt_scaling_light.svg new file mode 100644 index 0000000..831ab37 --- /dev/null +++ b/benchmarks/results/comparison_mt_scaling_light.svg @@ -0,0 +1,41544 @@ + + + + + + + + 2026-03-05T20:16:40.919868 + image/svg+xml + + + Matplotlib v3.10.8, https://matplotlib.org/ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/benchmarks/results/comparison_scaling_ratio.png b/benchmarks/results/comparison_scaling_ratio.png deleted file mode 100644 index 2a07dbf..0000000 Binary files a/benchmarks/results/comparison_scaling_ratio.png and /dev/null differ diff --git a/benchmarks/results/comparison_scaling_ratio_dark.svg b/benchmarks/results/comparison_scaling_ratio_dark.svg new file mode 100644 index 0000000..8f4e181 --- /dev/null +++ b/benchmarks/results/comparison_scaling_ratio_dark.svg @@ -0,0 +1,38468 @@ + + + + + + + + 2026-03-05T20:16:41.269132 + image/svg+xml + + + Matplotlib v3.10.8, https://matplotlib.org/ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/benchmarks/results/comparison_scaling_ratio_light.svg b/benchmarks/results/comparison_scaling_ratio_light.svg new file mode 100644 index 0000000..8e055bc --- /dev/null +++ b/benchmarks/results/comparison_scaling_ratio_light.svg @@ -0,0 +1,38468 @@ + + + + + + + + 2026-03-05T20:16:40.988675 + image/svg+xml + + + Matplotlib v3.10.8, https://matplotlib.org/ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/benchmarks/results/comparison_st_throughput.png b/benchmarks/results/comparison_st_throughput.png deleted file mode 100644 index 543b199..0000000 Binary files a/benchmarks/results/comparison_st_throughput.png and /dev/null differ diff --git a/benchmarks/results/comparison_st_throughput_dark.svg b/benchmarks/results/comparison_st_throughput_dark.svg new file mode 100644 index 0000000..338dc26 --- /dev/null +++ b/benchmarks/results/comparison_st_throughput_dark.svg @@ -0,0 +1,32488 @@ + + + + + + + + 2026-03-05T20:16:41.132199 + image/svg+xml + + + Matplotlib v3.10.8, https://matplotlib.org/ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/benchmarks/results/comparison_st_throughput_light.svg b/benchmarks/results/comparison_st_throughput_light.svg new file mode 100644 index 0000000..8722fd5 --- /dev/null +++ b/benchmarks/results/comparison_st_throughput_light.svg @@ -0,0 +1,32488 @@ + + + + + + + + 2026-03-05T20:16:40.851529 + image/svg+xml + + + Matplotlib v3.10.8, https://matplotlib.org/ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/docs/alternatives.md b/docs/alternatives.md index a3089ab..d8be972 100644 --- a/docs/alternatives.md +++ b/docs/alternatives.md @@ -9,20 +9,22 @@ The Python caching ecosystem includes several notable libraries. Here's how they | Async support | Yes | Yes | Yes | No | No | | Cross-process (shared mem) | Yes (mmap) | No | No | No | No | | TTL support | Yes | Yes | Yes (+ TTI) | Yes | No | -| LRU | Yes | Yes | Yes | Yes | Yes | -| LFU | Yes | Yes | TinyLFU | Yes | No | -| FIFO | Yes | Yes | No | Yes | No | -| MRU | Yes | No | No | No | No | -| Custom key function | No | No | No | Yes | No | -| Stampede prevention | No | Yes | Yes | No | No | +| Eviction | SIEVE | LRU/LFU/FIFO/RR | TinyLFU/LRU | LRU/LFU/FIFO/RR | LRU | +| Stampede prevention | No | Yes | Yes (`get_with`) | No | No | | Per-entry TTL | No | Yes (VTTLCache) | Yes | No | No | +| Manual cache object | No | Yes (dict-like) | Yes (`Moka(...)`) | Yes | No | +| Cache statistics | Yes | Yes (+ memory) | No | Yes | Yes | -**Performance ballpark** (not directly comparable — different benchmarking setups): +**Performance** (same machine, same workload, single-threaded, cache=256): -- [cachebox](https://github.com/awolverp/cachebox): ~3.7M ops/s LRU insert (from cachebox-benchmark, Python 3.13) -- [moka-py](https://github.com/deliro/moka-py): ~8.9M ops/s get (from moka-py README) -- warp_cache: 14-20M ops/s get (our benchmarks, see [performance](performance.md)) +| Library | ops/s | vs warp_cache | +|---|---:|---:| +| lru_cache | 32.1M | 1.8x faster | +| warp_cache | 18.1M | 1.0x | +| moka_py | 3.7M | 4.9x slower | +| cachebox | 1.5M | 12.1x slower | +| cachetools | 814K | 22.2x slower | -These numbers come from different machines, different workloads, and different measurement methodologies — treat them as order-of-magnitude indicators, not head-to-head results. +See [full benchmarks](../benchmarks/COMPARISON.md) for multi-thread, TTL, shared memory, and sustained throughput results. -**warp_cache's niche**: the only Rust-backed cache combining shared memory (cross-process mmap), all four eviction strategies (LRU/MRU/FIFO/LFU), and builtin thread safety in a single decorator. If you need stampede prevention or per-entry TTL, look at cachebox or moka-py. If you need a custom key function, cachetools is the way to go. +**warp_cache's niche**: the only Rust-backed cache combining shared memory (cross-process mmap), SIEVE eviction (scan-resistant, near-optimal hit rates), and builtin thread safety in a single decorator. If you need stampede prevention or per-entry TTL, look at cachebox or moka-py. If you need a manual cache object API, look at moka-py or cachebox. If you need maximum single-threaded speed, use `lru_cache`. diff --git a/docs/performance.md b/docs/performance.md index 2c2d235..d44e1d5 100644 --- a/docs/performance.md +++ b/docs/performance.md @@ -1,6 +1,6 @@ # Performance -*Environment: Python 3.13.2, CPython, Apple M-series (arm64), Clang 20.0.0* +*Environment: Python 3.13.2, CPython, Apple M-series (arm64), Clang 20.1.0* ## Architecture @@ -10,100 +10,197 @@ The entire cache lookup happens in a single Rust `__call__`: Python: fn(42) └─ tp_call (PyO3) ─────────────────────────────── one FFI crossing ├─ hash(args) via ffi::PyObject_Hash - ├─ HashMap lookup Rust hashbrown, precomputed hash + ├─ shard select hash % n_shards + ├─ RwLock::read() per-shard read lock (~8ns) + ├─ HashMap lookup hashbrown ├─ equality check via ffi::PyObject_RichCompareBool - ├─ RwLock (read) parking_lot, ~8ns uncontended + ├─ SIEVE visited=1 AtomicBool store, lock-free └─ return cached value ``` No Python wrapper function. No serialization. No intermediate key object. -## Single-threaded throughput vs cache size +## SIEVE eviction -| Cache Size | warp_cache (ops/s) | cachetools (ops/s) | lru_cache (ops/s) | wc/ct | wc/lru | -|---:|---:|---:|---:|---:|---:| -| 32 | 12,350,000 | 610,000 | 22,230,000 | 20.2x | 0.56x | -| 64 | 13,480,000 | 654,000 | 23,290,000 | 20.6x | 0.58x | -| 128 | 14,210,000 | 717,000 | 26,760,000 | 19.8x | 0.53x | -| 256 | 16,350,000 | 833,000 | 29,770,000 | 19.6x | 0.55x | -| 512 | 17,660,000 | 960,000 | 35,710,000 | 18.4x | 0.49x | -| 1024 | 17,710,000 | 1,184,000 | 39,780,000 | 15.0x | 0.45x | +Both the in-process and shared memory backends use **SIEVE** — a scan-resistant eviction algorithm that achieves near-optimal hit rates with O(1) overhead per operation. -## Strategy comparison (cache size = 256) +How it works: +- **On hit**: set `visited = 1` — a single idempotent store. No linked-list reordering, no lock promotion. +- **On evict**: a "hand" scans the entry list. Visited entries get a second chance (bit cleared to 0); unvisited entries are evicted. -| Strategy | warp_cache (ops/s) | cachetools (ops/s) | Ratio | -|---|---:|---:|---:| -| LRU | 16,350,000 | 833,000 | 19.6x | -| LFU | 6,270,000 | 770,000 | 8.1x | -| FIFO | 15,710,000 | 921,000 | 17.1x | +This is simpler than LRU (no list reordering on every hit) and achieves higher hit rates than LRU/FIFO on skewed workloads. The visited-bit design enables **lock-free reads** on both backends — cache hits never need a write lock. + +## Single-threaded throughput vs cache size + +| Cache Size | warp_cache | lru_cache | cachetools | cachebox | moka_py | wc/ct | wc/lru | +|---:|---:|---:|---:|---:|---:|---:|---:| +| 32 | 13.0M | 19.6M | 594K | 1.1M | 3.1M | 21.9x | 0.66x | +| 64 | 15.0M | 22.3M | 628K | 1.2M | 3.3M | 23.9x | 0.67x | +| 128 | 16.6M | 25.7M | 711K | 1.3M | 3.4M | 23.3x | 0.65x | +| 256 | 18.1M | 32.1M | 814K | 1.5M | 3.7M | 22.2x | 0.56x | +| 512 | 18.6M | 34.5M | 948K | 1.8M | 4.1M | 19.6x | 0.54x | +| 1024 | 19.9M | 39.5M | 1.1M | 2.4M | 4.4M | 17.6x | 0.50x | -## TTL throughput (cache size = 256, ttl = 60s) +## TTL throughput (cache size = 256) -| Implementation | ops/s | -|---|---:| -| warp_cache | 14,190,000 | -| cachetools | 580,000 | -| **Ratio** | **24.5x** | +| TTL | warp_cache | cachetools | moka_py | wc/ct | +|---|---:|---:|---:|---:| +| 1ms | 6.7M | 584K | 2.5M | 11.5x | +| 10ms | 6.9M | 528K | 2.7M | 13.1x | +| 100ms | 6.9M | 529K | 2.7M | 13.0x | +| 1s | 7.0M | 526K | 2.6M | 13.3x | +| None | 6.9M | 532K | 2.7M | 13.0x | + +TTL adds minimal overhead — the expiry timestamp is checked during the normal read path, with no background eviction thread. ## Multi-threaded throughput (cache size = 256) -| Threads | warp_cache (ops/s) | cachetools + Lock (ops/s) | lru_cache + Lock (ops/s) | wc/ct | wc/lru | -|---:|---:|---:|---:|---:|---:| -| 1 | 15,920,000 | 809,000 | 12,930,000 | 19.7x | 1.23x | -| 2 | 15,630,000 | 810,000 | 12,670,000 | 19.3x | 1.23x | -| 4 | 15,650,000 | 821,000 | 12,650,000 | 19.1x | 1.24x | -| 8 | 16,410,000 | 810,000 | 12,620,000 | 20.3x | 1.30x | -| 16 | 16,120,000 | 801,000 | 12,140,000 | 20.1x | 1.33x | +| Threads | warp_cache | lru_cache + Lock | cachetools + Lock | cachebox | moka_py | wc/lru | +|---:|---:|---:|---:|---:|---:|---:| +| 1 | 18.3M | 11.9M | 778K | 1.5M | 3.6M | 1.54x | +| 2 | 17.3M | 12.2M | 793K | 1.5M | 3.5M | 1.42x | +| 4 | 18.0M | 12.6M | 803K | 1.5M | 3.6M | 1.43x | +| 8 | 17.9M | 12.3M | 774K | 1.5M | 3.6M | 1.46x | +| 16 | 17.2M | 11.6M | 785K | 1.5M | 3.6M | 1.48x | +| 32 | 16.8M | 11.6M | 779K | 1.4M | 3.6M | 1.45x | + +`warp_cache` maintains ~17-18M ops/s regardless of thread count — stable scaling with no contention, and **1.4-1.5x faster** than `lru_cache + Lock` even under the GIL. The sharded `RwLock` architecture means cache hits only acquire a cheap per-shard read lock (~8ns), while `lru_cache + Lock` must acquire a global `threading.Lock()` on every access. Under free-threaded Python (no GIL), `warp_cache`'s per-shard locking enables true parallel reads across shards while `lru_cache` must acquire a real lock. + +## Shared memory backend + +The shared memory backend uses SIEVE with **fully lock-free reads** — no Mutex, no RwLock. `ShmCache` uses interior mutability: reads go through the seqlock's optimistic path (lock-free), and the `visited` bit is set via a direct idempotent store. Writes acquire the seqlock's TTAS spinlock internally. Size limit checks use cached struct fields, avoiding shared memory header reads on the hot path. + +### Memory vs shared throughput + +| Backend | Throughput | Hit Rate | Notes | +|---|---:|---:|---| +| Memory (in-process) | 17.2M ops/s | 71.2% | Sharded hashbrown HashMap + RwLock + SIEVE | +| Shared (mmap, single process) | 9.2M ops/s | 72.3% | Seqlock + lock-free reads, no Mutex | + +The shared backend reaches **54% of in-process speed**. The gap is dominated by serialization (serde fast-path for primitives, pickle fallback), ahash of key bytes, seqlock overhead, and mmap copy — all irreducible cross-process costs. + +### Multi-process scaling + +| Processes | Total Throughput | Per-Process Avg | +|---:|---:|---:| +| 1 | 4.8M ops/s | 4.8M ops/s | +| 2 | 7.2M ops/s | 3.6M ops/s | +| 4 | 7.5M ops/s | 1.9M ops/s | +| 8 | 6.6M ops/s | 0.8M ops/s | + +Lock-free reads enable excellent multi-process scaling. Total throughput peaks at 4 processes (7.5M ops/s) — 1.6x the single-process rate. Even with 8 processes contending on the same mmap'd file, throughput stays at 6.6M ops/s. + +For comparison, the previous LRU-based shared backend achieved only 3.1M ops/s at 4 processes and 1.9M ops/s at 8 processes — SIEVE delivers **2.5x and 3.4x improvements** respectively, thanks to eliminating write locks on the read path. + +## SIEVE eviction quality + +Beyond throughput, SIEVE delivers **measurably better hit rates** than LRU across all workload patterns. These benchmarks compare `warp_cache` (SIEVE) against `functools.lru_cache` (LRU) using 1M requests with Zipf-distributed keys. -`warp_cache` maintains ~16M ops/s regardless of thread count. `cachetools` -requires a manual `threading.Lock()` and tops out at ~770K ops/s. -`lru_cache + Lock` degrades as contention increases. +Run them yourself: `make bench-sieve` or `python benchmarks/bench_sieve.py --quick`. + +### Hit ratio vs cache size + +With Zipf alpha=1.0 over 10K unique keys, SIEVE consistently outperforms LRU at every cache size — the advantage peaks at **21.6% miss reduction** at 10% cache ratio: + +| Cache % | SIEVE | LRU | Miss Reduction | +|---:|---:|---:|---:| +| 0.1% | 26.2% | 13.1% | +15.1% | +| 1% | 49.6% | 39.0% | +17.4% | +| 5% | 67.2% | 58.6% | +20.7% | +| 10% | 74.5% | 67.5% | **+21.6%** | +| 25% | 84.0% | 79.9% | +20.6% | +| 50% | 91.3% | 89.7% | +15.8% | + +### Scan resistance + +This is SIEVE's key advantage. Hot working set (100 keys, Zipf) mixed with sequential scans (10K unique keys, each accessed once). Cache size = 200 (fits the entire hot set). SIEVE protects hot items via the visited bit; LRU pushes them out during scans: + +| Hot Fraction | SIEVE | LRU | Miss Reduction | +|---:|---:|---:|---:| +| 100% | 99.99% | 99.99% | +0.0% | +| 90% | 90.0% | 88.9% | +10.0% | +| 80% | 80.0% | 76.0% | +16.6% | +| 70% | 69.9% | 63.5% | **+17.6%** | +| 50% | 50.0% | 40.9% | +15.3% | +| 30% | 30.0% | 21.1% | +11.3% | + +At 70% hot fraction, SIEVE retains almost all hot items (69.9% hit rate ≈ hot fraction) while LRU drops to 63.5% as scans push hot entries out of the cache. + +### One-hit-wonder filtering + +Mix of Zipf-distributed reused keys with unique one-time keys (one-hit wonders). SIEVE inserts with `visited=0` and evicts on the first hand scan; LRU gives every entry a full tenure through the cache: + +| OHW Ratio | SIEVE | LRU | Miss Reduction | +|---:|---:|---:|---:| +| 0% | 72.4% | 65.0% | +21.2% | +| 25% | 53.9% | 43.7% | +18.1% | +| 50% | 35.6% | 25.8% | +13.2% | +| 75% | 17.2% | 10.6% | +7.4% | + +### Working set shift + +Three phases: Zipf over keys 0–999, then keys 1000–1999 (completely new set), then back to 0–999. Cache size = 200. Both algorithms adapt, but SIEVE maintains a consistent advantage: + +| Phase | SIEVE | LRU | +|---|---:|---:| +| Phase 1 (keys 0–999) | 75.5% | 69.7% | +| Phase 2 (keys 1000–1999) | 75.6% | 69.9% | +| Phase 3 (return to 0–999) | 75.5% | 69.6% | + +*Benchmarks: `benchmarks/bench_sieve.py`, 1M ops, Zipf-distributed keys, seed=42.* ## Where the remaining gap lives -At cache size 128, a cache hit takes ~64ns vs `lru_cache`'s ~39ns: +### Memory backend vs lru_cache + +At cache size 256, a cache hit takes ~55ns vs `lru_cache`'s ~31ns: | Operation | lru_cache (C) | warp_cache (Rust) | Delta | |---|---:|---:|---:| | Call dispatch (`tp_call`) | ~5ns | ~10ns | +5ns | | Hash args (`PyObject_Hash`) | ~15ns | ~15ns | 0 | -| Table lookup + key equality | ~10ns | ~12ns | +2ns | -| LRU reorder (linked list) | ~5ns | ~8ns | +3ns | -| **Lock acquire + release** | **0ns** | **~8ns** | **+8ns** | +| Shard select + RwLock::read | ~0ns | ~8ns | +8ns | +| Table lookup + key equality | ~5ns | ~5ns | 0 | +| SIEVE visited store | ~0ns | ~1ns | +1ns | | Refcount management | ~2ns | ~5ns | +3ns | | Return value | ~2ns | ~2ns | 0 | -| **Total** | **~39ns** | **~60ns** | **+21ns (~34ns measured)** | - -Three categories: +| **Total** | **~31ns** | **~55ns** | **+24ns** | -1. **Irreducible: Thread safety lock (~8ns)** — `lru_cache` pays nothing because - the GIL provides implicit thread safety. We pay ~8ns for an uncontended - `parking_lot` write lock. This cannot be eliminated without removing thread - safety. +Two categories: -2. **Structural: PyO3 call dispatch (~5ns)** — PyO3's `tp_call` shim extracts +1. **Structural: PyO3 call dispatch (~5ns)** — PyO3's `tp_call` shim extracts GIL tokens, validates and converts argument pointers. `lru_cache` receives raw `PyObject*` directly. Inherent to using a safe FFI layer. -3. **Marginal: Reference counting (~3ns)** — `lru_cache` uses the args tuple - pointer as-is. We `Py_INCREF` to own it in `CacheKey`, then `Py_DECREF` on - drop. Cost of Rust's ownership model. +2. **Structural: per-shard RwLock (~8ns)** — `parking_lot::RwLock::read()` is + cheap (~8ns uncontended) and enables true parallel reads across shards. + `lru_cache` uses a simple C hash table with no concurrency support. + +Note: cache hits acquire only a **per-shard read lock** — the SIEVE visited bit +uses `AtomicBool::store(Relaxed)` which requires no lock upgrade. The write lock +is only acquired on cache misses for SIEVE eviction. + +### Shared backend vs memory backend -## Could a C extension do better? +The shared backend hit path takes ~109ns vs the memory backend's ~55ns. The ~54ns delta is irreducible cross-process overhead: -Yes, by ~15ns/hit — closing the gap to ~0.85x. A C extension would bypass -PyO3's shim and use `PyObject*` directly. But: ~800 lines of manual C with no -borrow checker, no memory safety, and manual `Py_DECREF` tracking. The Rust -implementation is ~400 lines with 6 lines of `unsafe`. +| Operation | Cost | Notes | +|---|---:|---| +| Key serialization (serde fast-path) | ~10ns | Unavoidable for cross-process | +| ahash of key bytes | ~4ns | Deterministic hash (Python's is randomized per-process) | +| Seqlock (read_begin + validate) | ~15ns | Optimistic lock-free read | +| HT lookup in mmap | ~10ns | Slightly slower than hashbrown in heap | +| Value `.to_vec()` copy | ~8ns | Must copy from mmap before seqlock validate | +| Value deserialization (serde) | ~8ns | Unavoidable for cross-process | +| **Total delta** | **~55ns** | | -Even a perfect C implementation cannot reach 1.0x — the lock is the irreducible -cost of thread safety. +No Mutex or RwLock on the shared backend's read path — `ShmCache` uses interior mutability with the seqlock providing all necessary synchronization. ## Python 3.13+/3.14 free-threading Under free-threaded Python (no GIL), `warp_cache`'s architecture pays off: -- **warp_cache improves**: `RwLock` enables true parallel reads across cores +- **warp_cache improves**: sharded `RwLock` enables true parallel reads across cores - **lru_cache gets worse**: needs a real lock without the GIL's implicit protection - **Trade-off**: atomic refcounting adds ~2-5ns to single-threaded cost @@ -118,9 +215,11 @@ runs all benchmarks. | 1. Serialization + Python wrapper | pickle.dumps, functools.wraps, 2 FFI crossings | ~500K ops/s | 0.02x | | 2. PyObject keys + Rust `__call__` | Precomputed hash, single FFI crossing | ~13-18M ops/s | 0.56-0.68x | | 3. Compiler: fat LTO + codegen-units=1 | Cross-crate inlining of PyO3 wrappers | +10-15% | 0.66-0.74x | -| 4. Static dispatch via enum | Replace `Box` with enum, enables inlining | +5% | — | -| 5. Raw FFI for key equality | `ffi::PyObject_RichCompareBool` instead of `Python::with_gil` | +multi-thread | — | +| 4. Raw FFI for key equality | `ffi::PyObject_RichCompareBool` instead of `Python::with_gil` | +multi-thread | — | +| 5. SIEVE eviction | Unified eviction for both backends, lock-free reads | +12% hit rate | — | +| 6. Sharded RwLock (hashbrown) | Per-shard read locks, true parallel reads across shards | ~18M ops/s | 0.56x | +| 7. Shared backend: remove Mutex | `ShmCache` interior mutability, cached hash state + size limits | cleaner arch | — | --- -*Benchmarks: 100K ops per config, Zipf-distributed keys (2000 unique), `time.perf_counter()`. cachetools 7.0.1. Source: `benchmarks/`* +*Benchmarks: 100K ops per config, Zipf-distributed keys (2000 unique), `time.perf_counter()`. Python 3.13.2, cachetools 7.0.1, moka_py 0.3.0, cachebox 5.2.2. Source: `benchmarks/`* diff --git a/docs/usage.md b/docs/usage.md index 21c481e..b74a6db 100644 --- a/docs/usage.md +++ b/docs/usage.md @@ -32,30 +32,16 @@ If you need to cache a function that takes unhashable arguments, convert them to hashable equivalents before passing (e.g. `tuple(my_list)`, `tuple(sorted(my_dict.items()))`). -## Eviction strategies +## Eviction -The `Strategy` enum controls how entries are evicted when the cache is full: +warp_cache uses **SIEVE** eviction — a simple, scan-resistant algorithm that provides near-optimal hit rates with O(1) overhead per access. There is no strategy parameter; SIEVE is used automatically for both the memory and shared backends. -```python -from warp_cache import cache, Strategy - -@cache(strategy=Strategy.LRU, max_size=256) -def fetch(url): - ... - -@cache(strategy=Strategy.LFU, max_size=1000) -def lookup(key): - ... -``` +SIEVE works by maintaining a `visited` bit on each cache entry: -| Strategy | Value | Evicts | Best for | -|----------|-------|--------|----------| -| `Strategy.LRU` | `0` | Least recently used (default) | General-purpose caching | -| `Strategy.MRU` | `1` | Most recently used | Scans where old items are re-accessed | -| `Strategy.FIFO` | `2` | Oldest insertion | Simple age-based rotation | -| `Strategy.LFU` | `3` | Least frequently used | Skewed access patterns with hot keys | +- **On cache hit**: the entry's `visited` bit is set to 1 (protecting it from eviction) +- **On eviction**: a rotating "hand" scans the cache. Entries with `visited=1` get a second chance (bit cleared to 0, hand advances). The first entry found with `visited=0` is evicted. -`Strategy` is an `IntEnum`, so you can also pass the integer value directly (e.g. `strategy=0` for LRU). +This means frequently-accessed entries are protected, while entries that were cached but never re-accessed are evicted first — similar to LRU but with better scan resistance and lower overhead. ## Async functions @@ -107,7 +93,7 @@ from warp_cache import cache, Backend The memory backend keeps all cached data in the process's own heap. Keys are stored as live Python objects (no serialization), and lookups go through a single Rust `__call__` — hash, lookup, equality check, and return all happen in one FFI crossing with no copying. -Thread safety is provided by a `parking_lot::RwLock` (~8ns uncontended). This is the fastest backend, reaching **14-20M ops/s** single-threaded. +Thread safety is provided by a sharded `hashbrown::HashMap` with `parking_lot::RwLock` per shard — cache hits acquire only a cheap per-shard read lock (~8ns). The write lock is acquired only on cache misses for SIEVE eviction. ```python @cache(max_size=256) # backend="memory" is the default @@ -135,11 +121,11 @@ def get_embedding(text: str) -> list[float]: - **Lock file** — holds a seqlock (sequence counter + spinlock) for cross-process synchronization. Reads are optimistic and lock-free; only writes acquire the spinlock - File location: `/dev/shm/` on Linux, `$TMPDIR/warp_cache/` on macOS - The file name is derived deterministically from the function's `__module__` and `__qualname__`, so the same function in different processes maps to the same cache automatically -- If an existing cache file has different parameters (capacity, strategy, key/value sizes), it is recreated +- If an existing cache file has different parameters (capacity, key/value sizes, version), it is recreated **Serialization overhead:** -Both keys and values are serialized with `pickle.dumps` on write and `pickle.loads` on read. This adds significant per-operation cost compared to the memory backend, which stores live Python objects directly. Expect roughly **2x** lower throughput depending on the size and complexity of your keys and values — the seqlock made reads near-free; the gap is now dominated by pickle serialization. The shared backend is designed for cases where the cached computation is expensive enough (network I/O, ML inference, heavy math) that the serialization cost is negligible in comparison. +Keys and values are serialized using a fast-path binary format for common primitives (None, bool, int, float, str, bytes, flat tuples) with pickle fallback for complex types. This adds per-operation cost compared to the memory backend, which stores live Python objects directly. Expect roughly **2x** lower throughput — the gap is irreducible cross-process overhead: serialization, deterministic hashing, seqlock, and mmap copy. No Mutex is used; all reads are fully lock-free. The shared backend is designed for cases where the cached computation is expensive enough (network I/O, ML inference, heavy math) that the serialization cost is negligible in comparison. **Size limits:** @@ -203,7 +189,6 @@ with ThreadPoolExecutor(max_workers=8) as pool: | Parameter | Type | Default | Description | |-----------|------|---------|-------------| -| `strategy` | `Strategy` | `Strategy.LRU` | Eviction strategy | | `max_size` | `int` | `128` | Maximum number of cached entries | | `ttl` | `float \| None` | `None` | Time-to-live in seconds (`None` = no expiry) | | `backend` | `str \| int \| Backend` | `Backend.MEMORY` | `"memory"` for in-process, `"shared"` for cross-process | diff --git a/examples/gunicorn_app.py b/examples/gunicorn_app.py index 1393b8b..7a5b7d0 100644 --- a/examples/gunicorn_app.py +++ b/examples/gunicorn_app.py @@ -20,10 +20,10 @@ import json import os -from warp_cache import Strategy, cache +from warp_cache import cache -@cache(strategy=Strategy.LRU, max_size=1024, ttl=60.0, backend="shared") +@cache(max_size=1024, ttl=60.0, backend="shared") def expensive_compute(n): """Simulate a CPU-heavy computation shared across workers.""" data = str(n).encode() @@ -51,13 +51,15 @@ def app(environ, start_response): if path == "/stats": info = expensive_compute.cache_info() - body = json.dumps({ - "pid": os.getpid(), - "hits": info.hits, - "misses": info.misses, - "max_size": info.max_size, - "current_size": info.current_size, - }) + body = json.dumps( + { + "pid": os.getpid(), + "hits": info.hits, + "misses": info.misses, + "max_size": info.max_size, + "current_size": info.current_size, + } + ) start_response("200 OK", [("Content-Type", "application/json")]) return [body.encode()] diff --git a/examples/strategies.py b/examples/strategies.py index d395e8f..e98ed75 100644 --- a/examples/strategies.py +++ b/examples/strategies.py @@ -2,40 +2,45 @@ # requires-python = ">=3.10" # dependencies = ["warp_cache"] # /// -"""Eviction strategies — LRU, MRU, FIFO, LFU.""" +"""SIEVE eviction — scan-resistant caching with second chances.""" import logging -from warp_cache import Strategy, cache +from warp_cache import cache logging.basicConfig(level=logging.INFO, format="%(message)s") log = logging.getLogger(__name__) -def demo_strategy(name, strategy): - @cache(strategy=strategy, max_size=3) +if __name__ == "__main__": + log.info("SIEVE eviction demo: visited entries get a second chance\n") + + call_count = 0 + + @cache(max_size=3) def fn(x): + global call_count + call_count += 1 return x * 10 - # Fill the cache: [1, 2, 3] + # Fill the cache: [1, 2, 3] — all unvisited fn(1) fn(2) fn(3) + log.info("After inserting 1, 2, 3: %s", fn.cache_info()) - # Access 1 and 2 again (affects LRU/LFU ordering) + # Access 1 and 2 — marks them as visited (protected) fn(1) fn(2) + log.info("After accessing 1 and 2 (now visited): hits=%d", fn.cache_info().hits) - # Insert 4 — triggers eviction + # Insert 4 — triggers eviction. Entry 3 is unvisited, so it's evicted. fn(4) - - info = fn.cache_info() - log.info("%4s: hits=%d, misses=%d, size=%d", name, info.hits, info.misses, info.current_size) - - -if __name__ == "__main__": - log.info("Each strategy evicts a different entry when the cache is full:\n") - demo_strategy("LRU", Strategy.LRU) # Evicts least recently used (3) - demo_strategy("MRU", Strategy.MRU) # Evicts most recently used (2) - demo_strategy("FIFO", Strategy.FIFO) # Evicts first inserted (1) - demo_strategy("LFU", Strategy.LFU) # Evicts least frequently used (3) + log.info("After inserting 4: %s", fn.cache_info()) + + # Verify: 3 was evicted (miss), 1 and 2 survived (hit) + call_count = 0 + fn(1) # hit + fn(2) # hit + fn(3) # miss — was evicted + log.info("Accessing 1 (hit), 2 (hit), 3 (miss — evicted): recomputed=%d", call_count) diff --git a/llms-full.txt b/llms-full.txt index a637d98..70940b4 100644 --- a/llms-full.txt +++ b/llms-full.txt @@ -1,6 +1,6 @@ # warp_cache -> warp_cache is a thread-safe Python caching decorator backed by a Rust extension (PyO3). It provides LRU/MRU/FIFO/LFU eviction, TTL support, async awareness, and a cross-process shared memory backend. It is a drop-in replacement for `functools.lru_cache` with added thread safety and features. +> warp_cache is a thread-safe Python caching decorator backed by a Rust extension (PyO3). It uses SIEVE eviction (scan-resistant, near-optimal hit rates), with TTL support, async awareness, and a cross-process shared memory backend. It is a drop-in replacement for `functools.lru_cache` with added thread safety and features. ## Installation @@ -15,18 +15,17 @@ Prebuilt wheels are available for Linux (x86_64, aarch64), macOS (x86_64, arm64) All public names are importable from `warp_cache`: ```python -from warp_cache import cache, lru_cache, Strategy, Backend, CacheInfo, SharedCacheInfo +from warp_cache import cache, Backend, CacheInfo, SharedCacheInfo ``` ### `cache()` decorator -The main decorator. Wraps a function with a Rust-backed cache. +The main decorator. Wraps a function with a Rust-backed cache using SIEVE eviction. ```python from warp_cache import cache @cache( - strategy=Strategy.LRU, # Eviction strategy (LRU, MRU, FIFO, LFU) max_size=128, # Maximum number of cached entries ttl=None, # Time-to-live in seconds (None = no expiry) backend="memory", # "memory" (in-process) or "shared" (cross-process mmap) @@ -39,29 +38,6 @@ def my_function(x, y): All arguments to the decorated function must be hashable. -### `lru_cache()` decorator - -Convenience shorthand for `cache(strategy=Strategy.LRU, ...)`. Accepts all the same parameters except `strategy`. - -```python -from warp_cache import lru_cache - -@lru_cache(max_size=256, ttl=60.0) -def my_function(x): - return x * x -``` - -### `Strategy` enum - -Controls eviction when the cache is full. `Strategy` is an `IntEnum`. - -| Strategy | Value | Evicts | Best for | -|-----------------|-------|--------------------------|--------------------------------------| -| `Strategy.LRU` | `0` | Least recently used | General-purpose caching (default) | -| `Strategy.MRU` | `1` | Most recently used | Scans where old items are re-accessed| -| `Strategy.FIFO` | `2` | Oldest insertion | Simple age-based rotation | -| `Strategy.LFU` | `3` | Least frequently used | Skewed access patterns with hot keys | - ### `Backend` enum Selects where cached data is stored. `Backend` is an `IntEnum`. The decorator also accepts the strings `"memory"` and `"shared"`. @@ -132,16 +108,6 @@ def compute(x, y): return x + y ``` -Or use the `lru_cache` shorthand: - -```python -from warp_cache import lru_cache - -@lru_cache(max_size=128) -def compute(x, y): - return x + y -``` - ### TTL (time-to-live) ```python @@ -203,21 +169,6 @@ Shared backend details: - Not available on Windows (`backend="memory"` works everywhere) - Monitor oversize skips: `fn.cache_info().oversize_skips` -### Eviction strategies - -```python -from warp_cache import cache, Strategy - -@cache(strategy=Strategy.LRU, max_size=256) -def fetch(url): ... - -@cache(strategy=Strategy.LFU, max_size=1000) -def lookup(key): ... - -@cache(strategy=Strategy.FIFO, max_size=100) -def rotate(item): ... -``` - ## Platform support | Platform | `backend="memory"` | `backend="shared"` | diff --git a/llms.txt b/llms.txt index edf8d72..aaccb93 100644 --- a/llms.txt +++ b/llms.txt @@ -1,10 +1,10 @@ # warp_cache -> warp_cache is a thread-safe Python caching decorator backed by a Rust extension (PyO3). It provides LRU/MRU/FIFO/LFU eviction, TTL support, async awareness, and a cross-process shared memory backend. It is a drop-in replacement for `functools.lru_cache` with added thread safety and features. +> warp_cache is a thread-safe Python caching decorator backed by a Rust extension (PyO3). It uses SIEVE eviction (scan-resistant, near-optimal hit rates), with TTL support, async awareness, and a cross-process shared memory backend. It is a drop-in replacement for `functools.lru_cache` with added thread safety and features. ## Docs -- [Usage guide](https://github.com/tolo/warp_cache/blob/main/docs/usage.md): Eviction strategies, async, TTL, shared memory, decorator parameters +- [Usage guide](https://github.com/tolo/warp_cache/blob/main/docs/usage.md): SIEVE eviction, async, TTL, shared memory, decorator parameters - [Performance](https://github.com/tolo/warp_cache/blob/main/docs/performance.md): Benchmarks and architecture deep-dive - [Alternatives](https://github.com/tolo/warp_cache/blob/main/docs/alternatives.md): Comparison with cachebox, moka-py, cachetools, lru_cache - [Development](https://github.com/tolo/warp_cache/blob/main/docs/development.md): Building from source, running tests diff --git a/src/entry.rs b/src/entry.rs index b10edf2..bbf97cc 100644 --- a/src/entry.rs +++ b/src/entry.rs @@ -1,8 +1,20 @@ -use pyo3::prelude::*; +use std::sync::atomic::{AtomicBool, Ordering}; use std::time::Instant; -pub struct CacheEntry { +use pyo3::prelude::*; + +pub struct SieveEntry { pub value: Py, pub created_at: Instant, - pub frequency: u64, + pub visited: AtomicBool, +} + +impl Clone for SieveEntry { + fn clone(&self) -> Self { + Python::attach(|py| SieveEntry { + value: self.value.clone_ref(py), + created_at: self.created_at, + visited: AtomicBool::new(self.visited.load(Ordering::Relaxed)), + }) + } } diff --git a/src/key.rs b/src/key.rs index f3661db..8bc30e9 100644 --- a/src/key.rs +++ b/src/key.rs @@ -50,3 +50,10 @@ impl PartialEq for CacheKey { } impl Eq for CacheKey {} + +impl CacheKey { + #[inline(always)] + pub fn shard_index(&self, n_shards: usize) -> usize { + self.hash as usize % n_shards + } +} diff --git a/src/lib.rs b/src/lib.rs index 56e6dd8..6296fb1 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -1,7 +1,6 @@ mod entry; mod key; mod store; -mod strategies; #[cfg(not(target_os = "windows"))] mod serde; diff --git a/src/shared_store.rs b/src/shared_store.rs index 50a38c4..764e95f 100644 --- a/src/shared_store.rs +++ b/src/shared_store.rs @@ -42,18 +42,20 @@ pub struct SharedCachedFunction { fn_obj: Py, pickle_dumps: Py, pickle_loads: Py, - cache: parking_lot::Mutex, + cache: ShmCache, + max_key_size: usize, + max_value_size: usize, + hash_state: RandomState, } #[pymethods] impl SharedCachedFunction { #[new] - #[pyo3(signature = (fn_obj, strategy, max_size, ttl=None, max_key_size=512, max_value_size=4096, shm_name=None))] + #[pyo3(signature = (fn_obj, max_size, ttl=None, max_key_size=512, max_value_size=4096, shm_name=None))] #[allow(clippy::too_many_arguments)] fn new( py: Python<'_>, fn_obj: Py, - strategy: u8, max_size: usize, ttl: Option, max_key_size: usize, @@ -72,7 +74,6 @@ impl SharedCachedFunction { let cache = ShmCache::create_or_open( &name, - strategy as u32, max_size as u32, max_key_size as u32, max_value_size as u32, @@ -82,11 +83,21 @@ impl SharedCachedFunction { pyo3::exceptions::PyOSError::new_err(format!("Failed to create shared cache: {e}")) })?; + let hash_state = RandomState::with_seeds( + 0x517cc1b727220a95, + 0x6c62272e07bb0142, + 0x0f1e2d3c4b5a6978, + 0xa1b2c3d4e5f60718, + ); + Ok(SharedCachedFunction { fn_obj, pickle_dumps, pickle_loads, - cache: parking_lot::Mutex::new(cache), + cache, + max_key_size, + max_value_size, + hash_state, }) } @@ -99,32 +110,22 @@ impl SharedCachedFunction { ) -> PyResult> { let (key_hash, key_bytes) = self.make_key(py, &args, &kwargs)?; - // Check size limits - { - let cache = self.cache.lock(); - if cache.is_oversize(&key_bytes, &[]) { - cache.record_oversize_skip(); - drop(cache); - return self - .fn_obj - .bind(py) - .call(args, kwargs.as_ref()) - .map(|r| r.unbind()); - } + // Check key size limit (no lock needed — uses cached struct field) + if key_bytes.len() > self.max_key_size { + self.cache.record_oversize_skip(); + return self + .fn_obj + .bind(py) + .call(args, kwargs.as_ref()) + .map(|r| r.unbind()); } - // Lookup in shared cache - let value_bytes: Option> = { - let cache = self.cache.lock(); - match cache.get(key_hash, &key_bytes) { - ShmGetResult::Hit(v) => Some(v), - ShmGetResult::Miss => None, + // Lookup in shared cache (lock-free via seqlock) + match self.cache.get(key_hash, &key_bytes) { + ShmGetResult::Hit(vb) => { + return self.deserialize_value(py, &vb); } - }; - - // On hit: deserialize and return - if let Some(vb) = value_bytes { - return self.deserialize_value(py, &vb); + ShmGetResult::Miss => {} } // Cache miss: call the wrapped function @@ -145,8 +146,7 @@ impl SharedCachedFunction { ) -> PyResult>> { let (key_hash, key_bytes) = self.make_key(py, &args, &kwargs)?; - let cache = self.cache.lock(); - match cache.get(key_hash, &key_bytes) { + match self.cache.get(key_hash, &key_bytes) { ShmGetResult::Hit(vb) => { let value = self.deserialize_value(py, &vb)?; Ok(Some(value)) @@ -171,8 +171,7 @@ impl SharedCachedFunction { } fn cache_info(&self) -> SharedCacheInfo { - let cache = self.cache.lock(); - let info = cache.info(); + let info = self.cache.info(); SharedCacheInfo { hits: info.hits, misses: info.misses, @@ -183,8 +182,7 @@ impl SharedCachedFunction { } fn cache_clear(&self) { - let mut cache = self.cache.lock(); - cache.clear(); + self.cache.clear(); } } @@ -222,13 +220,7 @@ impl SharedCachedFunction { // Hash the serialized bytes — deterministic across processes // (Python's hash() is randomized per-process for str/bytes) let key_hash = { - let state = RandomState::with_seeds( - 0x517cc1b727220a95, - 0x6c62272e07bb0142, - 0x0f1e2d3c4b5a6978, - 0xa1b2c3d4e5f60718, - ); - let mut hasher = state.build_hasher(); + let mut hasher = self.hash_state.build_hasher(); hasher.write(&key_bytes); hasher.finish() }; @@ -253,18 +245,14 @@ impl SharedCachedFunction { serde::wrap_pickle(pickle_bytes) }; - { - let cache = self.cache.lock(); - if cache.is_oversize(key_bytes, &value_bytes) { - cache.record_oversize_skip(); - return Ok(()); - } + // Check size limits (no lock — uses cached struct fields) + if key_bytes.len() > self.max_key_size || value_bytes.len() > self.max_value_size { + self.cache.record_oversize_skip(); + return Ok(()); } - { - let mut cache = self.cache.lock(); - cache.insert(key_hash, key_bytes, &value_bytes); - } + // Insert acquires seqlock write lock internally + self.cache.insert(key_hash, key_bytes, &value_bytes); Ok(()) } diff --git a/src/shm/layout.rs b/src/shm/layout.rs index af1f8c9..9d04ee2 100644 --- a/src/shm/layout.rs +++ b/src/shm/layout.rs @@ -6,9 +6,9 @@ /// Magic bytes at the start of the header to validate the mapping. pub const MAGIC: [u8; 8] = *b"FCACHE01"; -/// Layout version — bumped when the lock format changes. -/// v1 = pthread_rwlock, v2 = seqlock. -pub const VERSION: u32 = 2; +/// Layout version — bumped when the format changes. +/// v1 = pthread_rwlock, v2 = seqlock, v3 = SIEVE eviction. +pub const VERSION: u32 = 3; /// Size of the fixed header at the start of the region. pub const HEADER_SIZE: usize = 256; @@ -45,7 +45,7 @@ pub struct Header { pub list_head: i32, // 72..76 (eviction list, SLOT_NONE = empty) pub list_tail: i32, // 76..80 pub free_head: i32, // 80..84 - pub _reserved: i32, // 84..88 (alignment padding) + pub sieve_hand: i32, // 84..88 (SIEVE eviction hand position) // Explicit padding to 256 bytes: 256 - 88 = 168 pub _pad: [u8; 168], @@ -79,8 +79,8 @@ pub struct SlotHeader { // 8-byte aligned group pub key_hash: u64, // 0..8 pub created_at_nanos: u64, // 8..16 (monotonic nanos) - pub frequency: u64, // 16..24 - pub unique_id: u64, // 24..32 (monotonic ID for LFU) + pub visited: u64, // 16..24 (SIEVE: 0=unvisited, 1=visited) + pub unique_id: u64, // 24..32 (unused, kept for layout stability) // 4-byte aligned group pub occupied: u32, // 32..36 (1 = occupied, 0 = free) diff --git a/src/shm/mod.rs b/src/shm/mod.rs index c2f93e5..9ce6ba9 100644 --- a/src/shm/mod.rs +++ b/src/shm/mod.rs @@ -1,12 +1,12 @@ /// Shared-memory cache backend. /// -/// Provides `ShmCache` — a cross-process LRU/MRU/FIFO/LFU cache backed -/// by mmap. All data (header, hash table, slab arena) lives in a single +/// Provides `ShmCache` — a cross-process SIEVE cache backed by mmap. +/// All data (header, hash table, slab arena) lives in a single /// memory-mapped file. A separate mmap file holds the seqlock. /// -/// Read path uses an optimistic seqlock: lock-free hash lookup + value copy, -/// then a brief write lock only when ordering updates are needed (LRU/MRU/LFU). -/// FIFO reads are fully lock-free. Stats are updated via atomics (no lock). +/// Read path uses an optimistic seqlock: lock-free hash lookup + value copy. +/// On hit, `visited` is set to 1 without a write lock (idempotent store). +/// All reads are fully lock-free. Stats are updated via atomics (no lock). pub mod hashtable; pub mod layout; pub mod lock; @@ -42,14 +42,13 @@ enum OptimisticResult { /// at the same mmap. pub struct ShmCache { region: ShmRegion, - next_unique_id: u64, + next_unique_id: AtomicU64, } impl ShmCache { /// Create or open a shared cache. pub fn create_or_open( name: &str, - strategy: u32, capacity: u32, max_key_size: u32, max_value_size: u32, @@ -63,7 +62,6 @@ impl ShmCache { let region = ShmRegion::create_or_open( name, - strategy, capacity, slot_size, max_key_size, @@ -73,7 +71,7 @@ impl ShmCache { Ok(ShmCache { region, - next_unique_id: 0, + next_unique_id: AtomicU64::new(0), }) } @@ -136,12 +134,6 @@ impl ShmCache { unsafe { &*(self.base_ptr().add(32) as *const AtomicU64) } } - /// Check if key/value sizes exceed limits. Returns true if oversize. - pub fn is_oversize(&self, key_bytes: &[u8], value_bytes: &[u8]) -> bool { - let h = self.header(); - key_bytes.len() > h.max_key_size as usize || value_bytes.len() > h.max_value_size as usize - } - /// Bounds-checked hash table lookup for the optimistic read path. /// /// Mirrors `hashtable::ht_lookup` but adds bounds checks to guard against @@ -258,8 +250,9 @@ impl ShmCache { /// Look up a key (by hash + serialized bytes). Returns a copy of the value bytes on hit. /// - /// Uses optimistic seqlock reads. Only acquires the write lock when ordering - /// needs updating (LRU/MRU/LFU hit) or when removing an expired entry. + /// Uses optimistic seqlock reads. Fully lock-free: on hit, sets `visited=1` + /// via a direct store (idempotent, safe even if the slot is concurrently evicted + /// because the mmap memory remains valid). pub fn get(&self, key_hash: u64, key_bytes: &[u8]) -> ShmGetResult { let lock = self.lock(); @@ -267,31 +260,14 @@ impl ShmCache { match result { OptimisticResult::Hit { value, slot_index } => { - let strategy = self.header().strategy; - - // FIFO: no ordering update needed — fully lock-free - if strategy != 2 { - // LRU/MRU/LFU: brief write lock for ordering update - lock.write_lock(); - unsafe { - // Re-verify the slot is still valid (another writer may have evicted it) - let slot_size = self.header().slot_size; - let slot_ptr = self - .slab_base() - .add(slot_index as usize * slot_size as usize); - let slot = &*(slot_ptr as *const SlotHeader); - if slot.occupied != 0 && slot.key_hash == key_hash { - let header = self.header_mut(); - ordering::on_access( - header, - self.slab_base_mut(), - slot_size, - slot_index, - strategy, - ); - } - } - lock.write_unlock(); + // SIEVE: mark as visited — lock-free, idempotent store + unsafe { + let slot_size = self.header().slot_size; + let slot_ptr = self + .slab_base_mut() + .add(slot_index as usize * slot_size as usize); + let slot = &mut *(slot_ptr as *mut SlotHeader); + slot.visited = 1; } // Stats: atomic, no lock needed @@ -332,18 +308,17 @@ impl ShmCache { } /// Insert a key-value pair. Evicts if necessary. - pub fn insert(&mut self, key_hash: u64, key_bytes: &[u8], value_bytes: &[u8]) { + pub fn insert(&self, key_hash: u64, key_bytes: &[u8], value_bytes: &[u8]) { let lock = self.lock(); lock.write_lock(); unsafe { self.insert_inner(key_hash, key_bytes, value_bytes) }; lock.write_unlock(); } - unsafe fn insert_inner(&mut self, key_hash: u64, key_bytes: &[u8], value_bytes: &[u8]) { + unsafe fn insert_inner(&self, key_hash: u64, key_bytes: &[u8], value_bytes: &[u8]) { let h = self.header(); let ht_cap = h.ht_capacity; let slot_size = h.slot_size; - let strategy = h.strategy; let capacity = h.capacity; // Check if key already exists — update value in place @@ -362,12 +337,10 @@ impl ShmCache { let slot = &mut *(slot_ptr as *mut SlotHeader); slot.value_len = value_bytes.len() as u32; slot.created_at_nanos = current_time_nanos(); + slot.visited = 1; let value_dest = slot_ptr.add(SLOT_HEADER_SIZE + slot.key_len as usize); std::ptr::copy_nonoverlapping(value_bytes.as_ptr(), value_dest, value_bytes.len()); - - let header = self.header_mut(); - ordering::on_access(header, self.slab_base_mut(), slot_size, idx, strategy); return; } @@ -381,8 +354,8 @@ impl ShmCache { header.free_head = free_slot.next; idx } else if header.current_size >= capacity { - // Need to evict - let evict_idx = ordering::evict_candidate(header, strategy); + // Need to evict — SIEVE picks the victim + let evict_idx = ordering::sieve_evict(header, self.slab_base_mut(), slot_size); if evict_idx == SLOT_NONE { return; // shouldn't happen } @@ -425,11 +398,10 @@ impl ShmCache { slot.key_len = key_bytes.len() as u32; slot.value_len = value_bytes.len() as u32; slot.created_at_nanos = current_time_nanos(); - slot.frequency = 0; + slot.visited = 0; slot.prev = SLOT_NONE; slot.next = SLOT_NONE; - slot.unique_id = self.next_unique_id; - self.next_unique_id += 1; + slot.unique_id = self.next_unique_id.fetch_add(1, AtomicOrdering::Relaxed); // Copy key bytes let key_dest = slot_ptr.add(SLOT_HEADER_SIZE); @@ -442,9 +414,9 @@ impl ShmCache { // Insert into hash table hashtable::ht_insert(self.ht_base_mut(), ht_cap, key_hash, slot_idx); - // Add to eviction list + // Add to eviction list (SIEVE: new entries go to tail, unvisited) let header = self.header_mut(); - ordering::on_insert(header, self.slab_base_mut(), slot_size, slot_idx, strategy); + ordering::list_push_tail(header, self.slab_base_mut(), slot_size, slot_idx); header.current_size += 1; } @@ -485,14 +457,14 @@ impl ShmCache { } /// Clear the entire cache. - pub fn clear(&mut self) { + pub fn clear(&self) { let lock = self.lock(); lock.write_lock(); unsafe { self.clear_inner() }; lock.write_unlock(); } - unsafe fn clear_inner(&mut self) { + unsafe fn clear_inner(&self) { let h = self.header(); let ht_cap = h.ht_capacity; let slot_size = h.slot_size; @@ -522,6 +494,7 @@ impl ShmCache { header.list_head = SLOT_NONE; header.list_tail = SLOT_NONE; header.free_head = 0; + header.sieve_hand = SLOT_NONE; } /// Increment oversize skip counter. Lock-free via atomic. diff --git a/src/shm/ordering.rs b/src/shm/ordering.rs index b73b878..58e814a 100644 --- a/src/shm/ordering.rs +++ b/src/shm/ordering.rs @@ -1,7 +1,6 @@ -/// Intrusive doubly-linked list for eviction ordering. +/// Intrusive doubly-linked list and SIEVE eviction for shared memory. /// /// Uses prev/next indices stored in each slot header. -/// Supports LRU, MRU, FIFO, and LFU eviction strategies. use super::layout::{Header, SlotHeader, SLOT_NONE}; /// Get a reference to a slot header. @@ -43,7 +42,7 @@ pub unsafe fn list_remove(header: &mut Header, slab_base: *mut u8, slot_size: u3 s.next = SLOT_NONE; } -/// Push a slot to the tail of the eviction list (most recently used position). +/// Push a slot to the tail of the eviction list. /// /// # Safety /// Caller must hold write lock. @@ -61,142 +60,53 @@ pub unsafe fn list_push_tail(header: &mut Header, slab_base: *mut u8, slot_size: header.list_tail = index; } -/// Move a slot to the tail of the list (touch for LRU/MRU). +/// SIEVE eviction: find a victim slot to evict. /// -/// # Safety -/// Caller must hold write lock. -pub unsafe fn list_move_to_tail( - header: &mut Header, - slab_base: *mut u8, - slot_size: u32, - index: i32, -) { - list_remove(header, slab_base, slot_size, index); - list_push_tail(header, slab_base, slot_size, index); -} - -/// For LFU: insert a slot in sorted position by (frequency ASC, unique_id ASC). +/// Scans from `header.sieve_hand` (or `list_head` if SLOT_NONE). +/// - If `visited == 1`: clear to 0, advance hand. +/// - If `visited == 0`: set hand to next, return this index as victim. /// -/// Scans from the tail (highest frequency) toward head. +/// Returns the slot index to evict, or `SLOT_NONE` if the list is empty. /// /// # Safety /// Caller must hold write lock. -pub unsafe fn list_insert_lfu(header: &mut Header, slab_base: *mut u8, slot_size: u32, index: i32) { - let new_slot = slot(slab_base, slot_size, index); - let new_freq = new_slot.frequency; - let new_uid = new_slot.unique_id; - - // Find insertion point: scan from tail backward - let mut cursor = header.list_tail; - while cursor != SLOT_NONE { - let cs = slot(slab_base, slot_size, cursor); - // Insert after cursor if cursor's freq < new_freq, - // or (same freq and cursor's uid < new_uid) - if cs.frequency < new_freq || (cs.frequency == new_freq && cs.unique_id <= new_uid) { - // Insert after cursor - let s = slot_mut(slab_base, slot_size, index); - s.prev = cursor; - s.next = slot(slab_base, slot_size, cursor).next; - - if s.next != SLOT_NONE { - slot_mut(slab_base, slot_size, s.next).prev = index; - } else { - header.list_tail = index; - } - - slot_mut(slab_base, slot_size, cursor).next = index; - return; - } - cursor = cs.prev; +pub unsafe fn sieve_evict(header: &mut Header, slab_base: *mut u8, slot_size: u32) -> i32 { + if header.list_head == SLOT_NONE { + return SLOT_NONE; } - // Insert at head - let s = slot_mut(slab_base, slot_size, index); - s.prev = SLOT_NONE; - s.next = header.list_head; - - if header.list_head != SLOT_NONE { - slot_mut(slab_base, slot_size, header.list_head).prev = index; - } else { - header.list_tail = index; - } - - header.list_head = index; -} - -/// Pick the slot to evict based on the strategy. -/// -/// Returns the slot index to evict, or SLOT_NONE if the list is empty. -/// -/// - LRU (0): evict head (least recently used) -/// - MRU (1): evict tail (most recently used) -/// - FIFO (2): evict head (oldest insertion) -/// - LFU (3): evict head (lowest frequency — list is sorted) -pub fn evict_candidate(header: &Header, strategy: u32) -> i32 { - match strategy { - 0 | 2 | 3 => header.list_head, // LRU, FIFO, LFU: evict from head - 1 => header.list_tail, // MRU: evict from tail - _ => header.list_head, + let mut hand = header.sieve_hand; + if hand == SLOT_NONE { + hand = header.list_head; } -} -/// Called on cache hit to update ordering. -/// -/// - LRU: move to tail -/// - MRU: move to tail -/// - FIFO: no-op (insertion order preserved) -/// - LFU: increment frequency, reposition in sorted list -/// -/// # Safety -/// Caller must hold write lock. -pub unsafe fn on_access( - header: &mut Header, - slab_base: *mut u8, - slot_size: u32, - index: i32, - strategy: u32, -) { - match strategy { - 0 | 1 => { - // LRU/MRU: move to tail - list_move_to_tail(header, slab_base, slot_size, index); - } - 2 => { - // FIFO: no reordering on access - } - 3 => { - // LFU: increment frequency and reposition - let s = slot_mut(slab_base, slot_size, index); - s.frequency += 1; - list_remove(header, slab_base, slot_size, index); - list_insert_lfu(header, slab_base, slot_size, index); + // We may need to scan up to 2× the list length (one pass to clear visited bits, + // one pass to find an unvisited entry). + let capacity = header.capacity; + for _ in 0..capacity * 2 { + let s = slot_mut(slab_base, slot_size, hand); + if s.visited != 0 { + // Second chance: clear visited bit, advance hand + s.visited = 0; + let next = s.next; + hand = if next != SLOT_NONE { + next + } else { + header.list_head + }; + } else { + // Found an unvisited entry — evict it + let next = s.next; + header.sieve_hand = if next != SLOT_NONE { + next + } else { + header.list_head + }; + return hand; } - _ => {} } -} -/// Called on insert to add the new slot to the eviction list. -/// -/// # Safety -/// Caller must hold write lock. -pub unsafe fn on_insert( - header: &mut Header, - slab_base: *mut u8, - slot_size: u32, - index: i32, - strategy: u32, -) { - match strategy { - 0..=2 => { - // LRU/MRU/FIFO: append to tail - list_push_tail(header, slab_base, slot_size, index); - } - 3 => { - // LFU: insert in sorted position (frequency = 0 → near head) - list_insert_lfu(header, slab_base, slot_size, index); - } - _ => { - list_push_tail(header, slab_base, slot_size, index); - } - } + // Fallback: evict whatever the hand is pointing at + header.sieve_hand = SLOT_NONE; + hand } diff --git a/src/shm/region.rs b/src/shm/region.rs index 01f9235..2a49c7d 100644 --- a/src/shm/region.rs +++ b/src/shm/region.rs @@ -35,7 +35,6 @@ impl ShmRegion { /// Create a new shared memory region, initializing all structures. pub fn create( name: &str, - strategy: u32, capacity: u32, slot_size: u32, max_key_size: u32, @@ -84,7 +83,7 @@ impl ShmRegion { let header = unsafe { &mut *(mmap.as_mut_ptr() as *mut Header) }; header.magic = MAGIC; header.version = VERSION; - header.strategy = strategy; + header.strategy = 0; header.capacity = capacity; header.ht_capacity = ht_capacity; header.slot_size = slot_size; @@ -98,6 +97,7 @@ impl ShmRegion { header.list_head = SLOT_NONE; header.list_tail = SLOT_NONE; header.free_head = 0; // first slot is start of free list + header.sieve_hand = SLOT_NONE; // Initialize hash table buckets to empty let ht_base = layout::ht_offset(); @@ -182,7 +182,6 @@ impl ShmRegion { /// Create if doesn't exist, otherwise open. pub fn create_or_open( name: &str, - strategy: u32, capacity: u32, slot_size: u32, max_key_size: u32, @@ -200,7 +199,6 @@ impl ShmRegion { let header = region.header(); if header.version == VERSION && header.capacity == capacity - && header.strategy == strategy && header.max_key_size == max_key_size && header.max_value_size == max_value_size { @@ -217,7 +215,6 @@ impl ShmRegion { Self::create( name, - strategy, capacity, slot_size, max_key_size, diff --git a/src/store.rs b/src/store.rs index 91a25e0..1027dfc 100644 --- a/src/store.rs +++ b/src/store.rs @@ -1,39 +1,23 @@ -use std::sync::atomic::{AtomicU64, Ordering}; +use std::collections::VecDeque; +use std::sync::atomic::{AtomicBool, AtomicU64, Ordering}; use std::time::{Duration, Instant}; -use parking_lot::{Mutex, RwLock}; +use hashbrown::HashMap; +use parking_lot::RwLock; use pyo3::prelude::*; use pyo3::types::{PyDict, PyTuple}; -use crate::entry::CacheEntry; +use crate::entry::SieveEntry; use crate::key::CacheKey; -use crate::strategies::fifo::FifoStrategy; -use crate::strategies::lfu::LfuStrategy; -use crate::strategies::lru::LruStrategy; -use crate::strategies::mru::MruStrategy; -use crate::strategies::StrategyEnum; - -/// Maximum number of deferred ordering updates buffered before a write lock -/// is acquired. Cache hits under the read lock append to this log; when a -/// write lock is taken (on miss or when the log is full), the log is drained -/// and ordering is replayed. If the log fills up between write-lock -/// acquisitions, additional hit-ordering updates are silently dropped — this -/// makes eviction ordering *approximate* under sustained hit-only workloads. -const ACCESS_LOG_CAPACITY: usize = 64; - -struct CacheStoreInner { - strategy: StrategyEnum, - ttl: Option, -} -impl CacheStoreInner { - /// Drain the access log and replay deferred ordering updates. - #[inline(always)] - fn drain_access_log(&mut self, log: &mut Vec) { - for key in log.drain(..) { - self.strategy.record_access(&key); - } - } +const MAX_SHARDS: usize = 16; +const MIN_SHARD_SIZE: usize = 8; + +struct Shard { + map: HashMap, + order: VecDeque, + hand: usize, + capacity: usize, } #[pyclass(frozen)] @@ -61,32 +45,114 @@ impl CacheInfo { #[pyclass(frozen)] pub struct CachedFunction { fn_obj: Py, - inner: RwLock, - access_log: Mutex>, + shards: Box<[RwLock]>, + n_shards: usize, + ttl: Option, + max_size: usize, hits: AtomicU64, misses: AtomicU64, } +impl CachedFunction { + /// Evict one entry using the SIEVE algorithm. + /// Called with the shard write-locked (caller passes `&mut Shard`). + fn evict_one(shard: &mut Shard) { + let initial_len = shard.order.len(); + if initial_len == 0 { + return; + } + + let mut scanned = 0; + while scanned <= initial_len { + if shard.order.is_empty() { + break; + } + if shard.hand >= shard.order.len() { + shard.hand = 0; + } + + let key = shard.order[shard.hand].clone(); + + // Read visited status (ends immutable borrow before mutations) + let status = shard + .map + .get(&key) + .map(|e| e.visited.load(Ordering::Relaxed)); + + match status { + Some(true) => { + // Second chance: clear visited bit, advance hand + if let Some(entry) = shard.map.get(&key) { + entry.visited.store(false, Ordering::Relaxed); + } + shard.hand += 1; + scanned += 1; + } + Some(false) => { + // Evict this entry + shard.map.remove(&key); + shard.order.remove(shard.hand); + if shard.hand >= shard.order.len() && !shard.order.is_empty() { + shard.hand = 0; + } + return; + } + None => { + // Stale entry (TTL-removed or otherwise gone from map) + shard.order.remove(shard.hand); + if shard.hand >= shard.order.len() && !shard.order.is_empty() { + shard.hand = 0; + } + // Don't increment scanned — order shifted, retry at same position + } + } + } + } + + #[inline(always)] + fn make_key<'py>( + py: Python<'py>, + args: &Bound<'py, PyTuple>, + kwargs: &Option>, + ) -> PyResult { + let key_obj: Py = match kwargs { + Some(ref kw) if !kw.is_empty() => { + let builtins = py.import("builtins")?; + let items = kw.call_method0("items")?; + let sorted_items = builtins.call_method1("sorted", (items,))?; + let kw_tup = builtins.getattr("tuple")?.call1((sorted_items,))?; + let combined = PyTuple::new(py, [args.as_any().clone(), kw_tup])?; + combined.unbind().into() + } + _ => args.clone().unbind().into(), + }; + CacheKey::new(py, key_obj) + } +} + #[pymethods] impl CachedFunction { #[new] - #[pyo3(signature = (fn_obj, strategy, max_size, ttl=None))] - fn new(fn_obj: Py, strategy: u8, max_size: usize, ttl: Option) -> Self { - let strat = match strategy { - 0 => StrategyEnum::Lru(LruStrategy::new(max_size)), - 1 => StrategyEnum::Mru(MruStrategy::new(max_size)), - 2 => StrategyEnum::Fifo(FifoStrategy::new(max_size)), - 3 => StrategyEnum::Lfu(LfuStrategy::new(max_size)), - _ => StrategyEnum::Lru(LruStrategy::new(max_size)), - }; - let ttl_dur = ttl.map(Duration::from_secs_f64); + #[pyo3(signature = (fn_obj, max_size, ttl=None))] + fn new(fn_obj: Py, max_size: usize, ttl: Option) -> Self { + let n_shards = (max_size / MIN_SHARD_SIZE).clamp(1, MAX_SHARDS); + let per_shard = max_size.div_ceil(n_shards); + let shards: Vec> = (0..n_shards) + .map(|_| { + RwLock::new(Shard { + map: HashMap::with_capacity(per_shard), + order: VecDeque::with_capacity(per_shard), + hand: 0, + capacity: per_shard, + }) + }) + .collect(); CachedFunction { fn_obj, - inner: RwLock::new(CacheStoreInner { - strategy: strat, - ttl: ttl_dur, - }), - access_log: Mutex::new(Vec::with_capacity(ACCESS_LOG_CAPACITY)), + shards: shards.into_boxed_slice(), + n_shards, + ttl: ttl.map(Duration::from_secs_f64), + max_size, hits: AtomicU64::new(0), misses: AtomicU64::new(0), } @@ -99,7 +165,7 @@ impl CachedFunction { args: Bound<'py, PyTuple>, kwargs: Option>, ) -> PyResult> { - // Build the cache key (inlined for performance on the hot path) + // Build the cache key let key_obj: Py = match kwargs { Some(ref kw) if !kw.is_empty() => { let builtins = py.import("builtins")?; @@ -112,54 +178,42 @@ impl CachedFunction { _ => args.clone().unbind().into(), }; let cache_key = CacheKey::new(py, key_obj)?; + let shard_idx = cache_key.shard_index(self.n_shards); - // FAST PATH: read lock — cache hit + // FAST PATH: read lock on one shard { - let inner = self.inner.read(); - if let Some(entry) = inner.strategy.peek(&cache_key) { - if let Some(ttl) = inner.ttl { - if entry.created_at.elapsed() > ttl { - // Expired — fall through to slow path (can't remove under read lock) - drop(inner); - } else { + let shard = self.shards[shard_idx].read(); + if let Some(entry) = shard.map.get(&cache_key) { + if let Some(ttl) = self.ttl { + if entry.created_at.elapsed() <= ttl { + entry.visited.store(true, Ordering::Relaxed); let val = entry.value.clone_ref(py); - drop(inner); + drop(shard); self.hits.fetch_add(1, Ordering::Relaxed); - let mut log = self.access_log.lock(); - if log.len() < ACCESS_LOG_CAPACITY { - log.push(cache_key); - } return Ok(val); } + // Expired — fall through to miss path } else { + entry.visited.store(true, Ordering::Relaxed); let val = entry.value.clone_ref(py); - drop(inner); + drop(shard); self.hits.fetch_add(1, Ordering::Relaxed); - let mut log = self.access_log.lock(); - if log.len() < ACCESS_LOG_CAPACITY { - log.push(cache_key); - } return Ok(val); } } } - // Cache miss: call the wrapped function (outside any lock) + // Cache miss: call the wrapped function (no lock held) let result = self.fn_obj.bind(py).call(args, kwargs.as_ref())?.unbind(); - // SLOW PATH: write lock — drain access log + insert + // SLOW PATH: write lock, double-check, evict if needed, insert { - let mut inner = self.inner.write(); - - // Drain deferred access log - let mut log = self.access_log.lock(); - inner.drain_access_log(&mut log); - drop(log); + let mut shard = self.shards[shard_idx].write(); // Double-check: another thread may have inserted while we were computing - let needs_insert = match inner.strategy.peek(&cache_key) { + let needs_insert = match shard.map.get(&cache_key) { Some(entry) => { - if let Some(ttl) = inner.ttl { + if let Some(ttl) = self.ttl { entry.created_at.elapsed() > ttl } else { false @@ -169,14 +223,24 @@ impl CachedFunction { }; if needs_insert { - // Remove expired entry if present - inner.strategy.remove(&cache_key); - let entry = CacheEntry { + // Remove expired entry from map if present (order cleaned lazily) + shard.map.remove(&cache_key); + + // Evict if at capacity + while shard.map.len() >= shard.capacity { + Self::evict_one(&mut shard); + if shard.order.is_empty() { + break; + } + } + + let entry = SieveEntry { value: result.clone_ref(py), created_at: Instant::now(), - frequency: 0, + visited: AtomicBool::new(false), }; - inner.strategy.insert(cache_key, entry); + shard.map.insert(cache_key.clone(), entry); + shard.order.push_back(cache_key); } } @@ -193,62 +257,25 @@ impl CachedFunction { kwargs: Option>, ) -> PyResult>> { let cache_key = Self::make_key(py, &args, &kwargs)?; + let shard_idx = cache_key.shard_index(self.n_shards); - // FAST PATH: read lock - { - let inner = self.inner.read(); - if let Some(entry) = inner.strategy.peek(&cache_key) { - if let Some(ttl) = inner.ttl { - if entry.created_at.elapsed() > ttl { - // Expired — need write lock to remove - drop(inner); - } else { - let val = entry.value.clone_ref(py); - drop(inner); - self.hits.fetch_add(1, Ordering::Relaxed); - let mut log = self.access_log.lock(); - if log.len() < ACCESS_LOG_CAPACITY { - log.push(cache_key); - } - return Ok(Some(val)); - } - } else { - let val = entry.value.clone_ref(py); - drop(inner); - self.hits.fetch_add(1, Ordering::Relaxed); - let mut log = self.access_log.lock(); - if log.len() < ACCESS_LOG_CAPACITY { - log.push(cache_key); - } - return Ok(Some(val)); + let shard = self.shards[shard_idx].read(); + if let Some(entry) = shard.map.get(&cache_key) { + if let Some(ttl) = self.ttl { + if entry.created_at.elapsed() > ttl { + drop(shard); + self.misses.fetch_add(1, Ordering::Relaxed); + return Ok(None); } } + entry.visited.store(true, Ordering::Relaxed); + let val = entry.value.clone_ref(py); + drop(shard); + self.hits.fetch_add(1, Ordering::Relaxed); + return Ok(Some(val)); } - // SLOW PATH: write lock for expired removal - { - let mut inner = self.inner.write(); - let mut log = self.access_log.lock(); - inner.drain_access_log(&mut log); - drop(log); - - // Check again under write lock - if let Some(entry) = inner.strategy.peek(&cache_key) { - if let Some(ttl) = inner.ttl { - if entry.created_at.elapsed() > ttl { - inner.strategy.remove(&cache_key); - self.misses.fetch_add(1, Ordering::Relaxed); - return Ok(None); - } - } - // Hit (possibly inserted by another thread) - let val = entry.value.clone_ref(py); - inner.strategy.record_access(&cache_key); - self.hits.fetch_add(1, Ordering::Relaxed); - return Ok(Some(val)); - } - } - + drop(shard); self.misses.fetch_add(1, Ordering::Relaxed); Ok(None) } @@ -263,59 +290,59 @@ impl CachedFunction { kwargs: Option>, ) -> PyResult<()> { let cache_key = Self::make_key(py, &args, &kwargs)?; - let mut inner = self.inner.write(); + let shard_idx = cache_key.shard_index(self.n_shards); - // Drain deferred access log - let mut log = self.access_log.lock(); - inner.drain_access_log(&mut log); - drop(log); + let mut shard = self.shards[shard_idx].write(); + + if shard.map.get(&cache_key).is_none() { + // New key: evict if needed, then insert + while shard.map.len() >= shard.capacity { + Self::evict_one(&mut shard); + if shard.order.is_empty() { + break; + } + } + let entry = SieveEntry { + value: value.clone_ref(py), + created_at: Instant::now(), + visited: AtomicBool::new(false), + }; + shard.map.insert(cache_key.clone(), entry); + shard.order.push_back(cache_key); + } else { + // Existing key: update value in place + let entry = SieveEntry { + value: value.clone_ref(py), + created_at: Instant::now(), + visited: AtomicBool::new(false), + }; + shard.map.insert(cache_key, entry); + } - let entry = CacheEntry { - value: value.clone_ref(py), - created_at: Instant::now(), - frequency: 0, - }; - inner.strategy.insert(cache_key, entry); Ok(()) } fn cache_info(&self) -> CacheInfo { - let inner = self.inner.read(); + let mut current_size = 0; + for shard in self.shards.iter() { + current_size += shard.read().map.len(); + } CacheInfo { hits: self.hits.load(Ordering::Relaxed), misses: self.misses.load(Ordering::Relaxed), - max_size: inner.strategy.capacity(), - current_size: inner.strategy.len(), + max_size: self.max_size, + current_size, } } fn cache_clear(&self) { - let mut inner = self.inner.write(); - inner.strategy.clear(); - self.access_log.lock().clear(); + for shard in self.shards.iter() { + let mut s = shard.write(); + s.map.clear(); + s.order.clear(); + s.hand = 0; + } self.hits.store(0, Ordering::Relaxed); self.misses.store(0, Ordering::Relaxed); } } - -impl CachedFunction { - #[inline(always)] - fn make_key<'py>( - py: Python<'py>, - args: &Bound<'py, PyTuple>, - kwargs: &Option>, - ) -> PyResult { - let key_obj: Py = match kwargs { - Some(ref kw) if !kw.is_empty() => { - let builtins = py.import("builtins")?; - let items = kw.call_method0("items")?; - let sorted_items = builtins.call_method1("sorted", (items,))?; - let kw_tup = builtins.getattr("tuple")?.call1((sorted_items,))?; - let combined = PyTuple::new(py, [args.as_any().clone(), kw_tup])?; - combined.unbind().into() - } - _ => args.clone().unbind().into(), - }; - CacheKey::new(py, key_obj) - } -} diff --git a/src/strategies/fifo.rs b/src/strategies/fifo.rs deleted file mode 100644 index 9eaf3af..0000000 --- a/src/strategies/fifo.rs +++ /dev/null @@ -1,58 +0,0 @@ -use hashlink::LinkedHashMap; - -use crate::entry::CacheEntry; -use crate::key::CacheKey; -use crate::strategies::EvictionStrategy; - -pub struct FifoStrategy { - map: LinkedHashMap, - capacity: usize, -} - -impl FifoStrategy { - pub fn new(capacity: usize) -> Self { - Self { - map: LinkedHashMap::new(), - capacity, - } - } -} - -impl EvictionStrategy for FifoStrategy { - fn insert(&mut self, key: CacheKey, entry: CacheEntry) { - if self.map.contains_key(&key) { - // Replace existing without changing order - self.map.replace(key, entry); - return; - } - if self.map.len() >= self.capacity { - // Evict oldest (front) - self.map.pop_front(); - } - self.map.insert(key, entry); - } - - fn peek(&self, key: &CacheKey) -> Option<&CacheEntry> { - self.map.get(key) - } - - fn record_access(&mut self, _key: &CacheKey) { - // FIFO: no reordering on access - } - - fn remove(&mut self, key: &CacheKey) -> Option { - self.map.remove(key) - } - - fn len(&self) -> usize { - self.map.len() - } - - fn clear(&mut self) { - self.map.clear(); - } - - fn capacity(&self) -> usize { - self.capacity - } -} diff --git a/src/strategies/lfu.rs b/src/strategies/lfu.rs deleted file mode 100644 index 8e85884..0000000 --- a/src/strategies/lfu.rs +++ /dev/null @@ -1,141 +0,0 @@ -use std::collections::{BTreeSet, HashMap}; -use std::time::Instant; - -use crate::entry::CacheEntry; -use crate::key::CacheKey; -use crate::strategies::EvictionStrategy; - -/// Ordering key for the frequency index. -/// Lower frequency evicted first; ties broken by oldest creation time, then unique id. -#[derive(Clone)] -struct FreqKey { - frequency: u64, - created_at_nanos: u128, - unique_id: u64, - cache_key: CacheKey, -} - -impl PartialEq for FreqKey { - fn eq(&self, other: &Self) -> bool { - self.frequency == other.frequency - && self.created_at_nanos == other.created_at_nanos - && self.unique_id == other.unique_id - } -} - -impl Eq for FreqKey {} - -impl PartialOrd for FreqKey { - fn partial_cmp(&self, other: &Self) -> Option { - Some(self.cmp(other)) - } -} - -impl Ord for FreqKey { - fn cmp(&self, other: &Self) -> std::cmp::Ordering { - self.frequency - .cmp(&other.frequency) - .then_with(|| self.created_at_nanos.cmp(&other.created_at_nanos)) - .then_with(|| self.unique_id.cmp(&other.unique_id)) - } -} - -pub struct LfuStrategy { - map: HashMap, - index: BTreeSet, - epoch: Instant, - capacity: usize, - next_id: u64, -} - -impl LfuStrategy { - pub fn new(capacity: usize) -> Self { - Self { - map: HashMap::new(), - index: BTreeSet::new(), - epoch: Instant::now(), - capacity, - next_id: 0, - } - } - - fn alloc_id(&mut self) -> u64 { - let id = self.next_id; - self.next_id += 1; - id - } -} - -impl EvictionStrategy for LfuStrategy { - fn insert(&mut self, key: CacheKey, entry: CacheEntry) { - if let Some((_, old_fk)) = self.map.remove(&key) { - self.index.remove(&old_fk); - } else if self.map.len() >= self.capacity { - if let Some(victim_fk) = self.index.iter().next().cloned() { - self.index.remove(&victim_fk); - self.map.remove(&victim_fk.cache_key); - } - } - let id = self.alloc_id(); - let fk = FreqKey { - frequency: entry.frequency, - created_at_nanos: entry.created_at.duration_since(self.epoch).as_nanos(), - unique_id: id, - cache_key: key.clone(), - }; - self.index.insert(fk.clone()); - self.map.insert(key, (entry, fk)); - } - - fn peek(&self, key: &CacheKey) -> Option<&CacheEntry> { - self.map.get(key).map(|(entry, _)| entry) - } - - fn record_access(&mut self, key: &CacheKey) { - if !self.map.contains_key(key) { - return; - } - - // Remove old index entry - let (_, old_fk) = &self.map[key]; - let old_fk = old_fk.clone(); - self.index.remove(&old_fk); - - let id = self.alloc_id(); - - // Bump frequency, build new FreqKey - let (entry, stored_fk) = self.map.get_mut(key).unwrap(); - entry.frequency += 1; - - let new_fk = FreqKey { - frequency: entry.frequency, - created_at_nanos: entry.created_at.duration_since(self.epoch).as_nanos(), - unique_id: id, - cache_key: key.clone(), - }; - self.index.insert(new_fk.clone()); - *stored_fk = new_fk; - } - - fn remove(&mut self, key: &CacheKey) -> Option { - if let Some((entry, fk)) = self.map.remove(key) { - self.index.remove(&fk); - Some(entry) - } else { - None - } - } - - fn len(&self) -> usize { - self.map.len() - } - - fn clear(&mut self) { - self.map.clear(); - self.index.clear(); - } - - fn capacity(&self) -> usize { - self.capacity - } -} diff --git a/src/strategies/lru.rs b/src/strategies/lru.rs deleted file mode 100644 index 6d6dce1..0000000 --- a/src/strategies/lru.rs +++ /dev/null @@ -1,51 +0,0 @@ -use hashlink::LruCache; - -use crate::entry::CacheEntry; -use crate::key::CacheKey; -use crate::strategies::EvictionStrategy; - -pub struct LruStrategy { - cache: LruCache, - cap: usize, -} - -impl LruStrategy { - pub fn new(capacity: usize) -> Self { - Self { - cache: LruCache::new(capacity), - cap: capacity, - } - } -} - -impl EvictionStrategy for LruStrategy { - fn insert(&mut self, key: CacheKey, entry: CacheEntry) { - // LruCache handles eviction automatically when at capacity - self.cache.insert(key, entry); - } - - fn peek(&self, key: &CacheKey) -> Option<&CacheEntry> { - self.cache.peek(key) - } - - fn record_access(&mut self, key: &CacheKey) { - // Touches the entry, moving it to the back of the LRU list - self.cache.get(key); - } - - fn remove(&mut self, key: &CacheKey) -> Option { - self.cache.remove(key) - } - - fn len(&self) -> usize { - self.cache.len() - } - - fn clear(&mut self) { - self.cache.clear(); - } - - fn capacity(&self) -> usize { - self.cap - } -} diff --git a/src/strategies/mod.rs b/src/strategies/mod.rs deleted file mode 100644 index 7e6bb89..0000000 --- a/src/strategies/mod.rs +++ /dev/null @@ -1,97 +0,0 @@ -pub mod fifo; -pub mod lfu; -pub mod lru; -pub mod mru; - -use crate::entry::CacheEntry; -use crate::key::CacheKey; - -pub trait EvictionStrategy: Send + Sync { - fn insert(&mut self, key: CacheKey, entry: CacheEntry); - fn peek(&self, key: &CacheKey) -> Option<&CacheEntry>; - fn record_access(&mut self, key: &CacheKey); - fn remove(&mut self, key: &CacheKey) -> Option; - fn len(&self) -> usize; - fn clear(&mut self); - fn capacity(&self) -> usize; -} - -/// Concrete enum wrapping all strategies — enables devirtualization + inlining. -pub enum StrategyEnum { - Lru(lru::LruStrategy), - Mru(mru::MruStrategy), - Fifo(fifo::FifoStrategy), - Lfu(lfu::LfuStrategy), -} - -impl StrategyEnum { - #[inline(always)] - pub fn insert(&mut self, key: CacheKey, entry: CacheEntry) { - match self { - Self::Lru(s) => s.insert(key, entry), - Self::Mru(s) => s.insert(key, entry), - Self::Fifo(s) => s.insert(key, entry), - Self::Lfu(s) => s.insert(key, entry), - } - } - - #[inline(always)] - pub fn peek(&self, key: &CacheKey) -> Option<&CacheEntry> { - match self { - Self::Lru(s) => s.peek(key), - Self::Mru(s) => s.peek(key), - Self::Fifo(s) => s.peek(key), - Self::Lfu(s) => s.peek(key), - } - } - - #[inline(always)] - pub fn record_access(&mut self, key: &CacheKey) { - match self { - Self::Lru(s) => s.record_access(key), - Self::Mru(s) => s.record_access(key), - Self::Fifo(s) => s.record_access(key), - Self::Lfu(s) => s.record_access(key), - } - } - - #[inline(always)] - pub fn remove(&mut self, key: &CacheKey) -> Option { - match self { - Self::Lru(s) => s.remove(key), - Self::Mru(s) => s.remove(key), - Self::Fifo(s) => s.remove(key), - Self::Lfu(s) => s.remove(key), - } - } - - #[inline(always)] - pub fn len(&self) -> usize { - match self { - Self::Lru(s) => s.len(), - Self::Mru(s) => s.len(), - Self::Fifo(s) => s.len(), - Self::Lfu(s) => s.len(), - } - } - - #[inline(always)] - pub fn clear(&mut self) { - match self { - Self::Lru(s) => s.clear(), - Self::Mru(s) => s.clear(), - Self::Fifo(s) => s.clear(), - Self::Lfu(s) => s.clear(), - } - } - - #[inline(always)] - pub fn capacity(&self) -> usize { - match self { - Self::Lru(s) => s.capacity(), - Self::Mru(s) => s.capacity(), - Self::Fifo(s) => s.capacity(), - Self::Lfu(s) => s.capacity(), - } - } -} diff --git a/src/strategies/mru.rs b/src/strategies/mru.rs deleted file mode 100644 index d062f93..0000000 --- a/src/strategies/mru.rs +++ /dev/null @@ -1,62 +0,0 @@ -use hashlink::LinkedHashMap; - -use crate::entry::CacheEntry; -use crate::key::CacheKey; -use crate::strategies::EvictionStrategy; - -/// MRU: evicts the most recently used entry. -/// On access, the entry moves to the back. On eviction, remove the back. -pub struct MruStrategy { - map: LinkedHashMap, - capacity: usize, -} - -impl MruStrategy { - pub fn new(capacity: usize) -> Self { - Self { - map: LinkedHashMap::new(), - capacity, - } - } -} - -impl EvictionStrategy for MruStrategy { - fn insert(&mut self, key: CacheKey, entry: CacheEntry) { - if self.map.contains_key(&key) { - self.map.remove(&key); - } else if self.map.len() >= self.capacity { - // Evict most recently used (back) - self.map.pop_back(); - } - // Insert at back (most recent position) - self.map.insert(key, entry); - } - - fn peek(&self, key: &CacheKey) -> Option<&CacheEntry> { - self.map.get(key) - } - - fn record_access(&mut self, key: &CacheKey) { - // Move to back (most recent) by removing and re-inserting - if let Some(entry) = self.map.remove(key) { - let key_clone = key.clone(); - self.map.insert(key_clone, entry); - } - } - - fn remove(&mut self, key: &CacheKey) -> Option { - self.map.remove(key) - } - - fn len(&self) -> usize { - self.map.len() - } - - fn clear(&mut self) { - self.map.clear(); - } - - fn capacity(&self) -> usize { - self.capacity - } -} diff --git a/tests/test_async.py b/tests/test_async.py index 52cf780..2c2ddfd 100644 --- a/tests/test_async.py +++ b/tests/test_async.py @@ -3,7 +3,7 @@ import pytest -from warp_cache import Strategy, cache +from warp_cache import cache from warp_cache._decorator import AsyncCachedFunction # ── Basic hit/miss ──────────────────────────────────────────────────────── @@ -13,7 +13,7 @@ async def test_async_basic_hit_miss(): call_count = 0 - @cache(strategy=Strategy.LRU, max_size=128) + @cache(max_size=128) async def add(a, b): nonlocal call_count call_count += 1 @@ -123,22 +123,20 @@ async def slow_fn(x): assert call_count >= 3 -# ── Strategies ──────────────────────────────────────────────────────────── +# ── Eviction ────────────────────────────────────────────────────────────── @pytest.mark.asyncio -async def test_async_strategies(): - for strat in [Strategy.LRU, Strategy.MRU, Strategy.FIFO, Strategy.LFU]: - - @cache(strategy=strat, max_size=2) - async def fn(x): - return x - - assert await fn(1) == 1 - assert await fn(2) == 2 - assert await fn(3) == 3 # triggers eviction - info = fn.cache_info() - assert info.current_size == 2 +async def test_async_eviction(): + @cache(max_size=2) + async def fn(x): + return x + + assert await fn(1) == 1 + assert await fn(2) == 2 + assert await fn(3) == 3 # triggers eviction + info = fn.cache_info() + assert info.current_size == 2 # ── Shared backend ─────────────────────────────────────────────────────── diff --git a/tests/test_basic.py b/tests/test_basic.py index 4e92302..c1591eb 100644 --- a/tests/test_basic.py +++ b/tests/test_basic.py @@ -1,10 +1,10 @@ -from warp_cache import Strategy, cache +from warp_cache import cache def test_basic_hit_miss(): call_count = 0 - @cache(strategy=Strategy.LRU, max_size=128) + @cache(max_size=128) def add(a, b): nonlocal call_count call_count += 1 @@ -81,7 +81,7 @@ def greet(name, greeting="hello"): def test_eviction_at_capacity(): - @cache(strategy=Strategy.LRU, max_size=3) + @cache(max_size=3) def identity(x): return x @@ -91,11 +91,7 @@ def identity(x): info = identity.cache_info() assert info.current_size == 3 - # Adding a 4th should evict the oldest (1) + # Adding a 4th should evict one entry (SIEVE picks an unvisited one) identity(4) info = identity.cache_info() assert info.current_size == 3 - - # 1 should be a miss now - identity(1) - assert identity.cache_info().misses == 5 # 1,2,3,4 were misses, then 1 again diff --git a/tests/test_shared_basic.py b/tests/test_shared_basic.py index d01c507..0496c3f 100644 --- a/tests/test_shared_basic.py +++ b/tests/test_shared_basic.py @@ -5,7 +5,7 @@ import os import tempfile -from warp_cache import SharedCacheInfo, Strategy, cache +from warp_cache import SharedCacheInfo, cache def _cleanup_shm(): @@ -28,7 +28,7 @@ def teardown_method(self): def test_basic_hit_miss(self): call_count = 0 - @cache(strategy=Strategy.LRU, max_size=128, backend="shared") + @cache(max_size=128, backend="shared") def fn(x): nonlocal call_count call_count += 1 @@ -51,7 +51,7 @@ def fn(x): def test_cache_clear(self): call_count = 0 - @cache(strategy=Strategy.LRU, max_size=128, backend="shared") + @cache(max_size=128, backend="shared") def fn(x): nonlocal call_count call_count += 1 @@ -71,7 +71,7 @@ def fn(x): assert call_count == 3 # re-computed def test_none_return_value(self): - @cache(strategy=Strategy.LRU, max_size=128, backend="shared") + @cache(max_size=128, backend="shared") def fn(x): return None @@ -80,7 +80,7 @@ def fn(x): assert fn.cache_info().hits == 1 def test_kwargs(self): - @cache(strategy=Strategy.LRU, max_size=128, backend="shared") + @cache(max_size=128, backend="shared") def fn(a, b): return a + b @@ -89,7 +89,7 @@ def fn(a, b): assert fn.cache_info().hits == 1 def test_eviction_at_capacity(self): - @cache(strategy=Strategy.LRU, max_size=4, backend="shared") + @cache(max_size=4, backend="shared") def fn(x): return x @@ -97,18 +97,12 @@ def fn(x): fn(i) assert fn.cache_info().current_size == 4 - # This should evict key 0 (LRU) + # This should evict an unvisited entry (SIEVE) fn(99) assert fn.cache_info().current_size == 4 - # key 0 was evicted, so calling it again is a miss - fn(0) - info = fn.cache_info() - assert info.misses == 6 # 4 initial + 99 + 0 re-miss - def test_oversize_skip(self): @cache( - strategy=Strategy.LRU, max_size=128, backend="shared", max_key_size=16, @@ -131,7 +125,7 @@ def fn(x): def test_fast_path_types(self): """All fast-path primitive types should cache correctly.""" - @cache(strategy=Strategy.LRU, max_size=128, backend="shared") + @cache(max_size=128, backend="shared") def fn(x): return x @@ -144,7 +138,7 @@ def fn(x): def test_fast_path_tuple_keys(self): """Tuples of primitives should use fast-path serialization.""" - @cache(strategy=Strategy.LRU, max_size=128, backend="shared") + @cache(max_size=128, backend="shared") def fn(a, b): return a + b @@ -153,7 +147,7 @@ def fn(a, b): assert fn.cache_info().hits == 1 def test_shared_cache_info_repr(self): - @cache(strategy=Strategy.LRU, max_size=64, backend="shared") + @cache(max_size=64, backend="shared") def fn(x): return x @@ -165,79 +159,113 @@ def fn(x): assert "misses=1" in r -class TestSharedStrategies: +class TestSharedSieve: + """Test SIEVE eviction behavior in the shared memory backend.""" + def setup_method(self): _cleanup_shm() def teardown_method(self): _cleanup_shm() - def test_lru_eviction(self): - @cache(strategy=Strategy.LRU, max_size=3, backend="shared") + def test_unvisited_evicted_first(self): + """SIEVE: unvisited entries are evicted before visited ones.""" + call_count = 0 + + @cache(max_size=3, backend="shared") def fn(x): + nonlocal call_count + call_count += 1 return x - fn(1) - fn(2) - fn(3) - fn(1) # touch 1, making 2 the LRU - fn(4) # evict 2 - assert fn.cache_info().current_size == 3 + fn(1) # miss, inserted (unvisited) + fn(2) # miss, inserted (unvisited) + fn(3) # miss, inserted (unvisited) + assert call_count == 3 - # 2 was evicted - fn(2) - assert fn.cache_info().misses == 5 # 1,2,3,4 + re-miss on 2 + # Access 2 and 3 — marks them as visited + fn(2) # hit → visited=true + fn(3) # hit → visited=true + assert call_count == 3 - def test_fifo_eviction(self): - @cache(strategy=Strategy.FIFO, max_size=3, backend="shared") + # Insert 4 — must evict. 1 is unvisited, should be evicted + fn(4) # miss, evicts 1 + assert call_count == 4 + + # Verify: 1 was evicted (miss), 2 and 3 survive (hit) + call_count = 0 + fn(2) # hit + assert call_count == 0 + fn(3) # hit + assert call_count == 0 + fn(1) # miss — was evicted + assert call_count == 1 + + def test_second_chance(self): + """SIEVE: visited entries get their visited bit cleared (second chance) + and are only evicted on a subsequent pass if still unvisited.""" + call_count = 0 + + @cache(max_size=2, backend="shared") def fn(x): + nonlocal call_count + call_count += 1 return x - fn(1) - fn(2) - fn(3) - fn(1) # touch 1 — FIFO doesn't reorder - fn(4) # evict 1 (first inserted) - assert fn.cache_info().current_size == 3 + fn(1) # miss + fn(2) # miss + assert call_count == 2 - # 1 was evicted - fn(1) - assert fn.cache_info().misses == 5 + # Visit both entries + fn(1) # hit → visited=true + fn(2) # hit → visited=true + + # Insert 3 — all entries visited, so the hand scans and clears visited bits, + # then evicts the first entry it finds unvisited on the second pass + fn(3) # miss, evicts one of {1, 2} + assert call_count == 3 + + info = fn.cache_info() + assert info.current_size == 2 + + def test_eviction_respects_capacity(self): + """Cache never exceeds max_size.""" - def test_mru_eviction(self): - @cache(strategy=Strategy.MRU, max_size=3, backend="shared") + @cache(max_size=5, backend="shared") def fn(x): return x - fn(1) - fn(2) - fn(3) - fn(2) # touch 2, making it most recently used - fn(4) # evict 2 (MRU) - assert fn.cache_info().current_size == 3 + for i in range(100): + fn(i) + info = fn.cache_info() + assert info.current_size <= 5 - # 2 was evicted - fn(2) - assert fn.cache_info().misses == 5 + def test_hit_sets_visited(self): + """A cache hit marks the entry as visited, protecting it from eviction.""" + call_count = 0 - def test_lfu_eviction(self): - @cache(strategy=Strategy.LFU, max_size=3, backend="shared") + @cache(max_size=3, backend="shared") def fn(x): + nonlocal call_count + call_count += 1 return x - fn(1) - fn(2) - fn(3) - fn(1) - fn(1) # freq(1) = 2 - fn(2) # freq(2) = 1 - # freq(3) = 0 — least frequent - fn(4) # evict 3 (lowest frequency) - assert fn.cache_info().current_size == 3 + fn(1) # miss + fn(2) # miss + fn(3) # miss + # All entries are unvisited + + # Visit entry 1 + fn(1) # hit → visited=true - # 3 was evicted - fn(3) - assert fn.cache_info().misses == 5 + # Insert 4 — evicts an unvisited entry (2 or 3), not 1 + fn(4) # miss + assert call_count == 4 + + # Entry 1 should still be cached + call_count = 0 + fn(1) # hit + assert call_count == 0 class TestSharedTTL: @@ -250,7 +278,7 @@ def teardown_method(self): def test_ttl_expiry(self): import time - @cache(strategy=Strategy.LRU, max_size=128, ttl=0.1, backend="shared") + @cache(max_size=128, ttl=0.1, backend="shared") def fn(x): return x * 2 @@ -264,7 +292,7 @@ def fn(x): assert fn.cache_info().misses == 2 def test_ttl_not_expired(self): - @cache(strategy=Strategy.LRU, max_size=128, ttl=10.0, backend="shared") + @cache(max_size=128, ttl=10.0, backend="shared") def fn(x): return x * 2 @@ -285,7 +313,7 @@ def teardown_method(self): def test_default_is_memory(self): from warp_cache._warp_cache_rs import CacheInfo - @cache(strategy=Strategy.LRU, max_size=128) + @cache(max_size=128) def fn(x): return x @@ -294,7 +322,7 @@ def fn(x): assert isinstance(info, CacheInfo) def test_shared_returns_shared_info(self): - @cache(strategy=Strategy.LRU, max_size=128, backend="shared") + @cache(max_size=128, backend="shared") def fn(x): return x diff --git a/tests/test_shared_multiprocess.py b/tests/test_shared_multiprocess.py index e376b3a..2966560 100644 --- a/tests/test_shared_multiprocess.py +++ b/tests/test_shared_multiprocess.py @@ -30,7 +30,6 @@ def _cleanup_shm(): # Use a fixed shm_name so all processes (even with spawn) share the same cache _shared_fn = SharedCachedFunction( lambda x: x * x, - 0, 16, ttl=None, max_key_size=512, @@ -121,7 +120,6 @@ def test_cross_process_str_key_different_hashseed(self): # Parent writes a string-keyed entry parent_fn = SharedCachedFunction( lambda x: f"hello-{x}", - 0, 16, ttl=None, max_key_size=512, @@ -139,7 +137,7 @@ def test_cross_process_str_key_different_hashseed(self): fn = SharedCachedFunction( lambda x: f"hello-{{x}}", - 0, 16, ttl=None, + 16, ttl=None, max_key_size=512, max_value_size=4096, shm_name="{shm_name}", ) diff --git a/tests/test_shared_realistic.py b/tests/test_shared_realistic.py index 74e48c7..08d5e7b 100644 --- a/tests/test_shared_realistic.py +++ b/tests/test_shared_realistic.py @@ -10,7 +10,7 @@ import pytest -from warp_cache import Strategy, cache +from warp_cache import cache from warp_cache._warp_cache_rs import SharedCachedFunction @@ -43,7 +43,7 @@ def teardown_method(self): _cleanup_shm() def test_hit_miss_ratio(self): - @cache(strategy=Strategy.LRU, max_size=MAX_SIZE, backend="shared") + @cache(max_size=MAX_SIZE, backend="shared") def fn(n): return _expensive_compute(n) @@ -84,7 +84,6 @@ def fn(n): _shared_realistic_fn = SharedCachedFunction( _expensive_compute, - 0, MAX_SIZE, ttl=None, max_key_size=512, diff --git a/tests/test_strategies.py b/tests/test_strategies.py index 2d71366..b2b1f9b 100644 --- a/tests/test_strategies.py +++ b/tests/test_strategies.py @@ -1,120 +1,104 @@ -from warp_cache import Strategy, cache +from warp_cache import cache -def test_lru_eviction_order(): - """LRU: least recently used evicted first.""" +def test_sieve_unvisited_evicted_first(): + """SIEVE: unvisited entries are evicted before visited ones.""" call_count = 0 - @cache(strategy=Strategy.LRU, max_size=3) + @cache(max_size=3) def fn(x): nonlocal call_count call_count += 1 return x - fn(1) # miss. Cache order (LRU→MRU): [1] - fn(2) # miss. [1, 2] - fn(3) # miss. [1, 2, 3] - fn(1) # hit, promotes 1. [2, 3, 1] + fn(1) # miss, inserted (unvisited) + fn(2) # miss, inserted (unvisited) + fn(3) # miss, inserted (unvisited) assert call_count == 3 - fn(4) # miss, evicts 2 (LRU). [3, 1, 4] + # Access 2 and 3 — marks them as visited + fn(2) # hit → visited=true + fn(3) # hit → visited=true + assert call_count == 3 + + # Insert 4 — must evict. 1 is unvisited, should be evicted + fn(4) # miss, evicts 1 assert call_count == 4 - # Verify: 2 was evicted (miss), 1 and 3 are still present (hit) + # Verify: 1 was evicted (miss), 2 and 3 survive (hit) call_count = 0 - fn(1) # hit + fn(2) # hit assert call_count == 0 fn(3) # hit assert call_count == 0 - fn(2) # miss — was evicted + fn(1) # miss — was evicted assert call_count == 1 -def test_fifo_eviction_order(): - """FIFO: first inserted evicted first, access doesn't change order.""" +def test_sieve_second_chance(): + """SIEVE: visited entries get their visited bit cleared (second chance) + and are only evicted on a subsequent pass if still unvisited.""" call_count = 0 - @cache(strategy=Strategy.FIFO, max_size=3) + @cache(max_size=2) def fn(x): nonlocal call_count call_count += 1 return x - fn(1) # miss. Insertion order: [1] - fn(2) # miss. [1, 2] - fn(3) # miss. [1, 2, 3] - fn(1) # hit (FIFO doesn't reorder). Still [1, 2, 3] - assert call_count == 3 + fn(1) # miss + fn(2) # miss + assert call_count == 2 - fn(4) # miss, evicts 1 (oldest). [2, 3, 4] - assert call_count == 4 + # Visit both entries + fn(1) # hit → visited=true + fn(2) # hit → visited=true - # Verify: 1 was evicted (miss), 2 and 3 are still present (hit) - call_count = 0 - fn(2) # hit - assert call_count == 0 - fn(3) # hit - assert call_count == 0 - fn(1) # miss — was evicted - assert call_count == 1 + # Insert 3 — all entries visited, so the hand scans and clears visited bits, + # then evicts the first entry it finds unvisited on the second pass + fn(3) # miss, evicts one of {1, 2} + assert call_count == 3 + info = fn.cache_info() + assert info.current_size == 2 -def test_mru_eviction_order(): - """MRU: most recently used evicted first.""" - call_count = 0 - @cache(strategy=Strategy.MRU, max_size=3) +def test_sieve_eviction_respects_capacity(): + """Cache never exceeds max_size.""" + + @cache(max_size=5) def fn(x): - nonlocal call_count - call_count += 1 return x - fn(1) # miss. [1] - fn(2) # miss. [1, 2] - fn(3) # miss. [1, 2, 3] - fn(2) # hit, 2 becomes most recent. [1, 3, 2] - assert call_count == 3 + for i in range(100): + fn(i) + info = fn.cache_info() + assert info.current_size <= 5 - fn(4) # miss, evicts 2 (MRU). [1, 3, 4] - assert call_count == 4 - # Verify: 2 was evicted (miss), 1 and 3 are still present (hit) +def test_sieve_hit_sets_visited(): + """A cache hit marks the entry as visited, protecting it from eviction.""" call_count = 0 - fn(1) # hit - assert call_count == 0 - fn(3) # hit - assert call_count == 0 - fn(2) # miss — was evicted - assert call_count == 1 - -def test_lfu_eviction_order(): - """LFU: least frequently used evicted first.""" - call_count = 0 - - @cache(strategy=Strategy.LFU, max_size=3) + @cache(max_size=3) def fn(x): nonlocal call_count call_count += 1 return x - fn(1) # miss, freq(1)=0 - fn(2) # miss, freq(2)=0 - fn(3) # miss, freq(3)=0 - fn(1) # hit, freq(1)=1 - fn(1) # hit, freq(1)=2 - fn(2) # hit, freq(2)=1 - # freqs: 1→2, 2→1, 3→0 - assert call_count == 3 + fn(1) # miss + fn(2) # miss + fn(3) # miss + # All entries are unvisited - fn(4) # miss, evicts 3 (lowest freq=0) + # Visit entry 1 + fn(1) # hit → visited=true + + # Insert 4 — evicts an unvisited entry (2 or 3), not 1 + fn(4) # miss assert call_count == 4 - # Verify: 3 was evicted (miss), 1 and 2 are still present (hit) + # Entry 1 should still be cached call_count = 0 fn(1) # hit assert call_count == 0 - fn(2) # hit - assert call_count == 0 - fn(3) # miss — was evicted - assert call_count == 1 diff --git a/tests/test_stress.py b/tests/test_stress.py index 1507b4e..a486c1c 100644 --- a/tests/test_stress.py +++ b/tests/test_stress.py @@ -5,27 +5,25 @@ import time from concurrent.futures import ThreadPoolExecutor -from warp_cache import Strategy, cache +from warp_cache import cache # --------------------------------------------------------------------------- -# 1. High-volume insert/get — 100k ops per strategy, verify correctness +# 1. High-volume insert/get — 100k ops, verify correctness # --------------------------------------------------------------------------- def test_high_volume(): - for strategy in Strategy: - - @cache(strategy=strategy, max_size=1024) - def fn(x): - return x * 3 + 1 + @cache(max_size=1024) + def fn(x): + return x * 3 + 1 - for i in range(100_000): - key = i % 2000 # 2000 unique keys, many repeats - assert fn(key) == key * 3 + 1 + for i in range(100_000): + key = i % 2000 # 2000 unique keys, many repeats + assert fn(key) == key * 3 + 1 - info = fn.cache_info() - assert info.hits + info.misses == 100_000 - assert info.current_size <= 1024 + info = fn.cache_info() + assert info.hits + info.misses == 100_000 + assert info.current_size <= 1024 # --------------------------------------------------------------------------- @@ -34,16 +32,14 @@ def fn(x): def test_eviction_churn(): - for strategy in Strategy: - - @cache(strategy=strategy, max_size=10) - def fn(x): - return x + @cache(max_size=10) + def fn(x): + return x - for i in range(10_000): - assert fn(i) == i - info = fn.cache_info() - assert info.current_size <= 10 + for i in range(10_000): + assert fn(i) == i + info = fn.cache_info() + assert info.current_size <= 10 # --------------------------------------------------------------------------- @@ -55,7 +51,7 @@ def test_heavy_contention(): call_count = 0 lock = threading.Lock() - @cache(strategy=Strategy.LRU, max_size=64) + @cache(max_size=64) def fn(x): nonlocal call_count with lock: @@ -90,7 +86,7 @@ def worker(): def test_ttl_under_load(): call_count = 0 - @cache(strategy=Strategy.LRU, max_size=256, ttl=0.05) + @cache(max_size=256, ttl=0.05) def fn(x): nonlocal call_count call_count += 1 @@ -121,7 +117,7 @@ def fn(x): def test_mixed_workload(): - @cache(strategy=Strategy.LFU, max_size=128) + @cache(max_size=128) def fn(x): return x + 1 diff --git a/tests/test_threading.py b/tests/test_threading.py index 47620f2..d5bb692 100644 --- a/tests/test_threading.py +++ b/tests/test_threading.py @@ -1,7 +1,7 @@ import threading from concurrent.futures import ThreadPoolExecutor -from warp_cache import Strategy, cache +from warp_cache import cache def test_concurrent_access(): @@ -9,7 +9,7 @@ def test_concurrent_access(): call_count = 0 lock = threading.Lock() - @cache(strategy=Strategy.LRU, max_size=128) + @cache(max_size=128) def slow_add(a, b): nonlocal call_count with lock: @@ -34,37 +34,10 @@ def worker(i): assert call_count < 800 -def test_concurrent_different_strategies(): - """Verify thread safety across all strategies.""" - for strategy in Strategy: - call_count = 0 - lock = threading.Lock() - - @cache(strategy=strategy, max_size=64) - def fn(x): - nonlocal call_count - with lock: - call_count += 1 - return x * x - - def worker(): - for i in range(100): - assert fn(i % 20) == (i % 20) ** 2 - - threads = [threading.Thread(target=worker) for _ in range(8)] - for t in threads: - t.start() - for t in threads: - t.join() - - info = fn.cache_info() - assert info.hits > 0, f"Expected hits for {strategy.name}" - - def test_concurrent_cache_clear(): """Test that cache_clear during concurrent access doesn't crash.""" - @cache(strategy=Strategy.LRU, max_size=128) + @cache(max_size=128) def fn(x): return x diff --git a/uv.lock b/uv.lock index d72ad5d..68101b4 100644 --- a/uv.lock +++ b/uv.lock @@ -416,7 +416,7 @@ name = "exceptiongroup" version = "1.3.1" source = { registry = "https://pypi.org/simple" } dependencies = [ - { name = "typing-extensions", marker = "python_full_version < '3.13'" }, + { name = "typing-extensions", marker = "python_full_version < '3.11'" }, ] sdist = { url = "https://files.pythonhosted.org/packages/50/79/66800aadf48771f6b62f7eb014e352e5d06856655206165d775e675a02c9/exceptiongroup-1.3.1.tar.gz", hash = "sha256:8b412432c6055b0b7d14c310000ae93352ed6754f70fa8f7c34141f91c4e3219", size = 30371 } wheels = [ diff --git a/warp_cache/__init__.py b/warp_cache/__init__.py index 7389b79..8cfaac8 100644 --- a/warp_cache/__init__.py +++ b/warp_cache/__init__.py @@ -1,5 +1,5 @@ -from warp_cache._decorator import cache, lru_cache -from warp_cache._strategies import Backend, Strategy +from warp_cache._decorator import cache +from warp_cache._strategies import Backend from warp_cache._warp_cache_rs import CacheInfo, SharedCacheInfo -__all__ = ["Backend", "cache", "CacheInfo", "lru_cache", "SharedCacheInfo", "Strategy"] +__all__ = ["Backend", "cache", "CacheInfo", "SharedCacheInfo"] diff --git a/warp_cache/_decorator.py b/warp_cache/_decorator.py index c89592f..4e8db33 100644 --- a/warp_cache/_decorator.py +++ b/warp_cache/_decorator.py @@ -5,7 +5,7 @@ from collections.abc import Callable from typing import Any -from warp_cache._strategies import Backend, Strategy +from warp_cache._strategies import Backend from warp_cache._warp_cache_rs import ( CachedFunction, CacheInfo, @@ -68,7 +68,6 @@ def _resolve_backend(backend: str | int | Backend) -> Backend: def cache( - strategy: Strategy = Strategy.LRU, max_size: int = 128, ttl: float | None = None, backend: str | int | Backend = Backend.MEMORY, @@ -80,8 +79,10 @@ def cache( Supports both sync and async functions. The async detection happens once at decoration time — zero overhead on the sync path. + Uses SIEVE eviction — a simple, scan-resistant algorithm that provides + near-optimal hit rates with O(1) overhead per access. + Args: - strategy: Eviction strategy (LRU, MRU, FIFO, LFU). max_size: Maximum number of cached entries. ttl: Time-to-live in seconds (None = no expiry). backend: Backend.MEMORY (default) for in-process cache, @@ -89,15 +90,6 @@ def cache( Also accepts the strings "memory" and "shared". max_key_size: Max serialized key size in bytes (shared backend only). max_value_size: Max serialized value size in bytes (shared backend only). - - Note: - Eviction ordering (LRU, MRU, LFU) is approximate under sustained - hit-only workloads. The in-process backend batches ordering updates - (up to 64) under a read lock and replays them on the next write lock - acquisition. If the batch fills before a cache miss triggers a write - lock, additional ordering updates are dropped. In practice this - rarely affects eviction quality, but under pathological access - patterns the evicted entry may not be the theoretically optimal one. """ resolved_backend = _resolve_backend(backend) @@ -105,7 +97,6 @@ def decorator(fn): if resolved_backend == Backend.SHARED: inner = SharedCachedFunction( fn, - int(strategy), max_size, ttl=ttl, max_key_size=max_key_size if max_key_size is not None else 512, @@ -122,7 +113,7 @@ def decorator(fn): "max_value_size has no effect with the memory backend", stacklevel=2, ) - inner = CachedFunction(fn, int(strategy), max_size, ttl=ttl) + inner = CachedFunction(fn, max_size, ttl=ttl) if asyncio.iscoroutinefunction(fn): return AsyncCachedFunction(fn, inner) @@ -130,21 +121,3 @@ def decorator(fn): return inner return decorator - - -def lru_cache( - max_size: int = 128, - ttl: float | None = None, - backend: str | int | Backend = Backend.MEMORY, - max_key_size: int | None = None, - max_value_size: int | None = None, -) -> Callable[[Callable[..., Any]], CachedFunction | SharedCachedFunction | AsyncCachedFunction]: - """Shorthand for ``cache(strategy=Strategy.LRU, ...)``.""" - return cache( - strategy=Strategy.LRU, - max_size=max_size, - ttl=ttl, - backend=backend, - max_key_size=max_key_size, - max_value_size=max_value_size, - ) diff --git a/warp_cache/_strategies.py b/warp_cache/_strategies.py index de4a3fd..14d6b22 100644 --- a/warp_cache/_strategies.py +++ b/warp_cache/_strategies.py @@ -1,13 +1,6 @@ from enum import IntEnum -class Strategy(IntEnum): - LRU = 0 - MRU = 1 - FIFO = 2 - LFU = 3 - - class Backend(IntEnum): MEMORY = 0 SHARED = 1 diff --git a/warp_cache/_warp_cache_rs.pyi b/warp_cache/_warp_cache_rs.pyi index df5d675..6d4bdd1 100644 --- a/warp_cache/_warp_cache_rs.pyi +++ b/warp_cache/_warp_cache_rs.pyi @@ -29,7 +29,6 @@ class CachedFunction: def __init__( self, fn_obj: Callable[..., Any], - strategy: int, max_size: int, ttl: float | None = None, ) -> None: ... @@ -43,7 +42,6 @@ class SharedCachedFunction: def __init__( self, fn_obj: Callable[..., Any], - strategy: int, max_size: int, ttl: float | None = None, max_key_size: int = 512,