diff --git a/.dockerignore b/.dockerignore
new file mode 100644
index 0000000..3086180
--- /dev/null
+++ b/.dockerignore
@@ -0,0 +1,81 @@
+# Keep the build context lean — anything not required to run the web
+# builder in production should be excluded so layer caching stays fast
+# and secrets/dev artifacts never end up in the image.
+
+# VCS / CI
+.git/
+.gitignore
+.gitattributes
+.github/
+.gitleaks.toml
+
+# Claude Code worktrees and local config
+.claude/
+
+# Virtualenvs
+.venv/
+venv/
+env/
+ENV/
+
+# Python caches
+__pycache__/
+**/__pycache__/
+*.py[cod]
+*$py.class
+*.so
+.Python
+build/
+dist/
+*.egg-info/
+*.egg
+.eggs/
+
+# Tests, coverage, type/lint caches
+tests/
+.pytest_cache/
+.mypy_cache/
+.ruff_cache/
+.coverage
+.coverage.*
+htmlcov/
+.tox/
+coverage.xml
+
+# Local secrets — never bake into an image
+.env
+.env.*
+!.env.example
+
+# Editor
+.idea/
+.vscode/
+*.swp
+*.swo
+
+# macOS / Windows cruft
+.DS_Store
+.AppleDouble
+.LSOverride
+Thumbs.db
+
+# Tool outputs not needed at runtime
+output/
+*.log
+
+# Docs and dev tooling not used by the runtime
+docs/
+examples/
+CHANGELOG.md
+CONTRIBUTING.md
+SECURITY.md
+.pre-commit-config.yaml
+.pip-audit-allowlist.txt
+.secrets.baseline
+scripts/
+Makefile
+
+# Container artefacts that should not nest into the image itself
+Dockerfile
+.dockerignore
+fly.toml
diff --git a/.github/workflows/fly-deploy.yml b/.github/workflows/fly-deploy.yml
new file mode 100644
index 0000000..b0c246e
--- /dev/null
+++ b/.github/workflows/fly-deploy.yml
@@ -0,0 +1,18 @@
+# See https://fly.io/docs/app-guides/continuous-deployment-with-github-actions/
+
+name: Fly Deploy
+on:
+  push:
+    branches:
+      - main
+jobs:
+  deploy:
+    name: Deploy app
+    runs-on: ubuntu-latest
+    concurrency: deploy-group    # optional: ensure only one action runs at a time
+    steps:
+      - uses: actions/checkout@v4
+      - uses: superfly/flyctl-actions/setup-flyctl@master
+      - run: flyctl deploy --remote-only
+        env:
+          FLY_API_TOKEN: ${{ secrets.FLY_API_TOKEN }}
diff --git a/CHANGELOG.md b/CHANGELOG.md
index 48e9367..af87f0b 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -10,6 +10,45 @@ Versioning follows [Semantic Versioning](https://semver.org/spec/v2.0.0.html).
 ## [Unreleased]
 
 ### Added
+- **Per-IP rate limit on `POST /generate`** (10 requests / hour /
+  client IP, sliding window). Caps abuse cost at the
+  most-expensive route — each `/generate` triggers an Anthropic
+  narrative call costing ~$0.10–$0.30, so without a cap a single
+  abusive IP could burn the published Anthropic spend ceiling in
+  minutes. Limit lives in-process (`web.ratelimit.RateLimiter`) and
+  is keyed on `Fly-Client-IP` (Fly's edge proxy) → `X-Forwarded-For`
+  → socket peer; `request.client.host` alone would be useless on
+  Fly because it is one of Fly's load-balancer addresses. Bounded
+  to 10 000 tracked IPs (LRU-by-insert eviction) so a flood of
+  unique sources cannot OOM the process. Over-quota responses are
+  HTTP 429 with a `Retry-After` header, returned before the
+  multipart body is parsed (FastAPI dependency runs first when the
+  dep only takes `Request`). Picked over a global cap because per-IP
+  bounds the worst case from a single attacker; a global counter
+  would not have helped against a botnet hitting each IP once and
+  would have hurt legitimate concurrent use during a launch.
+- **Fly.io deployment config** (`Dockerfile`, `fly.toml`, `.dockerignore`)
+  for the web builder. The image is `python:3.12-slim` with the project
+  installed in editable mode so `trailstory.renderers.html` keeps
+  finding the top-level `templates/` directory at runtime; the renderer
+  resolves it via `Path(__file__).parents[2] / "templates"`, which only
+  works when the package source lives next to `templates/` — a
+  non-editable install would relocate `trailstory/` into site-packages
+  and break the path. `fly.toml` ships a 512 MB shared-cpu-1x VM in
+  `fra` with a `/healthz` HTTP check, `force_https`, auto-stop on idle,
+  and no persistent volume — the 30-min retention sweep runs against
+  `/tmp` and restarts wipe in-flight workspaces, which is *stronger*
+  than the published privacy promise.
+- **`/version` endpoint** that returns the running image's git SHA
+  (sourced from the `GIT_SHA` build arg, falls back to `"unknown"` for
+  local runs). Wired to the `make deploy` target so every Fly deploy
+  stamps the current commit into the image and a deploy-correlated bug
+  can be tied back to the source without log archaeology.
+- **`make docker-build` / `make deploy` targets**. `deploy` refuses to
+  ship a dirty working tree and forwards `GIT_SHA` to
+  `flyctl deploy --build-arg`, so the SHA in `/version` always matches
+  the commit Fly built from. `docker-build` exists for local smoke
+  tests before the first deploy.
 - **Streaming narrative generation via Server-Sent Events** for the web
   builder. `POST /generate` now runs the deterministic prep phase
   (parse GPX + load_photos + persist `pending.json`), wipes the raw
diff --git a/Dockerfile b/Dockerfile
new file mode 100644
index 0000000..aef9d0b
--- /dev/null
+++ b/Dockerfile
@@ -0,0 +1,50 @@
+# syntax=docker/dockerfile:1.7
+#
+# Trailstory web builder — single-stage image for Fly.io.
+#
+# We install the package in editable mode on purpose: the HTML renderer
+# resolves the memory template via ``Path(__file__).parents[2] / "templates"``
+# (see trailstory/renderers/html.py:27), which only works when the
+# package source lives next to the top-level templates/ directory at
+# runtime. A non-editable install would relocate trailstory/ into
+# site-packages and break that path. See CLAUDE.md → "Architecture and
+# file map" for the layout this depends on.
+
+FROM python:3.12-slim
+
+ENV PYTHONDONTWRITEBYTECODE=1 \
+    PYTHONUNBUFFERED=1 \
+    PIP_DISABLE_PIP_VERSION_CHECK=1 \
+    PIP_NO_CACHE_DIR=1
+
+WORKDIR /app
+
+# Project metadata first so the dependency-install layer is cached on
+# any change to source files but invalidated when pyproject.toml moves.
+COPY pyproject.toml README.md LICENSE ./
+COPY trailstory/ ./trailstory/
+COPY web/ ./web/
+COPY templates/ ./templates/
+
+# Editable install: dependencies + package, with trailstory/ pointing
+# back at /app/trailstory so the renderer's parents[2] still resolves
+# to /app/templates.
+RUN pip install -e .
+
+# Drop privileges. The retention sweeper writes under $TMPDIR
+# (default /tmp) which is world-writeable, so a non-root user is fine.
+RUN useradd --create-home --uid 1000 app && chown -R app:app /app
+USER app
+
+# Build identity — set by `make deploy` / `flyctl deploy --build-arg`.
+# Surfaced via GET /version so a deploy can be tied back to a commit.
+ARG GIT_SHA=unknown
+ENV GIT_SHA=${GIT_SHA}
+
+# Fly injects PORT at runtime; default keeps `docker run` ergonomic.
+ENV PORT=8080
+EXPOSE 8080
+
+# Shell form so $PORT expands. One uvicorn worker — the LLM call is
+# I/O-bound and v0 traffic does not justify multi-worker complexity.
+CMD ["sh", "-c", "exec uvicorn web.__main__:app --host 0.0.0.0 --port ${PORT}"]
diff --git a/Makefile b/Makefile
index b838621..72c4a4e 100644
--- a/Makefile
+++ b/Makefile
@@ -1,5 +1,5 @@
 .DEFAULT_GOAL := help
-.PHONY: help setup dev install install-hooks test lint format typecheck ci clean generate test-render golden-update eval eval-live eval-update-golden web web-dev
+.PHONY: help setup dev install install-hooks test lint format typecheck ci clean generate test-render golden-update eval eval-live eval-update-golden web web-dev docker-build deploy
 
 PYTHON ?= python3.12
 VENV   := .venv
@@ -109,6 +109,26 @@ web:                ## Run the FastAPI builder against the real Anthropic API (n
 web-dev:            ## Run the FastAPI builder with a fake LLM (free, deterministic narrative)
 	$(PY) -m web --fake-llm --reload
 
+# ── Deploy ─────────────────────────────────────────────────────────────────────
+
+# Resolved at make-invocation time so the same value reaches the Dockerfile
+# build arg and the /version endpoint. `--short` keeps the SHA readable.
+GIT_SHA := $(shell git rev-parse --short HEAD 2>/dev/null || echo unknown)
+
+docker-build:       ## Build the production image locally (smoke test before deploy)
+	docker build --build-arg GIT_SHA=$(GIT_SHA) -t trailstory:$(GIT_SHA) -t trailstory:latest .
+
+deploy:             ## Deploy to Fly.io with the current git SHA stamped into /version
+	@command -v flyctl >/dev/null 2>&1 || { \
+		echo "→ flyctl not found. Install: brew install flyctl"; exit 1; \
+	}
+	@if ! git diff-index --quiet HEAD --; then \
+		echo "→ working tree is dirty. Commit or stash before deploying."; \
+		git status --short; \
+		exit 1; \
+	fi
+	flyctl deploy --build-arg GIT_SHA=$(GIT_SHA)
+
 # ── Cleanup ────────────────────────────────────────────────────────────────────
 
 clean:              ## Remove build artifacts, cache, and generated output
diff --git a/fly.toml b/fly.toml
new file mode 100644
index 0000000..b720f4d
--- /dev/null
+++ b/fly.toml
@@ -0,0 +1,34 @@
+# fly.toml app configuration file generated for trailstory on 2026-04-30T17:16:08+02:00
+#
+# See https://fly.io/docs/reference/configuration/ for information about how to use this file.
+#
+
+app = 'trailstory'
+primary_region = 'fra'
+
+[build]
+
+[env]
+  PORT = '8080'
+
+[http_service]
+  internal_port = 8080
+  force_https = true
+  auto_stop_machines = 'stop'
+  auto_start_machines = true
+  min_machines_running = 0
+  processes = ['app']
+
+  [[http_service.checks]]
+    interval = '30s'
+    timeout = '5s'
+    grace_period = '10s'
+    method = 'GET'
+    path = '/healthz'
+
+[[vm]]
+  size = 'shared-cpu-1x'
+  memory = '512mb'
+  cpu_kind = 'shared'
+  cpus = 1
+  memory_mb = 512
diff --git a/tests/test_ratelimit.py b/tests/test_ratelimit.py
new file mode 100644
index 0000000..3ae9071
--- /dev/null
+++ b/tests/test_ratelimit.py
@@ -0,0 +1,139 @@
+"""Tests for the per-IP rate limiter on ``/generate``.
+
+The limiter unit tests use injected ``now`` timestamps so we exercise
+window expiry without ``time.sleep`` (deterministic, sub-millisecond).
+The ``client_ip`` tests use a stubbed Starlette ``Request`` rather
+than the FastAPI ``TestClient`` so we can assert each header path in
+isolation. The route-level 429 path is exercised in ``test_web.py``.
+"""
+
+from __future__ import annotations
+
+from typing import Any
+from unittest.mock import MagicMock
+
+import pytest
+
+from web.ratelimit import RateLimiter, client_ip
+
+
+def _request(headers: dict[str, str], peer: str | None = "127.0.0.1") -> Any:
+    """Build a minimal stand-in for :class:`fastapi.Request`.
+
+    ``client_ip`` only reads ``request.headers`` and ``request.client``,
+    so a MagicMock is enough — we avoid spinning up a real Starlette
+    Request just for header lookup.
+    """
+    request = MagicMock()
+    request.headers = headers
+    if peer is None:
+        request.client = None
+    else:
+        client = MagicMock()
+        client.host = peer
+        request.client = client
+    return request
+
+
+# ── RateLimiter ──────────────────────────────────────────────────────────────
+
+
+def test_limiter_allows_up_to_limit() -> None:
+    rl = RateLimiter(limit=3, window_seconds=60)
+    for _ in range(3):
+        allowed, retry = rl.check("ip1", now=100.0)
+        assert allowed is True
+        assert retry == 0
+
+
+def test_limiter_rejects_over_limit_with_retry_after() -> None:
+    rl = RateLimiter(limit=2, window_seconds=60)
+    rl.check("ip1", now=100.0)
+    rl.check("ip1", now=110.0)
+    allowed, retry = rl.check("ip1", now=120.0)
+    assert allowed is False
+    # Oldest entry was at 100; window 60s → ages out at 160; +1 buffer.
+    assert retry == 41
+
+
+def test_limiter_separates_keys() -> None:
+    rl = RateLimiter(limit=1, window_seconds=60)
+    assert rl.check("a", now=100.0) == (True, 0)
+    assert rl.check("b", now=100.0) == (True, 0)
+    assert rl.check("a", now=100.0)[0] is False
+    assert rl.check("b", now=100.0)[0] is False
+
+
+def test_limiter_window_expiry_restores_quota() -> None:
+    rl = RateLimiter(limit=1, window_seconds=10)
+    assert rl.check("ip", now=100.0)[0] is True
+    assert rl.check("ip", now=105.0)[0] is False
+    # 11s past the original entry — it has aged out of the 10s window.
+    assert rl.check("ip", now=111.0)[0] is True
+
+
+def test_limiter_uses_real_clock_when_now_is_none() -> None:
+    """A bare ``check(key)`` call should still work — we just want
+    a smoke test that the wall-clock branch exercises without
+    raising. Two calls back-to-back are within the default window
+    so both are allowed under a generous limit."""
+    rl = RateLimiter(limit=10, window_seconds=60)
+    assert rl.check("ip")[0] is True
+    assert rl.check("ip")[0] is True
+
+
+def test_limiter_evicts_oldest_when_max_keys_hit() -> None:
+    """A flood of unique IPs must not grow the bucket map without bound.
+
+    With ``max_keys=2``, inserting a third key evicts the first.
+    We can't see the dict directly, but the eviction behaviour is
+    observable: the evicted key gets a fresh bucket on its next
+    request even after it was previously over-quota.
+    """
+    rl = RateLimiter(limit=1, window_seconds=60, max_keys=2)
+    rl.check("a", now=1.0)  # bucket for "a" full
+    rl.check("b", now=2.0)  # bucket for "b" full
+    rl.check("c", now=3.0)  # forces eviction of "a"
+    # If "a" had been preserved, this would be denied.
+    assert rl.check("a", now=4.0)[0] is True
+
+
+def test_limiter_rejects_zero_or_negative_limit() -> None:
+    with pytest.raises(ValueError):
+        RateLimiter(limit=0, window_seconds=60)
+    with pytest.raises(ValueError):
+        RateLimiter(limit=-1, window_seconds=60)
+    with pytest.raises(ValueError):
+        RateLimiter(limit=1, window_seconds=0)
+
+
+# ── client_ip ────────────────────────────────────────────────────────────────
+
+
+def test_client_ip_prefers_fly_client_ip_header() -> None:
+    request = _request(
+        headers={
+            "fly-client-ip": "203.0.113.1",
+            "x-forwarded-for": "198.51.100.2",
+        },
+        peer="10.0.0.1",
+    )
+    assert client_ip(request) == "203.0.113.1"
+
+
+def test_client_ip_falls_back_to_xff_first_hop() -> None:
+    request = _request(
+        headers={"x-forwarded-for": "203.0.113.1, 198.51.100.2, 10.0.0.1"},
+        peer="10.0.0.1",
+    )
+    assert client_ip(request) == "203.0.113.1"
+
+
+def test_client_ip_falls_back_to_socket_peer() -> None:
+    request = _request(headers={}, peer="127.0.0.1")
+    assert client_ip(request) == "127.0.0.1"
+
+
+def test_client_ip_returns_unknown_when_no_signal() -> None:
+    request = _request(headers={}, peer=None)
+    assert client_ip(request) == "unknown"
diff --git a/tests/test_web.py b/tests/test_web.py
index e3adc42..ac2df84 100644
--- a/tests/test_web.py
+++ b/tests/test_web.py
@@ -37,6 +37,7 @@
 from trailstory.config import Settings
 from trailstory.llm.client import AnthropicClient
 from web.app import create_app
+from web.ratelimit import RateLimiter
 from web.routes import (
     MAX_GPX_BYTES,
     MAX_PHOTO_BYTES,
@@ -130,12 +131,14 @@ def _app_with_storage(
     storage: Storage,
     *,
     client: MagicMock | None = None,
+    rate_limiter: RateLimiter | None = None,
 ) -> tuple[FastAPI, MagicMock]:
     fake = client if client is not None else _make_client()
     app = create_app(
         settings=_settings(),
         storage=storage,
         client_factory=lambda: fake,
+        rate_limiter=rate_limiter,
         enable_sweeper=False,
     )
     return app, fake
@@ -296,6 +299,26 @@ def test_healthz_returns_status_ok(client: TestClient) -> None:
     assert response.json() == {"status": "ok"}
 
 
+def test_version_reports_git_sha_from_env(
+    client: TestClient, monkeypatch: pytest.MonkeyPatch
+) -> None:
+    """``GET /version`` echoes ``GIT_SHA`` for deploy traceability."""
+    monkeypatch.setenv("GIT_SHA", "abc1234")
+    response = client.get("/version")
+    assert response.status_code == 200
+    body = response.json()
+    assert body["git_sha"] == "abc1234"
+    assert body["version"] == "0.1.0"
+
+
+def test_version_falls_back_to_unknown(client: TestClient, monkeypatch: pytest.MonkeyPatch) -> None:
+    """Local runs without ``GIT_SHA`` set still return a well-formed payload."""
+    monkeypatch.delenv("GIT_SHA", raising=False)
+    response = client.get("/version")
+    assert response.status_code == 200
+    assert response.json() == {"version": "0.1.0", "git_sha": "unknown"}
+
+
 # ── happy path ───────────────────────────────────────────────────────────────
 
 
@@ -622,6 +645,70 @@ def test_generate_rejects_oversized_photo(
     assert "Photo exceeds" in response.json()["detail"]
 
 
+# ── rate limit ───────────────────────────────────────────────────────────────
+
+
+def test_generate_returns_429_when_over_rate_limit(storage: Storage) -> None:
+    """A second call from the same client after hitting the cap gets 429.
+
+    Uses a tiny limit so the rejection path is reachable in two calls.
+    The 429 must include a positive ``Retry-After`` header — that is
+    the public contract the client UI can rely on.
+    """
+    app, _ = _app_with_storage(
+        storage,
+        rate_limiter=RateLimiter(limit=1, window_seconds=3600),
+    )
+    with TestClient(app) as c:
+        first = c.post(
+            "/generate",
+            data={
+                "description": "The fog cleared just as we reached the ridge.",
+                "style": "editorial",
+            },
+            files=_generate_files(),
+        )
+        second = c.post(
+            "/generate",
+            data={"description": "Same client trying again.", "style": "editorial"},
+            files=_generate_files(),
+        )
+    assert first.status_code == 200
+    assert second.status_code == 429
+    assert int(second.headers["retry-after"]) > 0
+    assert "Too many memory generations" in second.json()["detail"]
+
+
+def test_generate_rate_limit_keys_on_fly_client_ip(storage: Storage) -> None:
+    """Two distinct ``Fly-Client-IP`` values get independent buckets."""
+    app, _ = _app_with_storage(
+        storage,
+        rate_limiter=RateLimiter(limit=1, window_seconds=3600),
+    )
+    with TestClient(app) as c:
+        first = c.post(
+            "/generate",
+            data={"description": "Client A.", "style": "editorial"},
+            files=_generate_files(),
+            headers={"Fly-Client-IP": "203.0.113.1"},
+        )
+        second = c.post(
+            "/generate",
+            data={"description": "Client B.", "style": "editorial"},
+            files=_generate_files(),
+            headers={"Fly-Client-IP": "203.0.113.2"},
+        )
+        third = c.post(
+            "/generate",
+            data={"description": "Client A again.", "style": "editorial"},
+            files=_generate_files(),
+            headers={"Fly-Client-IP": "203.0.113.1"},
+        )
+    assert first.status_code == 200
+    assert second.status_code == 200
+    assert third.status_code == 429
+
+
 # ── memory page / carousel 404s ──────────────────────────────────────────────
 
 
diff --git a/web/app.py b/web/app.py
index 6ff905b..92e0b2c 100644
--- a/web/app.py
+++ b/web/app.py
@@ -28,6 +28,7 @@
 
 from trailstory.config import Settings, load_settings
 from trailstory.llm.client import AnthropicClient
+from web.ratelimit import GENERATE_LIMIT_PER_HOUR, GENERATE_WINDOW_SECONDS, RateLimiter
 from web.routes import router
 from web.storage import Storage
 
@@ -48,6 +49,7 @@ def create_app(
     settings: Settings | None = None,
     storage: Storage | None = None,
     client_factory: Callable[[], AnthropicClient] | None = None,
+    rate_limiter: RateLimiter | None = None,
     enable_sweeper: bool = True,
 ) -> FastAPI:
     """Build a configured FastAPI app.
@@ -61,6 +63,10 @@ def create_app(
         client_factory: Callable returning a configured
             ``AnthropicClient``. Tests inject a factory that returns a
             ``MagicMock`` so the real SDK is never reached.
+        rate_limiter: Per-IP limiter for ``/generate``. Defaults to a
+            sliding-window ``RateLimiter`` sized at
+            :data:`web.ratelimit.GENERATE_LIMIT_PER_HOUR`. Tests pass
+            a tiny limit so the 429 path is reachable in a few calls.
         enable_sweeper: When ``True`` (default), the retention sweep is
             scheduled on app startup. Tests disable this so they can
             assert sweep behaviour by calling ``Storage.sweep_expired``
@@ -71,6 +77,14 @@ def create_app(
     resolved_factory = (
         client_factory if client_factory is not None else _default_client_factory(resolved_settings)
     )
+    resolved_limiter = (
+        rate_limiter
+        if rate_limiter is not None
+        else RateLimiter(
+            limit=GENERATE_LIMIT_PER_HOUR,
+            window_seconds=GENERATE_WINDOW_SECONDS,
+        )
+    )
 
     @asynccontextmanager
     async def lifespan(app: FastAPI) -> AsyncIterator[None]:
@@ -103,6 +117,7 @@ async def lifespan(app: FastAPI) -> AsyncIterator[None]:
     app.state.settings = resolved_settings
     app.state.storage = resolved_storage
     app.state.client_factory = resolved_factory
+    app.state.generate_limiter = resolved_limiter
     app.state.templates = Jinja2Templates(directory=str(TEMPLATE_DIR))
 
     if STATIC_DIR.is_dir():
diff --git a/web/ratelimit.py b/web/ratelimit.py
new file mode 100644
index 0000000..7bfd506
--- /dev/null
+++ b/web/ratelimit.py
@@ -0,0 +1,166 @@
+"""In-process per-IP rate limit for the expensive ``/generate`` route.
+
+A single ``/generate`` request triggers an Anthropic narrative call
+costing roughly $0.10-$0.30. Without a cap, one abusive client could
+burn through the published Anthropic spend ceiling in minutes — Fly's
+own pay-as-you-go tier exposes no hard cap on most accounts, so the
+defense lives in the app.
+
+Implementation:
+
+* Sliding-window counter per client IP. Each ``check`` evicts
+  expired timestamps from the head of the per-IP deque before
+  comparing length against the configured limit.
+* Bounded total size — when ``max_keys`` is reached we drop the
+  least-recently-inserted IP. Python's dict insertion-order semantics
+  give us LRU-by-insert for free, which is good enough to bound RAM
+  under a flood of unique source IPs.
+* Thread-safe via a single ``threading.Lock`` around the bucket
+  map. uvicorn's worker model means concurrent requests can land at
+  the same time; the lock hold is microseconds so contention is not
+  meaningful at v0 traffic.
+* In-process state only. A restart wipes the counters; that's fine
+  because the window is short (one hour) and Fly's HA pair sees the
+  same client through different machines, leaking at most 2x the
+  per-IP cap. Across-replica accuracy would need Redis, which is
+  more infrastructure than v0 warrants.
+
+Behind Fly's edge proxy ``request.client.host`` is one of Fly's edge
+IPs, identical for all clients. The real client lives in
+``Fly-Client-IP`` — :func:`client_ip` reads that first, falling back
+to ``X-Forwarded-For`` for non-Fly deployments and finally to the
+socket peer for local development.
+"""
+
+from __future__ import annotations
+
+import threading
+import time
+from collections import deque
+from typing import Final
+
+from fastapi import HTTPException, Request
+
+# Per-IP cap. Picked so a single client can iterate a few times on
+# their own memory (3-5 prompt tweaks plus the carousel call) without
+# friction, while bounding the worst case to roughly $2/hour of
+# Anthropic spend per IP.
+GENERATE_LIMIT_PER_HOUR: Final[int] = 10
+GENERATE_WINDOW_SECONDS: Final[int] = 60 * 60
+
+# Hard ceiling on the IP map so a flood of unique source IPs cannot
+# OOM the process. 10k entries is ~hundreds of KB of deques even at
+# the per-IP cap; far below the 512 MB Fly machine.
+DEFAULT_MAX_KEYS: Final[int] = 10_000
+
+
+class RateLimiter:
+    """Sliding-window per-key request counter.
+
+    One instance per app, hung off ``app.state.generate_limiter``.
+    Tests construct their own with a tiny ``limit`` so the 429 path
+    is reachable in a few requests without sleeping.
+    """
+
+    __slots__ = ("_buckets", "_limit", "_lock", "_max_keys", "_window")
+
+    def __init__(
+        self,
+        *,
+        limit: int,
+        window_seconds: int,
+        max_keys: int = DEFAULT_MAX_KEYS,
+    ) -> None:
+        if limit <= 0:
+            raise ValueError("limit must be positive")
+        if window_seconds <= 0:
+            raise ValueError("window_seconds must be positive")
+        self._limit = limit
+        self._window = window_seconds
+        self._max_keys = max_keys
+        self._buckets: dict[str, deque[float]] = {}
+        self._lock = threading.Lock()
+
+    def check(self, key: str, *, now: float | None = None) -> tuple[bool, int]:
+        """Record a request attempt for ``key`` and return the verdict.
+
+        Returns ``(allowed, retry_after_seconds)``. When ``allowed``
+        is ``False``, ``retry_after_seconds`` is the integer-second
+        wait until the oldest in-window entry ages out — suitable
+        for a ``Retry-After`` header.
+        """
+        ts = now if now is not None else time.monotonic()
+        cutoff = ts - self._window
+        with self._lock:
+            bucket = self._buckets.get(key)
+            if bucket is None:
+                if len(self._buckets) >= self._max_keys:
+                    # Drop the LRU-by-insert entry. We re-insert keys
+                    # below by ``self._buckets[key] = bucket``, which
+                    # does NOT refresh insertion order in Python's
+                    # dict — but ``check`` for an existing key
+                    # short-circuits the eviction branch entirely, so
+                    # the eviction here only fires for genuinely new
+                    # keys and the LRU semantics hold.
+                    self._buckets.pop(next(iter(self._buckets)))
+                bucket = deque()
+                self._buckets[key] = bucket
+            while bucket and bucket[0] <= cutoff:
+                bucket.popleft()
+            if len(bucket) >= self._limit:
+                # +1 so a client that obeys Retry-After lands just
+                # past the boundary instead of bouncing again on the
+                # rounding floor.
+                retry_after = max(1, int(bucket[0] + self._window - ts) + 1)
+                return False, retry_after
+            bucket.append(ts)
+            return True, 0
+
+
+def client_ip(request: Request) -> str:
+    """Best-effort extraction of the originating client IP.
+
+    Trust order:
+
+    1. ``Fly-Client-IP`` — set by Fly's edge proxy and not forwardable
+       by the client. This is the truth on Fly.
+    2. ``X-Forwarded-For`` — first hop. Used by other proxies. A
+       hostile client behind a non-Fly deployment could spoof this,
+       but the rate-limit blast radius is limited to one IP at a
+       time, so the worst case is "evades the limit" not "breaks the
+       process".
+    3. ``request.client.host`` — the socket peer. The right answer
+       for a direct connection (local dev). On Fly this would be one
+       of the edge IPs, useless for rate limiting.
+    """
+    fly_ip = request.headers.get("fly-client-ip")
+    if fly_ip:
+        return fly_ip
+    forwarded = request.headers.get("x-forwarded-for")
+    if forwarded:
+        return forwarded.split(",")[0].strip()
+    if request.client is not None:
+        return request.client.host
+    return "unknown"
+
+
+def enforce_generate_limit(request: Request) -> None:
+    """FastAPI dependency for the ``/generate`` route.
+
+    Fires before the form body is parsed (the dependency only takes
+    ``Request``, so FastAPI does not materialise multipart fields to
+    resolve it). A rate-limited client therefore gets a 429 without
+    us reading the upload — the bandwidth still costs us a little
+    but the expensive Anthropic call is cleanly avoided.
+    """
+    limiter: RateLimiter = request.app.state.generate_limiter
+    allowed, retry_after = limiter.check(client_ip(request))
+    if not allowed:
+        raise HTTPException(
+            status_code=429,
+            detail=(
+                "Too many memory generations from your address. "
+                f"Try again in {retry_after} seconds."
+            ),
+            headers={"Retry-After": str(retry_after)},
+        )
diff --git a/web/routes.py b/web/routes.py
index 6550c7c..8f6b1e3 100644
--- a/web/routes.py
+++ b/web/routes.py
@@ -1,6 +1,6 @@
 """HTTP route handlers for the web builder.
 
-Eight endpoints, all stateless from the user's point of view:
+Nine endpoints, all stateless from the user's point of view:
 
 * ``GET /``                          — landing page + builder form.
 * ``POST /generate``                 — multipart upload; runs the prep
@@ -14,6 +14,8 @@
 * ``POST /memory/{slug}/carousel``   — generates the IG carousel on demand.
 * ``GET /privacy``                   — plain-language privacy page.
 * ``GET /healthz``                   — uptime probe.
+* ``GET /version``                   — build identity (git SHA from the
+                                        deploy image).
 * ``GET /memory/{slug}/carousel/{filename}`` — serves a single slide.
 
 Heavy lifting (parse / load / narrative / render) lives in
@@ -27,12 +29,13 @@
 
 import json
 import logging
+import os
 import threading
 from collections.abc import Callable, Iterable, Iterator
 from pathlib import Path
 from typing import Annotated, Final
 
-from fastapi import APIRouter, BackgroundTasks, Form, HTTPException, Request, UploadFile
+from fastapi import APIRouter, BackgroundTasks, Depends, Form, HTTPException, Request, UploadFile
 from fastapi.responses import (
     FileResponse,
     HTMLResponse,
@@ -54,6 +57,7 @@
     render_carousel,
     stream_pipeline,
 )
+from web.ratelimit import enforce_generate_limit
 from web.storage import Storage, Workspace
 
 logger = logging.getLogger(__name__)
@@ -119,10 +123,30 @@ async def healthz() -> dict[str, str]:
     return {"status": "ok"}
 
 
+@router.get("/version")
+async def version() -> dict[str, str]:
+    """Build identity for the running image.
+
+    ``git_sha`` is injected at image build time via the ``GIT_SHA``
+    Docker build arg (see Dockerfile + ``make deploy``); local runs
+    fall through to ``"unknown"``. ``version`` mirrors the value
+    declared in ``pyproject.toml`` so a deployed bug can be tied back
+    to a specific commit + release without log archaeology.
+    """
+    return {
+        "version": "0.1.0",
+        "git_sha": os.environ.get("GIT_SHA", "unknown"),
+    }
+
+
 # ── pipeline ─────────────────────────────────────────────────────────────────
 
 
-@router.post("/generate", response_class=HTMLResponse)
+@router.post(
+    "/generate",
+    response_class=HTMLResponse,
+    dependencies=[Depends(enforce_generate_limit)],
+)
 async def generate(
     request: Request,
     background_tasks: BackgroundTasks,
@@ -139,6 +163,11 @@ async def generate(
     (``pending.json``) before responding so the SSE endpoint can pick up
     even after the BackgroundTask has wiped the raw uploads.
 
+    The route is rate-limited per client IP via
+    :func:`web.ratelimit.enforce_generate_limit`; an over-quota client
+    gets a 429 with a ``Retry-After`` header before the multipart body
+    is parsed.
+
     Raises a 4xx if the inputs are missing, oversized, or unsupported;
     pipeline parse / photo-load errors surface as 400 here rather than
     in the SSE stream so the user gets immediate feedback.