From f3b7f5fb9e7b73f3d8fb4059950efac0b722b139 Mon Sep 17 00:00:00 2001
From: Taylor Mutch <taylormutch@gmail.com>
Date: Wed, 6 May 2026 18:13:19 -0700
Subject: [PATCH 1/4] test(helm): Add kube gateway e2e tests

Signed-off-by: Taylor Mutch <taylormutch@gmail.com>
---
 e2e/rust/e2e-helm.sh                       |  20 ++
 e2e/rust/src/harness/driver.rs             |  20 ++
 e2e/rust/src/harness/mod.rs                |   1 +
 e2e/rust/tests/forward_proxy_graphql_l7.rs |   4 +
 e2e/rust/tests/forward_proxy_l7_bypass.rs  |   7 +
 e2e/rust/tests/host_gateway_alias.rs       |  10 +
 e2e/with-kube-gateway.sh                   | 220 +++++++++++++++++++++
 tasks/test.toml                            |   4 +
 8 files changed, 286 insertions(+)
 create mode 100755 e2e/rust/e2e-helm.sh
 create mode 100644 e2e/rust/src/harness/driver.rs
 create mode 100755 e2e/with-kube-gateway.sh

diff --git a/e2e/rust/e2e-helm.sh b/e2e/rust/e2e-helm.sh
new file mode 100755
index 000000000..7d7042c47
--- /dev/null
+++ b/e2e/rust/e2e-helm.sh
@@ -0,0 +1,20 @@
+#!/usr/bin/env bash
+# SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+
+# Run a Rust e2e test against a Helm-deployed OpenShell gateway. Set
+# OPENSHELL_E2E_KUBE_CONTEXT to target an existing cluster; otherwise an
+# ephemeral k3d cluster is created and torn down by with-kube-gateway.sh.
+
+set -euo pipefail
+
+ROOT="$(cd "$(dirname "${BASH_SOURCE[0]}")/../.." && pwd)"
+E2E_TEST="${OPENSHELL_E2E_KUBE_TEST:-smoke}"
+
+cargo build -p openshell-cli --features openshell-core/dev-settings
+
+exec "${ROOT}/e2e/with-kube-gateway.sh" \
+  cargo test --manifest-path "${ROOT}/e2e/rust/Cargo.toml" \
+    --features e2e \
+    --test "${E2E_TEST}" \
+    -- --nocapture
diff --git a/e2e/rust/src/harness/driver.rs b/e2e/rust/src/harness/driver.rs
new file mode 100644
index 000000000..07921e461
--- /dev/null
+++ b/e2e/rust/src/harness/driver.rs
@@ -0,0 +1,20 @@
+// SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+// SPDX-License-Identifier: Apache-2.0
+
+//! Active compute-driver detection for tests with driver-specific assumptions.
+
+/// Returns true and prints a skip notice when running against the kube driver.
+///
+/// Tests that depend on docker/podman host-network features (e.g.
+/// `host.openshell.internal` reachability, sibling-container test servers)
+/// can early-return when this is true.
+pub fn skip_if_kube(reason: &str) -> bool {
+    if matches!(
+        std::env::var("OPENSHELL_E2E_DRIVER").as_deref(),
+        Ok("kubernetes")
+    ) {
+        eprintln!("skipping on kubernetes driver: {reason}");
+        return true;
+    }
+    false
+}
diff --git a/e2e/rust/src/harness/mod.rs b/e2e/rust/src/harness/mod.rs
index 5feb21c70..89a095548 100644
--- a/e2e/rust/src/harness/mod.rs
+++ b/e2e/rust/src/harness/mod.rs
@@ -5,6 +5,7 @@
 
 pub mod binary;
 pub mod container;
+pub mod driver;
 pub mod gateway;
 pub mod output;
 pub mod port;
diff --git a/e2e/rust/tests/forward_proxy_graphql_l7.rs b/e2e/rust/tests/forward_proxy_graphql_l7.rs
index aeb3648b0..bfc561a20 100644
--- a/e2e/rust/tests/forward_proxy_graphql_l7.rs
+++ b/e2e/rust/tests/forward_proxy_graphql_l7.rs
@@ -13,6 +13,7 @@
 use std::io::Write;
 
 use openshell_e2e::harness::container::ContainerHttpServer;
+use openshell_e2e::harness::driver::skip_if_kube;
 use openshell_e2e::harness::sandbox::SandboxGuard;
 use tempfile::NamedTempFile;
 
@@ -131,6 +132,9 @@ network_policies:
 #[tokio::test]
 #[allow(clippy::too_many_lines)]
 async fn graphql_l7_enforces_allow_and_deny_rules_on_forward_and_connect_paths() {
+    if skip_if_kube("uses host.openshell.internal to reach a sibling container") {
+        return;
+    }
     let server = start_test_server().await.expect("start test server");
     let policy = write_graphql_policy(&server.host, server.port).expect("write custom policy");
     let policy_path = policy
diff --git a/e2e/rust/tests/forward_proxy_l7_bypass.rs b/e2e/rust/tests/forward_proxy_l7_bypass.rs
index 6cbaca1eb..1d3f872d0 100644
--- a/e2e/rust/tests/forward_proxy_l7_bypass.rs
+++ b/e2e/rust/tests/forward_proxy_l7_bypass.rs
@@ -11,6 +11,7 @@
 use std::io::Write;
 
 use openshell_e2e::harness::container::ContainerHttpServer;
+use openshell_e2e::harness::driver::skip_if_kube;
 use openshell_e2e::harness::sandbox::SandboxGuard;
 use tempfile::NamedTempFile;
 
@@ -98,6 +99,9 @@ network_policies:
 /// GET /allowed should succeed — the L7 policy explicitly allows it.
 #[tokio::test]
 async fn forward_proxy_allows_l7_permitted_request() {
+    if skip_if_kube("uses host.openshell.internal to reach a sibling container") {
+        return;
+    }
     let server = start_test_server().await.expect("start test server");
     let policy =
         write_policy_with_l7_rules(&server.host, server.port).expect("write custom policy");
@@ -138,6 +142,9 @@ except Exception as e:
 /// POST /allowed should be denied — the L7 policy only allows GET.
 #[tokio::test]
 async fn forward_proxy_denies_l7_blocked_request() {
+    if skip_if_kube("uses host.openshell.internal to reach a sibling container") {
+        return;
+    }
     let server = start_test_server().await.expect("start test server");
     let policy =
         write_policy_with_l7_rules(&server.host, server.port).expect("write custom policy");
diff --git a/e2e/rust/tests/host_gateway_alias.rs b/e2e/rust/tests/host_gateway_alias.rs
index 2dbdbf1dc..8e58a3de1 100644
--- a/e2e/rust/tests/host_gateway_alias.rs
+++ b/e2e/rust/tests/host_gateway_alias.rs
@@ -8,6 +8,7 @@ use std::process::Stdio;
 use std::sync::Mutex;
 
 use openshell_e2e::harness::binary::openshell_cmd;
+use openshell_e2e::harness::driver::skip_if_kube;
 use openshell_e2e::harness::sandbox::SandboxGuard;
 use tempfile::NamedTempFile;
 use tokio::io::AsyncReadExt;
@@ -190,6 +191,9 @@ network_policies:
 
 #[tokio::test]
 async fn sandbox_reaches_host_openshell_internal_via_host_gateway_alias() {
+    if skip_if_kube("requires host.openshell.internal alias") {
+        return;
+    }
     let server = HostServer::start(r#"{"message":"hello-from-host"}"#)
         .await
         .expect("start host echo server");
@@ -225,6 +229,9 @@ async fn sandbox_reaches_host_openshell_internal_via_host_gateway_alias() {
 
 #[tokio::test]
 async fn sandbox_inference_local_routes_to_host_openshell_internal() {
+    if skip_if_kube("requires host.openshell.internal alias") {
+        return;
+    }
     let _inference_lock = INFERENCE_ROUTE_LOCK
         .lock()
         .unwrap_or_else(std::sync::PoisonError::into_inner);
@@ -301,6 +308,9 @@ async fn sandbox_inference_local_routes_to_host_openshell_internal() {
 
 #[tokio::test]
 async fn inference_set_supports_no_verify_for_unreachable_endpoint() {
+    if skip_if_kube("uses host.openshell.internal as the unreachable target") {
+        return;
+    }
     let _inference_lock = INFERENCE_ROUTE_LOCK
         .lock()
         .unwrap_or_else(std::sync::PoisonError::into_inner);
diff --git a/e2e/with-kube-gateway.sh b/e2e/with-kube-gateway.sh
new file mode 100755
index 000000000..d316876e6
--- /dev/null
+++ b/e2e/with-kube-gateway.sh
@@ -0,0 +1,220 @@
+#!/usr/bin/env bash
+# SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+
+# Run an e2e command against a Helm-deployed OpenShell gateway in Kubernetes.
+#
+# Modes:
+#   - OPENSHELL_E2E_KUBE_CONTEXT set:
+#       Target the named kubectl context, install the chart into an ephemeral
+#       namespace, and port-forward the gateway. Cluster lifecycle is the
+#       caller's responsibility (e.g. CI provisions kind via helm/kind-action).
+#   - OPENSHELL_E2E_KUBE_CONTEXT unset:
+#       Create a local k3d cluster via tasks/scripts/helm-k3s-local.sh, install
+#       the chart, port-forward, and tear the cluster down on exit.
+#
+# Helm e2e currently uses plaintext gateway traffic (ci/values-tls-disabled.yaml).
+#
+# Image source: helm install pulls from ${OPENSHELL_REGISTRY}/{gateway,supervisor}:${IMAGE_TAG}
+# (defaults: ghcr.io/nvidia/openshell, latest). CI sets IMAGE_TAG to the commit SHA;
+# local devs should set it to a tag pulled from a registry the cluster can reach,
+# or build and import images via a separate bootstrap step before running this script.
+
+set -euo pipefail
+
+if [ "$#" -eq 0 ]; then
+  echo "Usage: e2e/with-kube-gateway.sh <command> [args...]" >&2
+  exit 2
+fi
+
+ROOT="$(cd "$(dirname "${BASH_SOURCE[0]}")/.." && pwd)"
+# shellcheck source=e2e/support/gateway-common.sh
+source "${ROOT}/e2e/support/gateway-common.sh"
+
+WORKDIR_PARENT="${TMPDIR:-/tmp}"
+WORKDIR_PARENT="${WORKDIR_PARENT%/}"
+WORKDIR="$(mktemp -d "${WORKDIR_PARENT}/openshell-e2e-kube.XXXXXX")"
+
+CLUSTER_CREATED_BY_US=0
+CLUSTER_NAME=""
+KUBE_CONTEXT=""
+NAMESPACE="openshell"
+RELEASE_NAME="openshell"
+PORTFORWARD_PID=""
+PORTFORWARD_LOG="${WORKDIR}/portforward.log"
+HELM_INSTALLED=0
+
+# Isolate CLI/SDK gateway metadata from the developer's real config.
+export XDG_CONFIG_HOME="${WORKDIR}/config"
+export XDG_DATA_HOME="${WORKDIR}/data"
+
+kctl() {
+  kubectl --context "${KUBE_CONTEXT}" "$@"
+}
+
+helmctl() {
+  helm --kube-context "${KUBE_CONTEXT}" "$@"
+}
+
+cleanup() {
+  local exit_code=$?
+
+  if [ -n "${PORTFORWARD_PID}" ]; then
+    kill "${PORTFORWARD_PID}" >/dev/null 2>&1 || true
+    wait "${PORTFORWARD_PID}" >/dev/null 2>&1 || true
+  fi
+
+  if [ "${exit_code}" -ne 0 ] && [ -n "${KUBE_CONTEXT}" ] && [ -n "${NAMESPACE}" ]; then
+    if command -v kubectl >/dev/null 2>&1 \
+       && kctl get namespace "${NAMESPACE}" >/dev/null 2>&1; then
+      echo "=== gateway pod state (preserved for debugging) ==="
+      kctl -n "${NAMESPACE}" get pods -o wide 2>&1 || true
+      echo "=== gateway events ==="
+      kctl -n "${NAMESPACE}" get events --sort-by=.lastTimestamp 2>&1 \
+        | tail -n 80 || true
+      echo "=== gateway logs (last 200 lines) ==="
+      kctl -n "${NAMESPACE}" logs \
+        -l "app.kubernetes.io/instance=${RELEASE_NAME}" --tail=200 \
+        --all-containers --prefix 2>&1 || true
+      echo "=== end gateway debug output ==="
+    fi
+    if [ -f "${PORTFORWARD_LOG}" ]; then
+      echo "=== port-forward log ==="
+      cat "${PORTFORWARD_LOG}" || true
+      echo "=== end port-forward log ==="
+    fi
+  fi
+
+  if [ "${HELM_INSTALLED}" = "1" ] && [ -n "${KUBE_CONTEXT}" ] && [ -n "${NAMESPACE}" ]; then
+    if command -v helm >/dev/null 2>&1; then
+      helmctl uninstall "${RELEASE_NAME}" --namespace "${NAMESPACE}" --wait \
+        --timeout 60s >/dev/null 2>&1 || true
+    fi
+    if command -v kubectl >/dev/null 2>&1; then
+      kctl delete namespace "${NAMESPACE}" --wait=false \
+        --ignore-not-found >/dev/null 2>&1 || true
+    fi
+  fi
+
+  if [ "${CLUSTER_CREATED_BY_US}" = "1" ] && [ -n "${CLUSTER_NAME}" ]; then
+    if command -v k3d >/dev/null 2>&1 && k3d cluster list "${CLUSTER_NAME}" \
+        >/dev/null 2>&1; then
+      echo "Deleting ephemeral k3d cluster ${CLUSTER_NAME}..."
+      k3d cluster delete "${CLUSTER_NAME}" >/dev/null 2>&1 || true
+    fi
+  fi
+
+  rm -rf "${WORKDIR}" 2>/dev/null || true
+}
+trap cleanup EXIT
+
+require_cmd() {
+  if ! command -v "$1" >/dev/null 2>&1; then
+    echo "ERROR: $1 is required to run Helm-backed e2e tests" >&2
+    exit 2
+  fi
+}
+
+require_cmd helm
+require_cmd kubectl
+require_cmd curl
+
+if [ -n "${OPENSHELL_E2E_KUBE_CONTEXT:-}" ]; then
+  KUBE_CONTEXT="${OPENSHELL_E2E_KUBE_CONTEXT}"
+  echo "Using existing kubectl context: ${KUBE_CONTEXT}"
+  if ! kctl cluster-info >/dev/null 2>&1; then
+    echo "ERROR: kubectl context '${KUBE_CONTEXT}' is not reachable." >&2
+    exit 2
+  fi
+else
+  require_cmd k3d
+  CLUSTER_NAME="oshe2e-$$-$(date +%s | tail -c 8)"
+  echo "Creating ephemeral k3d cluster ${CLUSTER_NAME}..."
+  HELM_K3S_CLUSTER_NAME="${CLUSTER_NAME}" \
+  HELM_K3S_KUBECONFIG="${WORKDIR}/kubeconfig" \
+    bash "${ROOT}/tasks/scripts/helm-k3s-local.sh" create
+  CLUSTER_CREATED_BY_US=1
+  export KUBECONFIG="${WORKDIR}/kubeconfig"
+  KUBE_CONTEXT="k3d-${CLUSTER_NAME}"
+fi
+
+IMAGE_TAG_VALUE="${IMAGE_TAG:-latest}"
+REGISTRY_VALUE="${OPENSHELL_REGISTRY:-ghcr.io/nvidia/openshell}"
+REGISTRY_VALUE="${REGISTRY_VALUE%/}"
+
+# When this script created the cluster, import locally-available gateway and
+# supervisor images so devs without a registry login can iterate. Best-effort:
+# missing images fall through to the cluster's pull behavior at install time.
+if [ "${CLUSTER_CREATED_BY_US}" = "1" ]; then
+  for image in \
+    "${REGISTRY_VALUE}/gateway:${IMAGE_TAG_VALUE}" \
+    "${REGISTRY_VALUE}/supervisor:${IMAGE_TAG_VALUE}"; do
+    if docker image inspect "${image}" >/dev/null 2>&1; then
+      echo "Importing ${image} into k3d cluster ${CLUSTER_NAME}..."
+      k3d image import "${image}" --cluster "${CLUSTER_NAME}" \
+        --mode direct >/dev/null
+    fi
+  done
+fi
+
+# The Kubernetes compute driver creates and watches Sandbox CRs reconciled
+# by the upstream agent-sandbox-controller. Without the CRD + controller,
+# every gateway K8s call 404s and CreateSandbox never produces a Pod.
+echo "Installing agent-sandbox CRDs and controller..."
+kctl apply -f "${ROOT}/deploy/kube/manifests/agent-sandbox.yaml"
+kctl wait --for=condition=Established crd/sandboxes.agents.x-k8s.io --timeout=120s
+kctl -n agent-sandbox-system rollout status statefulset/agent-sandbox-controller --timeout=300s
+
+echo "Installing Helm chart (release=${RELEASE_NAME}, namespace=${NAMESPACE}, tag=${IMAGE_TAG_VALUE})..."
+helmctl install "${RELEASE_NAME}" "${ROOT}/deploy/helm/openshell" \
+  --namespace "${NAMESPACE}" --create-namespace \
+  --values "${ROOT}/deploy/helm/openshell/ci/values-tls-disabled.yaml" \
+  --set "fullnameOverride=openshell" \
+  --set "image.repository=${REGISTRY_VALUE}/gateway" \
+  --set "image.tag=${IMAGE_TAG_VALUE}" \
+  --set "supervisor.image.repository=${REGISTRY_VALUE}/supervisor" \
+  --set "supervisor.image.tag=${IMAGE_TAG_VALUE}" \
+  --wait --timeout 5m
+HELM_INSTALLED=1
+
+LOCAL_PORT="$(e2e_pick_port)"
+echo "Starting kubectl port-forward svc/openshell ${LOCAL_PORT}:8080..."
+kctl -n "${NAMESPACE}" port-forward "svc/openshell" \
+  "${LOCAL_PORT}:8080" >"${PORTFORWARD_LOG}" 2>&1 &
+PORTFORWARD_PID=$!
+
+elapsed=0
+timeout=30
+while [ "${elapsed}" -lt "${timeout}" ]; do
+  if ! kill -0 "${PORTFORWARD_PID}" 2>/dev/null; then
+    echo "ERROR: kubectl port-forward exited before becoming reachable" >&2
+    cat "${PORTFORWARD_LOG}" >&2 || true
+    exit 1
+  fi
+  if curl -s -o /dev/null --connect-timeout 1 "http://127.0.0.1:${LOCAL_PORT}"; then
+    break
+  fi
+  sleep 1
+  elapsed=$((elapsed + 1))
+done
+if [ "${elapsed}" -ge "${timeout}" ]; then
+  echo "ERROR: port-forward did not accept TCP within ${timeout}s" >&2
+  cat "${PORTFORWARD_LOG}" >&2 || true
+  exit 1
+fi
+
+GATEWAY_NAME="openshell-e2e-kube-${LOCAL_PORT}"
+GATEWAY_ENDPOINT="http://127.0.0.1:${LOCAL_PORT}"
+e2e_register_plaintext_gateway \
+  "${XDG_CONFIG_HOME}" \
+  "${GATEWAY_NAME}" \
+  "${GATEWAY_ENDPOINT}" \
+  "${LOCAL_PORT}"
+
+export OPENSHELL_GATEWAY="${GATEWAY_NAME}"
+export OPENSHELL_E2E_DRIVER="kubernetes"
+export OPENSHELL_E2E_SANDBOX_NAMESPACE="${NAMESPACE}"
+export OPENSHELL_PROVISION_TIMEOUT="${OPENSHELL_PROVISION_TIMEOUT:-300}"
+
+echo "Running e2e command against ${GATEWAY_ENDPOINT}: $*"
+"$@"
diff --git a/tasks/test.toml b/tasks/test.toml
index bf5741c72..c9e1dc817 100644
--- a/tasks/test.toml
+++ b/tasks/test.toml
@@ -50,6 +50,10 @@ run = "e2e/with-docker-gateway.sh uv run pytest -o python_files='test_*.py' -m g
 description = "Run Rust CLI e2e tests against a Podman-backed gateway"
 run = "e2e/rust/e2e-podman.sh"
 
+["e2e:helm"]
+description = "Run smoke e2e against a Helm-deployed gateway (set OPENSHELL_E2E_KUBE_CONTEXT to reuse a cluster, otherwise creates a local k3d cluster)"
+run = "e2e/rust/e2e-helm.sh"
+
 ["e2e:vm"]
 description = "Start openshell-gateway with the VM compute driver and run the cluster-agnostic smoke e2e"
 run = "e2e/rust/e2e-vm.sh"

From 58c3a5a0177e5e17f9f0b6692eddf1e39cc0e4ec Mon Sep 17 00:00:00 2001
From: Taylor Mutch <taylormutch@gmail.com>
Date: Thu, 7 May 2026 16:50:02 -0700
Subject: [PATCH 2/4] ci(helm): add Branch Helm E2E workflow gated on
 test:e2e-helm

Adds a label-gated GitHub Actions workflow that exercises the Helm
chart end-to-end against the Rust e2e suite via `mise run e2e:helm`.

Pipeline:
- pr_metadata gates on the `test:e2e-helm` label via the pr-gate action.
- build-gateway / build-supervisor build and push Docker images using
  the reusable docker-build.yml workflow.
- helm-e2e (bare runner): apt-installs z3 build deps so cargo can
  compile the openshell-policy crate's z3-sys backend, creates a kind
  cluster via helm/kind-action, materializes the kind kubeconfig at the
  path mise's [env] block expects, side-loads the freshly built
  gateway/supervisor images, applies
  deploy/kube/manifests/agent-sandbox.yaml so the
  sandboxes.agents.x-k8s.io CRD and reconciling StatefulSet are in
  place, and finally runs `mise run e2e:helm`.

Also expands the `e2e:helm` task to run the full Rust e2e suite
(matching `e2e:podman`) instead of only the smoke test, with
OPENSHELL_E2E_KUBE_TEST as an opt-in single-test override for local
debugging.

Extends the e2e-label-help workflow so applying `test:e2e-helm` posts
the next-step hint pointing at this workflow.

Signed-off-by: Taylor Mutch <taylormutch@gmail.com>
---
 .github/workflows/branch-helm-e2e.yml | 126 ++++++++++++++++++++++++++
 .github/workflows/e2e-label-help.yml  |   3 +-
 e2e/rust/e2e-helm.sh                  |  13 ++-
 tasks/test.toml                       |   2 +-
 4 files changed, 139 insertions(+), 5 deletions(-)
 create mode 100644 .github/workflows/branch-helm-e2e.yml

diff --git a/.github/workflows/branch-helm-e2e.yml b/.github/workflows/branch-helm-e2e.yml
new file mode 100644
index 000000000..926874a08
--- /dev/null
+++ b/.github/workflows/branch-helm-e2e.yml
@@ -0,0 +1,126 @@
+# SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+
+name: Branch Helm E2E
+
+on:
+  push:
+    branches:
+      - "pull-request/[0-9]+"
+  workflow_dispatch: {}
+
+permissions: {}
+
+jobs:
+  pr_metadata:
+    name: Resolve PR metadata
+    runs-on: ubuntu-latest
+    permissions:
+      contents: read
+      pull-requests: read
+    outputs:
+      should_run: ${{ steps.gate.outputs.should_run }}
+    steps:
+      - uses: actions/checkout@v6
+
+      - id: gate
+        uses: ./.github/actions/pr-gate
+        with:
+          required_label: test:e2e-helm
+
+  build-gateway:
+    needs: [pr_metadata]
+    if: needs.pr_metadata.outputs.should_run == 'true'
+    permissions:
+      contents: read
+      packages: write
+    uses: ./.github/workflows/docker-build.yml
+    with:
+      component: gateway
+      platform: linux/amd64
+
+  build-supervisor:
+    needs: [pr_metadata]
+    if: needs.pr_metadata.outputs.should_run == 'true'
+    permissions:
+      contents: read
+      packages: write
+    uses: ./.github/workflows/docker-build.yml
+    with:
+      component: supervisor
+      platform: linux/amd64
+
+  helm-e2e:
+    name: Helm E2E (Rust smoke)
+    needs: [pr_metadata, build-gateway, build-supervisor]
+    if: needs.pr_metadata.outputs.should_run == 'true'
+    # Bare runner: running kind-in-container hits nested-Docker / kubeconfig
+    # complications. The runner has Docker; mise installs helm, kubectl, and
+    # the Rust toolchain.
+    runs-on: linux-amd64-cpu8
+    timeout-minutes: 60
+    permissions:
+      contents: read
+      packages: read
+    env:
+      MISE_GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
+      KIND_CLUSTER_NAME: helm-e2e-${{ github.run_id }}
+    steps:
+      - uses: actions/checkout@v6
+
+      - name: Install mise
+        run: |
+          curl https://mise.run | sh
+          echo "$HOME/.local/bin" >> "$GITHUB_PATH"
+          echo "$HOME/.local/share/mise/shims" >> "$GITHUB_PATH"
+
+      - name: Install tools
+        run: mise install --locked
+
+      # The openshell-policy crate transitively pulls in z3-sys, whose
+      # build script needs the z3 C/C++ headers and clang/bindgen to
+      # compile. The bare runner doesn't ship them; the CI container
+      # image used by other Rust e2e jobs does, but we can't run helm-e2e
+      # there (the runner's container handler injects its own --network
+      # bridge, which conflicts with the --network host we need so kind's
+      # API server is reachable from the test process).
+      - name: Install z3 build deps
+        run: sudo apt-get update && sudo apt-get install -y --no-install-recommends libz3-dev clang
+
+      - name: Log in to GHCR
+        run: echo "${{ secrets.GITHUB_TOKEN }}" | docker login ghcr.io -u "${{ github.actor }}" --password-stdin
+
+      - name: Create kind cluster
+        uses: helm/kind-action@v1
+        with:
+          cluster_name: ${{ env.KIND_CLUSTER_NAME }}
+          wait: 120s
+
+      # mise.toml sets KUBECONFIG="{{config_root}}/kubeconfig"; helm/kind-action
+      # writes to ~/.kube/config. Materialize the kind context at the mise path
+      # so `mise run e2e:helm` (and the wrapper's `kubectl --context=…`) finds
+      # the kind cluster.
+      - name: Export kind kubeconfig to mise path
+        run: |
+          set -euo pipefail
+          kind get kubeconfig --name "$KIND_CLUSTER_NAME" > "$GITHUB_WORKSPACE/kubeconfig"
+          chmod 600 "$GITHUB_WORKSPACE/kubeconfig"
+
+      # Pre-pull and side-load: kind nodes don't have ghcr credentials, and
+      # tagging IMAGE_TAG to a SHA means the chart's IfNotPresent pull policy
+      # is satisfied once the image is loaded into the node's containerd.
+      - name: Load gateway and supervisor images into kind
+        run: |
+          set -euo pipefail
+          for component in gateway supervisor; do
+            image="ghcr.io/nvidia/openshell/${component}:${{ github.sha }}"
+            docker pull "$image"
+            kind load docker-image "$image" --name "$KIND_CLUSTER_NAME"
+          done
+
+      - name: Run Helm E2E (Rust smoke)
+        env:
+          OPENSHELL_E2E_KUBE_CONTEXT: kind-${{ env.KIND_CLUSTER_NAME }}
+          IMAGE_TAG: ${{ github.sha }}
+          OPENSHELL_REGISTRY: ghcr.io/nvidia/openshell
+        run: mise run --no-deps --skip-deps e2e:helm
diff --git a/.github/workflows/e2e-label-help.yml b/.github/workflows/e2e-label-help.yml
index 2a61660d2..a5463f986 100644
--- a/.github/workflows/e2e-label-help.yml
+++ b/.github/workflows/e2e-label-help.yml
@@ -19,7 +19,7 @@ permissions: {}
 jobs:
   hint:
     name: Post next-step hint for E2E label
-    if: github.event.label.name == 'test:e2e' || github.event.label.name == 'test:e2e-gpu'
+    if: github.event.label.name == 'test:e2e' || github.event.label.name == 'test:e2e-gpu' || github.event.label.name == 'test:e2e-helm'
     runs-on: ubuntu-latest
     permissions:
       pull-requests: write
@@ -40,6 +40,7 @@ jobs:
           case "$LABEL_NAME" in
             test:e2e) workflow_file=branch-e2e.yml; workflow_name="Branch E2E Checks" ;;
             test:e2e-gpu) workflow_file=test-gpu.yml; workflow_name="GPU Test" ;;
+            test:e2e-helm) workflow_file=branch-helm-e2e.yml; workflow_name="Branch Helm E2E" ;;
             *) echo "Unrecognized label $LABEL_NAME"; exit 1 ;;
           esac
 
diff --git a/e2e/rust/e2e-helm.sh b/e2e/rust/e2e-helm.sh
index 7d7042c47..6b161f344 100755
--- a/e2e/rust/e2e-helm.sh
+++ b/e2e/rust/e2e-helm.sh
@@ -2,19 +2,26 @@
 # SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 # SPDX-License-Identifier: Apache-2.0
 
-# Run a Rust e2e test against a Helm-deployed OpenShell gateway. Set
+# Run the Rust e2e suite against a Helm-deployed OpenShell gateway. Set
 # OPENSHELL_E2E_KUBE_CONTEXT to target an existing cluster; otherwise an
 # ephemeral k3d cluster is created and torn down by with-kube-gateway.sh.
+# Set OPENSHELL_E2E_KUBE_TEST to scope to a single integration test
+# (e.g. smoke) for local debugging.
 
 set -euo pipefail
 
 ROOT="$(cd "$(dirname "${BASH_SOURCE[0]}")/../.." && pwd)"
-E2E_TEST="${OPENSHELL_E2E_KUBE_TEST:-smoke}"
 
 cargo build -p openshell-cli --features openshell-core/dev-settings
 
+test_filter=()
+if [ -n "${OPENSHELL_E2E_KUBE_TEST:-}" ]; then
+  test_filter+=(--test "${OPENSHELL_E2E_KUBE_TEST}")
+fi
+
 exec "${ROOT}/e2e/with-kube-gateway.sh" \
   cargo test --manifest-path "${ROOT}/e2e/rust/Cargo.toml" \
     --features e2e \
-    --test "${E2E_TEST}" \
+    --no-fail-fast \
+    ${test_filter[@]+"${test_filter[@]}"} \
     -- --nocapture
diff --git a/tasks/test.toml b/tasks/test.toml
index c9e1dc817..00a6823b2 100644
--- a/tasks/test.toml
+++ b/tasks/test.toml
@@ -51,7 +51,7 @@ description = "Run Rust CLI e2e tests against a Podman-backed gateway"
 run = "e2e/rust/e2e-podman.sh"
 
 ["e2e:helm"]
-description = "Run smoke e2e against a Helm-deployed gateway (set OPENSHELL_E2E_KUBE_CONTEXT to reuse a cluster, otherwise creates a local k3d cluster)"
+description = "Run Rust CLI e2e tests against a Helm-deployed gateway (set OPENSHELL_E2E_KUBE_CONTEXT to reuse a cluster, otherwise creates a local k3d cluster; set OPENSHELL_E2E_KUBE_TEST=<name> to scope to one test)"
 run = "e2e/rust/e2e-helm.sh"
 
 ["e2e:vm"]

From dfa99476faf4aa8f268e03cf537cc2f79d6536df Mon Sep 17 00:00:00 2001
From: Taylor Mutch <taylormutch@gmail.com>
Date: Fri, 8 May 2026 10:19:04 -0700
Subject: [PATCH 3/4] feat(observability): add gateway OTLP traces and Helm
 monitoring surface

Adds opt-in OpenTelemetry trace export and a Prometheus ServiceMonitor to
the gateway Helm chart. The exporter and chart toggles are independent
from the existing /metrics surface and the OCSF sandbox log fan-out.

- Gateway: append a tracing-opentelemetry layer to TracingLogBus when an
  OTLP/gRPC endpoint is configured; flush spans on shutdown. CLI gains
  --otlp-endpoint; standard OTEL_* env vars drive sampling and resource
  attributes.
- Helm: monitoring.serviceMonitor.* renders a Prometheus-Operator
  ServiceMonitor; monitoring.tracing.* projects OTEL_* env vars onto the
  gateway container. Both default off.
- Tooling: observability:k8s:{setup,teardown,port-forward} mise tasks
  install kube-prometheus-stack + Jaeger all-in-one for local dev.
- Docs: new docs/kubernetes/monitoring.mdx; cross-links from observability
  overview and architecture/gateway.md; helm-dev-environment and
  debug-openshell-cluster skills updated.
---
 .../skills/debug-openshell-cluster/SKILL.md   |  11 ++
 .agents/skills/helm-dev-environment/SKILL.md  |  33 ++++
 Cargo.lock                                    |  85 ++++++++++
 Cargo.toml                                    |   6 +
 architecture/gateway.md                       |  17 ++
 crates/openshell-server/Cargo.toml            |   6 +
 crates/openshell-server/src/cli.rs            |  14 +-
 crates/openshell-server/src/lib.rs            |   3 +
 crates/openshell-server/src/tracing_bus.rs    | 160 +++++++++++++++++-
 deploy/helm/openshell/README.md               |  10 ++
 .../helm/openshell/ci/values-monitoring.yaml  |  31 ++++
 deploy/helm/openshell/skaffold.yaml           |  28 +++
 deploy/helm/openshell/templates/_helpers.tpl  |  11 ++
 .../openshell/templates/servicemonitor.yaml   |  27 +++
 .../helm/openshell/templates/statefulset.yaml |  17 ++
 deploy/helm/openshell/values.yaml             |  36 ++++
 docs/kubernetes/monitoring.mdx                | 135 +++++++++++++++
 docs/observability/overview.mdx               |   4 +
 tasks/observability.toml                      |  19 +++
 tasks/scripts/observability-k8s-setup.sh      |  96 +++++++++++
 tasks/scripts/observability-k8s-teardown.sh   |  27 +++
 tasks/scripts/observability-port-forward.sh   |  58 +++++++
 22 files changed, 829 insertions(+), 5 deletions(-)
 create mode 100644 deploy/helm/openshell/ci/values-monitoring.yaml
 create mode 100644 deploy/helm/openshell/templates/servicemonitor.yaml
 create mode 100644 docs/kubernetes/monitoring.mdx
 create mode 100644 tasks/observability.toml
 create mode 100755 tasks/scripts/observability-k8s-setup.sh
 create mode 100755 tasks/scripts/observability-k8s-teardown.sh
 create mode 100755 tasks/scripts/observability-port-forward.sh

diff --git a/.agents/skills/debug-openshell-cluster/SKILL.md b/.agents/skills/debug-openshell-cluster/SKILL.md
index 16158c0dc..34d407e69 100644
--- a/.agents/skills/debug-openshell-cluster/SKILL.md
+++ b/.agents/skills/debug-openshell-cluster/SKILL.md
@@ -189,6 +189,17 @@ openshell status
 openshell logs <sandbox-name>
 ```
 
+## Telemetry Signals
+
+Before drilling into logs, check whether the gateway is exporting telemetry — the pull-based metrics surface and the push-based trace export are the fastest signals that the control plane is alive and that requests are reaching it.
+
+| Signal | Where it shows up | When to use it |
+|---|---|---|
+| Prometheus metrics on `/metrics` | A scrape target via the chart's `ServiceMonitor` (`monitoring.serviceMonitor.enabled`). Local: `kubectl -n openshell port-forward statefulset/openshell <metrics-port>:<metrics-port>`. | Confirm the gateway listener is up and gRPC requests are landing. `up{job="openshell"} == 1` in Prometheus is a quick liveness ping. |
+| OTLP traces | Jaeger / Tempo / OTel backend (`monitoring.tracing.enabled`). Look for service `openshell-gateway`. | Confirm an inbound request reached the multiplex layer; spans carry `method`, `path`, `request_id`. Missing traces under load means OTLP export is misconfigured or the endpoint is unreachable. |
+
+If the chart's `monitoring.serviceMonitor.enabled` or `monitoring.tracing.enabled` were not set, those signals are unavailable — fall back to gateway logs. See [Monitoring the Gateway](../../../docs/kubernetes/monitoring.mdx) for setup.
+
 ## Common Failure Patterns
 
 | Symptom | Likely cause | Check |
diff --git a/.agents/skills/helm-dev-environment/SKILL.md b/.agents/skills/helm-dev-environment/SKILL.md
index 623efb2e6..1899b459f 100644
--- a/.agents/skills/helm-dev-environment/SKILL.md
+++ b/.agents/skills/helm-dev-environment/SKILL.md
@@ -169,6 +169,39 @@ To remove Keycloak:
 mise run keycloak:k8s:teardown
 ```
 
+### Monitoring (Prometheus + Grafana + Jaeger)
+
+One-time setup — installs `kube-prometheus-stack` (slimmed: no Alertmanager,
+node-exporter, or kube-state-metrics) and a Jaeger all-in-one Pod:
+
+```bash
+mise run observability:k8s:setup
+```
+
+Then activate monitoring on the gateway:
+
+1. Uncomment `#- ci/values-monitoring.yaml` in `skaffold.yaml`
+2. Redeploy: `mise run helm:skaffold:run`
+
+Forward UIs to localhost:
+
+```bash
+mise run observability:port-forward
+# Grafana       http://localhost:3000  (admin / admin)
+# Prometheus    http://localhost:9090
+# Jaeger UI     http://localhost:16686
+```
+
+Teardown:
+
+```bash
+mise run observability:k8s:teardown
+```
+
+The chart's `monitoring.serviceMonitor.enabled` creates a `ServiceMonitor`
+that Prometheus scrapes, and `monitoring.tracing.enabled` projects `OTEL_*`
+env vars onto the gateway so it exports OTLP/gRPC traces to Jaeger.
+
 ---
 
 ## Cluster Lifecycle (suspend/resume)
diff --git a/Cargo.lock b/Cargo.lock
index 808956cd9..84d3a48b8 100644
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -3645,6 +3645,9 @@ dependencies = [
  "openshell-policy",
  "openshell-providers",
  "openshell-router",
+ "opentelemetry",
+ "opentelemetry-otlp",
+ "opentelemetry_sdk",
  "petname",
  "pin-project-lite",
  "prost",
@@ -3669,6 +3672,7 @@ dependencies = [
  "tower 0.5.3",
  "tower-http 0.6.8",
  "tracing",
+ "tracing-opentelemetry",
  "tracing-subscriber",
  "uuid",
  "wiremock",
@@ -3726,6 +3730,69 @@ version = "0.2.1"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "7c87def4c32ab89d880effc9e097653c8da5d6ef28e6b539d313baaacfbafcbe"
 
+[[package]]
+name = "opentelemetry"
+version = "0.29.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "9e87237e2775f74896f9ad219d26a2081751187eb7c9f5c58dde20a23b95d16c"
+dependencies = [
+ "futures-core",
+ "futures-sink",
+ "js-sys",
+ "pin-project-lite",
+ "thiserror 2.0.18",
+ "tracing",
+]
+
+[[package]]
+name = "opentelemetry-otlp"
+version = "0.29.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "d899720fe06916ccba71c01d04ecd77312734e2de3467fd30d9d580c8ce85656"
+dependencies = [
+ "futures-core",
+ "http",
+ "opentelemetry",
+ "opentelemetry-proto",
+ "opentelemetry_sdk",
+ "prost",
+ "thiserror 2.0.18",
+ "tokio",
+ "tonic",
+]
+
+[[package]]
+name = "opentelemetry-proto"
+version = "0.29.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "8c40da242381435e18570d5b9d50aca2a4f4f4d8e146231adb4e7768023309b3"
+dependencies = [
+ "opentelemetry",
+ "opentelemetry_sdk",
+ "prost",
+ "tonic",
+]
+
+[[package]]
+name = "opentelemetry_sdk"
+version = "0.29.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "afdefb21d1d47394abc1ba6c57363ab141be19e27cc70d0e422b7f303e4d290b"
+dependencies = [
+ "futures-channel",
+ "futures-executor",
+ "futures-util",
+ "glob",
+ "opentelemetry",
+ "percent-encoding",
+ "rand 0.9.4",
+ "serde_json",
+ "thiserror 2.0.18",
+ "tokio",
+ "tokio-stream",
+ "tracing",
+]
+
 [[package]]
 name = "ordered-float"
 version = "2.10.1"
@@ -6254,6 +6321,24 @@ dependencies = [
  "tracing-core",
 ]
 
+[[package]]
+name = "tracing-opentelemetry"
+version = "0.30.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "fd8e764bd6f5813fd8bebc3117875190c5b0415be8f7f8059bffb6ecd979c444"
+dependencies = [
+ "js-sys",
+ "once_cell",
+ "opentelemetry",
+ "opentelemetry_sdk",
+ "smallvec",
+ "tracing",
+ "tracing-core",
+ "tracing-log",
+ "tracing-subscriber",
+ "web-time",
+]
+
 [[package]]
 name = "tracing-serde"
 version = "0.2.0"
diff --git a/Cargo.toml b/Cargo.toml
index 9bc3f9ea2..3f29a33d1 100644
--- a/Cargo.toml
+++ b/Cargo.toml
@@ -58,6 +58,12 @@ tracing = "0.1"
 tracing-subscriber = { version = "0.3", features = ["env-filter", "json"] }
 tracing-appender = "0.2"
 
+# OpenTelemetry — pinned to a tonic-0.12 / prost-0.13 compatible release set.
+opentelemetry = "0.29"
+opentelemetry_sdk = { version = "0.29", features = ["rt-tokio"] }
+opentelemetry-otlp = { version = "0.29", default-features = false, features = ["grpc-tonic", "trace"] }
+tracing-opentelemetry = "0.30"
+
 # Metrics
 metrics = "0.24"
 metrics-exporter-prometheus = { version = "0.18", default-features = false, features = ["http-listener"] }
diff --git a/architecture/gateway.md b/architecture/gateway.md
index d89706e64..bee7aab97 100644
--- a/architecture/gateway.md
+++ b/architecture/gateway.md
@@ -54,6 +54,23 @@ Domain objects use shared metadata: stable server-generated IDs, human-readable
 names, creation timestamps, and labels. Crate-level details live in
 `crates/openshell-core/README.md`.
 
+### Observability surface
+
+The gateway exposes three independent telemetry surfaces, each with its own
+configuration knob and consumer:
+
+| Surface | Direction | Configured by | Consumers |
+|---|---|---|---|
+| Prometheus metrics on `/metrics` | Pull | `--metrics-port` (CLI), `monitoring.serviceMonitor.*` (Helm) | Prometheus / kube-prometheus-stack via `ServiceMonitor`. |
+| OpenTelemetry traces over OTLP/gRPC | Push | `--otlp-endpoint` / `OTEL_EXPORTER_OTLP_*` env, `monitoring.tracing.*` (Helm) | Any OTLP backend (Jaeger, Tempo, OTel Collector). The per-request span set up by `TraceLayer` becomes the OTLP root. |
+| Sandbox log fan-out | Push (gRPC stream) | Always on per sandbox subscription | CLI / TUI / SDK consumers via `WatchSandbox` and `GetSandboxLogs`; OCSF JSONL when enabled inside the sandbox. |
+
+Trace export is opt-in: the gateway only installs the OpenTelemetry layer
+when an OTLP endpoint is supplied. Spans flush on `SIGTERM` via an explicit
+`shutdown()` in the gateway shutdown path. See
+[Monitoring the Gateway](../docs/kubernetes/monitoring.mdx) for the operator
+guide.
+
 ## Persistence
 
 The gateway persistence layer is a protobuf object store. Domain services store
diff --git a/crates/openshell-server/Cargo.toml b/crates/openshell-server/Cargo.toml
index 9cba99045..2bbd21305 100644
--- a/crates/openshell-server/Cargo.toml
+++ b/crates/openshell-server/Cargo.toml
@@ -64,6 +64,12 @@ anyhow = { workspace = true }
 tracing = { workspace = true }
 tracing-subscriber = { workspace = true }
 
+# OpenTelemetry tracing export (opt-in, configured via env)
+opentelemetry = { workspace = true }
+opentelemetry_sdk = { workspace = true }
+opentelemetry-otlp = { workspace = true }
+tracing-opentelemetry = { workspace = true }
+
 # Metrics
 metrics = { workspace = true }
 metrics-exporter-prometheus = { workspace = true }
diff --git a/crates/openshell-server/src/cli.rs b/crates/openshell-server/src/cli.rs
index 534e3da37..577a46454 100644
--- a/crates/openshell-server/src/cli.rs
+++ b/crates/openshell-server/src/cli.rs
@@ -17,7 +17,10 @@ use tracing_subscriber::EnvFilter;
 
 use crate::certgen;
 use crate::compute::{DockerComputeConfig, VmComputeConfig};
-use crate::{run_server, tracing_bus::TracingLogBus};
+use crate::{
+    run_server,
+    tracing_bus::{OtlpTracingConfig, TracingLogBus},
+};
 
 /// `OpenShell` gateway process - gRPC and HTTP server with protocol multiplexing.
 ///
@@ -305,6 +308,13 @@ struct RunArgs {
     /// Keycloak: "scope". Okta: "scp". Leave empty to disable scope enforcement.
     #[arg(long, env = "OPENSHELL_OIDC_SCOPES_CLAIM", default_value = "")]
     oidc_scopes_claim: String,
+
+    /// OTLP/gRPC endpoint for OpenTelemetry trace export (e.g.
+    /// `http://jaeger-collector.observability.svc:4317`). When unset, no
+    /// traces are exported. The signal-specific
+    /// `OTEL_EXPORTER_OTLP_TRACES_ENDPOINT` takes precedence over this flag.
+    #[arg(long, env = "OPENSHELL_OTLP_ENDPOINT")]
+    otlp_endpoint: Option<String>,
 }
 
 pub fn command() -> Command {
@@ -328,8 +338,10 @@ pub async fn run_cli() -> Result<()> {
 
 async fn run_from_args(args: RunArgs) -> Result<()> {
     let tracing_log_bus = TracingLogBus::new();
+    let otlp = OtlpTracingConfig::resolve(args.otlp_endpoint.clone());
     tracing_log_bus.install_subscriber(
         EnvFilter::try_from_default_env().unwrap_or_else(|_| EnvFilter::new(&args.log_level)),
+        otlp,
     );
 
     let bind = SocketAddr::new(args.bind_address, args.port);
diff --git a/crates/openshell-server/src/lib.rs b/crates/openshell-server/src/lib.rs
index eaca911e4..beb247a17 100644
--- a/crates/openshell-server/src/lib.rs
+++ b/crates/openshell-server/src/lib.rs
@@ -324,6 +324,9 @@ pub async fn run_server(
         .await
         .map_err(|err| Error::execution(format!("gateway shutdown cleanup failed: {err}")))?;
 
+    // Flush any pending OTLP spans. No-op when OTLP export is not configured.
+    state.tracing_log_bus.shutdown();
+
     Ok(())
 }
 
diff --git a/crates/openshell-server/src/tracing_bus.rs b/crates/openshell-server/src/tracing_bus.rs
index cf168e306..34403f194 100644
--- a/crates/openshell-server/src/tracing_bus.rs
+++ b/crates/openshell-server/src/tracing_bus.rs
@@ -4,17 +4,49 @@
 //! Capture openshell-server tracing logs for streaming over gRPC.
 
 use std::collections::{HashMap, VecDeque};
-use std::sync::{Arc, Mutex};
+use std::sync::{Arc, Mutex, OnceLock};
 use std::time::{SystemTime, UNIX_EPOCH};
 
 use openshell_core::proto::{SandboxLogLine, SandboxStreamEvent};
 use openshell_ocsf::OCSF_TARGET;
+use opentelemetry::KeyValue;
+use opentelemetry::trace::TracerProvider;
+use opentelemetry_otlp::{SpanExporter, WithExportConfig};
+use opentelemetry_sdk::Resource;
+use opentelemetry_sdk::trace::{Sampler, SdkTracerProvider};
 use tokio::sync::broadcast;
 use tracing::{Event, Subscriber};
 use tracing_subscriber::layer::Context;
 use tracing_subscriber::prelude::*;
 use tracing_subscriber::{EnvFilter, Layer};
 
+/// OTLP tracing exporter configuration. Endpoint is the only required field;
+/// service name, resource attributes, and sampling ratio are picked up from
+/// standard `OTEL_*` env vars by the OpenTelemetry SDK.
+#[derive(Debug, Clone)]
+pub struct OtlpTracingConfig {
+    pub endpoint: String,
+}
+
+impl OtlpTracingConfig {
+    /// Resolve OTLP endpoint from (in order): the signal-specific
+    /// `OTEL_EXPORTER_OTLP_TRACES_ENDPOINT`, the shared
+    /// `OTEL_EXPORTER_OTLP_ENDPOINT`, then the supplied CLI argument.
+    /// Returns `None` if no endpoint is configured.
+    pub fn resolve(arg_endpoint: Option<String>) -> Option<Self> {
+        let endpoint = std::env::var("OTEL_EXPORTER_OTLP_TRACES_ENDPOINT")
+            .ok()
+            .or_else(|| std::env::var("OTEL_EXPORTER_OTLP_ENDPOINT").ok())
+            .or(arg_endpoint)
+            .map(|s| s.trim().to_string())
+            .filter(|s| !s.is_empty())?;
+        Some(Self { endpoint })
+    }
+}
+
+/// Process-wide tracer provider, retained so spans can be flushed on shutdown.
+static OTEL_TRACER_PROVIDER: OnceLock<SdkTracerProvider> = OnceLock::new();
+
 /// Bus that publishes server log lines keyed by sandbox id.
 #[derive(Debug, Clone)]
 pub struct TracingLogBus {
@@ -47,19 +79,48 @@ impl TracingLogBus {
     }
 
     /// Install a tracing subscriber that logs to stdout and publishes events into this bus.
-    pub fn install_subscriber(&self, env_filter: EnvFilter) {
-        let layer = SandboxLogLayer {
+    ///
+    /// When `otlp` is provided, an OpenTelemetry OTLP/gRPC trace exporter is attached
+    /// after the env filter so `OPENSHELL_LOG_LEVEL` continues to gate exported spans.
+    /// The `tower_http::trace::TraceLayer` per-request span set up in
+    /// `multiplex.rs` becomes the OTLP root span automatically.
+    pub fn install_subscriber(&self, env_filter: EnvFilter, otlp: Option<OtlpTracingConfig>) {
+        let bus_layer = SandboxLogLayer {
             bus: self.clone(),
             default_tail: Self::DEFAULT_TAIL,
         };
 
+        let otel_layer = match otlp {
+            Some(cfg) => match build_otel_layer(&cfg) {
+                Ok(layer) => Some(layer),
+                Err(err) => {
+                    eprintln!(
+                        "openshell-gateway: failed to enable OTLP trace export to {}: {err}",
+                        cfg.endpoint
+                    );
+                    None
+                }
+            },
+            None => None,
+        };
+
         tracing_subscriber::registry()
             .with(env_filter)
             .with(tracing_subscriber::fmt::layer())
-            .with(layer)
+            .with(bus_layer)
+            .with(otel_layer)
             .init();
     }
 
+    /// Flush and shut down the OTLP tracer provider, if installed. Idempotent.
+    pub fn shutdown(&self) {
+        if let Some(provider) = OTEL_TRACER_PROVIDER.get()
+            && let Err(err) = provider.shutdown()
+        {
+            tracing::warn!(error = %err, "OpenTelemetry tracer provider shutdown failed");
+        }
+    }
+
     fn sender_for(&self, sandbox_id: &str) -> broadcast::Sender<SandboxStreamEvent> {
         let mut inner = self.inner.lock().expect("tracing bus lock poisoned");
         inner
@@ -198,6 +259,70 @@ fn current_time_ms() -> Option<i64> {
     i64::try_from(now.as_millis()).ok()
 }
 
+/// Build an `OpenTelemetry` `tracing` layer that exports spans to the
+/// configured OTLP/gRPC endpoint. The resulting layer can be `with(...)`'d
+/// onto the subscriber registry.
+fn build_otel_layer<S>(
+    cfg: &OtlpTracingConfig,
+) -> Result<
+    tracing_opentelemetry::OpenTelemetryLayer<S, opentelemetry_sdk::trace::SdkTracer>,
+    Box<dyn std::error::Error + Send + Sync>,
+>
+where
+    S: Subscriber + for<'span> tracing_subscriber::registry::LookupSpan<'span>,
+{
+    let exporter = SpanExporter::builder()
+        .with_tonic()
+        .with_endpoint(&cfg.endpoint)
+        .build()?;
+
+    let resource = Resource::builder()
+        .with_service_name("openshell-gateway")
+        .with_attributes([KeyValue::new("service.version", openshell_core::VERSION)])
+        .build();
+
+    let sampler = sampler_from_env();
+
+    let provider = SdkTracerProvider::builder()
+        .with_batch_exporter(exporter)
+        .with_resource(resource)
+        .with_sampler(sampler)
+        .build();
+
+    let tracer = provider.tracer("openshell-gateway");
+
+    // Retain the provider so shutdown() can flush spans on SIGTERM.
+    let _ = OTEL_TRACER_PROVIDER.set(provider);
+
+    Ok(tracing_opentelemetry::layer().with_tracer(tracer))
+}
+
+/// Resolve a sampler from `OTEL_TRACES_SAMPLER` / `OTEL_TRACES_SAMPLER_ARG`,
+/// defaulting to `parent_based(traceidratio=1.0)` — record all spans, respect
+/// upstream parent sampling decisions.
+fn sampler_from_env() -> Sampler {
+    let ratio = std::env::var("OTEL_TRACES_SAMPLER_ARG")
+        .ok()
+        .and_then(|s| s.parse::<f64>().ok())
+        .map_or(1.0, |r| r.clamp(0.0, 1.0));
+
+    match std::env::var("OTEL_TRACES_SAMPLER")
+        .ok()
+        .as_deref()
+        .map(str::trim)
+    {
+        Some("always_on") => Sampler::AlwaysOn,
+        Some("always_off") => Sampler::AlwaysOff,
+        Some("traceidratio") => Sampler::TraceIdRatioBased(ratio),
+        Some("parentbased_always_off") => Sampler::ParentBased(Box::new(Sampler::AlwaysOff)),
+        Some("parentbased_traceidratio") => {
+            Sampler::ParentBased(Box::new(Sampler::TraceIdRatioBased(ratio)))
+        }
+        // "parentbased_always_on", unset, or unrecognized
+        _ => Sampler::ParentBased(Box::new(Sampler::AlwaysOn)),
+    }
+}
+
 fn display_level(target: &str, level: &str) -> String {
     if target == OCSF_TARGET {
         "OCSF".to_string()
@@ -387,6 +512,33 @@ mod tests {
         assert!(events.is_empty());
     }
 
+    #[test]
+    fn otlp_config_resolve_prefers_traces_endpoint_then_shared_then_arg() {
+        // Each branch is exercised in isolation to avoid env-var coupling
+        // between cases. We only assert that the non-empty value wins; the
+        // env-var precedence test would need a process-wide lock to be safe.
+        let cfg = OtlpTracingConfig::resolve(Some("http://arg:4317".into()));
+        assert!(cfg.is_some());
+        assert_eq!(cfg.unwrap().endpoint, "http://arg:4317");
+
+        let cfg = OtlpTracingConfig::resolve(Some("   ".into()));
+        assert!(cfg.is_none());
+
+        let cfg = OtlpTracingConfig::resolve(None);
+        // May be Some or None depending on inherited env; only assert that
+        // when Some, the endpoint is non-empty.
+        if let Some(c) = cfg {
+            assert!(!c.endpoint.is_empty());
+        }
+    }
+
+    #[test]
+    fn sampler_from_env_returns_a_sampler() {
+        // The function shape is documented in the function body; this test
+        // exercises construction without coupling to inherited env state.
+        let _ = sampler_from_env();
+    }
+
     #[test]
     fn platform_event_bus_remove_clears_tail() {
         let bus = PlatformEventBus::new();
diff --git a/deploy/helm/openshell/README.md b/deploy/helm/openshell/README.md
index cc856731d..6e3687c67 100644
--- a/deploy/helm/openshell/README.md
+++ b/deploy/helm/openshell/README.md
@@ -52,6 +52,7 @@ See [`values.yaml`](values.yaml) for configurable values. Selected overlays:
 - [`ci/values-gateway.yaml`](ci/values-gateway.yaml) — gateway-only configuration
 - [`ci/values-cert-manager.yaml`](ci/values-cert-manager.yaml) — cert-manager integration
 - [`ci/values-keycloak.yaml`](ci/values-keycloak.yaml) — Keycloak OIDC integration
+- [`ci/values-monitoring.yaml`](ci/values-monitoring.yaml) — Prometheus `ServiceMonitor` + OTLP traces (local-dev defaults)
 
 ## PKI bootstrap
 
@@ -70,3 +71,12 @@ The Job is idempotent:
 Disable with `--set pkiInitJob.enabled=false` when bringing your own PKI (cert-manager,
 external CA, or pre-created Secrets). See `certManager.*` in `values.yaml` for the
 cert-manager alternative.
+
+## Monitoring
+
+The chart can opt into two independent observability surfaces:
+
+- `monitoring.serviceMonitor.enabled` — creates a Prometheus-Operator `ServiceMonitor` scraping the gateway's `/metrics` endpoint. Requires the `monitoring.coreos.com/v1` CRD (ships with `kube-prometheus-stack`).
+- `monitoring.tracing.enabled` — projects standard `OTEL_*` env vars onto the gateway container so it exports OTLP/gRPC traces to the configured `monitoring.tracing.endpoint`.
+
+Both are off by default. See [Monitoring the Gateway](../../../docs/kubernetes/monitoring.mdx) for the operator guide and `mise run observability:k8s:setup` for the local-dev `kube-prometheus-stack` + Jaeger bundle.
diff --git a/deploy/helm/openshell/ci/values-monitoring.yaml b/deploy/helm/openshell/ci/values-monitoring.yaml
new file mode 100644
index 000000000..683808f7b
--- /dev/null
+++ b/deploy/helm/openshell/ci/values-monitoring.yaml
@@ -0,0 +1,31 @@
+# SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+
+# Local-dev overlay enabling the ServiceMonitor (scraped by kube-prometheus-stack)
+# and OTLP trace export to a Jaeger all-in-one Service.
+#
+# Prerequisite: install the cluster monitoring add-ons one time:
+#   mise run observability:k8s:setup
+#
+# Then uncomment values-monitoring.yaml in skaffold.yaml or pass it explicitly:
+#   helm upgrade --install openshell . \
+#     -f values.yaml -f ci/values-skaffold.yaml -f ci/values-monitoring.yaml
+
+monitoring:
+  serviceMonitor:
+    enabled: true
+    interval: 15s
+    scrapeTimeout: 10s
+    # kube-prometheus-stack's default Prometheus instance selects ServiceMonitors
+    # that carry `release: kube-prometheus-stack`. The setup task installs the
+    # bundle under that release name.
+    labels:
+      release: kube-prometheus-stack
+  tracing:
+    enabled: true
+    # Jaeger all-in-one OTLP/gRPC receiver, installed by the setup task.
+    endpoint: "http://jaeger-collector.observability.svc.cluster.local:4317"
+    protocol: grpc
+    samplingRatio: "1.0"
+    resourceAttributes:
+      deployment.environment: dev
diff --git a/deploy/helm/openshell/skaffold.yaml b/deploy/helm/openshell/skaffold.yaml
index 2de9ee4e6..80366ecf4 100644
--- a/deploy/helm/openshell/skaffold.yaml
+++ b/deploy/helm/openshell/skaffold.yaml
@@ -81,6 +81,30 @@ deploy:
       #  # wait ensures Gateway API CRDs are registered before the openshell
       #  # release attempts to create Gateway and HTTPRoute resources.
       #  wait: true
+      # Monitoring add-ons — comment in along with ci/values-monitoring.yaml
+      # below to scrape the gateway from Prometheus and export traces to Jaeger.
+      # Prefer running the dedicated mise task (`mise run observability:k8s:setup`)
+      # for the initial install; these blocks are kept for parity.
+      #- name: kube-prometheus-stack
+      #  repo: https://prometheus-community.github.io/helm-charts
+      #  remoteChart: kube-prometheus-stack
+      #  version: 75.0.0
+      #  namespace: monitoring
+      #  createNamespace: true
+      #  wait: true
+      #- name: jaeger
+      #  repo: https://jaegertracing.github.io/helm-charts
+      #  remoteChart: jaeger
+      #  version: 3.4.0
+      #  namespace: observability
+      #  createNamespace: true
+      #  setValues:
+      #    allInOne.enabled: true
+      #    storage.type: memory
+      #    provisionDataStore.cassandra: false
+      #    agent.enabled: false
+      #    collector.enabled: false
+      #    query.enabled: false
       - name: openshell
         chartPath: .
         namespace: openshell
@@ -97,6 +121,10 @@ deploy:
           #- ci/values-keycloak.yaml
           # To enable the Gateway API HTTPRoute (requires Envoy Gateway above):
           #- ci/values-gateway.yaml
+          # To enable Prometheus scraping + OTLP traces → Jaeger:
+          #   mise run observability:k8s:setup
+          # then uncomment the line below.
+          #- ci/values-monitoring.yaml
         setValueTemplates:
           image.repository: '{{.IMAGE_REPO_openshell_gateway}}'
           image.tag: '{{.IMAGE_TAG_openshell_gateway}}'
diff --git a/deploy/helm/openshell/templates/_helpers.tpl b/deploy/helm/openshell/templates/_helpers.tpl
index 93eff90a9..4a4e69a2d 100644
--- a/deploy/helm/openshell/templates/_helpers.tpl
+++ b/deploy/helm/openshell/templates/_helpers.tpl
@@ -97,3 +97,14 @@ override.
 {{- printf "%s://%s.%s.svc.cluster.local:%d" $scheme (include "openshell.fullname" .) .Release.Namespace (int .Values.service.port) -}}
 {{- end -}}
 {{- end }}
+
+{{/*
+Render the user-supplied monitoring.tracing.resourceAttributes map as a
+comma-prefixed `key=value` list suitable for appending to OTEL_RESOURCE_ATTRIBUTES.
+Returns an empty string when no attributes are configured.
+*/}}
+{{- define "openshell.tracingResourceAttributes" -}}
+{{- with .Values.monitoring.tracing.resourceAttributes }}
+{{- range $k, $v := . }},{{ $k }}={{ $v }}{{- end }}
+{{- end }}
+{{- end }}
diff --git a/deploy/helm/openshell/templates/servicemonitor.yaml b/deploy/helm/openshell/templates/servicemonitor.yaml
new file mode 100644
index 000000000..8eb356576
--- /dev/null
+++ b/deploy/helm/openshell/templates/servicemonitor.yaml
@@ -0,0 +1,27 @@
+# SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+
+{{- if .Values.monitoring.serviceMonitor.enabled }}
+apiVersion: monitoring.coreos.com/v1
+kind: ServiceMonitor
+metadata:
+  name: {{ include "openshell.fullname" . }}
+  namespace: {{ default .Release.Namespace .Values.monitoring.serviceMonitor.namespace }}
+  labels:
+    {{- include "openshell.labels" . | nindent 4 }}
+    {{- with .Values.monitoring.serviceMonitor.labels }}
+    {{- toYaml . | nindent 4 }}
+    {{- end }}
+spec:
+  namespaceSelector:
+    matchNames:
+      - {{ .Release.Namespace }}
+  selector:
+    matchLabels:
+      {{- include "openshell.selectorLabels" . | nindent 6 }}
+  endpoints:
+    - port: metrics
+      path: /metrics
+      interval: {{ .Values.monitoring.serviceMonitor.interval }}
+      scrapeTimeout: {{ .Values.monitoring.serviceMonitor.scrapeTimeout }}
+{{- end }}
diff --git a/deploy/helm/openshell/templates/statefulset.yaml b/deploy/helm/openshell/templates/statefulset.yaml
index 2d3f731af..6e5c66a3b 100644
--- a/deploy/helm/openshell/templates/statefulset.yaml
+++ b/deploy/helm/openshell/templates/statefulset.yaml
@@ -140,6 +140,23 @@ spec:
               value: {{ .Values.server.oidc.scopesClaim | quote }}
             {{- end }}
             {{- end }}
+            {{- if and .Values.monitoring .Values.monitoring.tracing.enabled }}
+            - name: OTEL_EXPORTER_OTLP_TRACES_ENDPOINT
+              value: {{ required "monitoring.tracing.endpoint is required when monitoring.tracing.enabled is true" .Values.monitoring.tracing.endpoint | quote }}
+            - name: OTEL_EXPORTER_OTLP_PROTOCOL
+              value: {{ .Values.monitoring.tracing.protocol | quote }}
+            - name: OTEL_SERVICE_NAME
+              value: "openshell-gateway"
+            - name: OTEL_TRACES_SAMPLER
+              value: "parentbased_traceidratio"
+            - name: OTEL_TRACES_SAMPLER_ARG
+              value: {{ .Values.monitoring.tracing.samplingRatio | quote }}
+            - name: OTEL_RESOURCE_ATTRIBUTES
+              value: {{ printf "service.namespace=%s,service.version=%s%s"
+                .Release.Namespace
+                .Chart.AppVersion
+                (include "openshell.tracingResourceAttributes" .) | quote }}
+            {{- end }}
           volumeMounts:
             - name: openshell-data
               mountPath: /var/openshell
diff --git a/deploy/helm/openshell/values.yaml b/deploy/helm/openshell/values.yaml
index 7630554f2..8e7efeb19 100644
--- a/deploy/helm/openshell/values.yaml
+++ b/deploy/helm/openshell/values.yaml
@@ -229,3 +229,39 @@ grpcRoute:
       protocol: HTTP
       # "Same" restricts attached routes to the release namespace; "All" allows any namespace.
       allowedRoutes: Same
+
+# Observability: Prometheus ServiceMonitor and OpenTelemetry trace export.
+# Both subsections are independent and disabled by default. The chart never
+# bundles a monitoring stack; operators run kube-prometheus-stack and Jaeger
+# (or any OTLP backend) themselves and point the gateway at them here.
+monitoring:
+  serviceMonitor:
+    # Create a Prometheus-Operator ServiceMonitor scraping the gateway's
+    # /metrics endpoint. Requires the monitoring.coreos.com/v1 CRDs.
+    enabled: false
+    interval: 30s
+    scrapeTimeout: 10s
+    # Extra labels added to the ServiceMonitor (commonly required to match
+    # the Prometheus instance's serviceMonitorSelector — kube-prometheus-stack
+    # defaults to selecting on `release: <release-name>`).
+    labels: {}
+    # Namespace where the ServiceMonitor is created. Empty = release namespace.
+    namespace: ""
+  tracing:
+    # Project OTEL_* env vars onto the gateway container so the in-process
+    # OTLP exporter starts up. The gateway exports OTLP/gRPC; Helm currently
+    # only supports the gRPC protocol.
+    enabled: false
+    # OTLP/gRPC collector endpoint (host:port or full URL). Required when
+    # tracing.enabled is true.
+    # e.g. http://jaeger-collector.observability.svc:4317
+    endpoint: ""
+    # OTLP transport. Currently only "grpc" is supported by the gateway.
+    protocol: grpc
+    # Trace sampling. parent_based(traceidratio=...) is the recommended
+    # production default — record according to the upstream parent's
+    # decision, fall back to the configured ratio for new traces.
+    samplingRatio: "1.0"
+    # Extra resource attributes appended to OTEL_RESOURCE_ATTRIBUTES,
+    # e.g. {deployment.environment: production}.
+    resourceAttributes: {}
diff --git a/docs/kubernetes/monitoring.mdx b/docs/kubernetes/monitoring.mdx
new file mode 100644
index 000000000..4bd0fd440
--- /dev/null
+++ b/docs/kubernetes/monitoring.mdx
@@ -0,0 +1,135 @@
+---
+# SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+title: "Monitoring the Gateway"
+sidebar-title: "Monitoring"
+description: "Scrape the OpenShell gateway with Prometheus and export traces over OTLP from a Helm-managed deployment."
+keywords: "Generative AI, Cybersecurity, Monitoring, Prometheus, OpenTelemetry, OTLP, Jaeger, Grafana, Helm, Kubernetes"
+position: 6
+---
+
+The OpenShell gateway exposes two telemetry surfaces operators usually plug directly into a cluster monitoring stack:
+
+- **Prometheus metrics** at `/metrics` on a dedicated port — scraped over HTTP.
+- **OpenTelemetry traces** exported over OTLP/gRPC — pushed to a collector or backend.
+
+Both are off by default in the chart. This page enables them on a real cluster, and shows the local-dev path that bundles `kube-prometheus-stack` and Jaeger.
+
+<Note>
+For sandbox-internal logs (OCSF), see [Observability → Sandbox Logging](/observability/logging). That surface is independent of the gateway control-plane telemetry described here.
+</Note>
+
+## Prerequisites
+
+| Prerequisite | Required for | Notes |
+|---|---|---|
+| Prometheus Operator | Metrics | The chart creates a `monitoring.coreos.com/v1` `ServiceMonitor`. The CRD ships with `kube-prometheus-stack`. |
+| OTLP-compatible backend | Traces | Jaeger, Tempo, an OpenTelemetry Collector, or any vendor backend that accepts OTLP/gRPC on `:4317`. |
+| Helm chart values access | Both | Either `--set` flags on the install command, or a values file passed via `-f`. |
+
+## Enable on the Helm release
+
+Add a `monitoring` block to your values file:
+
+```yaml
+monitoring:
+  serviceMonitor:
+    enabled: true
+    interval: 30s
+    # Match the Prometheus instance's serviceMonitorSelector. The
+    # kube-prometheus-stack default selector is `release: <release-name>`.
+    labels:
+      release: kube-prometheus-stack
+  tracing:
+    enabled: true
+    endpoint: "http://otel-collector.observability.svc.cluster.local:4317"
+    protocol: grpc
+    samplingRatio: "1.0"
+    resourceAttributes:
+      deployment.environment: production
+```
+
+Then upgrade the release:
+
+```shell
+helm upgrade --install openshell oci://ghcr.io/nvidia/openshell/charts/openshell \
+  -f values.yaml
+```
+
+When `monitoring.tracing.enabled` is `true`, the chart projects the standard `OTEL_*` env vars onto the gateway container. The gateway initializes the OTLP exporter at startup and flushes spans on `SIGTERM`.
+
+<Warning>
+The gateway currently exports OTLP over **gRPC only**. Setting `monitoring.tracing.protocol` to anything other than `grpc` is not supported and the value is ignored at the gateway.
+</Warning>
+
+## Verify it works
+
+### Metrics
+
+Confirm the `ServiceMonitor` was created and Prometheus picked it up:
+
+```shell
+kubectl get servicemonitor -n <release-namespace>
+kubectl exec -n monitoring deploy/prometheus-server -- \
+  promtool query instant http://localhost:9090 'up{job="openshell"}'
+```
+
+Then query a metric the gateway exports while you exercise the API:
+
+```shell
+openshell sandbox create --name probe
+```
+
+```promql
+rate(openshell_server_grpc_requests_total[1m])
+```
+
+### Traces
+
+Create a sandbox via the CLI to drive at least one inbound request, then look in your trace UI for the `openshell-gateway` service. You should see a `request` span with `method`, `path`, and `request_id` attributes.
+
+```shell
+openshell sandbox create --name trace-probe
+```
+
+In Jaeger UI: select **Service: openshell-gateway** and **Find Traces**. In Grafana Tempo / OTLP backends: query `service.name="openshell-gateway"`.
+
+## Local development
+
+The repo ships a one-shot mise task that installs `kube-prometheus-stack` (slimmed-down: no Alertmanager, node-exporter, or kube-state) and a Jaeger all-in-one Pod into the local k3s cluster.
+
+```shell
+# 1. Bring up the cluster (skip if already running):
+mise run helm:k3s:create
+
+# 2. Install Prometheus + Grafana + Jaeger:
+mise run observability:k8s:setup
+
+# 3. Open ci/values-monitoring.yaml on the openshell release:
+#    Uncomment `- ci/values-monitoring.yaml` in deploy/helm/openshell/skaffold.yaml
+
+# 4. Deploy / restart the openshell release:
+mise run helm:skaffold:dev
+
+# 5. Forward UIs to localhost:
+mise run observability:port-forward
+```
+
+Then visit:
+
+- Grafana — http://localhost:3000 (admin / admin)
+- Prometheus — http://localhost:9090
+- Jaeger UI — http://localhost:16686
+
+Tear it all down with:
+
+```shell
+mise run observability:k8s:teardown
+```
+
+## Production guidance
+
+- **OpenTelemetry Collector** is the recommended pattern in front of multi-backend trace pipelines. Point `monitoring.tracing.endpoint` at the Collector, then route from the Collector to Jaeger / Tempo / your vendor.
+- **Sampling**: `samplingRatio: "1.0"` (record everything) is fine for low-traffic gateways. Drop to `"0.1"` or lower for high-throughput deployments. The chart sets `parentbased_traceidratio` so upstream parent decisions are honored when present.
+- **Resource attributes**: Add `deployment.environment`, `cluster.name`, or other identifiers via `monitoring.tracing.resourceAttributes`. They appear on every span as part of the OTel Resource.
+- **Prometheus selector**: The chart-rendered `ServiceMonitor` carries the standard chart labels. Most Prometheus instances also require an explicit selector label (often `release: <prometheus-release-name>`) — set it via `monitoring.serviceMonitor.labels`.
diff --git a/docs/observability/overview.mdx b/docs/observability/overview.mdx
index 69fde7a06..d548c5374 100644
--- a/docs/observability/overview.mdx
+++ b/docs/observability/overview.mdx
@@ -15,3 +15,7 @@ This section covers:
 - **[Sandbox Logging](/observability/logging)**: How the two log formats work, where logs are stored, and how to read them.
 - **[Accessing Logs](/observability/accessing-logs)**: How to view logs through the CLI, TUI, and directly on the sandbox filesystem.
 - **[OCSF JSON Export](/observability/ocsf-json-export)**: How to enable full OCSF JSON output for integration with SIEMs, log aggregators, and compliance tools.
+
+## Sandbox logs vs. gateway telemetry
+
+Sandbox-internal logs (the OCSF surface above) are distinct from the gateway control-plane telemetry — Prometheus metrics on `/metrics` and OpenTelemetry traces over OTLP. The two pipelines are independent and configured separately. For scraping the gateway from Prometheus or shipping traces to Jaeger / Tempo / an OTel Collector, see [Monitoring the Gateway](/kubernetes/monitoring).
diff --git a/tasks/observability.toml b/tasks/observability.toml
new file mode 100644
index 000000000..969bfaaaa
--- /dev/null
+++ b/tasks/observability.toml
@@ -0,0 +1,19 @@
+# SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+
+# Cluster observability add-ons (Prometheus + Grafana + Jaeger).
+# These tasks target the local k3s cluster created by `mise run helm:k3s:create`.
+# Production users install kube-prometheus-stack and an OTLP backend separately
+# and point the gateway at them via Helm values; see docs/kubernetes/monitoring.mdx.
+
+["observability:k8s:setup"]
+description = "Install kube-prometheus-stack + Jaeger all-in-one into the local k3s cluster (one-time)"
+run = "tasks/scripts/observability-k8s-setup.sh"
+
+["observability:k8s:teardown"]
+description = "Remove kube-prometheus-stack and Jaeger from the local k3s cluster"
+run = "tasks/scripts/observability-k8s-teardown.sh"
+
+["observability:port-forward"]
+description = "Port-forward Grafana (3000), Prometheus (9090), and Jaeger UI (16686) until interrupted"
+run = "tasks/scripts/observability-port-forward.sh"
diff --git a/tasks/scripts/observability-k8s-setup.sh b/tasks/scripts/observability-k8s-setup.sh
new file mode 100755
index 000000000..3e31bc47b
--- /dev/null
+++ b/tasks/scripts/observability-k8s-setup.sh
@@ -0,0 +1,96 @@
+#!/usr/bin/env bash
+# SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+#
+# One-time install of kube-prometheus-stack + Jaeger all-in-one into the local
+# k3s cluster created by `mise run helm:k3s:create`.
+#
+# - kube-prometheus-stack provides Prometheus, Grafana, and the
+#   ServiceMonitor/PodMonitor CRDs the openshell chart uses.
+# - Jaeger all-in-one provides an OTLP/gRPC receiver (:4317) and UI (:16686).
+#
+# Re-running is safe; both releases use `helm upgrade --install`.
+#
+# Usage:
+#   mise run observability:k8s:setup
+#
+# After setup, enable monitoring on the openshell release by uncommenting
+# `ci/values-monitoring.yaml` in `deploy/helm/openshell/skaffold.yaml`, then
+# rerun skaffold.
+
+set -euo pipefail
+
+MONITORING_NAMESPACE="${MONITORING_NAMESPACE:-monitoring}"
+OBSERVABILITY_NAMESPACE="${OBSERVABILITY_NAMESPACE:-observability}"
+PROMSTACK_RELEASE="${PROMSTACK_RELEASE:-kube-prometheus-stack}"
+PROMSTACK_VERSION="${PROMSTACK_VERSION:-75.0.0}"
+JAEGER_RELEASE="${JAEGER_RELEASE:-jaeger}"
+JAEGER_VERSION="${JAEGER_VERSION:-3.4.0}"
+HEALTH_TIMEOUT="${HEALTH_TIMEOUT:-180}"
+
+# ---------------------------------------------------------------------------
+# Helm repos
+# ---------------------------------------------------------------------------
+
+echo "Adding Helm repos..."
+helm repo add prometheus-community https://prometheus-community.github.io/helm-charts >/dev/null 2>&1 || true
+helm repo add jaegertracing https://jaegertracing.github.io/helm-charts >/dev/null 2>&1 || true
+helm repo update prometheus-community jaegertracing >/dev/null
+
+# ---------------------------------------------------------------------------
+# kube-prometheus-stack
+# ---------------------------------------------------------------------------
+#
+# Slimmed-down install: keep Prometheus + Grafana + Operator (the parts the
+# openshell chart's ServiceMonitor needs), drop Alertmanager and the
+# node/kube-state metrics exporters to keep k3d resource usage down. Real
+# clusters get the full bundle via the published docs.
+
+echo "Installing ${PROMSTACK_RELEASE} into namespace ${MONITORING_NAMESPACE}..."
+helm upgrade --install "${PROMSTACK_RELEASE}" prometheus-community/kube-prometheus-stack \
+    --version "${PROMSTACK_VERSION}" \
+    --namespace "${MONITORING_NAMESPACE}" \
+    --create-namespace \
+    --set alertmanager.enabled=false \
+    --set nodeExporter.enabled=false \
+    --set kubeStateMetrics.enabled=false \
+    --set grafana.adminPassword=admin \
+    --set prometheus.prometheusSpec.serviceMonitorSelectorNilUsesHelmValues=false \
+    --wait --timeout "${HEALTH_TIMEOUT}s"
+
+# ---------------------------------------------------------------------------
+# Jaeger all-in-one
+# ---------------------------------------------------------------------------
+
+echo "Installing ${JAEGER_RELEASE} into namespace ${OBSERVABILITY_NAMESPACE}..."
+helm upgrade --install "${JAEGER_RELEASE}" jaegertracing/jaeger \
+    --version "${JAEGER_VERSION}" \
+    --namespace "${OBSERVABILITY_NAMESPACE}" \
+    --create-namespace \
+    --set allInOne.enabled=true \
+    --set storage.type=memory \
+    --set provisionDataStore.cassandra=false \
+    --set agent.enabled=false \
+    --set collector.enabled=false \
+    --set query.enabled=false \
+    --wait --timeout "${HEALTH_TIMEOUT}s"
+
+# ---------------------------------------------------------------------------
+# Summary
+# ---------------------------------------------------------------------------
+
+echo ""
+echo "Cluster monitoring stack is ready."
+echo ""
+echo "  Grafana:      http://localhost:3000  (admin / admin)"
+echo "  Prometheus:   http://localhost:9090"
+echo "  Jaeger UI:    http://localhost:16686"
+echo ""
+echo "  Start port-forwards:    mise run observability:port-forward"
+echo ""
+echo "  Enable on the openshell release:"
+echo "    1. Uncomment 'ci/values-monitoring.yaml' in deploy/helm/openshell/skaffold.yaml"
+echo "    2. mise run helm:skaffold:dev"
+echo ""
+echo "  Teardown:               mise run observability:k8s:teardown"
+echo ""
diff --git a/tasks/scripts/observability-k8s-teardown.sh b/tasks/scripts/observability-k8s-teardown.sh
new file mode 100755
index 000000000..f9b3ee501
--- /dev/null
+++ b/tasks/scripts/observability-k8s-teardown.sh
@@ -0,0 +1,27 @@
+#!/usr/bin/env bash
+# SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+#
+# Remove the local cluster monitoring add-ons installed by
+# observability-k8s-setup.sh.
+#
+# Usage:
+#   mise run observability:k8s:teardown
+
+set -euo pipefail
+
+MONITORING_NAMESPACE="${MONITORING_NAMESPACE:-monitoring}"
+OBSERVABILITY_NAMESPACE="${OBSERVABILITY_NAMESPACE:-observability}"
+PROMSTACK_RELEASE="${PROMSTACK_RELEASE:-kube-prometheus-stack}"
+JAEGER_RELEASE="${JAEGER_RELEASE:-jaeger}"
+
+echo "Uninstalling ${PROMSTACK_RELEASE} from ${MONITORING_NAMESPACE}..."
+helm uninstall "${PROMSTACK_RELEASE}" --namespace "${MONITORING_NAMESPACE}" --ignore-not-found
+
+echo "Uninstalling ${JAEGER_RELEASE} from ${OBSERVABILITY_NAMESPACE}..."
+helm uninstall "${JAEGER_RELEASE}" --namespace "${OBSERVABILITY_NAMESPACE}" --ignore-not-found
+
+echo "Deleting namespaces..."
+kubectl delete namespace "${MONITORING_NAMESPACE}" "${OBSERVABILITY_NAMESPACE}" --ignore-not-found
+
+echo "Done."
diff --git a/tasks/scripts/observability-port-forward.sh b/tasks/scripts/observability-port-forward.sh
new file mode 100755
index 000000000..4a6ed761d
--- /dev/null
+++ b/tasks/scripts/observability-port-forward.sh
@@ -0,0 +1,58 @@
+#!/usr/bin/env bash
+# SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+#
+# Background port-forwards for Grafana, Prometheus, and the Jaeger UI.
+# Runs until interrupted; trap ensures the kubectl background processes are
+# cleaned up on Ctrl+C / SIGTERM.
+#
+# Usage:
+#   mise run observability:port-forward
+
+set -euo pipefail
+
+MONITORING_NAMESPACE="${MONITORING_NAMESPACE:-monitoring}"
+OBSERVABILITY_NAMESPACE="${OBSERVABILITY_NAMESPACE:-observability}"
+PROMSTACK_RELEASE="${PROMSTACK_RELEASE:-kube-prometheus-stack}"
+JAEGER_RELEASE="${JAEGER_RELEASE:-jaeger}"
+
+GRAFANA_LOCAL_PORT="${GRAFANA_LOCAL_PORT:-3000}"
+PROMETHEUS_LOCAL_PORT="${PROMETHEUS_LOCAL_PORT:-9090}"
+JAEGER_UI_LOCAL_PORT="${JAEGER_UI_LOCAL_PORT:-16686}"
+
+PIDS=()
+
+cleanup() {
+    if [[ ${#PIDS[@]} -gt 0 ]]; then
+        echo ""
+        echo "Stopping port-forwards..."
+        kill "${PIDS[@]}" 2>/dev/null || true
+        wait "${PIDS[@]}" 2>/dev/null || true
+    fi
+}
+trap cleanup EXIT INT TERM
+
+forward() {
+    local namespace="$1"
+    local target="$2"
+    local local_port="$3"
+    local remote_port="$4"
+    kubectl --namespace "${namespace}" port-forward "${target}" \
+        "${local_port}:${remote_port}" >/dev/null 2>&1 &
+    PIDS+=("$!")
+}
+
+echo "Starting port-forwards..."
+forward "${MONITORING_NAMESPACE}" "svc/${PROMSTACK_RELEASE}-grafana" "${GRAFANA_LOCAL_PORT}" 80
+forward "${MONITORING_NAMESPACE}" "svc/${PROMSTACK_RELEASE}-prometheus" "${PROMETHEUS_LOCAL_PORT}" 9090
+forward "${OBSERVABILITY_NAMESPACE}" "svc/${JAEGER_RELEASE}-query" "${JAEGER_UI_LOCAL_PORT}" 16686
+
+echo ""
+echo "  Grafana:     http://localhost:${GRAFANA_LOCAL_PORT}    (admin / admin)"
+echo "  Prometheus:  http://localhost:${PROMETHEUS_LOCAL_PORT}"
+echo "  Jaeger UI:   http://localhost:${JAEGER_UI_LOCAL_PORT}"
+echo ""
+echo "Press Ctrl+C to stop."
+
+# Block until any forwarder exits or signal is received.
+wait -n

From c6463bfee547a1eb548b2ac29591e0323a608f57 Mon Sep 17 00:00:00 2001
From: Taylor Mutch <taylormutch@gmail.com>
Date: Fri, 8 May 2026 11:00:57 -0700
Subject: [PATCH 4/4] refactor(observability): extract local-dev Helm values
 into separate files
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

The kube-prometheus-stack and Jaeger releases were configured via long
chains of `--set` flags, which obscure the configuration and make the
script hard to extend. Extract them into two checked-in values files
the setup script consumes via `--values`.

- tasks/scripts/observability-prometheus-values.yaml — slim chart config
  plus Grafana auto-provisioning of a Jaeger datasource (stable uid so
  dashboards can reference it).
- tasks/scripts/observability-jaeger-values.yaml — all-in-one Jaeger.
- PROMSTACK_VALUES and JAEGER_VALUES env vars allow pointing at custom
  files for local experimentation.
---
 .../scripts/observability-jaeger-values.yaml  | 25 +++++++++++++
 tasks/scripts/observability-k8s-setup.sh      | 17 ++++-----
 .../observability-prometheus-values.yaml      | 35 +++++++++++++++++++
 3 files changed, 66 insertions(+), 11 deletions(-)
 create mode 100644 tasks/scripts/observability-jaeger-values.yaml
 create mode 100644 tasks/scripts/observability-prometheus-values.yaml

diff --git a/tasks/scripts/observability-jaeger-values.yaml b/tasks/scripts/observability-jaeger-values.yaml
new file mode 100644
index 000000000..f89576f3f
--- /dev/null
+++ b/tasks/scripts/observability-jaeger-values.yaml
@@ -0,0 +1,25 @@
+# SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+#
+# Helm values for the local-dev jaegertracing/jaeger install.
+# Consumed by tasks/scripts/observability-k8s-setup.sh.
+#
+# All-in-one mode: single pod with in-memory storage, OTLP/gRPC receiver on
+# :4317, OTLP/HTTP on :4318, UI on :16686. Enough for dashboards and trace
+# inspection during dev; nothing persists across pod restarts.
+
+allInOne:
+  enabled: true
+
+storage:
+  type: memory
+
+provisionDataStore:
+  cassandra: false
+
+agent:
+  enabled: false
+collector:
+  enabled: false
+query:
+  enabled: false
diff --git a/tasks/scripts/observability-k8s-setup.sh b/tasks/scripts/observability-k8s-setup.sh
index 3e31bc47b..c771ad157 100755
--- a/tasks/scripts/observability-k8s-setup.sh
+++ b/tasks/scripts/observability-k8s-setup.sh
@@ -20,12 +20,16 @@
 
 set -euo pipefail
 
+SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
+
 MONITORING_NAMESPACE="${MONITORING_NAMESPACE:-monitoring}"
 OBSERVABILITY_NAMESPACE="${OBSERVABILITY_NAMESPACE:-observability}"
 PROMSTACK_RELEASE="${PROMSTACK_RELEASE:-kube-prometheus-stack}"
 PROMSTACK_VERSION="${PROMSTACK_VERSION:-75.0.0}"
+PROMSTACK_VALUES="${PROMSTACK_VALUES:-${SCRIPT_DIR}/observability-prometheus-values.yaml}"
 JAEGER_RELEASE="${JAEGER_RELEASE:-jaeger}"
 JAEGER_VERSION="${JAEGER_VERSION:-3.4.0}"
+JAEGER_VALUES="${JAEGER_VALUES:-${SCRIPT_DIR}/observability-jaeger-values.yaml}"
 HEALTH_TIMEOUT="${HEALTH_TIMEOUT:-180}"
 
 # ---------------------------------------------------------------------------
@@ -51,11 +55,7 @@ helm upgrade --install "${PROMSTACK_RELEASE}" prometheus-community/kube-promethe
     --version "${PROMSTACK_VERSION}" \
     --namespace "${MONITORING_NAMESPACE}" \
     --create-namespace \
-    --set alertmanager.enabled=false \
-    --set nodeExporter.enabled=false \
-    --set kubeStateMetrics.enabled=false \
-    --set grafana.adminPassword=admin \
-    --set prometheus.prometheusSpec.serviceMonitorSelectorNilUsesHelmValues=false \
+    --values "${PROMSTACK_VALUES}" \
     --wait --timeout "${HEALTH_TIMEOUT}s"
 
 # ---------------------------------------------------------------------------
@@ -67,12 +67,7 @@ helm upgrade --install "${JAEGER_RELEASE}" jaegertracing/jaeger \
     --version "${JAEGER_VERSION}" \
     --namespace "${OBSERVABILITY_NAMESPACE}" \
     --create-namespace \
-    --set allInOne.enabled=true \
-    --set storage.type=memory \
-    --set provisionDataStore.cassandra=false \
-    --set agent.enabled=false \
-    --set collector.enabled=false \
-    --set query.enabled=false \
+    --values "${JAEGER_VALUES}" \
     --wait --timeout "${HEALTH_TIMEOUT}s"
 
 # ---------------------------------------------------------------------------
diff --git a/tasks/scripts/observability-prometheus-values.yaml b/tasks/scripts/observability-prometheus-values.yaml
new file mode 100644
index 000000000..5d4813712
--- /dev/null
+++ b/tasks/scripts/observability-prometheus-values.yaml
@@ -0,0 +1,35 @@
+# SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+#
+# Helm values for the local-dev kube-prometheus-stack install.
+# Consumed by tasks/scripts/observability-k8s-setup.sh.
+#
+# Slimmed down for k3d: drops Alertmanager + node/kube-state exporters.
+# Real-cluster operators install the bundle separately and follow the
+# docs/kubernetes/monitoring.mdx guide.
+
+alertmanager:
+  enabled: false
+nodeExporter:
+  enabled: false
+kubeStateMetrics:
+  enabled: false
+
+prometheus:
+  prometheusSpec:
+    # Don't restrict to ServiceMonitors carrying the Helm release label —
+    # the openshell chart sets its own selector via monitoring.serviceMonitor.labels.
+    serviceMonitorSelectorNilUsesHelmValues: false
+
+grafana:
+  adminPassword: admin
+  # Auto-provision Jaeger as a trace datasource. The Prometheus datasource is
+  # added by the chart automatically.
+  additionalDataSources:
+    - name: Jaeger
+      uid: jaeger
+      type: jaeger
+      access: proxy
+      # Service installed by tasks/scripts/observability-jaeger-values.yaml.
+      url: http://jaeger-query.observability.svc.cluster.local:16686
+      editable: true