From f3b7f5fb9e7b73f3d8fb4059950efac0b722b139 Mon Sep 17 00:00:00 2001 From: Taylor Mutch Date: Wed, 6 May 2026 18:13:19 -0700 Subject: [PATCH 1/4] test(helm): Add kube gateway e2e tests Signed-off-by: Taylor Mutch --- e2e/rust/e2e-helm.sh | 20 ++ e2e/rust/src/harness/driver.rs | 20 ++ e2e/rust/src/harness/mod.rs | 1 + e2e/rust/tests/forward_proxy_graphql_l7.rs | 4 + e2e/rust/tests/forward_proxy_l7_bypass.rs | 7 + e2e/rust/tests/host_gateway_alias.rs | 10 + e2e/with-kube-gateway.sh | 220 +++++++++++++++++++++ tasks/test.toml | 4 + 8 files changed, 286 insertions(+) create mode 100755 e2e/rust/e2e-helm.sh create mode 100644 e2e/rust/src/harness/driver.rs create mode 100755 e2e/with-kube-gateway.sh diff --git a/e2e/rust/e2e-helm.sh b/e2e/rust/e2e-helm.sh new file mode 100755 index 000000000..7d7042c47 --- /dev/null +++ b/e2e/rust/e2e-helm.sh @@ -0,0 +1,20 @@ +#!/usr/bin/env bash +# SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: Apache-2.0 + +# Run a Rust e2e test against a Helm-deployed OpenShell gateway. Set +# OPENSHELL_E2E_KUBE_CONTEXT to target an existing cluster; otherwise an +# ephemeral k3d cluster is created and torn down by with-kube-gateway.sh. + +set -euo pipefail + +ROOT="$(cd "$(dirname "${BASH_SOURCE[0]}")/../.." && pwd)" +E2E_TEST="${OPENSHELL_E2E_KUBE_TEST:-smoke}" + +cargo build -p openshell-cli --features openshell-core/dev-settings + +exec "${ROOT}/e2e/with-kube-gateway.sh" \ + cargo test --manifest-path "${ROOT}/e2e/rust/Cargo.toml" \ + --features e2e \ + --test "${E2E_TEST}" \ + -- --nocapture diff --git a/e2e/rust/src/harness/driver.rs b/e2e/rust/src/harness/driver.rs new file mode 100644 index 000000000..07921e461 --- /dev/null +++ b/e2e/rust/src/harness/driver.rs @@ -0,0 +1,20 @@ +// SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +// SPDX-License-Identifier: Apache-2.0 + +//! Active compute-driver detection for tests with driver-specific assumptions. + +/// Returns true and prints a skip notice when running against the kube driver. +/// +/// Tests that depend on docker/podman host-network features (e.g. +/// `host.openshell.internal` reachability, sibling-container test servers) +/// can early-return when this is true. +pub fn skip_if_kube(reason: &str) -> bool { + if matches!( + std::env::var("OPENSHELL_E2E_DRIVER").as_deref(), + Ok("kubernetes") + ) { + eprintln!("skipping on kubernetes driver: {reason}"); + return true; + } + false +} diff --git a/e2e/rust/src/harness/mod.rs b/e2e/rust/src/harness/mod.rs index 5feb21c70..89a095548 100644 --- a/e2e/rust/src/harness/mod.rs +++ b/e2e/rust/src/harness/mod.rs @@ -5,6 +5,7 @@ pub mod binary; pub mod container; +pub mod driver; pub mod gateway; pub mod output; pub mod port; diff --git a/e2e/rust/tests/forward_proxy_graphql_l7.rs b/e2e/rust/tests/forward_proxy_graphql_l7.rs index aeb3648b0..bfc561a20 100644 --- a/e2e/rust/tests/forward_proxy_graphql_l7.rs +++ b/e2e/rust/tests/forward_proxy_graphql_l7.rs @@ -13,6 +13,7 @@ use std::io::Write; use openshell_e2e::harness::container::ContainerHttpServer; +use openshell_e2e::harness::driver::skip_if_kube; use openshell_e2e::harness::sandbox::SandboxGuard; use tempfile::NamedTempFile; @@ -131,6 +132,9 @@ network_policies: #[tokio::test] #[allow(clippy::too_many_lines)] async fn graphql_l7_enforces_allow_and_deny_rules_on_forward_and_connect_paths() { + if skip_if_kube("uses host.openshell.internal to reach a sibling container") { + return; + } let server = start_test_server().await.expect("start test server"); let policy = write_graphql_policy(&server.host, server.port).expect("write custom policy"); let policy_path = policy diff --git a/e2e/rust/tests/forward_proxy_l7_bypass.rs b/e2e/rust/tests/forward_proxy_l7_bypass.rs index 6cbaca1eb..1d3f872d0 100644 --- a/e2e/rust/tests/forward_proxy_l7_bypass.rs +++ b/e2e/rust/tests/forward_proxy_l7_bypass.rs @@ -11,6 +11,7 @@ use std::io::Write; use openshell_e2e::harness::container::ContainerHttpServer; +use openshell_e2e::harness::driver::skip_if_kube; use openshell_e2e::harness::sandbox::SandboxGuard; use tempfile::NamedTempFile; @@ -98,6 +99,9 @@ network_policies: /// GET /allowed should succeed — the L7 policy explicitly allows it. #[tokio::test] async fn forward_proxy_allows_l7_permitted_request() { + if skip_if_kube("uses host.openshell.internal to reach a sibling container") { + return; + } let server = start_test_server().await.expect("start test server"); let policy = write_policy_with_l7_rules(&server.host, server.port).expect("write custom policy"); @@ -138,6 +142,9 @@ except Exception as e: /// POST /allowed should be denied — the L7 policy only allows GET. #[tokio::test] async fn forward_proxy_denies_l7_blocked_request() { + if skip_if_kube("uses host.openshell.internal to reach a sibling container") { + return; + } let server = start_test_server().await.expect("start test server"); let policy = write_policy_with_l7_rules(&server.host, server.port).expect("write custom policy"); diff --git a/e2e/rust/tests/host_gateway_alias.rs b/e2e/rust/tests/host_gateway_alias.rs index 2dbdbf1dc..8e58a3de1 100644 --- a/e2e/rust/tests/host_gateway_alias.rs +++ b/e2e/rust/tests/host_gateway_alias.rs @@ -8,6 +8,7 @@ use std::process::Stdio; use std::sync::Mutex; use openshell_e2e::harness::binary::openshell_cmd; +use openshell_e2e::harness::driver::skip_if_kube; use openshell_e2e::harness::sandbox::SandboxGuard; use tempfile::NamedTempFile; use tokio::io::AsyncReadExt; @@ -190,6 +191,9 @@ network_policies: #[tokio::test] async fn sandbox_reaches_host_openshell_internal_via_host_gateway_alias() { + if skip_if_kube("requires host.openshell.internal alias") { + return; + } let server = HostServer::start(r#"{"message":"hello-from-host"}"#) .await .expect("start host echo server"); @@ -225,6 +229,9 @@ async fn sandbox_reaches_host_openshell_internal_via_host_gateway_alias() { #[tokio::test] async fn sandbox_inference_local_routes_to_host_openshell_internal() { + if skip_if_kube("requires host.openshell.internal alias") { + return; + } let _inference_lock = INFERENCE_ROUTE_LOCK .lock() .unwrap_or_else(std::sync::PoisonError::into_inner); @@ -301,6 +308,9 @@ async fn sandbox_inference_local_routes_to_host_openshell_internal() { #[tokio::test] async fn inference_set_supports_no_verify_for_unreachable_endpoint() { + if skip_if_kube("uses host.openshell.internal as the unreachable target") { + return; + } let _inference_lock = INFERENCE_ROUTE_LOCK .lock() .unwrap_or_else(std::sync::PoisonError::into_inner); diff --git a/e2e/with-kube-gateway.sh b/e2e/with-kube-gateway.sh new file mode 100755 index 000000000..d316876e6 --- /dev/null +++ b/e2e/with-kube-gateway.sh @@ -0,0 +1,220 @@ +#!/usr/bin/env bash +# SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: Apache-2.0 + +# Run an e2e command against a Helm-deployed OpenShell gateway in Kubernetes. +# +# Modes: +# - OPENSHELL_E2E_KUBE_CONTEXT set: +# Target the named kubectl context, install the chart into an ephemeral +# namespace, and port-forward the gateway. Cluster lifecycle is the +# caller's responsibility (e.g. CI provisions kind via helm/kind-action). +# - OPENSHELL_E2E_KUBE_CONTEXT unset: +# Create a local k3d cluster via tasks/scripts/helm-k3s-local.sh, install +# the chart, port-forward, and tear the cluster down on exit. +# +# Helm e2e currently uses plaintext gateway traffic (ci/values-tls-disabled.yaml). +# +# Image source: helm install pulls from ${OPENSHELL_REGISTRY}/{gateway,supervisor}:${IMAGE_TAG} +# (defaults: ghcr.io/nvidia/openshell, latest). CI sets IMAGE_TAG to the commit SHA; +# local devs should set it to a tag pulled from a registry the cluster can reach, +# or build and import images via a separate bootstrap step before running this script. + +set -euo pipefail + +if [ "$#" -eq 0 ]; then + echo "Usage: e2e/with-kube-gateway.sh [args...]" >&2 + exit 2 +fi + +ROOT="$(cd "$(dirname "${BASH_SOURCE[0]}")/.." && pwd)" +# shellcheck source=e2e/support/gateway-common.sh +source "${ROOT}/e2e/support/gateway-common.sh" + +WORKDIR_PARENT="${TMPDIR:-/tmp}" +WORKDIR_PARENT="${WORKDIR_PARENT%/}" +WORKDIR="$(mktemp -d "${WORKDIR_PARENT}/openshell-e2e-kube.XXXXXX")" + +CLUSTER_CREATED_BY_US=0 +CLUSTER_NAME="" +KUBE_CONTEXT="" +NAMESPACE="openshell" +RELEASE_NAME="openshell" +PORTFORWARD_PID="" +PORTFORWARD_LOG="${WORKDIR}/portforward.log" +HELM_INSTALLED=0 + +# Isolate CLI/SDK gateway metadata from the developer's real config. +export XDG_CONFIG_HOME="${WORKDIR}/config" +export XDG_DATA_HOME="${WORKDIR}/data" + +kctl() { + kubectl --context "${KUBE_CONTEXT}" "$@" +} + +helmctl() { + helm --kube-context "${KUBE_CONTEXT}" "$@" +} + +cleanup() { + local exit_code=$? + + if [ -n "${PORTFORWARD_PID}" ]; then + kill "${PORTFORWARD_PID}" >/dev/null 2>&1 || true + wait "${PORTFORWARD_PID}" >/dev/null 2>&1 || true + fi + + if [ "${exit_code}" -ne 0 ] && [ -n "${KUBE_CONTEXT}" ] && [ -n "${NAMESPACE}" ]; then + if command -v kubectl >/dev/null 2>&1 \ + && kctl get namespace "${NAMESPACE}" >/dev/null 2>&1; then + echo "=== gateway pod state (preserved for debugging) ===" + kctl -n "${NAMESPACE}" get pods -o wide 2>&1 || true + echo "=== gateway events ===" + kctl -n "${NAMESPACE}" get events --sort-by=.lastTimestamp 2>&1 \ + | tail -n 80 || true + echo "=== gateway logs (last 200 lines) ===" + kctl -n "${NAMESPACE}" logs \ + -l "app.kubernetes.io/instance=${RELEASE_NAME}" --tail=200 \ + --all-containers --prefix 2>&1 || true + echo "=== end gateway debug output ===" + fi + if [ -f "${PORTFORWARD_LOG}" ]; then + echo "=== port-forward log ===" + cat "${PORTFORWARD_LOG}" || true + echo "=== end port-forward log ===" + fi + fi + + if [ "${HELM_INSTALLED}" = "1" ] && [ -n "${KUBE_CONTEXT}" ] && [ -n "${NAMESPACE}" ]; then + if command -v helm >/dev/null 2>&1; then + helmctl uninstall "${RELEASE_NAME}" --namespace "${NAMESPACE}" --wait \ + --timeout 60s >/dev/null 2>&1 || true + fi + if command -v kubectl >/dev/null 2>&1; then + kctl delete namespace "${NAMESPACE}" --wait=false \ + --ignore-not-found >/dev/null 2>&1 || true + fi + fi + + if [ "${CLUSTER_CREATED_BY_US}" = "1" ] && [ -n "${CLUSTER_NAME}" ]; then + if command -v k3d >/dev/null 2>&1 && k3d cluster list "${CLUSTER_NAME}" \ + >/dev/null 2>&1; then + echo "Deleting ephemeral k3d cluster ${CLUSTER_NAME}..." + k3d cluster delete "${CLUSTER_NAME}" >/dev/null 2>&1 || true + fi + fi + + rm -rf "${WORKDIR}" 2>/dev/null || true +} +trap cleanup EXIT + +require_cmd() { + if ! command -v "$1" >/dev/null 2>&1; then + echo "ERROR: $1 is required to run Helm-backed e2e tests" >&2 + exit 2 + fi +} + +require_cmd helm +require_cmd kubectl +require_cmd curl + +if [ -n "${OPENSHELL_E2E_KUBE_CONTEXT:-}" ]; then + KUBE_CONTEXT="${OPENSHELL_E2E_KUBE_CONTEXT}" + echo "Using existing kubectl context: ${KUBE_CONTEXT}" + if ! kctl cluster-info >/dev/null 2>&1; then + echo "ERROR: kubectl context '${KUBE_CONTEXT}' is not reachable." >&2 + exit 2 + fi +else + require_cmd k3d + CLUSTER_NAME="oshe2e-$$-$(date +%s | tail -c 8)" + echo "Creating ephemeral k3d cluster ${CLUSTER_NAME}..." + HELM_K3S_CLUSTER_NAME="${CLUSTER_NAME}" \ + HELM_K3S_KUBECONFIG="${WORKDIR}/kubeconfig" \ + bash "${ROOT}/tasks/scripts/helm-k3s-local.sh" create + CLUSTER_CREATED_BY_US=1 + export KUBECONFIG="${WORKDIR}/kubeconfig" + KUBE_CONTEXT="k3d-${CLUSTER_NAME}" +fi + +IMAGE_TAG_VALUE="${IMAGE_TAG:-latest}" +REGISTRY_VALUE="${OPENSHELL_REGISTRY:-ghcr.io/nvidia/openshell}" +REGISTRY_VALUE="${REGISTRY_VALUE%/}" + +# When this script created the cluster, import locally-available gateway and +# supervisor images so devs without a registry login can iterate. Best-effort: +# missing images fall through to the cluster's pull behavior at install time. +if [ "${CLUSTER_CREATED_BY_US}" = "1" ]; then + for image in \ + "${REGISTRY_VALUE}/gateway:${IMAGE_TAG_VALUE}" \ + "${REGISTRY_VALUE}/supervisor:${IMAGE_TAG_VALUE}"; do + if docker image inspect "${image}" >/dev/null 2>&1; then + echo "Importing ${image} into k3d cluster ${CLUSTER_NAME}..." + k3d image import "${image}" --cluster "${CLUSTER_NAME}" \ + --mode direct >/dev/null + fi + done +fi + +# The Kubernetes compute driver creates and watches Sandbox CRs reconciled +# by the upstream agent-sandbox-controller. Without the CRD + controller, +# every gateway K8s call 404s and CreateSandbox never produces a Pod. +echo "Installing agent-sandbox CRDs and controller..." +kctl apply -f "${ROOT}/deploy/kube/manifests/agent-sandbox.yaml" +kctl wait --for=condition=Established crd/sandboxes.agents.x-k8s.io --timeout=120s +kctl -n agent-sandbox-system rollout status statefulset/agent-sandbox-controller --timeout=300s + +echo "Installing Helm chart (release=${RELEASE_NAME}, namespace=${NAMESPACE}, tag=${IMAGE_TAG_VALUE})..." +helmctl install "${RELEASE_NAME}" "${ROOT}/deploy/helm/openshell" \ + --namespace "${NAMESPACE}" --create-namespace \ + --values "${ROOT}/deploy/helm/openshell/ci/values-tls-disabled.yaml" \ + --set "fullnameOverride=openshell" \ + --set "image.repository=${REGISTRY_VALUE}/gateway" \ + --set "image.tag=${IMAGE_TAG_VALUE}" \ + --set "supervisor.image.repository=${REGISTRY_VALUE}/supervisor" \ + --set "supervisor.image.tag=${IMAGE_TAG_VALUE}" \ + --wait --timeout 5m +HELM_INSTALLED=1 + +LOCAL_PORT="$(e2e_pick_port)" +echo "Starting kubectl port-forward svc/openshell ${LOCAL_PORT}:8080..." +kctl -n "${NAMESPACE}" port-forward "svc/openshell" \ + "${LOCAL_PORT}:8080" >"${PORTFORWARD_LOG}" 2>&1 & +PORTFORWARD_PID=$! + +elapsed=0 +timeout=30 +while [ "${elapsed}" -lt "${timeout}" ]; do + if ! kill -0 "${PORTFORWARD_PID}" 2>/dev/null; then + echo "ERROR: kubectl port-forward exited before becoming reachable" >&2 + cat "${PORTFORWARD_LOG}" >&2 || true + exit 1 + fi + if curl -s -o /dev/null --connect-timeout 1 "http://127.0.0.1:${LOCAL_PORT}"; then + break + fi + sleep 1 + elapsed=$((elapsed + 1)) +done +if [ "${elapsed}" -ge "${timeout}" ]; then + echo "ERROR: port-forward did not accept TCP within ${timeout}s" >&2 + cat "${PORTFORWARD_LOG}" >&2 || true + exit 1 +fi + +GATEWAY_NAME="openshell-e2e-kube-${LOCAL_PORT}" +GATEWAY_ENDPOINT="http://127.0.0.1:${LOCAL_PORT}" +e2e_register_plaintext_gateway \ + "${XDG_CONFIG_HOME}" \ + "${GATEWAY_NAME}" \ + "${GATEWAY_ENDPOINT}" \ + "${LOCAL_PORT}" + +export OPENSHELL_GATEWAY="${GATEWAY_NAME}" +export OPENSHELL_E2E_DRIVER="kubernetes" +export OPENSHELL_E2E_SANDBOX_NAMESPACE="${NAMESPACE}" +export OPENSHELL_PROVISION_TIMEOUT="${OPENSHELL_PROVISION_TIMEOUT:-300}" + +echo "Running e2e command against ${GATEWAY_ENDPOINT}: $*" +"$@" diff --git a/tasks/test.toml b/tasks/test.toml index bf5741c72..c9e1dc817 100644 --- a/tasks/test.toml +++ b/tasks/test.toml @@ -50,6 +50,10 @@ run = "e2e/with-docker-gateway.sh uv run pytest -o python_files='test_*.py' -m g description = "Run Rust CLI e2e tests against a Podman-backed gateway" run = "e2e/rust/e2e-podman.sh" +["e2e:helm"] +description = "Run smoke e2e against a Helm-deployed gateway (set OPENSHELL_E2E_KUBE_CONTEXT to reuse a cluster, otherwise creates a local k3d cluster)" +run = "e2e/rust/e2e-helm.sh" + ["e2e:vm"] description = "Start openshell-gateway with the VM compute driver and run the cluster-agnostic smoke e2e" run = "e2e/rust/e2e-vm.sh" From 58c3a5a0177e5e17f9f0b6692eddf1e39cc0e4ec Mon Sep 17 00:00:00 2001 From: Taylor Mutch Date: Thu, 7 May 2026 16:50:02 -0700 Subject: [PATCH 2/4] ci(helm): add Branch Helm E2E workflow gated on test:e2e-helm Adds a label-gated GitHub Actions workflow that exercises the Helm chart end-to-end against the Rust e2e suite via `mise run e2e:helm`. Pipeline: - pr_metadata gates on the `test:e2e-helm` label via the pr-gate action. - build-gateway / build-supervisor build and push Docker images using the reusable docker-build.yml workflow. - helm-e2e (bare runner): apt-installs z3 build deps so cargo can compile the openshell-policy crate's z3-sys backend, creates a kind cluster via helm/kind-action, materializes the kind kubeconfig at the path mise's [env] block expects, side-loads the freshly built gateway/supervisor images, applies deploy/kube/manifests/agent-sandbox.yaml so the sandboxes.agents.x-k8s.io CRD and reconciling StatefulSet are in place, and finally runs `mise run e2e:helm`. Also expands the `e2e:helm` task to run the full Rust e2e suite (matching `e2e:podman`) instead of only the smoke test, with OPENSHELL_E2E_KUBE_TEST as an opt-in single-test override for local debugging. Extends the e2e-label-help workflow so applying `test:e2e-helm` posts the next-step hint pointing at this workflow. Signed-off-by: Taylor Mutch --- .github/workflows/branch-helm-e2e.yml | 126 ++++++++++++++++++++++++++ .github/workflows/e2e-label-help.yml | 3 +- e2e/rust/e2e-helm.sh | 13 ++- tasks/test.toml | 2 +- 4 files changed, 139 insertions(+), 5 deletions(-) create mode 100644 .github/workflows/branch-helm-e2e.yml diff --git a/.github/workflows/branch-helm-e2e.yml b/.github/workflows/branch-helm-e2e.yml new file mode 100644 index 000000000..926874a08 --- /dev/null +++ b/.github/workflows/branch-helm-e2e.yml @@ -0,0 +1,126 @@ +# SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: Apache-2.0 + +name: Branch Helm E2E + +on: + push: + branches: + - "pull-request/[0-9]+" + workflow_dispatch: {} + +permissions: {} + +jobs: + pr_metadata: + name: Resolve PR metadata + runs-on: ubuntu-latest + permissions: + contents: read + pull-requests: read + outputs: + should_run: ${{ steps.gate.outputs.should_run }} + steps: + - uses: actions/checkout@v6 + + - id: gate + uses: ./.github/actions/pr-gate + with: + required_label: test:e2e-helm + + build-gateway: + needs: [pr_metadata] + if: needs.pr_metadata.outputs.should_run == 'true' + permissions: + contents: read + packages: write + uses: ./.github/workflows/docker-build.yml + with: + component: gateway + platform: linux/amd64 + + build-supervisor: + needs: [pr_metadata] + if: needs.pr_metadata.outputs.should_run == 'true' + permissions: + contents: read + packages: write + uses: ./.github/workflows/docker-build.yml + with: + component: supervisor + platform: linux/amd64 + + helm-e2e: + name: Helm E2E (Rust smoke) + needs: [pr_metadata, build-gateway, build-supervisor] + if: needs.pr_metadata.outputs.should_run == 'true' + # Bare runner: running kind-in-container hits nested-Docker / kubeconfig + # complications. The runner has Docker; mise installs helm, kubectl, and + # the Rust toolchain. + runs-on: linux-amd64-cpu8 + timeout-minutes: 60 + permissions: + contents: read + packages: read + env: + MISE_GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} + KIND_CLUSTER_NAME: helm-e2e-${{ github.run_id }} + steps: + - uses: actions/checkout@v6 + + - name: Install mise + run: | + curl https://mise.run | sh + echo "$HOME/.local/bin" >> "$GITHUB_PATH" + echo "$HOME/.local/share/mise/shims" >> "$GITHUB_PATH" + + - name: Install tools + run: mise install --locked + + # The openshell-policy crate transitively pulls in z3-sys, whose + # build script needs the z3 C/C++ headers and clang/bindgen to + # compile. The bare runner doesn't ship them; the CI container + # image used by other Rust e2e jobs does, but we can't run helm-e2e + # there (the runner's container handler injects its own --network + # bridge, which conflicts with the --network host we need so kind's + # API server is reachable from the test process). + - name: Install z3 build deps + run: sudo apt-get update && sudo apt-get install -y --no-install-recommends libz3-dev clang + + - name: Log in to GHCR + run: echo "${{ secrets.GITHUB_TOKEN }}" | docker login ghcr.io -u "${{ github.actor }}" --password-stdin + + - name: Create kind cluster + uses: helm/kind-action@v1 + with: + cluster_name: ${{ env.KIND_CLUSTER_NAME }} + wait: 120s + + # mise.toml sets KUBECONFIG="{{config_root}}/kubeconfig"; helm/kind-action + # writes to ~/.kube/config. Materialize the kind context at the mise path + # so `mise run e2e:helm` (and the wrapper's `kubectl --context=…`) finds + # the kind cluster. + - name: Export kind kubeconfig to mise path + run: | + set -euo pipefail + kind get kubeconfig --name "$KIND_CLUSTER_NAME" > "$GITHUB_WORKSPACE/kubeconfig" + chmod 600 "$GITHUB_WORKSPACE/kubeconfig" + + # Pre-pull and side-load: kind nodes don't have ghcr credentials, and + # tagging IMAGE_TAG to a SHA means the chart's IfNotPresent pull policy + # is satisfied once the image is loaded into the node's containerd. + - name: Load gateway and supervisor images into kind + run: | + set -euo pipefail + for component in gateway supervisor; do + image="ghcr.io/nvidia/openshell/${component}:${{ github.sha }}" + docker pull "$image" + kind load docker-image "$image" --name "$KIND_CLUSTER_NAME" + done + + - name: Run Helm E2E (Rust smoke) + env: + OPENSHELL_E2E_KUBE_CONTEXT: kind-${{ env.KIND_CLUSTER_NAME }} + IMAGE_TAG: ${{ github.sha }} + OPENSHELL_REGISTRY: ghcr.io/nvidia/openshell + run: mise run --no-deps --skip-deps e2e:helm diff --git a/.github/workflows/e2e-label-help.yml b/.github/workflows/e2e-label-help.yml index 2a61660d2..a5463f986 100644 --- a/.github/workflows/e2e-label-help.yml +++ b/.github/workflows/e2e-label-help.yml @@ -19,7 +19,7 @@ permissions: {} jobs: hint: name: Post next-step hint for E2E label - if: github.event.label.name == 'test:e2e' || github.event.label.name == 'test:e2e-gpu' + if: github.event.label.name == 'test:e2e' || github.event.label.name == 'test:e2e-gpu' || github.event.label.name == 'test:e2e-helm' runs-on: ubuntu-latest permissions: pull-requests: write @@ -40,6 +40,7 @@ jobs: case "$LABEL_NAME" in test:e2e) workflow_file=branch-e2e.yml; workflow_name="Branch E2E Checks" ;; test:e2e-gpu) workflow_file=test-gpu.yml; workflow_name="GPU Test" ;; + test:e2e-helm) workflow_file=branch-helm-e2e.yml; workflow_name="Branch Helm E2E" ;; *) echo "Unrecognized label $LABEL_NAME"; exit 1 ;; esac diff --git a/e2e/rust/e2e-helm.sh b/e2e/rust/e2e-helm.sh index 7d7042c47..6b161f344 100755 --- a/e2e/rust/e2e-helm.sh +++ b/e2e/rust/e2e-helm.sh @@ -2,19 +2,26 @@ # SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. # SPDX-License-Identifier: Apache-2.0 -# Run a Rust e2e test against a Helm-deployed OpenShell gateway. Set +# Run the Rust e2e suite against a Helm-deployed OpenShell gateway. Set # OPENSHELL_E2E_KUBE_CONTEXT to target an existing cluster; otherwise an # ephemeral k3d cluster is created and torn down by with-kube-gateway.sh. +# Set OPENSHELL_E2E_KUBE_TEST to scope to a single integration test +# (e.g. smoke) for local debugging. set -euo pipefail ROOT="$(cd "$(dirname "${BASH_SOURCE[0]}")/../.." && pwd)" -E2E_TEST="${OPENSHELL_E2E_KUBE_TEST:-smoke}" cargo build -p openshell-cli --features openshell-core/dev-settings +test_filter=() +if [ -n "${OPENSHELL_E2E_KUBE_TEST:-}" ]; then + test_filter+=(--test "${OPENSHELL_E2E_KUBE_TEST}") +fi + exec "${ROOT}/e2e/with-kube-gateway.sh" \ cargo test --manifest-path "${ROOT}/e2e/rust/Cargo.toml" \ --features e2e \ - --test "${E2E_TEST}" \ + --no-fail-fast \ + ${test_filter[@]+"${test_filter[@]}"} \ -- --nocapture diff --git a/tasks/test.toml b/tasks/test.toml index c9e1dc817..00a6823b2 100644 --- a/tasks/test.toml +++ b/tasks/test.toml @@ -51,7 +51,7 @@ description = "Run Rust CLI e2e tests against a Podman-backed gateway" run = "e2e/rust/e2e-podman.sh" ["e2e:helm"] -description = "Run smoke e2e against a Helm-deployed gateway (set OPENSHELL_E2E_KUBE_CONTEXT to reuse a cluster, otherwise creates a local k3d cluster)" +description = "Run Rust CLI e2e tests against a Helm-deployed gateway (set OPENSHELL_E2E_KUBE_CONTEXT to reuse a cluster, otherwise creates a local k3d cluster; set OPENSHELL_E2E_KUBE_TEST= to scope to one test)" run = "e2e/rust/e2e-helm.sh" ["e2e:vm"] From dfa99476faf4aa8f268e03cf537cc2f79d6536df Mon Sep 17 00:00:00 2001 From: Taylor Mutch Date: Fri, 8 May 2026 10:19:04 -0700 Subject: [PATCH 3/4] feat(observability): add gateway OTLP traces and Helm monitoring surface Adds opt-in OpenTelemetry trace export and a Prometheus ServiceMonitor to the gateway Helm chart. The exporter and chart toggles are independent from the existing /metrics surface and the OCSF sandbox log fan-out. - Gateway: append a tracing-opentelemetry layer to TracingLogBus when an OTLP/gRPC endpoint is configured; flush spans on shutdown. CLI gains --otlp-endpoint; standard OTEL_* env vars drive sampling and resource attributes. - Helm: monitoring.serviceMonitor.* renders a Prometheus-Operator ServiceMonitor; monitoring.tracing.* projects OTEL_* env vars onto the gateway container. Both default off. - Tooling: observability:k8s:{setup,teardown,port-forward} mise tasks install kube-prometheus-stack + Jaeger all-in-one for local dev. - Docs: new docs/kubernetes/monitoring.mdx; cross-links from observability overview and architecture/gateway.md; helm-dev-environment and debug-openshell-cluster skills updated. --- .../skills/debug-openshell-cluster/SKILL.md | 11 ++ .agents/skills/helm-dev-environment/SKILL.md | 33 ++++ Cargo.lock | 85 ++++++++++ Cargo.toml | 6 + architecture/gateway.md | 17 ++ crates/openshell-server/Cargo.toml | 6 + crates/openshell-server/src/cli.rs | 14 +- crates/openshell-server/src/lib.rs | 3 + crates/openshell-server/src/tracing_bus.rs | 160 +++++++++++++++++- deploy/helm/openshell/README.md | 10 ++ .../helm/openshell/ci/values-monitoring.yaml | 31 ++++ deploy/helm/openshell/skaffold.yaml | 28 +++ deploy/helm/openshell/templates/_helpers.tpl | 11 ++ .../openshell/templates/servicemonitor.yaml | 27 +++ .../helm/openshell/templates/statefulset.yaml | 17 ++ deploy/helm/openshell/values.yaml | 36 ++++ docs/kubernetes/monitoring.mdx | 135 +++++++++++++++ docs/observability/overview.mdx | 4 + tasks/observability.toml | 19 +++ tasks/scripts/observability-k8s-setup.sh | 96 +++++++++++ tasks/scripts/observability-k8s-teardown.sh | 27 +++ tasks/scripts/observability-port-forward.sh | 58 +++++++ 22 files changed, 829 insertions(+), 5 deletions(-) create mode 100644 deploy/helm/openshell/ci/values-monitoring.yaml create mode 100644 deploy/helm/openshell/templates/servicemonitor.yaml create mode 100644 docs/kubernetes/monitoring.mdx create mode 100644 tasks/observability.toml create mode 100755 tasks/scripts/observability-k8s-setup.sh create mode 100755 tasks/scripts/observability-k8s-teardown.sh create mode 100755 tasks/scripts/observability-port-forward.sh diff --git a/.agents/skills/debug-openshell-cluster/SKILL.md b/.agents/skills/debug-openshell-cluster/SKILL.md index 16158c0dc..34d407e69 100644 --- a/.agents/skills/debug-openshell-cluster/SKILL.md +++ b/.agents/skills/debug-openshell-cluster/SKILL.md @@ -189,6 +189,17 @@ openshell status openshell logs ``` +## Telemetry Signals + +Before drilling into logs, check whether the gateway is exporting telemetry — the pull-based metrics surface and the push-based trace export are the fastest signals that the control plane is alive and that requests are reaching it. + +| Signal | Where it shows up | When to use it | +|---|---|---| +| Prometheus metrics on `/metrics` | A scrape target via the chart's `ServiceMonitor` (`monitoring.serviceMonitor.enabled`). Local: `kubectl -n openshell port-forward statefulset/openshell :`. | Confirm the gateway listener is up and gRPC requests are landing. `up{job="openshell"} == 1` in Prometheus is a quick liveness ping. | +| OTLP traces | Jaeger / Tempo / OTel backend (`monitoring.tracing.enabled`). Look for service `openshell-gateway`. | Confirm an inbound request reached the multiplex layer; spans carry `method`, `path`, `request_id`. Missing traces under load means OTLP export is misconfigured or the endpoint is unreachable. | + +If the chart's `monitoring.serviceMonitor.enabled` or `monitoring.tracing.enabled` were not set, those signals are unavailable — fall back to gateway logs. See [Monitoring the Gateway](../../../docs/kubernetes/monitoring.mdx) for setup. + ## Common Failure Patterns | Symptom | Likely cause | Check | diff --git a/.agents/skills/helm-dev-environment/SKILL.md b/.agents/skills/helm-dev-environment/SKILL.md index 623efb2e6..1899b459f 100644 --- a/.agents/skills/helm-dev-environment/SKILL.md +++ b/.agents/skills/helm-dev-environment/SKILL.md @@ -169,6 +169,39 @@ To remove Keycloak: mise run keycloak:k8s:teardown ``` +### Monitoring (Prometheus + Grafana + Jaeger) + +One-time setup — installs `kube-prometheus-stack` (slimmed: no Alertmanager, +node-exporter, or kube-state-metrics) and a Jaeger all-in-one Pod: + +```bash +mise run observability:k8s:setup +``` + +Then activate monitoring on the gateway: + +1. Uncomment `#- ci/values-monitoring.yaml` in `skaffold.yaml` +2. Redeploy: `mise run helm:skaffold:run` + +Forward UIs to localhost: + +```bash +mise run observability:port-forward +# Grafana http://localhost:3000 (admin / admin) +# Prometheus http://localhost:9090 +# Jaeger UI http://localhost:16686 +``` + +Teardown: + +```bash +mise run observability:k8s:teardown +``` + +The chart's `monitoring.serviceMonitor.enabled` creates a `ServiceMonitor` +that Prometheus scrapes, and `monitoring.tracing.enabled` projects `OTEL_*` +env vars onto the gateway so it exports OTLP/gRPC traces to Jaeger. + --- ## Cluster Lifecycle (suspend/resume) diff --git a/Cargo.lock b/Cargo.lock index 808956cd9..84d3a48b8 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -3645,6 +3645,9 @@ dependencies = [ "openshell-policy", "openshell-providers", "openshell-router", + "opentelemetry", + "opentelemetry-otlp", + "opentelemetry_sdk", "petname", "pin-project-lite", "prost", @@ -3669,6 +3672,7 @@ dependencies = [ "tower 0.5.3", "tower-http 0.6.8", "tracing", + "tracing-opentelemetry", "tracing-subscriber", "uuid", "wiremock", @@ -3726,6 +3730,69 @@ version = "0.2.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "7c87def4c32ab89d880effc9e097653c8da5d6ef28e6b539d313baaacfbafcbe" +[[package]] +name = "opentelemetry" +version = "0.29.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "9e87237e2775f74896f9ad219d26a2081751187eb7c9f5c58dde20a23b95d16c" +dependencies = [ + "futures-core", + "futures-sink", + "js-sys", + "pin-project-lite", + "thiserror 2.0.18", + "tracing", +] + +[[package]] +name = "opentelemetry-otlp" +version = "0.29.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d899720fe06916ccba71c01d04ecd77312734e2de3467fd30d9d580c8ce85656" +dependencies = [ + "futures-core", + "http", + "opentelemetry", + "opentelemetry-proto", + "opentelemetry_sdk", + "prost", + "thiserror 2.0.18", + "tokio", + "tonic", +] + +[[package]] +name = "opentelemetry-proto" +version = "0.29.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "8c40da242381435e18570d5b9d50aca2a4f4f4d8e146231adb4e7768023309b3" +dependencies = [ + "opentelemetry", + "opentelemetry_sdk", + "prost", + "tonic", +] + +[[package]] +name = "opentelemetry_sdk" +version = "0.29.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "afdefb21d1d47394abc1ba6c57363ab141be19e27cc70d0e422b7f303e4d290b" +dependencies = [ + "futures-channel", + "futures-executor", + "futures-util", + "glob", + "opentelemetry", + "percent-encoding", + "rand 0.9.4", + "serde_json", + "thiserror 2.0.18", + "tokio", + "tokio-stream", + "tracing", +] + [[package]] name = "ordered-float" version = "2.10.1" @@ -6254,6 +6321,24 @@ dependencies = [ "tracing-core", ] +[[package]] +name = "tracing-opentelemetry" +version = "0.30.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "fd8e764bd6f5813fd8bebc3117875190c5b0415be8f7f8059bffb6ecd979c444" +dependencies = [ + "js-sys", + "once_cell", + "opentelemetry", + "opentelemetry_sdk", + "smallvec", + "tracing", + "tracing-core", + "tracing-log", + "tracing-subscriber", + "web-time", +] + [[package]] name = "tracing-serde" version = "0.2.0" diff --git a/Cargo.toml b/Cargo.toml index 9bc3f9ea2..3f29a33d1 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -58,6 +58,12 @@ tracing = "0.1" tracing-subscriber = { version = "0.3", features = ["env-filter", "json"] } tracing-appender = "0.2" +# OpenTelemetry — pinned to a tonic-0.12 / prost-0.13 compatible release set. +opentelemetry = "0.29" +opentelemetry_sdk = { version = "0.29", features = ["rt-tokio"] } +opentelemetry-otlp = { version = "0.29", default-features = false, features = ["grpc-tonic", "trace"] } +tracing-opentelemetry = "0.30" + # Metrics metrics = "0.24" metrics-exporter-prometheus = { version = "0.18", default-features = false, features = ["http-listener"] } diff --git a/architecture/gateway.md b/architecture/gateway.md index d89706e64..bee7aab97 100644 --- a/architecture/gateway.md +++ b/architecture/gateway.md @@ -54,6 +54,23 @@ Domain objects use shared metadata: stable server-generated IDs, human-readable names, creation timestamps, and labels. Crate-level details live in `crates/openshell-core/README.md`. +### Observability surface + +The gateway exposes three independent telemetry surfaces, each with its own +configuration knob and consumer: + +| Surface | Direction | Configured by | Consumers | +|---|---|---|---| +| Prometheus metrics on `/metrics` | Pull | `--metrics-port` (CLI), `monitoring.serviceMonitor.*` (Helm) | Prometheus / kube-prometheus-stack via `ServiceMonitor`. | +| OpenTelemetry traces over OTLP/gRPC | Push | `--otlp-endpoint` / `OTEL_EXPORTER_OTLP_*` env, `monitoring.tracing.*` (Helm) | Any OTLP backend (Jaeger, Tempo, OTel Collector). The per-request span set up by `TraceLayer` becomes the OTLP root. | +| Sandbox log fan-out | Push (gRPC stream) | Always on per sandbox subscription | CLI / TUI / SDK consumers via `WatchSandbox` and `GetSandboxLogs`; OCSF JSONL when enabled inside the sandbox. | + +Trace export is opt-in: the gateway only installs the OpenTelemetry layer +when an OTLP endpoint is supplied. Spans flush on `SIGTERM` via an explicit +`shutdown()` in the gateway shutdown path. See +[Monitoring the Gateway](../docs/kubernetes/monitoring.mdx) for the operator +guide. + ## Persistence The gateway persistence layer is a protobuf object store. Domain services store diff --git a/crates/openshell-server/Cargo.toml b/crates/openshell-server/Cargo.toml index 9cba99045..2bbd21305 100644 --- a/crates/openshell-server/Cargo.toml +++ b/crates/openshell-server/Cargo.toml @@ -64,6 +64,12 @@ anyhow = { workspace = true } tracing = { workspace = true } tracing-subscriber = { workspace = true } +# OpenTelemetry tracing export (opt-in, configured via env) +opentelemetry = { workspace = true } +opentelemetry_sdk = { workspace = true } +opentelemetry-otlp = { workspace = true } +tracing-opentelemetry = { workspace = true } + # Metrics metrics = { workspace = true } metrics-exporter-prometheus = { workspace = true } diff --git a/crates/openshell-server/src/cli.rs b/crates/openshell-server/src/cli.rs index 534e3da37..577a46454 100644 --- a/crates/openshell-server/src/cli.rs +++ b/crates/openshell-server/src/cli.rs @@ -17,7 +17,10 @@ use tracing_subscriber::EnvFilter; use crate::certgen; use crate::compute::{DockerComputeConfig, VmComputeConfig}; -use crate::{run_server, tracing_bus::TracingLogBus}; +use crate::{ + run_server, + tracing_bus::{OtlpTracingConfig, TracingLogBus}, +}; /// `OpenShell` gateway process - gRPC and HTTP server with protocol multiplexing. /// @@ -305,6 +308,13 @@ struct RunArgs { /// Keycloak: "scope". Okta: "scp". Leave empty to disable scope enforcement. #[arg(long, env = "OPENSHELL_OIDC_SCOPES_CLAIM", default_value = "")] oidc_scopes_claim: String, + + /// OTLP/gRPC endpoint for OpenTelemetry trace export (e.g. + /// `http://jaeger-collector.observability.svc:4317`). When unset, no + /// traces are exported. The signal-specific + /// `OTEL_EXPORTER_OTLP_TRACES_ENDPOINT` takes precedence over this flag. + #[arg(long, env = "OPENSHELL_OTLP_ENDPOINT")] + otlp_endpoint: Option, } pub fn command() -> Command { @@ -328,8 +338,10 @@ pub async fn run_cli() -> Result<()> { async fn run_from_args(args: RunArgs) -> Result<()> { let tracing_log_bus = TracingLogBus::new(); + let otlp = OtlpTracingConfig::resolve(args.otlp_endpoint.clone()); tracing_log_bus.install_subscriber( EnvFilter::try_from_default_env().unwrap_or_else(|_| EnvFilter::new(&args.log_level)), + otlp, ); let bind = SocketAddr::new(args.bind_address, args.port); diff --git a/crates/openshell-server/src/lib.rs b/crates/openshell-server/src/lib.rs index eaca911e4..beb247a17 100644 --- a/crates/openshell-server/src/lib.rs +++ b/crates/openshell-server/src/lib.rs @@ -324,6 +324,9 @@ pub async fn run_server( .await .map_err(|err| Error::execution(format!("gateway shutdown cleanup failed: {err}")))?; + // Flush any pending OTLP spans. No-op when OTLP export is not configured. + state.tracing_log_bus.shutdown(); + Ok(()) } diff --git a/crates/openshell-server/src/tracing_bus.rs b/crates/openshell-server/src/tracing_bus.rs index cf168e306..34403f194 100644 --- a/crates/openshell-server/src/tracing_bus.rs +++ b/crates/openshell-server/src/tracing_bus.rs @@ -4,17 +4,49 @@ //! Capture openshell-server tracing logs for streaming over gRPC. use std::collections::{HashMap, VecDeque}; -use std::sync::{Arc, Mutex}; +use std::sync::{Arc, Mutex, OnceLock}; use std::time::{SystemTime, UNIX_EPOCH}; use openshell_core::proto::{SandboxLogLine, SandboxStreamEvent}; use openshell_ocsf::OCSF_TARGET; +use opentelemetry::KeyValue; +use opentelemetry::trace::TracerProvider; +use opentelemetry_otlp::{SpanExporter, WithExportConfig}; +use opentelemetry_sdk::Resource; +use opentelemetry_sdk::trace::{Sampler, SdkTracerProvider}; use tokio::sync::broadcast; use tracing::{Event, Subscriber}; use tracing_subscriber::layer::Context; use tracing_subscriber::prelude::*; use tracing_subscriber::{EnvFilter, Layer}; +/// OTLP tracing exporter configuration. Endpoint is the only required field; +/// service name, resource attributes, and sampling ratio are picked up from +/// standard `OTEL_*` env vars by the OpenTelemetry SDK. +#[derive(Debug, Clone)] +pub struct OtlpTracingConfig { + pub endpoint: String, +} + +impl OtlpTracingConfig { + /// Resolve OTLP endpoint from (in order): the signal-specific + /// `OTEL_EXPORTER_OTLP_TRACES_ENDPOINT`, the shared + /// `OTEL_EXPORTER_OTLP_ENDPOINT`, then the supplied CLI argument. + /// Returns `None` if no endpoint is configured. + pub fn resolve(arg_endpoint: Option) -> Option { + let endpoint = std::env::var("OTEL_EXPORTER_OTLP_TRACES_ENDPOINT") + .ok() + .or_else(|| std::env::var("OTEL_EXPORTER_OTLP_ENDPOINT").ok()) + .or(arg_endpoint) + .map(|s| s.trim().to_string()) + .filter(|s| !s.is_empty())?; + Some(Self { endpoint }) + } +} + +/// Process-wide tracer provider, retained so spans can be flushed on shutdown. +static OTEL_TRACER_PROVIDER: OnceLock = OnceLock::new(); + /// Bus that publishes server log lines keyed by sandbox id. #[derive(Debug, Clone)] pub struct TracingLogBus { @@ -47,19 +79,48 @@ impl TracingLogBus { } /// Install a tracing subscriber that logs to stdout and publishes events into this bus. - pub fn install_subscriber(&self, env_filter: EnvFilter) { - let layer = SandboxLogLayer { + /// + /// When `otlp` is provided, an OpenTelemetry OTLP/gRPC trace exporter is attached + /// after the env filter so `OPENSHELL_LOG_LEVEL` continues to gate exported spans. + /// The `tower_http::trace::TraceLayer` per-request span set up in + /// `multiplex.rs` becomes the OTLP root span automatically. + pub fn install_subscriber(&self, env_filter: EnvFilter, otlp: Option) { + let bus_layer = SandboxLogLayer { bus: self.clone(), default_tail: Self::DEFAULT_TAIL, }; + let otel_layer = match otlp { + Some(cfg) => match build_otel_layer(&cfg) { + Ok(layer) => Some(layer), + Err(err) => { + eprintln!( + "openshell-gateway: failed to enable OTLP trace export to {}: {err}", + cfg.endpoint + ); + None + } + }, + None => None, + }; + tracing_subscriber::registry() .with(env_filter) .with(tracing_subscriber::fmt::layer()) - .with(layer) + .with(bus_layer) + .with(otel_layer) .init(); } + /// Flush and shut down the OTLP tracer provider, if installed. Idempotent. + pub fn shutdown(&self) { + if let Some(provider) = OTEL_TRACER_PROVIDER.get() + && let Err(err) = provider.shutdown() + { + tracing::warn!(error = %err, "OpenTelemetry tracer provider shutdown failed"); + } + } + fn sender_for(&self, sandbox_id: &str) -> broadcast::Sender { let mut inner = self.inner.lock().expect("tracing bus lock poisoned"); inner @@ -198,6 +259,70 @@ fn current_time_ms() -> Option { i64::try_from(now.as_millis()).ok() } +/// Build an `OpenTelemetry` `tracing` layer that exports spans to the +/// configured OTLP/gRPC endpoint. The resulting layer can be `with(...)`'d +/// onto the subscriber registry. +fn build_otel_layer( + cfg: &OtlpTracingConfig, +) -> Result< + tracing_opentelemetry::OpenTelemetryLayer, + Box, +> +where + S: Subscriber + for<'span> tracing_subscriber::registry::LookupSpan<'span>, +{ + let exporter = SpanExporter::builder() + .with_tonic() + .with_endpoint(&cfg.endpoint) + .build()?; + + let resource = Resource::builder() + .with_service_name("openshell-gateway") + .with_attributes([KeyValue::new("service.version", openshell_core::VERSION)]) + .build(); + + let sampler = sampler_from_env(); + + let provider = SdkTracerProvider::builder() + .with_batch_exporter(exporter) + .with_resource(resource) + .with_sampler(sampler) + .build(); + + let tracer = provider.tracer("openshell-gateway"); + + // Retain the provider so shutdown() can flush spans on SIGTERM. + let _ = OTEL_TRACER_PROVIDER.set(provider); + + Ok(tracing_opentelemetry::layer().with_tracer(tracer)) +} + +/// Resolve a sampler from `OTEL_TRACES_SAMPLER` / `OTEL_TRACES_SAMPLER_ARG`, +/// defaulting to `parent_based(traceidratio=1.0)` — record all spans, respect +/// upstream parent sampling decisions. +fn sampler_from_env() -> Sampler { + let ratio = std::env::var("OTEL_TRACES_SAMPLER_ARG") + .ok() + .and_then(|s| s.parse::().ok()) + .map_or(1.0, |r| r.clamp(0.0, 1.0)); + + match std::env::var("OTEL_TRACES_SAMPLER") + .ok() + .as_deref() + .map(str::trim) + { + Some("always_on") => Sampler::AlwaysOn, + Some("always_off") => Sampler::AlwaysOff, + Some("traceidratio") => Sampler::TraceIdRatioBased(ratio), + Some("parentbased_always_off") => Sampler::ParentBased(Box::new(Sampler::AlwaysOff)), + Some("parentbased_traceidratio") => { + Sampler::ParentBased(Box::new(Sampler::TraceIdRatioBased(ratio))) + } + // "parentbased_always_on", unset, or unrecognized + _ => Sampler::ParentBased(Box::new(Sampler::AlwaysOn)), + } +} + fn display_level(target: &str, level: &str) -> String { if target == OCSF_TARGET { "OCSF".to_string() @@ -387,6 +512,33 @@ mod tests { assert!(events.is_empty()); } + #[test] + fn otlp_config_resolve_prefers_traces_endpoint_then_shared_then_arg() { + // Each branch is exercised in isolation to avoid env-var coupling + // between cases. We only assert that the non-empty value wins; the + // env-var precedence test would need a process-wide lock to be safe. + let cfg = OtlpTracingConfig::resolve(Some("http://arg:4317".into())); + assert!(cfg.is_some()); + assert_eq!(cfg.unwrap().endpoint, "http://arg:4317"); + + let cfg = OtlpTracingConfig::resolve(Some(" ".into())); + assert!(cfg.is_none()); + + let cfg = OtlpTracingConfig::resolve(None); + // May be Some or None depending on inherited env; only assert that + // when Some, the endpoint is non-empty. + if let Some(c) = cfg { + assert!(!c.endpoint.is_empty()); + } + } + + #[test] + fn sampler_from_env_returns_a_sampler() { + // The function shape is documented in the function body; this test + // exercises construction without coupling to inherited env state. + let _ = sampler_from_env(); + } + #[test] fn platform_event_bus_remove_clears_tail() { let bus = PlatformEventBus::new(); diff --git a/deploy/helm/openshell/README.md b/deploy/helm/openshell/README.md index cc856731d..6e3687c67 100644 --- a/deploy/helm/openshell/README.md +++ b/deploy/helm/openshell/README.md @@ -52,6 +52,7 @@ See [`values.yaml`](values.yaml) for configurable values. Selected overlays: - [`ci/values-gateway.yaml`](ci/values-gateway.yaml) — gateway-only configuration - [`ci/values-cert-manager.yaml`](ci/values-cert-manager.yaml) — cert-manager integration - [`ci/values-keycloak.yaml`](ci/values-keycloak.yaml) — Keycloak OIDC integration +- [`ci/values-monitoring.yaml`](ci/values-monitoring.yaml) — Prometheus `ServiceMonitor` + OTLP traces (local-dev defaults) ## PKI bootstrap @@ -70,3 +71,12 @@ The Job is idempotent: Disable with `--set pkiInitJob.enabled=false` when bringing your own PKI (cert-manager, external CA, or pre-created Secrets). See `certManager.*` in `values.yaml` for the cert-manager alternative. + +## Monitoring + +The chart can opt into two independent observability surfaces: + +- `monitoring.serviceMonitor.enabled` — creates a Prometheus-Operator `ServiceMonitor` scraping the gateway's `/metrics` endpoint. Requires the `monitoring.coreos.com/v1` CRD (ships with `kube-prometheus-stack`). +- `monitoring.tracing.enabled` — projects standard `OTEL_*` env vars onto the gateway container so it exports OTLP/gRPC traces to the configured `monitoring.tracing.endpoint`. + +Both are off by default. See [Monitoring the Gateway](../../../docs/kubernetes/monitoring.mdx) for the operator guide and `mise run observability:k8s:setup` for the local-dev `kube-prometheus-stack` + Jaeger bundle. diff --git a/deploy/helm/openshell/ci/values-monitoring.yaml b/deploy/helm/openshell/ci/values-monitoring.yaml new file mode 100644 index 000000000..683808f7b --- /dev/null +++ b/deploy/helm/openshell/ci/values-monitoring.yaml @@ -0,0 +1,31 @@ +# SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: Apache-2.0 + +# Local-dev overlay enabling the ServiceMonitor (scraped by kube-prometheus-stack) +# and OTLP trace export to a Jaeger all-in-one Service. +# +# Prerequisite: install the cluster monitoring add-ons one time: +# mise run observability:k8s:setup +# +# Then uncomment values-monitoring.yaml in skaffold.yaml or pass it explicitly: +# helm upgrade --install openshell . \ +# -f values.yaml -f ci/values-skaffold.yaml -f ci/values-monitoring.yaml + +monitoring: + serviceMonitor: + enabled: true + interval: 15s + scrapeTimeout: 10s + # kube-prometheus-stack's default Prometheus instance selects ServiceMonitors + # that carry `release: kube-prometheus-stack`. The setup task installs the + # bundle under that release name. + labels: + release: kube-prometheus-stack + tracing: + enabled: true + # Jaeger all-in-one OTLP/gRPC receiver, installed by the setup task. + endpoint: "http://jaeger-collector.observability.svc.cluster.local:4317" + protocol: grpc + samplingRatio: "1.0" + resourceAttributes: + deployment.environment: dev diff --git a/deploy/helm/openshell/skaffold.yaml b/deploy/helm/openshell/skaffold.yaml index 2de9ee4e6..80366ecf4 100644 --- a/deploy/helm/openshell/skaffold.yaml +++ b/deploy/helm/openshell/skaffold.yaml @@ -81,6 +81,30 @@ deploy: # # wait ensures Gateway API CRDs are registered before the openshell # # release attempts to create Gateway and HTTPRoute resources. # wait: true + # Monitoring add-ons — comment in along with ci/values-monitoring.yaml + # below to scrape the gateway from Prometheus and export traces to Jaeger. + # Prefer running the dedicated mise task (`mise run observability:k8s:setup`) + # for the initial install; these blocks are kept for parity. + #- name: kube-prometheus-stack + # repo: https://prometheus-community.github.io/helm-charts + # remoteChart: kube-prometheus-stack + # version: 75.0.0 + # namespace: monitoring + # createNamespace: true + # wait: true + #- name: jaeger + # repo: https://jaegertracing.github.io/helm-charts + # remoteChart: jaeger + # version: 3.4.0 + # namespace: observability + # createNamespace: true + # setValues: + # allInOne.enabled: true + # storage.type: memory + # provisionDataStore.cassandra: false + # agent.enabled: false + # collector.enabled: false + # query.enabled: false - name: openshell chartPath: . namespace: openshell @@ -97,6 +121,10 @@ deploy: #- ci/values-keycloak.yaml # To enable the Gateway API HTTPRoute (requires Envoy Gateway above): #- ci/values-gateway.yaml + # To enable Prometheus scraping + OTLP traces → Jaeger: + # mise run observability:k8s:setup + # then uncomment the line below. + #- ci/values-monitoring.yaml setValueTemplates: image.repository: '{{.IMAGE_REPO_openshell_gateway}}' image.tag: '{{.IMAGE_TAG_openshell_gateway}}' diff --git a/deploy/helm/openshell/templates/_helpers.tpl b/deploy/helm/openshell/templates/_helpers.tpl index 93eff90a9..4a4e69a2d 100644 --- a/deploy/helm/openshell/templates/_helpers.tpl +++ b/deploy/helm/openshell/templates/_helpers.tpl @@ -97,3 +97,14 @@ override. {{- printf "%s://%s.%s.svc.cluster.local:%d" $scheme (include "openshell.fullname" .) .Release.Namespace (int .Values.service.port) -}} {{- end -}} {{- end }} + +{{/* +Render the user-supplied monitoring.tracing.resourceAttributes map as a +comma-prefixed `key=value` list suitable for appending to OTEL_RESOURCE_ATTRIBUTES. +Returns an empty string when no attributes are configured. +*/}} +{{- define "openshell.tracingResourceAttributes" -}} +{{- with .Values.monitoring.tracing.resourceAttributes }} +{{- range $k, $v := . }},{{ $k }}={{ $v }}{{- end }} +{{- end }} +{{- end }} diff --git a/deploy/helm/openshell/templates/servicemonitor.yaml b/deploy/helm/openshell/templates/servicemonitor.yaml new file mode 100644 index 000000000..8eb356576 --- /dev/null +++ b/deploy/helm/openshell/templates/servicemonitor.yaml @@ -0,0 +1,27 @@ +# SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: Apache-2.0 + +{{- if .Values.monitoring.serviceMonitor.enabled }} +apiVersion: monitoring.coreos.com/v1 +kind: ServiceMonitor +metadata: + name: {{ include "openshell.fullname" . }} + namespace: {{ default .Release.Namespace .Values.monitoring.serviceMonitor.namespace }} + labels: + {{- include "openshell.labels" . | nindent 4 }} + {{- with .Values.monitoring.serviceMonitor.labels }} + {{- toYaml . | nindent 4 }} + {{- end }} +spec: + namespaceSelector: + matchNames: + - {{ .Release.Namespace }} + selector: + matchLabels: + {{- include "openshell.selectorLabels" . | nindent 6 }} + endpoints: + - port: metrics + path: /metrics + interval: {{ .Values.monitoring.serviceMonitor.interval }} + scrapeTimeout: {{ .Values.monitoring.serviceMonitor.scrapeTimeout }} +{{- end }} diff --git a/deploy/helm/openshell/templates/statefulset.yaml b/deploy/helm/openshell/templates/statefulset.yaml index 2d3f731af..6e5c66a3b 100644 --- a/deploy/helm/openshell/templates/statefulset.yaml +++ b/deploy/helm/openshell/templates/statefulset.yaml @@ -140,6 +140,23 @@ spec: value: {{ .Values.server.oidc.scopesClaim | quote }} {{- end }} {{- end }} + {{- if and .Values.monitoring .Values.monitoring.tracing.enabled }} + - name: OTEL_EXPORTER_OTLP_TRACES_ENDPOINT + value: {{ required "monitoring.tracing.endpoint is required when monitoring.tracing.enabled is true" .Values.monitoring.tracing.endpoint | quote }} + - name: OTEL_EXPORTER_OTLP_PROTOCOL + value: {{ .Values.monitoring.tracing.protocol | quote }} + - name: OTEL_SERVICE_NAME + value: "openshell-gateway" + - name: OTEL_TRACES_SAMPLER + value: "parentbased_traceidratio" + - name: OTEL_TRACES_SAMPLER_ARG + value: {{ .Values.monitoring.tracing.samplingRatio | quote }} + - name: OTEL_RESOURCE_ATTRIBUTES + value: {{ printf "service.namespace=%s,service.version=%s%s" + .Release.Namespace + .Chart.AppVersion + (include "openshell.tracingResourceAttributes" .) | quote }} + {{- end }} volumeMounts: - name: openshell-data mountPath: /var/openshell diff --git a/deploy/helm/openshell/values.yaml b/deploy/helm/openshell/values.yaml index 7630554f2..8e7efeb19 100644 --- a/deploy/helm/openshell/values.yaml +++ b/deploy/helm/openshell/values.yaml @@ -229,3 +229,39 @@ grpcRoute: protocol: HTTP # "Same" restricts attached routes to the release namespace; "All" allows any namespace. allowedRoutes: Same + +# Observability: Prometheus ServiceMonitor and OpenTelemetry trace export. +# Both subsections are independent and disabled by default. The chart never +# bundles a monitoring stack; operators run kube-prometheus-stack and Jaeger +# (or any OTLP backend) themselves and point the gateway at them here. +monitoring: + serviceMonitor: + # Create a Prometheus-Operator ServiceMonitor scraping the gateway's + # /metrics endpoint. Requires the monitoring.coreos.com/v1 CRDs. + enabled: false + interval: 30s + scrapeTimeout: 10s + # Extra labels added to the ServiceMonitor (commonly required to match + # the Prometheus instance's serviceMonitorSelector — kube-prometheus-stack + # defaults to selecting on `release: `). + labels: {} + # Namespace where the ServiceMonitor is created. Empty = release namespace. + namespace: "" + tracing: + # Project OTEL_* env vars onto the gateway container so the in-process + # OTLP exporter starts up. The gateway exports OTLP/gRPC; Helm currently + # only supports the gRPC protocol. + enabled: false + # OTLP/gRPC collector endpoint (host:port or full URL). Required when + # tracing.enabled is true. + # e.g. http://jaeger-collector.observability.svc:4317 + endpoint: "" + # OTLP transport. Currently only "grpc" is supported by the gateway. + protocol: grpc + # Trace sampling. parent_based(traceidratio=...) is the recommended + # production default — record according to the upstream parent's + # decision, fall back to the configured ratio for new traces. + samplingRatio: "1.0" + # Extra resource attributes appended to OTEL_RESOURCE_ATTRIBUTES, + # e.g. {deployment.environment: production}. + resourceAttributes: {} diff --git a/docs/kubernetes/monitoring.mdx b/docs/kubernetes/monitoring.mdx new file mode 100644 index 000000000..4bd0fd440 --- /dev/null +++ b/docs/kubernetes/monitoring.mdx @@ -0,0 +1,135 @@ +--- +# SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: Apache-2.0 +title: "Monitoring the Gateway" +sidebar-title: "Monitoring" +description: "Scrape the OpenShell gateway with Prometheus and export traces over OTLP from a Helm-managed deployment." +keywords: "Generative AI, Cybersecurity, Monitoring, Prometheus, OpenTelemetry, OTLP, Jaeger, Grafana, Helm, Kubernetes" +position: 6 +--- + +The OpenShell gateway exposes two telemetry surfaces operators usually plug directly into a cluster monitoring stack: + +- **Prometheus metrics** at `/metrics` on a dedicated port — scraped over HTTP. +- **OpenTelemetry traces** exported over OTLP/gRPC — pushed to a collector or backend. + +Both are off by default in the chart. This page enables them on a real cluster, and shows the local-dev path that bundles `kube-prometheus-stack` and Jaeger. + + +For sandbox-internal logs (OCSF), see [Observability → Sandbox Logging](/observability/logging). That surface is independent of the gateway control-plane telemetry described here. + + +## Prerequisites + +| Prerequisite | Required for | Notes | +|---|---|---| +| Prometheus Operator | Metrics | The chart creates a `monitoring.coreos.com/v1` `ServiceMonitor`. The CRD ships with `kube-prometheus-stack`. | +| OTLP-compatible backend | Traces | Jaeger, Tempo, an OpenTelemetry Collector, or any vendor backend that accepts OTLP/gRPC on `:4317`. | +| Helm chart values access | Both | Either `--set` flags on the install command, or a values file passed via `-f`. | + +## Enable on the Helm release + +Add a `monitoring` block to your values file: + +```yaml +monitoring: + serviceMonitor: + enabled: true + interval: 30s + # Match the Prometheus instance's serviceMonitorSelector. The + # kube-prometheus-stack default selector is `release: `. + labels: + release: kube-prometheus-stack + tracing: + enabled: true + endpoint: "http://otel-collector.observability.svc.cluster.local:4317" + protocol: grpc + samplingRatio: "1.0" + resourceAttributes: + deployment.environment: production +``` + +Then upgrade the release: + +```shell +helm upgrade --install openshell oci://ghcr.io/nvidia/openshell/charts/openshell \ + -f values.yaml +``` + +When `monitoring.tracing.enabled` is `true`, the chart projects the standard `OTEL_*` env vars onto the gateway container. The gateway initializes the OTLP exporter at startup and flushes spans on `SIGTERM`. + + +The gateway currently exports OTLP over **gRPC only**. Setting `monitoring.tracing.protocol` to anything other than `grpc` is not supported and the value is ignored at the gateway. + + +## Verify it works + +### Metrics + +Confirm the `ServiceMonitor` was created and Prometheus picked it up: + +```shell +kubectl get servicemonitor -n +kubectl exec -n monitoring deploy/prometheus-server -- \ + promtool query instant http://localhost:9090 'up{job="openshell"}' +``` + +Then query a metric the gateway exports while you exercise the API: + +```shell +openshell sandbox create --name probe +``` + +```promql +rate(openshell_server_grpc_requests_total[1m]) +``` + +### Traces + +Create a sandbox via the CLI to drive at least one inbound request, then look in your trace UI for the `openshell-gateway` service. You should see a `request` span with `method`, `path`, and `request_id` attributes. + +```shell +openshell sandbox create --name trace-probe +``` + +In Jaeger UI: select **Service: openshell-gateway** and **Find Traces**. In Grafana Tempo / OTLP backends: query `service.name="openshell-gateway"`. + +## Local development + +The repo ships a one-shot mise task that installs `kube-prometheus-stack` (slimmed-down: no Alertmanager, node-exporter, or kube-state) and a Jaeger all-in-one Pod into the local k3s cluster. + +```shell +# 1. Bring up the cluster (skip if already running): +mise run helm:k3s:create + +# 2. Install Prometheus + Grafana + Jaeger: +mise run observability:k8s:setup + +# 3. Open ci/values-monitoring.yaml on the openshell release: +# Uncomment `- ci/values-monitoring.yaml` in deploy/helm/openshell/skaffold.yaml + +# 4. Deploy / restart the openshell release: +mise run helm:skaffold:dev + +# 5. Forward UIs to localhost: +mise run observability:port-forward +``` + +Then visit: + +- Grafana — http://localhost:3000 (admin / admin) +- Prometheus — http://localhost:9090 +- Jaeger UI — http://localhost:16686 + +Tear it all down with: + +```shell +mise run observability:k8s:teardown +``` + +## Production guidance + +- **OpenTelemetry Collector** is the recommended pattern in front of multi-backend trace pipelines. Point `monitoring.tracing.endpoint` at the Collector, then route from the Collector to Jaeger / Tempo / your vendor. +- **Sampling**: `samplingRatio: "1.0"` (record everything) is fine for low-traffic gateways. Drop to `"0.1"` or lower for high-throughput deployments. The chart sets `parentbased_traceidratio` so upstream parent decisions are honored when present. +- **Resource attributes**: Add `deployment.environment`, `cluster.name`, or other identifiers via `monitoring.tracing.resourceAttributes`. They appear on every span as part of the OTel Resource. +- **Prometheus selector**: The chart-rendered `ServiceMonitor` carries the standard chart labels. Most Prometheus instances also require an explicit selector label (often `release: `) — set it via `monitoring.serviceMonitor.labels`. diff --git a/docs/observability/overview.mdx b/docs/observability/overview.mdx index 69fde7a06..d548c5374 100644 --- a/docs/observability/overview.mdx +++ b/docs/observability/overview.mdx @@ -15,3 +15,7 @@ This section covers: - **[Sandbox Logging](/observability/logging)**: How the two log formats work, where logs are stored, and how to read them. - **[Accessing Logs](/observability/accessing-logs)**: How to view logs through the CLI, TUI, and directly on the sandbox filesystem. - **[OCSF JSON Export](/observability/ocsf-json-export)**: How to enable full OCSF JSON output for integration with SIEMs, log aggregators, and compliance tools. + +## Sandbox logs vs. gateway telemetry + +Sandbox-internal logs (the OCSF surface above) are distinct from the gateway control-plane telemetry — Prometheus metrics on `/metrics` and OpenTelemetry traces over OTLP. The two pipelines are independent and configured separately. For scraping the gateway from Prometheus or shipping traces to Jaeger / Tempo / an OTel Collector, see [Monitoring the Gateway](/kubernetes/monitoring). diff --git a/tasks/observability.toml b/tasks/observability.toml new file mode 100644 index 000000000..969bfaaaa --- /dev/null +++ b/tasks/observability.toml @@ -0,0 +1,19 @@ +# SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: Apache-2.0 + +# Cluster observability add-ons (Prometheus + Grafana + Jaeger). +# These tasks target the local k3s cluster created by `mise run helm:k3s:create`. +# Production users install kube-prometheus-stack and an OTLP backend separately +# and point the gateway at them via Helm values; see docs/kubernetes/monitoring.mdx. + +["observability:k8s:setup"] +description = "Install kube-prometheus-stack + Jaeger all-in-one into the local k3s cluster (one-time)" +run = "tasks/scripts/observability-k8s-setup.sh" + +["observability:k8s:teardown"] +description = "Remove kube-prometheus-stack and Jaeger from the local k3s cluster" +run = "tasks/scripts/observability-k8s-teardown.sh" + +["observability:port-forward"] +description = "Port-forward Grafana (3000), Prometheus (9090), and Jaeger UI (16686) until interrupted" +run = "tasks/scripts/observability-port-forward.sh" diff --git a/tasks/scripts/observability-k8s-setup.sh b/tasks/scripts/observability-k8s-setup.sh new file mode 100755 index 000000000..3e31bc47b --- /dev/null +++ b/tasks/scripts/observability-k8s-setup.sh @@ -0,0 +1,96 @@ +#!/usr/bin/env bash +# SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: Apache-2.0 +# +# One-time install of kube-prometheus-stack + Jaeger all-in-one into the local +# k3s cluster created by `mise run helm:k3s:create`. +# +# - kube-prometheus-stack provides Prometheus, Grafana, and the +# ServiceMonitor/PodMonitor CRDs the openshell chart uses. +# - Jaeger all-in-one provides an OTLP/gRPC receiver (:4317) and UI (:16686). +# +# Re-running is safe; both releases use `helm upgrade --install`. +# +# Usage: +# mise run observability:k8s:setup +# +# After setup, enable monitoring on the openshell release by uncommenting +# `ci/values-monitoring.yaml` in `deploy/helm/openshell/skaffold.yaml`, then +# rerun skaffold. + +set -euo pipefail + +MONITORING_NAMESPACE="${MONITORING_NAMESPACE:-monitoring}" +OBSERVABILITY_NAMESPACE="${OBSERVABILITY_NAMESPACE:-observability}" +PROMSTACK_RELEASE="${PROMSTACK_RELEASE:-kube-prometheus-stack}" +PROMSTACK_VERSION="${PROMSTACK_VERSION:-75.0.0}" +JAEGER_RELEASE="${JAEGER_RELEASE:-jaeger}" +JAEGER_VERSION="${JAEGER_VERSION:-3.4.0}" +HEALTH_TIMEOUT="${HEALTH_TIMEOUT:-180}" + +# --------------------------------------------------------------------------- +# Helm repos +# --------------------------------------------------------------------------- + +echo "Adding Helm repos..." +helm repo add prometheus-community https://prometheus-community.github.io/helm-charts >/dev/null 2>&1 || true +helm repo add jaegertracing https://jaegertracing.github.io/helm-charts >/dev/null 2>&1 || true +helm repo update prometheus-community jaegertracing >/dev/null + +# --------------------------------------------------------------------------- +# kube-prometheus-stack +# --------------------------------------------------------------------------- +# +# Slimmed-down install: keep Prometheus + Grafana + Operator (the parts the +# openshell chart's ServiceMonitor needs), drop Alertmanager and the +# node/kube-state metrics exporters to keep k3d resource usage down. Real +# clusters get the full bundle via the published docs. + +echo "Installing ${PROMSTACK_RELEASE} into namespace ${MONITORING_NAMESPACE}..." +helm upgrade --install "${PROMSTACK_RELEASE}" prometheus-community/kube-prometheus-stack \ + --version "${PROMSTACK_VERSION}" \ + --namespace "${MONITORING_NAMESPACE}" \ + --create-namespace \ + --set alertmanager.enabled=false \ + --set nodeExporter.enabled=false \ + --set kubeStateMetrics.enabled=false \ + --set grafana.adminPassword=admin \ + --set prometheus.prometheusSpec.serviceMonitorSelectorNilUsesHelmValues=false \ + --wait --timeout "${HEALTH_TIMEOUT}s" + +# --------------------------------------------------------------------------- +# Jaeger all-in-one +# --------------------------------------------------------------------------- + +echo "Installing ${JAEGER_RELEASE} into namespace ${OBSERVABILITY_NAMESPACE}..." +helm upgrade --install "${JAEGER_RELEASE}" jaegertracing/jaeger \ + --version "${JAEGER_VERSION}" \ + --namespace "${OBSERVABILITY_NAMESPACE}" \ + --create-namespace \ + --set allInOne.enabled=true \ + --set storage.type=memory \ + --set provisionDataStore.cassandra=false \ + --set agent.enabled=false \ + --set collector.enabled=false \ + --set query.enabled=false \ + --wait --timeout "${HEALTH_TIMEOUT}s" + +# --------------------------------------------------------------------------- +# Summary +# --------------------------------------------------------------------------- + +echo "" +echo "Cluster monitoring stack is ready." +echo "" +echo " Grafana: http://localhost:3000 (admin / admin)" +echo " Prometheus: http://localhost:9090" +echo " Jaeger UI: http://localhost:16686" +echo "" +echo " Start port-forwards: mise run observability:port-forward" +echo "" +echo " Enable on the openshell release:" +echo " 1. Uncomment 'ci/values-monitoring.yaml' in deploy/helm/openshell/skaffold.yaml" +echo " 2. mise run helm:skaffold:dev" +echo "" +echo " Teardown: mise run observability:k8s:teardown" +echo "" diff --git a/tasks/scripts/observability-k8s-teardown.sh b/tasks/scripts/observability-k8s-teardown.sh new file mode 100755 index 000000000..f9b3ee501 --- /dev/null +++ b/tasks/scripts/observability-k8s-teardown.sh @@ -0,0 +1,27 @@ +#!/usr/bin/env bash +# SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: Apache-2.0 +# +# Remove the local cluster monitoring add-ons installed by +# observability-k8s-setup.sh. +# +# Usage: +# mise run observability:k8s:teardown + +set -euo pipefail + +MONITORING_NAMESPACE="${MONITORING_NAMESPACE:-monitoring}" +OBSERVABILITY_NAMESPACE="${OBSERVABILITY_NAMESPACE:-observability}" +PROMSTACK_RELEASE="${PROMSTACK_RELEASE:-kube-prometheus-stack}" +JAEGER_RELEASE="${JAEGER_RELEASE:-jaeger}" + +echo "Uninstalling ${PROMSTACK_RELEASE} from ${MONITORING_NAMESPACE}..." +helm uninstall "${PROMSTACK_RELEASE}" --namespace "${MONITORING_NAMESPACE}" --ignore-not-found + +echo "Uninstalling ${JAEGER_RELEASE} from ${OBSERVABILITY_NAMESPACE}..." +helm uninstall "${JAEGER_RELEASE}" --namespace "${OBSERVABILITY_NAMESPACE}" --ignore-not-found + +echo "Deleting namespaces..." +kubectl delete namespace "${MONITORING_NAMESPACE}" "${OBSERVABILITY_NAMESPACE}" --ignore-not-found + +echo "Done." diff --git a/tasks/scripts/observability-port-forward.sh b/tasks/scripts/observability-port-forward.sh new file mode 100755 index 000000000..4a6ed761d --- /dev/null +++ b/tasks/scripts/observability-port-forward.sh @@ -0,0 +1,58 @@ +#!/usr/bin/env bash +# SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: Apache-2.0 +# +# Background port-forwards for Grafana, Prometheus, and the Jaeger UI. +# Runs until interrupted; trap ensures the kubectl background processes are +# cleaned up on Ctrl+C / SIGTERM. +# +# Usage: +# mise run observability:port-forward + +set -euo pipefail + +MONITORING_NAMESPACE="${MONITORING_NAMESPACE:-monitoring}" +OBSERVABILITY_NAMESPACE="${OBSERVABILITY_NAMESPACE:-observability}" +PROMSTACK_RELEASE="${PROMSTACK_RELEASE:-kube-prometheus-stack}" +JAEGER_RELEASE="${JAEGER_RELEASE:-jaeger}" + +GRAFANA_LOCAL_PORT="${GRAFANA_LOCAL_PORT:-3000}" +PROMETHEUS_LOCAL_PORT="${PROMETHEUS_LOCAL_PORT:-9090}" +JAEGER_UI_LOCAL_PORT="${JAEGER_UI_LOCAL_PORT:-16686}" + +PIDS=() + +cleanup() { + if [[ ${#PIDS[@]} -gt 0 ]]; then + echo "" + echo "Stopping port-forwards..." + kill "${PIDS[@]}" 2>/dev/null || true + wait "${PIDS[@]}" 2>/dev/null || true + fi +} +trap cleanup EXIT INT TERM + +forward() { + local namespace="$1" + local target="$2" + local local_port="$3" + local remote_port="$4" + kubectl --namespace "${namespace}" port-forward "${target}" \ + "${local_port}:${remote_port}" >/dev/null 2>&1 & + PIDS+=("$!") +} + +echo "Starting port-forwards..." +forward "${MONITORING_NAMESPACE}" "svc/${PROMSTACK_RELEASE}-grafana" "${GRAFANA_LOCAL_PORT}" 80 +forward "${MONITORING_NAMESPACE}" "svc/${PROMSTACK_RELEASE}-prometheus" "${PROMETHEUS_LOCAL_PORT}" 9090 +forward "${OBSERVABILITY_NAMESPACE}" "svc/${JAEGER_RELEASE}-query" "${JAEGER_UI_LOCAL_PORT}" 16686 + +echo "" +echo " Grafana: http://localhost:${GRAFANA_LOCAL_PORT} (admin / admin)" +echo " Prometheus: http://localhost:${PROMETHEUS_LOCAL_PORT}" +echo " Jaeger UI: http://localhost:${JAEGER_UI_LOCAL_PORT}" +echo "" +echo "Press Ctrl+C to stop." + +# Block until any forwarder exits or signal is received. +wait -n From c6463bfee547a1eb548b2ac29591e0323a608f57 Mon Sep 17 00:00:00 2001 From: Taylor Mutch Date: Fri, 8 May 2026 11:00:57 -0700 Subject: [PATCH 4/4] refactor(observability): extract local-dev Helm values into separate files MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The kube-prometheus-stack and Jaeger releases were configured via long chains of `--set` flags, which obscure the configuration and make the script hard to extend. Extract them into two checked-in values files the setup script consumes via `--values`. - tasks/scripts/observability-prometheus-values.yaml — slim chart config plus Grafana auto-provisioning of a Jaeger datasource (stable uid so dashboards can reference it). - tasks/scripts/observability-jaeger-values.yaml — all-in-one Jaeger. - PROMSTACK_VALUES and JAEGER_VALUES env vars allow pointing at custom files for local experimentation. --- .../scripts/observability-jaeger-values.yaml | 25 +++++++++++++ tasks/scripts/observability-k8s-setup.sh | 17 ++++----- .../observability-prometheus-values.yaml | 35 +++++++++++++++++++ 3 files changed, 66 insertions(+), 11 deletions(-) create mode 100644 tasks/scripts/observability-jaeger-values.yaml create mode 100644 tasks/scripts/observability-prometheus-values.yaml diff --git a/tasks/scripts/observability-jaeger-values.yaml b/tasks/scripts/observability-jaeger-values.yaml new file mode 100644 index 000000000..f89576f3f --- /dev/null +++ b/tasks/scripts/observability-jaeger-values.yaml @@ -0,0 +1,25 @@ +# SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: Apache-2.0 +# +# Helm values for the local-dev jaegertracing/jaeger install. +# Consumed by tasks/scripts/observability-k8s-setup.sh. +# +# All-in-one mode: single pod with in-memory storage, OTLP/gRPC receiver on +# :4317, OTLP/HTTP on :4318, UI on :16686. Enough for dashboards and trace +# inspection during dev; nothing persists across pod restarts. + +allInOne: + enabled: true + +storage: + type: memory + +provisionDataStore: + cassandra: false + +agent: + enabled: false +collector: + enabled: false +query: + enabled: false diff --git a/tasks/scripts/observability-k8s-setup.sh b/tasks/scripts/observability-k8s-setup.sh index 3e31bc47b..c771ad157 100755 --- a/tasks/scripts/observability-k8s-setup.sh +++ b/tasks/scripts/observability-k8s-setup.sh @@ -20,12 +20,16 @@ set -euo pipefail +SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" + MONITORING_NAMESPACE="${MONITORING_NAMESPACE:-monitoring}" OBSERVABILITY_NAMESPACE="${OBSERVABILITY_NAMESPACE:-observability}" PROMSTACK_RELEASE="${PROMSTACK_RELEASE:-kube-prometheus-stack}" PROMSTACK_VERSION="${PROMSTACK_VERSION:-75.0.0}" +PROMSTACK_VALUES="${PROMSTACK_VALUES:-${SCRIPT_DIR}/observability-prometheus-values.yaml}" JAEGER_RELEASE="${JAEGER_RELEASE:-jaeger}" JAEGER_VERSION="${JAEGER_VERSION:-3.4.0}" +JAEGER_VALUES="${JAEGER_VALUES:-${SCRIPT_DIR}/observability-jaeger-values.yaml}" HEALTH_TIMEOUT="${HEALTH_TIMEOUT:-180}" # --------------------------------------------------------------------------- @@ -51,11 +55,7 @@ helm upgrade --install "${PROMSTACK_RELEASE}" prometheus-community/kube-promethe --version "${PROMSTACK_VERSION}" \ --namespace "${MONITORING_NAMESPACE}" \ --create-namespace \ - --set alertmanager.enabled=false \ - --set nodeExporter.enabled=false \ - --set kubeStateMetrics.enabled=false \ - --set grafana.adminPassword=admin \ - --set prometheus.prometheusSpec.serviceMonitorSelectorNilUsesHelmValues=false \ + --values "${PROMSTACK_VALUES}" \ --wait --timeout "${HEALTH_TIMEOUT}s" # --------------------------------------------------------------------------- @@ -67,12 +67,7 @@ helm upgrade --install "${JAEGER_RELEASE}" jaegertracing/jaeger \ --version "${JAEGER_VERSION}" \ --namespace "${OBSERVABILITY_NAMESPACE}" \ --create-namespace \ - --set allInOne.enabled=true \ - --set storage.type=memory \ - --set provisionDataStore.cassandra=false \ - --set agent.enabled=false \ - --set collector.enabled=false \ - --set query.enabled=false \ + --values "${JAEGER_VALUES}" \ --wait --timeout "${HEALTH_TIMEOUT}s" # --------------------------------------------------------------------------- diff --git a/tasks/scripts/observability-prometheus-values.yaml b/tasks/scripts/observability-prometheus-values.yaml new file mode 100644 index 000000000..5d4813712 --- /dev/null +++ b/tasks/scripts/observability-prometheus-values.yaml @@ -0,0 +1,35 @@ +# SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: Apache-2.0 +# +# Helm values for the local-dev kube-prometheus-stack install. +# Consumed by tasks/scripts/observability-k8s-setup.sh. +# +# Slimmed down for k3d: drops Alertmanager + node/kube-state exporters. +# Real-cluster operators install the bundle separately and follow the +# docs/kubernetes/monitoring.mdx guide. + +alertmanager: + enabled: false +nodeExporter: + enabled: false +kubeStateMetrics: + enabled: false + +prometheus: + prometheusSpec: + # Don't restrict to ServiceMonitors carrying the Helm release label — + # the openshell chart sets its own selector via monitoring.serviceMonitor.labels. + serviceMonitorSelectorNilUsesHelmValues: false + +grafana: + adminPassword: admin + # Auto-provision Jaeger as a trace datasource. The Prometheus datasource is + # added by the chart automatically. + additionalDataSources: + - name: Jaeger + uid: jaeger + type: jaeger + access: proxy + # Service installed by tasks/scripts/observability-jaeger-values.yaml. + url: http://jaeger-query.observability.svc.cluster.local:16686 + editable: true