diff --git a/Cargo.lock b/Cargo.lock index beac03238..cba681774 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -3476,6 +3476,7 @@ dependencies = [ "bytes", "futures", "openshell-core", + "serde", "tar", "tempfile", "tokio", @@ -3498,6 +3499,7 @@ dependencies = [ "openshell-core", "prost", "prost-types", + "serde", "serde_json", "thiserror 2.0.18", "tokio", @@ -3729,6 +3731,7 @@ dependencies = [ "tokio-rustls", "tokio-stream", "tokio-tungstenite 0.26.2", + "toml", "tonic", "tower 0.5.3", "tower-http 0.6.8", @@ -5201,6 +5204,15 @@ dependencies = [ "syn 2.0.117", ] +[[package]] +name = "serde_spanned" +version = "0.6.9" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "bf41e0cfaf7226dca15e8197172c295a782857fcb97fad1808a166870dee75a3" +dependencies = [ + "serde", +] + [[package]] name = "serde_urlencoded" version = "0.7.1" @@ -6138,6 +6150,47 @@ dependencies = [ "tokio", ] +[[package]] +name = "toml" +version = "0.8.23" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "dc1beb996b9d83529a9e75c17a1686767d148d70663143c7854d8b4a09ced362" +dependencies = [ + "serde", + "serde_spanned", + "toml_datetime", + "toml_edit", +] + +[[package]] +name = "toml_datetime" +version = "0.6.11" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "22cddaf88f4fbc13c51aebbf5f8eceb5c7c5a9da2ac40a13519eb5b0a0e8f11c" +dependencies = [ + "serde", +] + +[[package]] +name = "toml_edit" +version = "0.22.27" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "41fe8c660ae4257887cf66394862d21dbca4a6ddd26f04a3560410406a2f819a" +dependencies = [ + "indexmap 2.14.0", + "serde", + "serde_spanned", + "toml_datetime", + "toml_write", + "winnow", +] + +[[package]] +name = "toml_write" +version = "0.1.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "5d99f8c9a7727884afe522e9bd5edbfc91a3312b36a77b5fb8926e4c31a41801" + [[package]] name = "tonic" version = "0.12.3" @@ -7227,6 +7280,15 @@ version = "0.53.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "d6bbff5f0aada427a1e5a6da5f1f98158182f26556f345ac9e04d36d0ebed650" +[[package]] +name = "winnow" +version = "0.7.15" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "df79d97927682d2fd8adb29682d1140b343be4ac0f08fd68b7765d9c059d3945" +dependencies = [ + "memchr", +] + [[package]] name = "wiremock" version = "0.6.5" diff --git a/Cargo.toml b/Cargo.toml index 195544431..3fea379a2 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -69,6 +69,7 @@ nix = { version = "0.29", features = ["signal", "process", "user", "fs", "term"] serde = { version = "1", features = ["derive"] } serde_json = "1" serde_yml = "0.0.12" +toml = "0.8" apollo-parser = "0.8.5" # HTTP client diff --git a/architecture/gateway.md b/architecture/gateway.md index 68832d0cf..e9cbe187d 100644 --- a/architecture/gateway.md +++ b/architecture/gateway.md @@ -193,6 +193,44 @@ pre-created Secrets) disable the Helm hook via `pkiInitJob.enabled=false`. The chart also ships a `certManager.*` path that produces equivalent Secrets through cert-manager `Issuer`/`Certificate` resources. +## Configuration + +The gateway reads its configuration from three sources, merged in this +precedence (highest first): + +``` +Gateway CLI flag > gateway OPENSHELL_* env var > TOML file > built-in default +``` + +The TOML file is opt-in via `--config ` / `OPENSHELL_GATEWAY_CONFIG`. +Driver implementation settings live in the TOML driver tables. See +`docs/reference/gateway-config.mdx` for worked per-driver examples and RFC +0003 for the full schema. + +`database_url` is env-only and rejected when present in the file +(`OPENSHELL_DB_URL` / `--db-url`). + +### Driver inheritance + +`[openshell.gateway]` carries a small set of values (`sandbox_namespace`, +`default_image`, +`supervisor_image`, `guest_tls_ca/cert/key`, `client_tls_secret_name`, +`host_gateway_ip`, `enable_user_namespaces`) that are inherited into each +driver's `[openshell.drivers.]` table when the driver-specific table +does not override them. The allowlist is per-driver so a gateway-wide +default cannot land in a driver that does not understand it (e.g. +`client_tls_secret_name` is K8s-only). + +`image_pull_policy` is intentionally **not** inheritable: Kubernetes uses +`Always | IfNotPresent | Never` (passed verbatim to the K8s API) while +Podman uses the lowercase enum `always | missing | never | newer`. No +value means the same thing in both, so the key lives only under each +driver's own table. + +Driver-specific values that are not part of the inheritance allowlist +(e.g. Podman `socket_path`, VM `vcpus`) only come from the driver's own +table. + ## Operational Constraints - Gateway TLS and client certificate distribution are deployment concerns owned diff --git a/crates/openshell-core/src/config.rs b/crates/openshell-core/src/config.rs index cfa625139..e045d0a52 100644 --- a/crates/openshell-core/src/config.rs +++ b/crates/openshell-core/src/config.rs @@ -21,15 +21,12 @@ use std::str::FromStr; /// Default SSH port inside sandbox containers. pub const DEFAULT_SSH_PORT: u16 = 2222; -/// Default server / SSH gateway port. +/// Default gateway server port. pub const DEFAULT_SERVER_PORT: u16 = 8080; /// Default container stop timeout in seconds (SIGTERM → SIGKILL). pub const DEFAULT_STOP_TIMEOUT_SECS: u32 = 10; -/// Default Podman bridge network name. -pub const DEFAULT_NETWORK_NAME: &str = "openshell"; - /// Default Docker bridge network name for local sandboxes. pub const DEFAULT_DOCKER_NETWORK_NAME: &str = "openshell-docker"; @@ -39,12 +36,6 @@ pub const DEFAULT_SERVICE_ROUTING_DOMAIN: &str = "openshell.localhost"; /// Default OCI image for the openshell-sandbox supervisor binary. pub const DEFAULT_SUPERVISOR_IMAGE: &str = "openshell/supervisor:latest"; -/// Default image pull policy for sandbox images. -pub const DEFAULT_IMAGE_PULL_POLICY: &str = "missing"; - -/// Default Kubernetes namespace for sandbox resources. -pub const DEFAULT_K8S_NAMESPACE: &str = "openshell"; - /// CDI device identifier for requesting all NVIDIA GPUs. pub const CDI_GPU_DEVICE_ALL: &str = "nvidia.com/gpu=all"; @@ -225,75 +216,10 @@ pub struct Config { #[serde(default)] pub compute_drivers: Vec, - /// Kubernetes namespace for sandboxes. - #[serde(default = "default_sandbox_namespace")] - pub sandbox_namespace: String, - - /// Default container image for sandboxes. - #[serde(default = "default_sandbox_image")] - pub sandbox_image: String, - - /// Kubernetes `imagePullPolicy` for sandbox pods (e.g. `Always`, - /// `IfNotPresent`, `Never`). Defaults to empty, which lets Kubernetes - /// apply its own default (`:latest` → `Always`, anything else → - /// `IfNotPresent`). - #[serde(default)] - pub sandbox_image_pull_policy: String, - - /// gRPC endpoint for sandboxes to connect back to `OpenShell`. - /// Used by sandbox pods to fetch their policy at startup. - #[serde(default)] - pub grpc_endpoint: String, - - /// Public gateway host for SSH proxy connections. - #[serde(default = "default_ssh_gateway_host")] - pub ssh_gateway_host: String, - - /// Public gateway port for SSH proxy connections. - #[serde(default = "default_ssh_gateway_port")] - pub ssh_gateway_port: u16, - - /// SSH listen port inside sandbox containers that expose a TCP endpoint. - #[serde(default = "default_sandbox_ssh_port")] - pub sandbox_ssh_port: u16, - - /// Filesystem path where the sandbox supervisor binds its SSH Unix - /// socket. The supervisor is passed this path via - /// `OPENSHELL_SSH_SOCKET_PATH` / `--ssh-socket-path` and connects its - /// relay bridge to the same path. - /// - /// When the gateway orchestrates sandboxes that each live in their own - /// filesystem (K8s pod, libkrun VM, etc.), the default is safe. For - /// local dev where multiple supervisors share `/run`, override this to - /// something unique per sandbox. - #[serde(default = "default_sandbox_ssh_socket_path")] - pub sandbox_ssh_socket_path: String, - /// TTL for SSH session tokens, in seconds. 0 disables expiry. #[serde(default = "default_ssh_session_ttl_secs")] pub ssh_session_ttl_secs: u64, - /// Kubernetes secret name containing client TLS materials for sandbox pods. - /// When set, sandbox pods get this secret mounted so they can connect to - /// the server over mTLS. - #[serde(default)] - pub client_tls_secret_name: String, - - /// Host gateway IP for sandbox pod hostAliases. - /// When set, sandbox pods get hostAliases entries mapping - /// `host.docker.internal` and `host.openshell.internal` to this IP, - /// allowing them to reach services running on the Docker host. - #[serde(default)] - pub host_gateway_ip: String, - - /// Enable Kubernetes user namespace isolation (`hostUsers: false`) for - /// sandbox pods. When enabled, container UID 0 maps to an unprivileged - /// host UID and capabilities become namespaced. Requires Kubernetes 1.33+ - /// with user namespace support available (beta through 1.35, GA in 1.36+), - /// plus a supporting container runtime and Linux 5.12+. - #[serde(default)] - pub enable_user_namespaces: bool, - /// Browser-facing sandbox service routing configuration. #[serde(default)] pub service_routing: ServiceRoutingConfig, @@ -416,18 +342,7 @@ impl Config { oidc: None, database_url: String::new(), compute_drivers: vec![], - sandbox_namespace: default_sandbox_namespace(), - sandbox_image: default_sandbox_image(), - sandbox_image_pull_policy: String::new(), - grpc_endpoint: String::new(), - ssh_gateway_host: default_ssh_gateway_host(), - ssh_gateway_port: default_ssh_gateway_port(), - sandbox_ssh_port: default_sandbox_ssh_port(), - sandbox_ssh_socket_path: default_sandbox_ssh_socket_path(), ssh_session_ttl_secs: default_ssh_session_ttl_secs(), - client_tls_secret_name: String::new(), - host_gateway_ip: String::new(), - enable_user_namespaces: false, service_routing: ServiceRoutingConfig::default(), } } @@ -488,55 +403,6 @@ impl Config { self } - /// Create a new configuration with a sandbox namespace. - #[must_use] - pub fn with_sandbox_namespace(mut self, namespace: impl Into) -> Self { - self.sandbox_namespace = namespace.into(); - self - } - - /// Create a new configuration with a default sandbox image. - #[must_use] - pub fn with_sandbox_image(mut self, image: impl Into) -> Self { - self.sandbox_image = image.into(); - self - } - - /// Create a new configuration with a sandbox image pull policy. - #[must_use] - pub fn with_sandbox_image_pull_policy(mut self, policy: impl Into) -> Self { - self.sandbox_image_pull_policy = policy.into(); - self - } - - /// Create a new configuration with a gRPC endpoint for sandbox callback. - #[must_use] - pub fn with_grpc_endpoint(mut self, endpoint: impl Into) -> Self { - self.grpc_endpoint = endpoint.into(); - self - } - - /// Create a new configuration with the SSH gateway host. - #[must_use] - pub fn with_ssh_gateway_host(mut self, host: impl Into) -> Self { - self.ssh_gateway_host = host.into(); - self - } - - /// Create a new configuration with the SSH gateway port. - #[must_use] - pub const fn with_ssh_gateway_port(mut self, port: u16) -> Self { - self.ssh_gateway_port = port; - self - } - - /// Create a new configuration with the sandbox SSH port. - #[must_use] - pub const fn with_sandbox_ssh_port(mut self, port: u16) -> Self { - self.sandbox_ssh_port = port; - self - } - /// Create a new configuration with the SSH session TTL. #[must_use] pub const fn with_ssh_session_ttl_secs(mut self, secs: u64) -> Self { @@ -544,20 +410,6 @@ impl Config { self } - /// Set the Kubernetes secret name for sandbox client TLS materials. - #[must_use] - pub fn with_client_tls_secret_name(mut self, name: impl Into) -> Self { - self.client_tls_secret_name = name.into(); - self - } - - /// Set the host gateway IP for sandbox pod hostAliases. - #[must_use] - pub fn with_host_gateway_ip(mut self, ip: impl Into) -> Self { - self.host_gateway_ip = ip.into(); - self - } - /// Set the OIDC configuration for JWT-based authentication. #[must_use] pub fn with_oidc(mut self, oidc: OidcConfig) -> Self { @@ -662,30 +514,6 @@ fn default_log_level() -> String { "info".to_string() } -fn default_sandbox_namespace() -> String { - "default".to_string() -} - -fn default_sandbox_image() -> String { - format!("{}/base:latest", crate::image::DEFAULT_COMMUNITY_REGISTRY) -} - -fn default_ssh_gateway_host() -> String { - "127.0.0.1".to_string() -} - -const fn default_ssh_gateway_port() -> u16 { - DEFAULT_SERVER_PORT -} - -fn default_sandbox_ssh_socket_path() -> String { - "/run/openshell/ssh.sock".to_string() -} - -const fn default_sandbox_ssh_port() -> u16 { - DEFAULT_SSH_PORT -} - const fn default_ssh_session_ttl_secs() -> u64 { 86400 // 24 hours } diff --git a/crates/openshell-core/src/forward.rs b/crates/openshell-core/src/forward.rs index 82fe0114c..a5e373e61 100644 --- a/crates/openshell-core/src/forward.rs +++ b/crates/openshell-core/src/forward.rs @@ -462,7 +462,13 @@ pub fn resolve_ssh_gateway( // Remote cluster: use the remote host but keep the cluster URL port. return (host.to_string(), cluster_port); } - // Local cluster: both loopback — use cluster URL's port (Docker-mapped). + // Both endpoints loopback. The unspecified addresses (0.0.0.0 / ::) + // are bind-only — they aren't valid connect targets and aren't in TLS + // cert SANs, so fall back to the cluster URL's host (which the CLI + // is already using to reach the gateway). + if gateway_host == "0.0.0.0" || gateway_host == "::" { + return (host.to_string(), cluster_port); + } return (gateway_host.to_string(), cluster_port); } @@ -693,6 +699,16 @@ mod tests { assert_eq!(port, 8080); } + #[test] + fn resolve_ssh_gateway_swaps_zeros_for_loopback_cluster_host() { + // The gateway binds 0.0.0.0 but advertises that bind address via the + // SSH session response. 0.0.0.0 is not a valid connect target and is + // not in any TLS cert SAN; fall through to the cluster URL's host. + let (host, port) = resolve_ssh_gateway("0.0.0.0", 8080, "https://127.0.0.1:9000"); + assert_eq!(host, "127.0.0.1"); + assert_eq!(port, 9000); + } + #[test] fn resolve_ssh_gateway_handles_invalid_cluster_url() { let (host, port) = resolve_ssh_gateway("127.0.0.1", 8080, "not-a-url"); diff --git a/crates/openshell-driver-docker/Cargo.toml b/crates/openshell-driver-docker/Cargo.toml index 79d4fb37d..e2c97532a 100644 --- a/crates/openshell-driver-docker/Cargo.toml +++ b/crates/openshell-driver-docker/Cargo.toml @@ -19,6 +19,7 @@ futures = { workspace = true } tokio-stream = { workspace = true } tracing = { workspace = true } bytes = { workspace = true } +serde = { workspace = true } bollard = { version = "0.20" } tar = "0.4" tempfile = "3" diff --git a/crates/openshell-driver-docker/README.md b/crates/openshell-driver-docker/README.md index 20cfe6a0f..434e70d13 100644 --- a/crates/openshell-driver-docker/README.md +++ b/crates/openshell-driver-docker/README.md @@ -39,10 +39,10 @@ The agent child process does not retain these supervisor privileges. The Docker driver bind-mounts a host-side Linux `openshell-sandbox` binary into each sandbox container. Resolution order is: -1. `--docker-supervisor-bin` / `OPENSHELL_DOCKER_SUPERVISOR_BIN`. +1. `supervisor_bin` in `[openshell.drivers.docker]`. 2. A sibling `openshell-sandbox` next to the running `openshell-gateway` binary. 3. A local Linux cargo target build for the Docker daemon architecture. -4. `--docker-supervisor-image` / `OPENSHELL_DOCKER_SUPERVISOR_IMAGE`, or the +4. `supervisor_image` in `[openshell.drivers.docker]`, or the release-matched default supervisor image, extracting `/openshell-sandbox`. Release and Docker-image gateway builds bake the matching supervisor image tag diff --git a/crates/openshell-driver-docker/src/lib.rs b/crates/openshell-driver-docker/src/lib.rs index 7741e8393..30507422b 100644 --- a/crates/openshell-driver-docker/src/lib.rs +++ b/crates/openshell-driver-docker/src/lib.rs @@ -65,7 +65,7 @@ const DOCKER_NETWORK_DRIVER: &str = "bridge"; /// Default image holding the Linux `openshell-sandbox` binary. The gateway /// pulls this image and extracts the binary to a host-side cache when no -/// explicit `--docker-supervisor-bin` override or local build is available. +/// explicit `supervisor_bin` override or local build is available. const DEFAULT_DOCKER_SUPERVISOR_IMAGE_REPO: &str = "ghcr.io/nvidia/openshell/supervisor"; /// Path to the supervisor binary inside the `openshell/supervisor` image @@ -127,8 +127,21 @@ pub trait SupervisorReadiness: Send + Sync + 'static { } /// Gateway-local configuration for the Docker compute driver. -#[derive(Debug, Clone, Default)] +#[derive(Debug, Clone, serde::Serialize, serde::Deserialize)] +#[serde(default, deny_unknown_fields)] pub struct DockerComputeConfig { + /// Default OCI image for sandboxes. + pub default_image: String, + + /// Image pull policy for sandbox images. + pub image_pull_policy: String, + + /// Namespace label applied to Docker sandboxes. + pub sandbox_namespace: String, + + /// Gateway gRPC endpoint the sandbox connects back to. + pub grpc_endpoint: String, + /// Optional override for the Linux `openshell-sandbox` binary mounted into containers. pub supervisor_bin: Option, @@ -149,6 +162,38 @@ pub struct DockerComputeConfig { /// Docker bridge network that sandbox containers join. pub network_name: String, + + /// Host gateway IP used for sandbox host aliases. + pub host_gateway_ip: String, + + /// Unix socket path the in-container supervisor bridges relay traffic to. + pub ssh_socket_path: String, +} + +impl Default for DockerComputeConfig { + fn default() -> Self { + Self { + default_image: default_sandbox_image(), + image_pull_policy: String::new(), + sandbox_namespace: "default".to_string(), + grpc_endpoint: String::new(), + supervisor_bin: None, + supervisor_image: None, + guest_tls_ca: None, + guest_tls_cert: None, + guest_tls_key: None, + network_name: DEFAULT_DOCKER_NETWORK_NAME.to_string(), + host_gateway_ip: String::new(), + ssh_socket_path: "/run/openshell/ssh.sock".to_string(), + } + } +} + +fn default_sandbox_image() -> String { + format!( + "{}/base:latest", + openshell_core::image::DEFAULT_COMMUNITY_REGISTRY + ) } #[derive(Debug, Clone, PartialEq, Eq)] @@ -207,7 +252,7 @@ impl DockerComputeDriver { docker_config: &DockerComputeConfig, supervisor_readiness: Arc, ) -> CoreResult { - if config.grpc_endpoint.trim().is_empty() { + if docker_config.grpc_endpoint.trim().is_empty() { return Err(Error::config( "grpc_endpoint is required when using the docker compute driver", )); @@ -233,28 +278,28 @@ impl DockerComputeDriver { } let network_name = docker_network_name(docker_config); let bridge_gateway_ip = ensure_bridge_network(&docker, &network_name).await?; - let host_gateway_ip = parse_optional_host_gateway_ip(&config.host_gateway_ip)?; + let host_gateway_ip = parse_optional_host_gateway_ip(&docker_config.host_gateway_ip)?; let gateway_route = docker_gateway_route(&info, bridge_gateway_ip, gateway_port, host_gateway_ip); let grpc_endpoint = docker_container_openshell_endpoint( - &config.grpc_endpoint, + &docker_config.grpc_endpoint, HOST_OPENSHELL_INTERNAL, gateway_port, ); let daemon_arch = normalize_docker_arch(version.arch.as_deref().unwrap_or_default()); let supervisor_bin = resolve_supervisor_bin(&docker, docker_config, &daemon_arch).await?; - let guest_tls = docker_guest_tls_paths(config, docker_config)?; + let guest_tls = docker_guest_tls_paths(docker_config)?; let driver = Self { docker: Arc::new(docker), config: DockerDriverRuntimeConfig { - default_image: config.sandbox_image.clone(), - image_pull_policy: config.sandbox_image_pull_policy.clone(), - sandbox_namespace: config.sandbox_namespace.clone(), + default_image: docker_config.default_image.clone(), + image_pull_policy: docker_config.image_pull_policy.clone(), + sandbox_namespace: docker_config.sandbox_namespace.clone(), grpc_endpoint, network_name, gateway_route, - ssh_socket_path: config.sandbox_ssh_socket_path.clone(), + ssh_socket_path: docker_config.ssh_socket_path.clone(), stop_timeout_secs: DEFAULT_STOP_TIMEOUT_SECS, log_level: config.log_level.clone(), supervisor_bin, @@ -694,12 +739,12 @@ impl DockerComputeDriver { "never" => match self.docker.inspect_image(image).await { Ok(_) => Ok(()), Err(err) if is_not_found_error(&err) => Err(Status::failed_precondition(format!( - "docker image '{image}' is not present locally and sandbox_image_pull_policy=Never" + "docker image '{image}' is not present locally and image_pull_policy=Never" ))), Err(err) => Err(internal_status("inspect Docker image", err)), }, other => Err(Status::failed_precondition(format!( - "unsupported docker sandbox_image_pull_policy '{other}'; expected Always, IfNotPresent, or Never", + "unsupported docker image_pull_policy '{other}'; expected Always, IfNotPresent, or Never", ))), } } @@ -1079,11 +1124,10 @@ fn parse_optional_host_gateway_ip(value: &str) -> CoreResult> { return Ok(None); } - trimmed.parse().map(Some).map_err(|err| { - Error::config(format!( - "invalid OPENSHELL_HOST_GATEWAY_IP value '{trimmed}': {err}" - )) - }) + trimmed + .parse() + .map(Some) + .map_err(|err| Error::config(format!("invalid host_gateway_ip value '{trimmed}': {err}"))) } fn docker_gateway_route( @@ -1622,7 +1666,7 @@ pub(crate) async fn resolve_supervisor_bin( docker_config: &DockerComputeConfig, daemon_arch: &str, ) -> CoreResult { - // Tier 1: explicit --docker-supervisor-bin / OPENSHELL_DOCKER_SUPERVISOR_BIN. + // Tier 1: explicit supervisor_bin in [openshell.drivers.docker]. if let Some(path) = docker_config.supervisor_bin.clone() { let path = canonicalize_existing_file(&path, "docker supervisor binary")?; validate_linux_elf_binary(&path)?; @@ -1966,18 +2010,17 @@ pub(crate) fn validate_linux_elf_binary(path: &Path) -> CoreResult<()> { } pub(crate) fn docker_guest_tls_paths( - config: &Config, docker_config: &DockerComputeConfig, ) -> CoreResult> { let tls_flags_provided = docker_config.guest_tls_ca.is_some() || docker_config.guest_tls_cert.is_some() || docker_config.guest_tls_key.is_some(); - if !config.grpc_endpoint.starts_with("https://") { + if !docker_config.grpc_endpoint.starts_with("https://") { if tls_flags_provided { return Err(Error::config(format!( - "--docker-tls-ca/--docker-tls-cert/--docker-tls-key were provided but OPENSHELL_GRPC_ENDPOINT is '{}'; TLS materials require an https:// endpoint", - config.grpc_endpoint, + "guest_tls_ca/guest_tls_cert/guest_tls_key were provided but grpc_endpoint is '{}'; TLS materials require an https:// endpoint", + docker_config.grpc_endpoint, ))); } return Ok(None); @@ -1990,23 +2033,23 @@ pub(crate) fn docker_guest_tls_paths( ]; if provided.iter().all(Option::is_none) { return Err(Error::config( - "docker compute driver requires --docker-tls-ca, --docker-tls-cert, and --docker-tls-key when OPENSHELL_GRPC_ENDPOINT uses https://", + "docker compute driver requires guest_tls_ca, guest_tls_cert, and guest_tls_key when grpc_endpoint uses https://", )); } let Some(ca) = docker_config.guest_tls_ca.clone() else { return Err(Error::config( - "--docker-tls-ca is required when Docker sandbox TLS materials are configured", + "guest_tls_ca is required when Docker sandbox TLS materials are configured", )); }; let Some(cert) = docker_config.guest_tls_cert.clone() else { return Err(Error::config( - "--docker-tls-cert is required when Docker sandbox TLS materials are configured", + "guest_tls_cert is required when Docker sandbox TLS materials are configured", )); }; let Some(key) = docker_config.guest_tls_key.clone() else { return Err(Error::config( - "--docker-tls-key is required when Docker sandbox TLS materials are configured", + "guest_tls_key is required when Docker sandbox TLS materials are configured", )); }; diff --git a/crates/openshell-driver-docker/src/tests.rs b/crates/openshell-driver-docker/src/tests.rs index b3ea3b4a8..62a6b89e4 100644 --- a/crates/openshell-driver-docker/src/tests.rs +++ b/crates/openshell-driver-docker/src/tests.rs @@ -327,7 +327,7 @@ fn parse_optional_host_gateway_ip_rejects_invalid_values() { parse_optional_host_gateway_ip("not-an-ip") .unwrap_err() .to_string() - .contains("OPENSHELL_HOST_GATEWAY_IP") + .contains("host_gateway_ip") ); } @@ -708,20 +708,17 @@ fn validate_linux_elf_binary_rejects_non_elf_files() { #[test] fn docker_guest_tls_paths_require_all_files_for_https() { - let config = Config::new(None).with_grpc_endpoint("https://localhost:8443"); let tempdir = TempDir::new().unwrap(); let ca = tempdir.path().join("ca.crt"); fs::write(&ca, b"ca").unwrap(); - let err = docker_guest_tls_paths( - &config, - &DockerComputeConfig { - guest_tls_ca: Some(ca), - ..Default::default() - }, - ) + let err = docker_guest_tls_paths(&DockerComputeConfig { + grpc_endpoint: "https://localhost:8443".to_string(), + guest_tls_ca: Some(ca), + ..Default::default() + }) .unwrap_err(); - assert!(err.to_string().contains("--docker-tls-cert")); + assert!(err.to_string().contains("guest_tls_cert")); } #[test] @@ -798,26 +795,26 @@ fn trim_container_name_tail_strips_separators() { #[test] fn docker_guest_tls_paths_rejects_tls_flags_without_https() { - let config = Config::new(None).with_grpc_endpoint("http://localhost:8080"); let tempdir = TempDir::new().unwrap(); let ca = tempdir.path().join("ca.crt"); fs::write(&ca, b"ca").unwrap(); - let err = docker_guest_tls_paths( - &config, - &DockerComputeConfig { - guest_tls_ca: Some(ca), - ..Default::default() - }, - ) + let err = docker_guest_tls_paths(&DockerComputeConfig { + grpc_endpoint: "http://localhost:8080".to_string(), + guest_tls_ca: Some(ca), + ..Default::default() + }) .unwrap_err(); assert!(err.to_string().contains("https://")); } #[test] fn docker_guest_tls_paths_allows_plain_http_without_tls_flags() { - let config = Config::new(None).with_grpc_endpoint("http://localhost:8080"); - let result = docker_guest_tls_paths(&config, &DockerComputeConfig::default()).unwrap(); + let result = docker_guest_tls_paths(&DockerComputeConfig { + grpc_endpoint: "http://localhost:8080".to_string(), + ..Default::default() + }) + .unwrap(); assert!(result.is_none()); } diff --git a/crates/openshell-driver-kubernetes/Cargo.toml b/crates/openshell-driver-kubernetes/Cargo.toml index 5e247dc77..c222c9c31 100644 --- a/crates/openshell-driver-kubernetes/Cargo.toml +++ b/crates/openshell-driver-kubernetes/Cargo.toml @@ -26,6 +26,7 @@ tokio-stream = { workspace = true } kube = { workspace = true } kube-runtime = { workspace = true } k8s-openapi = { workspace = true } +serde = { workspace = true } serde_json = { workspace = true } clap = { workspace = true } tracing = { workspace = true } diff --git a/crates/openshell-driver-kubernetes/src/config.rs b/crates/openshell-driver-kubernetes/src/config.rs index c5ec9d5ae..28c04deb3 100644 --- a/crates/openshell-driver-kubernetes/src/config.rs +++ b/crates/openshell-driver-kubernetes/src/config.rs @@ -1,8 +1,15 @@ // SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. // SPDX-License-Identifier: Apache-2.0 +use openshell_core::config::DEFAULT_SUPERVISOR_IMAGE; +use serde::{Deserialize, Serialize}; + +/// Default Kubernetes namespace for sandbox resources. +pub const DEFAULT_K8S_NAMESPACE: &str = "openshell"; + /// How the supervisor binary is delivered into sandbox pods. -#[derive(Debug, Clone, Copy, PartialEq, Eq, Default)] +#[derive(Debug, Clone, Copy, PartialEq, Eq, Default, Serialize, Deserialize)] +#[serde(rename_all = "kebab-case")] pub enum SupervisorSideloadMethod { /// Mount the supervisor OCI image directly as a read-only volume /// (requires Kubernetes >= v1.33 with the `ImageVolume` feature gate, @@ -37,7 +44,8 @@ impl std::str::FromStr for SupervisorSideloadMethod { } } -#[derive(Debug, Clone)] +#[derive(Debug, Clone, Serialize, Deserialize)] +#[serde(default, deny_unknown_fields)] pub struct KubernetesComputeConfig { pub namespace: String, pub default_image: String, @@ -57,3 +65,32 @@ pub struct KubernetesComputeConfig { pub host_gateway_ip: String, pub enable_user_namespaces: bool, } + +impl Default for KubernetesComputeConfig { + fn default() -> Self { + Self { + namespace: DEFAULT_K8S_NAMESPACE.to_string(), + default_image: default_sandbox_image(), + // Default empty so the gateway omits `imagePullPolicy` from pod + // specs and Kubernetes applies its own default (Always for `latest`, + // IfNotPresent otherwise). `DEFAULT_IMAGE_PULL_POLICY` ("missing") + // is Podman vocabulary and is not a valid Kubernetes value. + image_pull_policy: String::new(), + supervisor_image: DEFAULT_SUPERVISOR_IMAGE.to_string(), + supervisor_image_pull_policy: String::new(), + supervisor_sideload_method: SupervisorSideloadMethod::default(), + grpc_endpoint: String::new(), + ssh_socket_path: "/run/openshell/ssh.sock".to_string(), + client_tls_secret_name: String::new(), + host_gateway_ip: String::new(), + enable_user_namespaces: false, + } + } +} + +fn default_sandbox_image() -> String { + format!( + "{}/base:latest", + openshell_core::image::DEFAULT_COMMUNITY_REGISTRY + ) +} diff --git a/crates/openshell-driver-podman/README.md b/crates/openshell-driver-podman/README.md index 51bf5f4e3..1906bd912 100644 --- a/crates/openshell-driver-podman/README.md +++ b/crates/openshell-driver-podman/README.md @@ -291,8 +291,7 @@ Podman resources after out-of-band container removal or label drift. | `OPENSHELL_GRPC_ENDPOINT` | `--grpc-endpoint` | Auto-detected via `host.containers.internal` | Gateway gRPC endpoint for sandbox callbacks. | | `OPENSHELL_GATEWAY_PORT` | `--gateway-port` | `8080` | Gateway port used for endpoint auto-detection by the standalone binary. | | `OPENSHELL_NETWORK_NAME` | `--network-name` | `openshell` | Podman bridge network name. | -| `OPENSHELL_SANDBOX_SSH_PORT` | `--sandbox-ssh-port` | `2222` | SSH compatibility port inside the container. | -| `OPENSHELL_SANDBOX_SSH_SOCKET_PATH` | `--sandbox-ssh-socket-path` | `/run/openshell/ssh.sock` | Standalone driver only: supervisor Unix socket path in `PodmanComputeConfig`. In-gateway Podman uses server `config.sandbox_ssh_socket_path`. | +| `OPENSHELL_SANDBOX_SSH_SOCKET_PATH` | `--sandbox-ssh-socket-path` | `/run/openshell/ssh.sock` | Supervisor Unix socket path in `PodmanComputeConfig`. | | `OPENSHELL_STOP_TIMEOUT` | `--stop-timeout` | `10` | Container stop timeout in seconds. | | `OPENSHELL_SUPERVISOR_IMAGE` | `--supervisor-image` | `openshell/supervisor:latest` through the gateway, required standalone | OCI image containing the supervisor binary. | | `OPENSHELL_PODMAN_TLS_CA` | `--podman-tls-ca` | unset | Host path to the CA certificate mounted for sandbox mTLS. | diff --git a/crates/openshell-driver-podman/src/config.rs b/crates/openshell-driver-podman/src/config.rs index 43f7d1fd0..c78c2b12f 100644 --- a/crates/openshell-driver-podman/src/config.rs +++ b/crates/openshell-driver-podman/src/config.rs @@ -1,12 +1,13 @@ // SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. // SPDX-License-Identifier: Apache-2.0 -use openshell_core::config::{ - DEFAULT_NETWORK_NAME, DEFAULT_SSH_PORT, DEFAULT_STOP_TIMEOUT_SECS, DEFAULT_SUPERVISOR_IMAGE, -}; +use openshell_core::config::{DEFAULT_STOP_TIMEOUT_SECS, DEFAULT_SUPERVISOR_IMAGE}; use std::path::PathBuf; use std::str::FromStr; +/// Default Podman bridge network name. +pub const DEFAULT_NETWORK_NAME: &str = "openshell"; + /// Image pull policy for sandbox and supervisor images. /// /// Controls when the Podman driver fetches a newer copy of an OCI image @@ -60,7 +61,8 @@ impl FromStr for ImagePullPolicy { } } -#[derive(Clone)] +#[derive(Clone, serde::Serialize, serde::Deserialize)] +#[serde(default, deny_unknown_fields)] pub struct PodmanComputeConfig { /// Path to the Podman API Unix socket. /// Default: `$XDG_RUNTIME_DIR/podman/podman.sock` (Linux), @@ -87,8 +89,6 @@ pub struct PodmanComputeConfig { /// Name of the Podman bridge network. /// Created automatically if it does not exist. pub network_name: String, - /// SSH port inside the container. - pub ssh_port: u16, /// Container stop timeout in seconds (SIGTERM → SIGKILL). pub stop_timeout_secs: u32, /// OCI image containing the openshell-sandbox supervisor binary. @@ -180,13 +180,12 @@ impl Default for PodmanComputeConfig { fn default() -> Self { Self { socket_path: Self::default_socket_path(), - default_image: String::new(), + default_image: default_sandbox_image(), image_pull_policy: ImagePullPolicy::default(), grpc_endpoint: String::new(), gateway_port: openshell_core::config::DEFAULT_SERVER_PORT, sandbox_ssh_socket_path: "/run/openshell/ssh.sock".to_string(), network_name: DEFAULT_NETWORK_NAME.to_string(), - ssh_port: DEFAULT_SSH_PORT, stop_timeout_secs: DEFAULT_STOP_TIMEOUT_SECS, supervisor_image: DEFAULT_SUPERVISOR_IMAGE.to_string(), guest_tls_ca: None, @@ -196,6 +195,13 @@ impl Default for PodmanComputeConfig { } } +fn default_sandbox_image() -> String { + format!( + "{}/base:latest", + openshell_core::image::DEFAULT_COMMUNITY_REGISTRY + ) +} + impl std::fmt::Debug for PodmanComputeConfig { fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { f.debug_struct("PodmanComputeConfig") @@ -206,7 +212,6 @@ impl std::fmt::Debug for PodmanComputeConfig { .field("gateway_port", &self.gateway_port) .field("sandbox_ssh_socket_path", &self.sandbox_ssh_socket_path) .field("network_name", &self.network_name) - .field("ssh_port", &self.ssh_port) .field("stop_timeout_secs", &self.stop_timeout_secs) .field("supervisor_image", &self.supervisor_image) .field("guest_tls_ca", &self.guest_tls_ca) diff --git a/crates/openshell-driver-podman/src/container.rs b/crates/openshell-driver-podman/src/container.rs index cd1baf7ec..1cb58e338 100644 --- a/crates/openshell-driver-podman/src/container.rs +++ b/crates/openshell-driver-podman/src/container.rs @@ -490,7 +490,8 @@ pub fn build_container_spec(sandbox: &DriverSandbox, config: &PodmanComputeConfi "CMD-SHELL".into(), format!( "test -e /var/run/openshell-ssh-ready || test -S {} || ss -tlnp | grep -q :{}", - config.sandbox_ssh_socket_path, config.ssh_port + config.sandbox_ssh_socket_path, + openshell_core::config::DEFAULT_SSH_PORT ), ], interval: 3_000_000_000, @@ -569,7 +570,7 @@ pub fn build_container_spec(sandbox: &DriverSandbox, config: &PodmanComputeConfi // the host, so we must use the published host port on 127.0.0.1 instead. portmappings: vec![PortMapping { host_port: 0, - container_port: config.ssh_port, + container_port: openshell_core::config::DEFAULT_SSH_PORT, protocol: "tcp".into(), }], }; diff --git a/crates/openshell-driver-podman/src/driver.rs b/crates/openshell-driver-podman/src/driver.rs index 04c360bb7..a2a1e15d6 100644 --- a/crates/openshell-driver-podman/src/driver.rs +++ b/crates/openshell-driver-podman/src/driver.rs @@ -256,7 +256,7 @@ impl PodmanComputeDriver { let image = container::resolve_image(sandbox, &self.config); if image.is_empty() { return Err(ComputeDriverError::Precondition( - "no sandbox image configured: set --sandbox-image on the server \ + "no sandbox image configured: set default_image in [openshell.drivers.podman] \ or provide an image in the sandbox template" .to_string(), )); diff --git a/crates/openshell-driver-podman/src/main.rs b/crates/openshell-driver-podman/src/main.rs index 9c31100ac..5a0227ef6 100644 --- a/crates/openshell-driver-podman/src/main.rs +++ b/crates/openshell-driver-podman/src/main.rs @@ -9,9 +9,9 @@ use tracing::info; use tracing_subscriber::EnvFilter; use openshell_core::VERSION; -use openshell_core::config::{DEFAULT_NETWORK_NAME, DEFAULT_SSH_PORT, DEFAULT_STOP_TIMEOUT_SECS}; +use openshell_core::config::DEFAULT_STOP_TIMEOUT_SECS; use openshell_core::proto::compute::v1::compute_driver_server::ComputeDriverServer; -use openshell_driver_podman::config::ImagePullPolicy; +use openshell_driver_podman::config::{DEFAULT_NETWORK_NAME, ImagePullPolicy}; use openshell_driver_podman::{ComputeDriverService, PodmanComputeConfig, PodmanComputeDriver}; #[derive(Parser)] @@ -67,9 +67,6 @@ struct Args { #[arg(long, env = "OPENSHELL_NETWORK_NAME", default_value = DEFAULT_NETWORK_NAME)] network_name: String, - #[arg(long, env = "OPENSHELL_SANDBOX_SSH_PORT", default_value_t = DEFAULT_SSH_PORT)] - sandbox_ssh_port: u16, - /// Container stop timeout in seconds (SIGTERM → SIGKILL). #[arg(long, env = "OPENSHELL_STOP_TIMEOUT", default_value_t = DEFAULT_STOP_TIMEOUT_SECS)] stop_timeout: u32, @@ -112,7 +109,6 @@ async fn main() -> Result<()> { gateway_port: args.gateway_port, sandbox_ssh_socket_path: args.sandbox_ssh_socket_path, network_name: args.network_name, - ssh_port: args.sandbox_ssh_port, stop_timeout_secs: args.stop_timeout, supervisor_image: args.supervisor_image, guest_tls_ca: args.podman_tls_ca, diff --git a/crates/openshell-driver-vm/README.md b/crates/openshell-driver-vm/README.md index 371c6cefe..dc1b112dd 100644 --- a/crates/openshell-driver-vm/README.md +++ b/crates/openshell-driver-vm/README.md @@ -43,7 +43,7 @@ By default `mise run gateway:vm`: - Registers the CLI gateway `vm-dev` by writing `~/.config/openshell/gateways/vm-dev/metadata.json`. It does not modify the workspace `.env`. - Persists the gateway SQLite DB under `.cache/gateway-vm/gateway.db`. - Places the VM driver state (per-sandbox rootfs plus `run/compute-driver.sock`) under `/tmp/openshell-vm-driver-$USER-vm-dev/` so the AF_UNIX socket path stays under macOS `SUN_LEN`. -- Passes `--driver-dir $PWD/target/debug` so the freshly built `openshell-driver-vm` is used instead of an older installed copy from `~/.local/libexec/openshell`, `/usr/libexec/openshell`, or `/usr/local/libexec`. +- Writes `.cache/gateway-vm/gateway.toml` with `[openshell.drivers.vm].driver_dir = "$PWD/target/debug"` so the freshly built `openshell-driver-vm` is used instead of an older installed copy from `~/.local/libexec/openshell`, `/usr/libexec/openshell`, or `/usr/local/libexec`. For GPU passthrough (VFIO), pass `-- --gpu` and run with root privileges: @@ -104,36 +104,36 @@ codesign \ # 4. Start the gateway with the VM driver mkdir -p /tmp/openshell-vm-driver-$USER-vm-dev .cache/gateway-vm +cat > .cache/gateway-vm/gateway.toml < \ - --grpc-endpoint http://host.containers.internal:18081 \ - --port 18081 \ - --vm-driver-state-dir /tmp/openshell-vm-driver-$USER-vm-dev + --port 18081 ``` -The gateway resolves `openshell-driver-vm` in this order: `--driver-dir`, conventional install locations (`~/.local/libexec/openshell`, `/usr/libexec/openshell`, `/usr/local/libexec/openshell`, `/usr/local/libexec`), then a sibling of the gateway binary. +The gateway resolves `openshell-driver-vm` in this order: `[openshell.drivers.vm].driver_dir`, conventional install locations (`~/.local/libexec/openshell`, `/usr/libexec/openshell`, `/usr/local/libexec/openshell`, `/usr/local/libexec`), then a sibling of the gateway binary. -## Flags +## Gateway And Driver Configuration -| Flag | Env var | Default | Purpose | -|---|---|---|---| -| `--drivers vm` | `OPENSHELL_DRIVERS` | `kubernetes` | Select the VM compute driver. | -| `--grpc-endpoint URL` | `OPENSHELL_GRPC_ENDPOINT` | — | Required. URL the sandbox guest dials to reach the gateway. Use `http://host.containers.internal:` (or `host.docker.internal` / `host.openshell.internal`) so traffic flows through gvproxy's host-loopback NAT (HostIP `192.168.127.254` → host `127.0.0.1`). Loopback URLs like `http://127.0.0.1:` are rewritten automatically by the driver. The bare gateway IP (`192.168.127.1`) only carries gvproxy's own services and will not reach host-bound ports. | -| `--vm-driver-state-dir DIR` | `OPENSHELL_VM_DRIVER_STATE_DIR` | `target/openshell-vm-driver` | Per-sandbox rootfs, console logs, image cache, and private `run/compute-driver.sock` UDS. | -| `--driver-dir DIR` | `OPENSHELL_DRIVER_DIR` | unset | Override the directory searched for `openshell-driver-vm`. | -| `--vm-driver-vcpus N` | `OPENSHELL_VM_DRIVER_VCPUS` | `2` | vCPUs per sandbox. | -| `--vm-driver-mem-mib N` | `OPENSHELL_VM_DRIVER_MEM_MIB` | `2048` | Memory per sandbox, in MiB. | -| `--vm-krun-log-level N` | `OPENSHELL_VM_KRUN_LOG_LEVEL` | `1` | libkrun verbosity (0–5). | -| `--vm-tls-ca PATH` | `OPENSHELL_VM_TLS_CA` | — | CA cert for the guest's mTLS client bundle. Required when `--grpc-endpoint` uses `https://`. | -| `--vm-tls-cert PATH` | `OPENSHELL_VM_TLS_CERT` | — | Guest client certificate. | -| `--vm-tls-key PATH` | `OPENSHELL_VM_TLS_KEY` | — | Guest client private key. | +Select the VM driver with `--drivers vm` or `OPENSHELL_DRIVERS=vm`. Configure VM-specific settings in `[openshell.drivers.vm]`: `grpc_endpoint`, `state_dir`, `driver_dir`, `vcpus`, `mem_mib`, `krun_log_level`, and `guest_tls_*`. -See [`openshell-gateway --help`](../openshell-server/src/cli.rs) for the full flag surface shared with the Kubernetes driver. +See [`openshell-gateway --help`](../openshell-server/src/cli.rs) for the gateway process flag surface. ## Verifying the gateway diff --git a/crates/openshell-server/Cargo.toml b/crates/openshell-server/Cargo.toml index 1592a4b22..4bbfe24fc 100644 --- a/crates/openshell-server/Cargo.toml +++ b/crates/openshell-server/Cargo.toml @@ -74,6 +74,7 @@ bytes = { workspace = true } pin-project-lite = { workspace = true } serde = { workspace = true } serde_json = { workspace = true } +toml = { workspace = true } tokio-stream = { workspace = true } sqlx = { workspace = true } reqwest = { workspace = true } diff --git a/crates/openshell-server/src/cli.rs b/crates/openshell-server/src/cli.rs index e5f902959..a2cfacde5 100644 --- a/crates/openshell-server/src/cli.rs +++ b/crates/openshell-server/src/cli.rs @@ -3,10 +3,11 @@ //! Shared CLI entrypoint for the gateway binaries. -use clap::{ArgAction, Command, CommandFactory, FromArgMatches, Parser}; +use clap::parser::ValueSource; +use clap::{ArgAction, ArgMatches, Command, CommandFactory, FromArgMatches, Parser}; use miette::{IntoDiagnostic, Result}; use openshell_core::ComputeDriverKind; -use openshell_core::config::{DEFAULT_DOCKER_NETWORK_NAME, DEFAULT_SERVER_PORT, DEFAULT_SSH_PORT}; +use openshell_core::config::DEFAULT_SERVER_PORT; use std::net::{IpAddr, SocketAddr}; use std::path::PathBuf; use tracing::{info, warn}; @@ -14,6 +15,7 @@ use tracing_subscriber::EnvFilter; use crate::certgen; use crate::compute::{DockerComputeConfig, VmComputeConfig}; +use crate::config_file::{self, ConfigFile, GatewayFileSection}; use crate::{run_server, tracing_bus::TracingLogBus}; /// `OpenShell` gateway process - gRPC and HTTP server with protocol multiplexing. @@ -41,6 +43,14 @@ enum Commands { #[derive(clap::Args, Debug)] #[allow(clippy::struct_excessive_bools)] struct RunArgs { + /// Path to a TOML configuration file (see RFC 0003). + /// + /// When set, gateway-wide settings and per-driver tables are read from + /// the file. Gateway command-line flags and `OPENSHELL_*` environment + /// variables continue to take precedence over gateway file values. + #[arg(long, env = "OPENSHELL_GATEWAY_CONFIG")] + config: Option, + /// IP address to bind the server, health, and metrics listeners to. #[arg(long, default_value = "127.0.0.1", env = "OPENSHELL_BIND_ADDRESS")] bind_address: IpAddr, @@ -100,138 +110,6 @@ struct RunArgs { )] drivers: Vec, - /// Kubernetes namespace for sandboxes. - #[arg(long, env = "OPENSHELL_SANDBOX_NAMESPACE", default_value = "default")] - sandbox_namespace: String, - - /// Default container image for sandboxes. - #[arg(long, env = "OPENSHELL_SANDBOX_IMAGE")] - sandbox_image: Option, - - /// Kubernetes `imagePullPolicy` for sandbox pods (Always, `IfNotPresent`, Never). - #[arg(long, env = "OPENSHELL_SANDBOX_IMAGE_PULL_POLICY")] - sandbox_image_pull_policy: Option, - - /// gRPC endpoint for sandboxes to callback to `OpenShell`. - /// This should be reachable from within the Kubernetes cluster. - #[arg(long, env = "OPENSHELL_GRPC_ENDPOINT")] - grpc_endpoint: Option, - - /// Public host for the SSH gateway. - #[arg(long, env = "OPENSHELL_SSH_GATEWAY_HOST", default_value = "127.0.0.1")] - ssh_gateway_host: String, - - /// Public port for the SSH gateway. - #[arg(long, env = "OPENSHELL_SSH_GATEWAY_PORT", default_value_t = DEFAULT_SERVER_PORT)] - ssh_gateway_port: u16, - - /// SSH port inside sandbox pods. - #[arg(long, env = "OPENSHELL_SANDBOX_SSH_PORT", default_value_t = DEFAULT_SSH_PORT)] - sandbox_ssh_port: u16, - - /// Kubernetes secret name containing client TLS materials for sandbox pods. - #[arg(long, env = "OPENSHELL_CLIENT_TLS_SECRET_NAME")] - client_tls_secret_name: Option, - - /// Host gateway IP for sandbox pod hostAliases. - /// When set, sandbox pods get hostAliases entries mapping - /// host.docker.internal and host.openshell.internal to this IP. - #[arg(long, env = "OPENSHELL_HOST_GATEWAY_IP")] - host_gateway_ip: Option, - - /// Working directory for VM driver sandbox state. - #[arg( - long, - env = "OPENSHELL_VM_DRIVER_STATE_DIR", - default_value_os_t = VmComputeConfig::default_state_dir() - )] - vm_driver_state_dir: PathBuf, - - /// Directory searched for compute-driver binaries (e.g. - /// `openshell-driver-vm`) when an explicit binary override isn't - /// configured. When unset, the gateway searches - /// `$HOME/.local/libexec/openshell`, `/usr/libexec/openshell`, - /// `/usr/local/libexec/openshell`, `/usr/local/libexec`, then a sibling - /// of the gateway binary. - #[arg(long, env = "OPENSHELL_DRIVER_DIR")] - driver_dir: Option, - - /// libkrun log level used by the VM helper. - #[arg( - long, - env = "OPENSHELL_VM_KRUN_LOG_LEVEL", - default_value_t = VmComputeConfig::default_krun_log_level() - )] - vm_krun_log_level: u32, - - /// Default vCPU count for VM sandboxes. - #[arg( - long, - env = "OPENSHELL_VM_DRIVER_VCPUS", - default_value_t = VmComputeConfig::default_vcpus() - )] - vm_vcpus: u8, - - /// Default memory allocation for VM sandboxes, in MiB. - #[arg( - long, - env = "OPENSHELL_VM_DRIVER_MEM_MIB", - default_value_t = VmComputeConfig::default_mem_mib() - )] - vm_mem_mib: u32, - - /// CA certificate installed into VM sandboxes for gateway mTLS. - #[arg(long, env = "OPENSHELL_VM_TLS_CA")] - vm_tls_ca: Option, - - /// Client certificate installed into VM sandboxes for gateway mTLS. - #[arg(long, env = "OPENSHELL_VM_TLS_CERT")] - vm_tls_cert: Option, - - /// Client private key installed into VM sandboxes for gateway mTLS. - #[arg(long, env = "OPENSHELL_VM_TLS_KEY")] - vm_tls_key: Option, - - /// Linux `openshell-sandbox` binary bind-mounted into Docker sandboxes. - /// - /// When unset the gateway falls back to (in order) a sibling - /// `openshell-sandbox` next to the gateway binary, a local cargo build, - /// or extracting the binary from `--docker-supervisor-image`. - #[arg(long, env = "OPENSHELL_DOCKER_SUPERVISOR_BIN")] - docker_supervisor_bin: Option, - - /// Image the Docker driver pulls to extract the Linux - /// `openshell-sandbox` binary when no explicit `--docker-supervisor-bin` - /// override or local build is available. Defaults to - /// `ghcr.io/nvidia/openshell/supervisor:`. - #[arg(long, env = "OPENSHELL_DOCKER_SUPERVISOR_IMAGE")] - docker_supervisor_image: Option, - - /// CA certificate bind-mounted into Docker sandboxes for gateway mTLS. - #[arg(long, env = "OPENSHELL_DOCKER_TLS_CA")] - docker_tls_ca: Option, - - /// Client certificate bind-mounted into Docker sandboxes for gateway mTLS. - #[arg(long, env = "OPENSHELL_DOCKER_TLS_CERT")] - docker_tls_cert: Option, - - /// Client private key bind-mounted into Docker sandboxes for gateway mTLS. - #[arg(long, env = "OPENSHELL_DOCKER_TLS_KEY")] - docker_tls_key: Option, - - /// Docker bridge network used for sandbox containers. - #[arg( - long, - env = "OPENSHELL_DOCKER_NETWORK_NAME", - default_value = DEFAULT_DOCKER_NETWORK_NAME - )] - docker_network_name: String, - - /// Enable Kubernetes user namespace isolation (hostUsers: false) for - /// sandbox pods. - #[arg(long, env = "OPENSHELL_ENABLE_USER_NAMESPACES")] - enable_user_namespaces: bool, - /// Disable TLS entirely — listen on plaintext HTTP. /// Use this when the gateway sits behind a reverse proxy or tunnel /// (e.g. Cloudflare Tunnel) that terminates TLS at the edge. @@ -313,15 +191,28 @@ pub async fn run_cli() -> Result<()> { .install_default() .map_err(|e| miette::miette!("failed to install rustls crypto provider: {e:?}"))?; - let cli = Cli::from_arg_matches(&command().get_matches()).expect("clap validated args"); + let matches = command().get_matches(); + let cli = Cli::from_arg_matches(&matches).expect("clap validated args"); match cli.command { Some(Commands::GenerateCerts(args)) => certgen::run(args).await, - None => Box::pin(run_from_args(cli.run)).await, + None => Box::pin(run_from_args(cli.run, matches)).await, } } -async fn run_from_args(args: RunArgs) -> Result<()> { +async fn run_from_args(mut args: RunArgs, matches: ArgMatches) -> Result<()> { + // Load TOML file when --config / OPENSHELL_GATEWAY_CONFIG is set. + // File values are applied below for any argument that is still at its + // built-in default — CLI flags and OPENSHELL_* env vars always win. + let file: Option = if let Some(path) = args.config.clone() { + Some(config_file::load(&path).map_err(|e| miette::miette!("{e}"))?) + } else { + None + }; + if let Some(file) = file.as_ref() { + merge_file_into_args(&mut args, &file.openshell.gateway, &matches); + } + let tracing_log_bus = TracingLogBus::new(); tracing_log_bus.install_subscriber( EnvFilter::try_from_default_env().unwrap_or_else(|_| EnvFilter::new(&args.log_level)), @@ -341,122 +232,108 @@ async fn run_from_args(args: RunArgs) -> Result<()> { let tls = if args.disable_tls { None } else { - let cert_path = args.tls_cert.ok_or_else(|| { + let cert_path = args.tls_cert.clone().ok_or_else(|| { miette::miette!( "--tls-cert is required when TLS is enabled (use --disable-tls to skip)" ) })?; - let key_path = args.tls_key.ok_or_else(|| { + let key_path = args.tls_key.clone().ok_or_else(|| { miette::miette!("--tls-key is required when TLS is enabled (use --disable-tls to skip)") })?; Some(openshell_core::TlsConfig { cert_path, key_path, require_client_auth: has_client_ca && !has_oidc, - client_ca_path: args.tls_client_ca, + client_ca_path: args.tls_client_ca.clone(), }) }; let db_url = args .db_url + .clone() .ok_or_else(|| miette::miette!("--db-url is required (or set OPENSHELL_DB_URL)"))?; let mut config = openshell_core::Config::new(tls) .with_bind_address(bind) .with_log_level(&args.log_level); - if args.health_port != 0 { - if args.port == args.health_port { + // Listener addresses for the health and metrics endpoints. The file may + // pin a different interface than the main listener (e.g. health on + // 127.0.0.1 while gRPC binds 0.0.0.0); the full `SocketAddr` from the + // file is preserved unless CLI/env supplied an explicit `--health-port` / + // `--metrics-port`, in which case the port overrides the file value + // while the IP defaults to `args.bind_address`. + let file_gateway = file.as_ref().map(|f| &f.openshell.gateway); + let health_bind = resolve_aux_listener( + args.bind_address, + args.health_port, + &matches, + "health_port", + || file_gateway.and_then(|g| g.health_bind_address), + ); + let metrics_bind = resolve_aux_listener( + args.bind_address, + args.metrics_port, + &matches, + "metrics_port", + || file_gateway.and_then(|g| g.metrics_bind_address), + ); + + if let Some(addr) = health_bind { + if args.port == addr.port() { return Err(miette::miette!( "--port and --health-port must be different (both set to {})", args.port )); } - let health_bind = SocketAddr::new(args.bind_address, args.health_port); - config = config.with_health_bind_address(health_bind); + config = config.with_health_bind_address(addr); } - if args.metrics_port != 0 { - if args.port == args.metrics_port { + if let Some(addr) = metrics_bind { + if args.port == addr.port() { return Err(miette::miette!( "--port and --metrics-port must be different (both set to {})", args.port )); } - if args.health_port != 0 && args.health_port == args.metrics_port { + if let Some(health) = health_bind + && health.port() == addr.port() + { return Err(miette::miette!( "--health-port and --metrics-port must be different (both set to {})", - args.health_port + health.port() )); } - let metrics_bind = SocketAddr::new(args.bind_address, args.metrics_port); - config = config.with_metrics_bind_address(metrics_bind); + config = config.with_metrics_bind_address(addr); } config = config .with_database_url(db_url) - .with_compute_drivers(args.drivers) - .with_sandbox_namespace(args.sandbox_namespace) - .with_ssh_gateway_host(args.ssh_gateway_host) - .with_ssh_gateway_port(args.ssh_gateway_port) - .with_sandbox_ssh_port(args.sandbox_ssh_port) - .with_server_sans(args.server_sans) + .with_compute_drivers(args.drivers.clone()) + .with_server_sans(args.server_sans.clone()) .with_loopback_service_http(args.enable_loopback_service_http); - if let Some(image) = args.sandbox_image { - config = config.with_sandbox_image(image); + if let Some(ttl) = file + .as_ref() + .and_then(|f| f.openshell.gateway.ssh_session_ttl_secs) + { + config = config.with_ssh_session_ttl_secs(ttl); } - if let Some(policy) = args.sandbox_image_pull_policy { - config = config.with_sandbox_image_pull_policy(policy); - } - - if let Some(endpoint) = args.grpc_endpoint { - config = config.with_grpc_endpoint(endpoint); - } - - if let Some(name) = args.client_tls_secret_name { - config = config.with_client_tls_secret_name(name); - } - - if let Some(ip) = args.host_gateway_ip { - config = config.with_host_gateway_ip(ip); - } - - if let Some(issuer) = args.oidc_issuer { + if let Some(issuer) = args.oidc_issuer.clone() { config = config.with_oidc(openshell_core::OidcConfig { issuer, - audience: args.oidc_audience, + audience: args.oidc_audience.clone(), jwks_ttl_secs: args.oidc_jwks_ttl, - roles_claim: args.oidc_roles_claim, - admin_role: args.oidc_admin_role, - user_role: args.oidc_user_role, - scopes_claim: args.oidc_scopes_claim, + roles_claim: args.oidc_roles_claim.clone(), + admin_role: args.oidc_admin_role.clone(), + user_role: args.oidc_user_role.clone(), + scopes_claim: args.oidc_scopes_claim.clone(), }); } - config.enable_user_namespaces = args.enable_user_namespaces; - - let vm_config = VmComputeConfig { - state_dir: args.vm_driver_state_dir, - driver_dir: args.driver_dir, - default_image: config.sandbox_image.clone(), - krun_log_level: args.vm_krun_log_level, - vcpus: args.vm_vcpus, - mem_mib: args.vm_mem_mib, - guest_tls_ca: args.vm_tls_ca, - guest_tls_cert: args.vm_tls_cert, - guest_tls_key: args.vm_tls_key, - }; - - let docker_config = DockerComputeConfig { - supervisor_bin: args.docker_supervisor_bin, - supervisor_image: args.docker_supervisor_image, - guest_tls_ca: args.docker_tls_ca, - guest_tls_cert: args.docker_tls_cert, - guest_tls_key: args.docker_tls_key, - network_name: args.docker_network_name, - }; + let vm_config = build_vm_config(file.as_ref())?; + let docker_config = build_docker_config(file.as_ref())?; if args.disable_tls { warn!("TLS disabled — listening on plaintext HTTP"); @@ -480,15 +357,185 @@ async fn run_from_args(args: RunArgs) -> Result<()> { info!(bind = %config.bind_address, "Starting OpenShell server"); - run_server(config, vm_config, docker_config, tracing_log_bus) - .await - .into_diagnostic() + Box::pin(run_server( + config, + vm_config, + docker_config, + file, + tracing_log_bus, + )) + .await + .into_diagnostic() } fn parse_compute_driver(value: &str) -> std::result::Result { value.parse() } +/// Returns `true` when an argument's value came from clap's built-in default +/// (or was never supplied at all). When the predicate is `true`, the loader +/// is free to replace the value with one read from the TOML config file. +fn arg_defaulted(matches: &ArgMatches, id: &str) -> bool { + matches!( + matches.value_source(id), + None | Some(ValueSource::DefaultValue) + ) +} + +/// Resolve the bind address for an auxiliary listener (health / metrics). +/// +/// The precedence is: +/// 1. CLI flag or `OPENSHELL_*` env var explicitly set on the corresponding +/// port argument → `bind_address:port` (port from CLI, IP from the main +/// listener interface). +/// 2. Full `SocketAddr` from `[openshell.gateway].{health,metrics}_bind_address` +/// → used as-is (this is how operators pin a loopback-only health port +/// on a gateway whose gRPC listener is bound publicly). +/// 3. Otherwise the listener is disabled (returns `None`). +fn resolve_aux_listener( + bind_ip: IpAddr, + port_arg: u16, + matches: &ArgMatches, + port_id: &str, + file_addr: impl FnOnce() -> Option, +) -> Option { + if !arg_defaulted(matches, port_id) { + if port_arg == 0 { + return None; + } + return Some(SocketAddr::new(bind_ip, port_arg)); + } + if let Some(addr) = file_addr() { + return Some(addr); + } + if port_arg == 0 { + None + } else { + Some(SocketAddr::new(bind_ip, port_arg)) + } +} + +/// Apply gateway-wide values from `[openshell.gateway]` onto `RunArgs` for +/// every argument that is still sourced from clap's built-in default. +/// +/// The function intentionally does not touch `database_url` — that secret is +/// env-only and the loader already rejected it when it appears in the file. +fn merge_file_into_args(args: &mut RunArgs, file: &GatewayFileSection, matches: &ArgMatches) { + if let Some(addr) = file.bind_address { + if arg_defaulted(matches, "bind_address") { + args.bind_address = addr.ip(); + } + if arg_defaulted(matches, "port") { + args.port = addr.port(); + } + } + // Note: file's full health_bind_address / metrics_bind_address are + // consumed in `run_from_args`'s listener-resolution block so the IP + // half of the SocketAddr is preserved. Copying only the port here + // would silently relocate a loopback-intended listener onto the + // public bind address. + if let Some(level) = &file.log_level + && arg_defaulted(matches, "log_level") + { + args.log_level.clone_from(level); + } + if let Some(drivers) = &file.compute_drivers + && arg_defaulted(matches, "drivers") + { + args.drivers.clone_from(drivers); + } + if let Some(sans) = &file.server_sans + && args.server_sans.is_empty() + && arg_defaulted(matches, "server_sans") + { + args.server_sans.clone_from(sans); + } + if let Some(enabled) = file.enable_loopback_service_http + && arg_defaulted(matches, "enable_loopback_service_http") + { + args.enable_loopback_service_http = enabled; + } + if let Some(disabled) = file.disable_tls + && arg_defaulted(matches, "disable_tls") + { + args.disable_tls = disabled; + } + // TLS gateway listener fields + if let Some(tls) = &file.tls { + if args.tls_cert.is_none() && arg_defaulted(matches, "tls_cert") { + args.tls_cert = Some(tls.cert_path.clone()); + } + if args.tls_key.is_none() && arg_defaulted(matches, "tls_key") { + args.tls_key = Some(tls.key_path.clone()); + } + if args.tls_client_ca.is_none() && arg_defaulted(matches, "tls_client_ca") { + args.tls_client_ca.clone_from(&tls.client_ca_path); + } + } + // OIDC fields + if let Some(oidc) = &file.oidc { + if args.oidc_issuer.is_none() && arg_defaulted(matches, "oidc_issuer") { + args.oidc_issuer = Some(oidc.issuer.clone()); + } + if arg_defaulted(matches, "oidc_audience") { + args.oidc_audience.clone_from(&oidc.audience); + } + if arg_defaulted(matches, "oidc_jwks_ttl") { + args.oidc_jwks_ttl = oidc.jwks_ttl_secs; + } + if arg_defaulted(matches, "oidc_roles_claim") { + args.oidc_roles_claim.clone_from(&oidc.roles_claim); + } + if arg_defaulted(matches, "oidc_admin_role") { + args.oidc_admin_role.clone_from(&oidc.admin_role); + } + if arg_defaulted(matches, "oidc_user_role") { + args.oidc_user_role.clone_from(&oidc.user_role); + } + if arg_defaulted(matches, "oidc_scopes_claim") { + args.oidc_scopes_claim.clone_from(&oidc.scopes_claim); + } + } +} + +/// Build [`VmComputeConfig`] from the `[openshell.drivers.vm]` table +/// inherited from `[openshell.gateway]`. +fn build_vm_config(file: Option<&ConfigFile>) -> Result { + let mut cfg = if let Some(file) = file { + let merged = config_file::driver_table( + ComputeDriverKind::Vm, + &file.openshell.gateway, + file.openshell.drivers.get("vm"), + ); + merged + .try_into::() + .map_err(|e| miette::miette!("invalid [openshell.drivers.vm] table: {e}"))? + } else { + VmComputeConfig::default() + }; + + if cfg.state_dir.as_os_str().is_empty() { + cfg.state_dir = VmComputeConfig::default_state_dir(); + } + Ok(cfg) +} + +/// Build [`DockerComputeConfig`] using the same inheritance pattern as +/// [`build_vm_config`]. +fn build_docker_config(file: Option<&ConfigFile>) -> Result { + if let Some(file) = file { + let merged = config_file::driver_table( + ComputeDriverKind::Docker, + &file.openshell.gateway, + file.openshell.drivers.get("docker"), + ); + return merged + .try_into::() + .map_err(|e| miette::miette!("invalid [openshell.drivers.docker] table: {e}")); + } + Ok(DockerComputeConfig::default()) +} + #[cfg(test)] mod tests { use super::{Cli, command}; @@ -646,6 +693,42 @@ mod tests { assert_eq!(cli.run.server_sans, vec!["*.apps.example.com".to_string()]); } + #[test] + fn command_rejects_removed_driver_flags() { + let err = command() + .try_get_matches_from([ + "openshell-gateway", + "--db-url", + "sqlite::memory:", + "--sandbox-image", + "example/sandbox:latest", + ]) + .expect_err("driver implementation flags should not be accepted"); + + assert_eq!(err.kind(), clap::error::ErrorKind::UnknownArgument); + } + + #[test] + fn command_rejects_removed_ssh_endpoint_flags() { + for flag in [ + "--ssh-gateway-host", + "--ssh-gateway-port", + "--sandbox-ssh-port", + ] { + let err = command() + .try_get_matches_from([ + "openshell-gateway", + "--db-url", + "sqlite::memory:", + flag, + "x", + ]) + .expect_err("SSH endpoint flags should not be accepted"); + + assert_eq!(err.kind(), clap::error::ErrorKind::UnknownArgument); + } + } + #[test] fn generate_certs_subcommand_parses_without_db_url() { let _lock = ENV_LOCK @@ -713,4 +796,308 @@ mod tests { assert!(cli.command.is_none()); assert!(cli.run.db_url.is_none()); } + + // ── Config-file merge tests ────────────────────────────────────────── + // + // `merge_file_into_args` is the bridge between `config_file::ConfigFile` + // and `RunArgs`. These cases lock in the precedence rule: + // + // CLI flag > OPENSHELL_* env var > TOML file > built-in default + // + // by exercising each combination on representative gateway fields. + + use super::{ConfigFile, merge_file_into_args}; + use clap::FromArgMatches; + + fn parse_with_args(argv: &[&str]) -> (super::RunArgs, clap::ArgMatches) { + let matches = command().try_get_matches_from(argv).expect("parses"); + let cli = Cli::from_arg_matches(&matches).expect("from arg matches"); + (cli.run, matches) + } + + fn config_file_from_toml(toml: &str) -> ConfigFile { + toml::from_str(toml).expect("valid TOML in test fixture") + } + + #[test] + fn file_value_applies_when_cli_uses_default() { + let _lock = ENV_LOCK + .lock() + .unwrap_or_else(std::sync::PoisonError::into_inner); + let _g1 = EnvVarGuard::remove("OPENSHELL_BIND_ADDRESS"); + let _g2 = EnvVarGuard::remove("OPENSHELL_SERVER_PORT"); + let _g3 = EnvVarGuard::remove("OPENSHELL_LOG_LEVEL"); + + let (mut args, matches) = + parse_with_args(&["openshell-gateway", "--db-url", "sqlite::memory:"]); + let file = config_file_from_toml( + r#" +[openshell.gateway] +bind_address = "0.0.0.0:9090" +log_level = "debug" +"#, + ); + merge_file_into_args(&mut args, &file.openshell.gateway, &matches); + + assert_eq!(args.bind_address, IpAddr::V4(Ipv4Addr::UNSPECIFIED)); + assert_eq!(args.port, 9090); + assert_eq!(args.log_level, "debug"); + } + + #[test] + fn cli_flag_overrides_file_value() { + let _lock = ENV_LOCK + .lock() + .unwrap_or_else(std::sync::PoisonError::into_inner); + let _g1 = EnvVarGuard::remove("OPENSHELL_BIND_ADDRESS"); + let _g2 = EnvVarGuard::remove("OPENSHELL_LOG_LEVEL"); + + let (mut args, matches) = parse_with_args(&[ + "openshell-gateway", + "--db-url", + "sqlite::memory:", + "--log-level", + "warn", + ]); + let file = config_file_from_toml( + r#" +[openshell.gateway] +log_level = "debug" +"#, + ); + merge_file_into_args(&mut args, &file.openshell.gateway, &matches); + + assert_eq!(args.log_level, "warn", "CLI flag must win over file"); + } + + #[test] + fn env_var_overrides_file_value() { + let _lock = ENV_LOCK + .lock() + .unwrap_or_else(std::sync::PoisonError::into_inner); + let _g = EnvVarGuard::set("OPENSHELL_LOG_LEVEL", "trace"); + + let (mut args, matches) = + parse_with_args(&["openshell-gateway", "--db-url", "sqlite::memory:"]); + let file = config_file_from_toml( + r#" +[openshell.gateway] +log_level = "debug" +"#, + ); + merge_file_into_args(&mut args, &file.openshell.gateway, &matches); + + assert_eq!(args.log_level, "trace", "env var must win over file"); + } + + #[test] + fn file_oidc_block_populates_oidc_args() { + let _lock = ENV_LOCK + .lock() + .unwrap_or_else(std::sync::PoisonError::into_inner); + let _g1 = EnvVarGuard::remove("OPENSHELL_OIDC_ISSUER"); + let _g2 = EnvVarGuard::remove("OPENSHELL_OIDC_AUDIENCE"); + + let (mut args, matches) = + parse_with_args(&["openshell-gateway", "--db-url", "sqlite::memory:"]); + let file = config_file_from_toml( + r#" +[openshell.gateway.oidc] +issuer = "https://idp.example.com" +audience = "openshell-cli" +"#, + ); + merge_file_into_args(&mut args, &file.openshell.gateway, &matches); + + assert_eq!(args.oidc_issuer.as_deref(), Some("https://idp.example.com")); + assert_eq!(args.oidc_audience, "openshell-cli"); + } + + #[test] + fn aux_listener_preserves_file_ip_against_public_bind() { + use std::net::SocketAddr; + let _lock = ENV_LOCK + .lock() + .unwrap_or_else(std::sync::PoisonError::into_inner); + let _g = EnvVarGuard::remove("OPENSHELL_HEALTH_PORT"); + + let (_args, matches) = + parse_with_args(&["openshell-gateway", "--db-url", "sqlite::memory:"]); + let file_addr: SocketAddr = "127.0.0.1:8081".parse().unwrap(); + let resolved = super::resolve_aux_listener( + IpAddr::V4(Ipv4Addr::UNSPECIFIED), + 0, + &matches, + "health_port", + || Some(file_addr), + ); + assert_eq!( + resolved, + Some(file_addr), + "TOML health_bind_address 127.0.0.1:8081 must not be relocated to 0.0.0.0:8081" + ); + } + + #[test] + fn aux_listener_cli_port_overrides_file_addr() { + let _lock = ENV_LOCK + .lock() + .unwrap_or_else(std::sync::PoisonError::into_inner); + let _g = EnvVarGuard::remove("OPENSHELL_HEALTH_PORT"); + + let (_args, matches) = parse_with_args(&[ + "openshell-gateway", + "--db-url", + "sqlite::memory:", + "--health-port", + "9999", + ]); + let file_addr: std::net::SocketAddr = "127.0.0.1:8081".parse().unwrap(); + let resolved = super::resolve_aux_listener( + IpAddr::V4(Ipv4Addr::UNSPECIFIED), + 9999, + &matches, + "health_port", + || Some(file_addr), + ); + assert_eq!( + resolved, + Some("0.0.0.0:9999".parse().unwrap()), + "CLI flag must win over file value" + ); + } + + #[test] + fn file_disable_tls_applies() { + let _lock = ENV_LOCK + .lock() + .unwrap_or_else(std::sync::PoisonError::into_inner); + let _g = EnvVarGuard::remove("OPENSHELL_DISABLE_TLS"); + + let (mut args, matches) = + parse_with_args(&["openshell-gateway", "--db-url", "sqlite::memory:"]); + let file = config_file_from_toml( + r" +[openshell.gateway] +disable_tls = true +", + ); + merge_file_into_args(&mut args, &file.openshell.gateway, &matches); + + assert!(args.disable_tls); + } + + #[test] + fn file_ssh_session_ttl_secs_is_parsed() { + // The loader must accept and surface the documented key. The actual + // wiring into `Config` happens in `run_from_args` against the parsed + // file (not via `merge_file_into_args`, since there is no matching + // `RunArgs` field), so this test pins the schema half. + let file = config_file_from_toml( + r" +[openshell.gateway] +ssh_session_ttl_secs = 1234 +", + ); + assert_eq!(file.openshell.gateway.ssh_session_ttl_secs, Some(1234)); + } + + #[test] + fn file_populates_service_routing_fields() { + let _lock = ENV_LOCK + .lock() + .unwrap_or_else(std::sync::PoisonError::into_inner); + let _g1 = EnvVarGuard::remove("OPENSHELL_SERVER_SAN"); + let _g2 = EnvVarGuard::remove("OPENSHELL_ENABLE_LOOPBACK_SERVICE_HTTP"); + + let (mut args, matches) = + parse_with_args(&["openshell-gateway", "--db-url", "sqlite::memory:"]); + let file = config_file_from_toml( + r#" +[openshell.gateway] +server_sans = ["gateway.local", "*.dev.openshell.localhost"] +enable_loopback_service_http = false +"#, + ); + merge_file_into_args(&mut args, &file.openshell.gateway, &matches); + + assert_eq!( + args.server_sans, + vec![ + "gateway.local".to_string(), + "*.dev.openshell.localhost".to_string() + ] + ); + assert!(!args.enable_loopback_service_http); + } + + #[test] + fn env_var_overrides_file_loopback_service_http() { + let _lock = ENV_LOCK + .lock() + .unwrap_or_else(std::sync::PoisonError::into_inner); + let _g = EnvVarGuard::set("OPENSHELL_ENABLE_LOOPBACK_SERVICE_HTTP", "true"); + + let (mut args, matches) = + parse_with_args(&["openshell-gateway", "--db-url", "sqlite::memory:"]); + let file = config_file_from_toml( + r" +[openshell.gateway] +enable_loopback_service_http = false +", + ); + merge_file_into_args(&mut args, &file.openshell.gateway, &matches); + + assert!( + args.enable_loopback_service_http, + "env var must win over file" + ); + } + + #[test] + fn driver_inherits_shared_image_from_gateway_section() { + // [openshell.gateway].default_image inherits into the K8s driver + // table when the driver-specific table does not set it. + let file = config_file_from_toml( + r#" +[openshell.gateway] +default_image = "ghcr.io/nvidia/openshell/sandbox:1.0" + +[openshell.drivers.kubernetes] +namespace = "agents" +"#, + ); + let merged = crate::config_file::driver_table( + super::ComputeDriverKind::Kubernetes, + &file.openshell.gateway, + file.openshell.drivers.get("kubernetes"), + ); + let parsed = merged + .try_into::() + .expect("merged table deserializes"); + assert_eq!(parsed.default_image, "ghcr.io/nvidia/openshell/sandbox:1.0"); + assert_eq!(parsed.namespace, "agents"); + } + + #[test] + fn driver_specific_value_overrides_gateway_inheritance() { + let file = config_file_from_toml( + r#" +[openshell.gateway] +default_image = "gateway-default:1.0" + +[openshell.drivers.kubernetes] +default_image = "k8s-specific:1.0" +"#, + ); + let merged = crate::config_file::driver_table( + super::ComputeDriverKind::Kubernetes, + &file.openshell.gateway, + file.openshell.drivers.get("kubernetes"), + ); + let parsed = merged + .try_into::() + .expect("deserializes"); + assert_eq!(parsed.default_image, "k8s-specific:1.0"); + } } diff --git a/crates/openshell-server/src/compute/vm.rs b/crates/openshell-server/src/compute/vm.rs index a6b847bb3..14e518e7b 100644 --- a/crates/openshell-server/src/compute/vm.rs +++ b/crates/openshell-server/src/compute/vm.rs @@ -60,7 +60,8 @@ const COMPUTE_DRIVER_SOCKET_RUN_DIR: &str = "run"; const COMPUTE_DRIVER_SOCKET_NAME: &str = "compute-driver.sock"; /// Configuration for launching and talking to the VM compute driver. -#[derive(Debug, Clone)] +#[derive(Debug, Clone, serde::Serialize, serde::Deserialize)] +#[serde(default, deny_unknown_fields)] pub struct VmComputeConfig { /// Working directory for VM driver sandbox state. pub state_dir: PathBuf, @@ -72,6 +73,9 @@ pub struct VmComputeConfig { /// Default sandbox image the driver should use when a request omits one. pub default_image: String, + /// Gateway gRPC endpoint the sandbox guest connects back to. + pub grpc_endpoint: String, + /// libkrun log level used by the VM driver helper. pub krun_log_level: u32, @@ -134,7 +138,8 @@ impl Default for VmComputeConfig { Self { state_dir: Self::default_state_dir(), driver_dir: None, - default_image: String::new(), + default_image: default_sandbox_image(), + grpc_endpoint: String::new(), krun_log_level: Self::default_krun_log_level(), vcpus: Self::default_vcpus(), mem_mib: Self::default_mem_mib(), @@ -145,6 +150,13 @@ impl Default for VmComputeConfig { } } +fn default_sandbox_image() -> String { + format!( + "{}/base:latest", + openshell_core::image::DEFAULT_COMMUNITY_REGISTRY + ) +} + #[cfg(unix)] #[derive(Debug, Clone, PartialEq, Eq)] pub struct VmGuestTlsPaths { @@ -157,7 +169,7 @@ pub struct VmGuestTlsPaths { /// /// Resolution order: /// 1. `{driver_dir}/openshell-driver-vm`, where `driver_dir` comes from -/// `--driver-dir` / `OPENSHELL_DRIVER_DIR`. +/// `[openshell.drivers.vm].driver_dir`. /// 2. Conventional install directories: /// `~/.local/libexec/openshell`, `/usr/libexec/openshell`, /// `/usr/local/libexec/openshell`, `/usr/local/libexec`. @@ -197,7 +209,7 @@ pub fn resolve_compute_driver_bin(vm_config: &VmComputeConfig) -> Result>() .join(", "); Err(Error::config(format!( - "vm compute driver binary not found (searched {searched_display}); install it under --driver-dir / OPENSHELL_DRIVER_DIR, a conventional libexec path such as ~/.local/libexec/openshell, /usr/libexec/openshell, or /usr/local/libexec{{,/openshell}}, or place it next to the gateway binary" + "vm compute driver binary not found (searched {searched_display}); install it under [openshell.drivers.vm].driver_dir, a conventional libexec path such as ~/.local/libexec/openshell, /usr/libexec/openshell, or /usr/local/libexec{{,/openshell}}, or place it next to the gateway binary" ))) } @@ -359,10 +371,9 @@ fn remove_stale_socket(socket_path: &Path, expected_uid: u32) -> Result<()> { #[cfg(unix)] pub fn compute_driver_guest_tls_paths( - config: &Config, vm_config: &VmComputeConfig, ) -> Result> { - if !config.grpc_endpoint.starts_with("https://") { + if !vm_config.grpc_endpoint.starts_with("https://") { return Ok(None); } @@ -373,23 +384,23 @@ pub fn compute_driver_guest_tls_paths( ]; if provided.iter().all(Option::is_none) { return Err(Error::config( - "vm compute driver requires --vm-tls-ca, --vm-tls-cert, and --vm-tls-key when OPENSHELL_GRPC_ENDPOINT uses https://", + "vm compute driver requires guest_tls_ca, guest_tls_cert, and guest_tls_key when grpc_endpoint uses https://", )); } let Some(ca) = vm_config.guest_tls_ca.clone() else { return Err(Error::config( - "--vm-tls-ca is required when VM guest TLS materials are configured", + "guest_tls_ca is required when VM guest TLS materials are configured", )); }; let Some(cert) = vm_config.guest_tls_cert.clone() else { return Err(Error::config( - "--vm-tls-cert is required when VM guest TLS materials are configured", + "guest_tls_cert is required when VM guest TLS materials are configured", )); }; let Some(key) = vm_config.guest_tls_key.clone() else { return Err(Error::config( - "--vm-tls-key is required when VM guest TLS materials are configured", + "guest_tls_key is required when VM guest TLS materials are configured", )); }; @@ -413,7 +424,7 @@ pub async fn spawn( config: &Config, vm_config: &VmComputeConfig, ) -> Result<(Channel, Arc)> { - if config.grpc_endpoint.trim().is_empty() { + if vm_config.grpc_endpoint.trim().is_empty() { return Err(Error::config( "grpc_endpoint is required when using the vm compute driver", )); @@ -421,7 +432,7 @@ pub async fn spawn( let driver_bin = resolve_compute_driver_bin(vm_config)?; let socket_path = compute_driver_socket_path(vm_config); - let guest_tls_paths = compute_driver_guest_tls_paths(config, vm_config)?; + let guest_tls_paths = compute_driver_guest_tls_paths(vm_config)?; prepare_compute_driver_socket_path(vm_config, &socket_path)?; let mut command = Command::new(&driver_bin); @@ -436,7 +447,7 @@ pub async fn spawn( command.arg("--log-level").arg(&config.log_level); command .arg("--openshell-endpoint") - .arg(&config.grpc_endpoint); + .arg(&vm_config.grpc_endpoint); command.arg("--state-dir").arg(&vm_config.state_dir); if !vm_config.default_image.trim().is_empty() { command.arg("--default-image").arg(&vm_config.default_image); @@ -538,7 +549,6 @@ mod tests { prepare_compute_driver_socket_path, prepare_vm_state_dir, resolve_compute_driver_bin, resolve_driver_search_dirs, }; - use openshell_core::{Config, TlsConfig}; use std::os::unix::fs::PermissionsExt; use std::os::unix::net::UnixListener as StdUnixListener; use std::path::PathBuf; @@ -569,8 +579,7 @@ mod tests { let err = resolve_compute_driver_bin(&vm_config) .unwrap_err() .to_string(); - assert!(err.contains("--driver-dir")); - assert!(err.contains("OPENSHELL_DRIVER_DIR")); + assert!(err.contains("[openshell.drivers.vm].driver_dir")); assert!(err.contains("openshell-driver-vm")); } @@ -588,27 +597,16 @@ mod tests { #[test] fn vm_compute_driver_tls_requires_explicit_guest_bundle() { - let dir = tempdir().unwrap(); - let server_cert = dir.path().join("server.crt"); - let server_key = dir.path().join("server.key"); - let server_ca = dir.path().join("client-ca.crt"); - std::fs::write(&server_cert, "server-cert").unwrap(); - std::fs::write(&server_key, "server-key").unwrap(); - std::fs::write(&server_ca, "client-ca").unwrap(); - - let config = Config::new(Some(TlsConfig { - cert_path: server_cert, - key_path: server_key, - client_ca_path: Some(server_ca), - require_client_auth: false, - })) - .with_grpc_endpoint("https://gateway.internal:8443"); + let vm_config = VmComputeConfig { + grpc_endpoint: "https://gateway.internal:8443".to_string(), + ..Default::default() + }; - let err = compute_driver_guest_tls_paths(&config, &VmComputeConfig::default()) + let err = compute_driver_guest_tls_paths(&vm_config) .expect_err("https vm endpoints should require an explicit guest client bundle"); assert!( err.to_string() - .contains("--vm-tls-ca, --vm-tls-cert, and --vm-tls-key") + .contains("guest_tls_ca, guest_tls_cert, and guest_tls_key") ); } @@ -617,14 +615,12 @@ mod tests { let dir = tempdir().unwrap(); let server_cert = dir.path().join("server.crt"); let server_key = dir.path().join("server.key"); - let server_ca = dir.path().join("client-ca.crt"); let guest_ca = dir.path().join("guest-ca.crt"); let guest_cert = dir.path().join("guest.crt"); let guest_key = dir.path().join("guest.key"); for path in [ &server_cert, &server_key, - &server_ca, &guest_ca, &guest_cert, &guest_key, @@ -632,21 +628,15 @@ mod tests { std::fs::write(path, path.display().to_string()).unwrap(); } - let config = Config::new(Some(TlsConfig { - cert_path: server_cert.clone(), - key_path: server_key.clone(), - client_ca_path: Some(server_ca), - require_client_auth: false, - })) - .with_grpc_endpoint("https://gateway.internal:8443"); let vm_config = VmComputeConfig { + grpc_endpoint: "https://gateway.internal:8443".to_string(), guest_tls_ca: Some(guest_ca.clone()), guest_tls_cert: Some(guest_cert.clone()), guest_tls_key: Some(guest_key.clone()), ..Default::default() }; - let guest_paths = compute_driver_guest_tls_paths(&config, &vm_config) + let guest_paths = compute_driver_guest_tls_paths(&vm_config) .unwrap() .expect("https vm endpoints should pass an explicit guest client bundle"); assert_eq!(guest_paths.ca, guest_ca); diff --git a/crates/openshell-server/src/config_file.rs b/crates/openshell-server/src/config_file.rs new file mode 100644 index 000000000..2a1320a55 --- /dev/null +++ b/crates/openshell-server/src/config_file.rs @@ -0,0 +1,518 @@ +// SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +// SPDX-License-Identifier: Apache-2.0 + +//! TOML configuration file loader for the gateway. +//! +//! See `rfc/0003-gateway-configuration/README.md` for the file format. This +//! module parses the file into [`ConfigFile`], rejects fields that must be +//! supplied via env/CLI (database URL), and provides +//! [`driver_table`] which overlays shared `[openshell.gateway]` defaults onto +//! a `[openshell.drivers.]` table so each driver crate's +//! `Deserialize` impl sees a fully-populated table. +//! +//! The merge precedence for gateway process settings is: +//! ```text +//! CLI flag > OPENSHELL_* env var > TOML file > built-in default +//! ``` +//! Driver implementation settings are configured in the TOML driver tables. +//! Per-field application of gateway file values happens in [`crate::cli`], +//! which uses clap's `ArgMatches::value_source` to detect arguments that fell +//! back to their default and are therefore eligible for replacement by file +//! values. + +use std::collections::BTreeMap; +use std::net::SocketAddr; +use std::path::{Path, PathBuf}; + +use openshell_core::config::ComputeDriverKind; +use openshell_core::{OidcConfig, TlsConfig}; +use serde::{Deserialize, Serialize}; + +/// Latest schema version this build understands. +pub const SCHEMA_VERSION: u32 = 1; + +/// Root of the gateway TOML config file. +/// +/// The file is rooted at `[openshell]` to reserve room for future components +/// (CLI, sandbox, router) to share a single config file without key +/// collisions. +#[derive(Debug, Clone, Default, Serialize, Deserialize)] +#[serde(deny_unknown_fields)] +pub struct ConfigFile { + #[serde(default)] + pub openshell: OpenShellRoot, +} + +/// `[openshell]` table. +#[derive(Debug, Clone, Default, Serialize, Deserialize)] +#[serde(deny_unknown_fields)] +pub struct OpenShellRoot { + /// Reserved for future schema migrations. Versions greater than + /// [`SCHEMA_VERSION`] are rejected at load time. + #[serde(default)] + pub version: Option, + + #[serde(default)] + pub gateway: GatewayFileSection, + + /// `[openshell.drivers.]` tables — passed verbatim to each driver + /// crate's `Deserialize` impl after the gateway-side inheritance merge. + /// Stored as raw [`toml::Value`] so each driver can evolve its schema + /// independently of this crate. + #[serde(default)] + pub drivers: BTreeMap, +} + +/// `[openshell.gateway]` section. +/// +/// All fields are `Option` so the loader can tell whether a key was set +/// in the file (`Some`) or not (`None` — value is taken from CLI/env/default). +/// +/// The fields under "Shared driver defaults" are inherited into +/// `[openshell.drivers.]` tables per [`inheritable_keys`]. +#[derive(Debug, Clone, Default, Serialize, Deserialize)] +#[serde(deny_unknown_fields)] +pub struct GatewayFileSection { + // ── Listeners ──────────────────────────────────────────────────────── + #[serde(default)] + pub bind_address: Option, + #[serde(default)] + pub health_bind_address: Option, + #[serde(default)] + pub metrics_bind_address: Option, + + // ── Logging ────────────────────────────────────────────────────────── + #[serde(default)] + pub log_level: Option, + + // ── Drivers ────────────────────────────────────────────────────────── + #[serde(default)] + pub compute_drivers: Option>, + + // ── Sandbox / SSH ──────────────────────────────────────────────────── + #[serde(default)] + pub sandbox_namespace: Option, + #[serde(default)] + pub ssh_session_ttl_secs: Option, + + // ── Service routing ────────────────────────────────────────────────── + /// Subject Alternative Names configured on the gateway server certificate. + /// Wildcard DNS SANs also enable sandbox service URLs under that domain. + #[serde(default)] + pub server_sans: Option>, + /// Enable plaintext HTTP routing for loopback sandbox service URLs. + #[serde(default)] + pub enable_loopback_service_http: Option, + + // ── Shared driver defaults (inherited into [openshell.drivers.]) ─ + #[serde(default)] + pub default_image: Option, + #[serde(default)] + pub supervisor_image: Option, + #[serde(default)] + pub client_tls_secret_name: Option, + #[serde(default)] + pub host_gateway_ip: Option, + #[serde(default)] + pub enable_user_namespaces: Option, + #[serde(default)] + pub guest_tls_ca: Option, + #[serde(default)] + pub guest_tls_cert: Option, + #[serde(default)] + pub guest_tls_key: Option, + + // ── TLS toggle ─────────────────────────────────────────────────────── + /// When `true`, the gateway listens on plaintext HTTP and ignores any + /// `[openshell.gateway.tls]` table. Mirrors `--disable-tls`. + #[serde(default)] + pub disable_tls: Option, + + // ── Nested tables ──────────────────────────────────────────────────── + #[serde(default)] + pub tls: Option, + #[serde(default)] + pub oidc: Option, + + // ── Disallowed-in-file fields ──────────────────────────────────────── + // + // Captured so we can produce a friendly "set this via env/CLI instead" + // error rather than a generic "unknown field" message. Validated and + // rejected in [`load`]. + #[serde(default)] + pub database_url: Option, +} + +#[derive(Debug, thiserror::Error)] +pub enum ConfigFileError { + #[error("failed to read gateway config file '{}': {source}", path.display())] + Io { + path: PathBuf, + #[source] + source: std::io::Error, + }, + #[error("failed to parse gateway config file '{}': {source}", path.display())] + Parse { + path: PathBuf, + #[source] + source: toml::de::Error, + }, + #[error( + "unsupported gateway config version {version}; this build only supports version {SCHEMA_VERSION}" + )] + UnsupportedVersion { version: u32 }, + #[error( + "`{field}` is not allowed in the gateway config file — set the {env} env var or pass {cli} on the command line" + )] + SecretInFile { + field: &'static str, + env: &'static str, + cli: &'static str, + }, +} + +/// Load and validate a TOML config file. +/// +/// Returns `Ok(ConfigFile::default())` for an empty file (the gateway then +/// falls back entirely to CLI/env/built-in defaults). +pub fn load(path: &Path) -> Result { + let contents = std::fs::read_to_string(path).map_err(|source| ConfigFileError::Io { + path: path.to_path_buf(), + source, + })?; + if contents.trim().is_empty() { + return Ok(ConfigFile::default()); + } + let file: ConfigFile = toml::from_str(&contents).map_err(|source| ConfigFileError::Parse { + path: path.to_path_buf(), + source, + })?; + + if let Some(version) = file.openshell.version + && version > SCHEMA_VERSION + { + return Err(ConfigFileError::UnsupportedVersion { version }); + } + + if file.openshell.gateway.database_url.is_some() { + return Err(ConfigFileError::SecretInFile { + field: "database_url", + env: "OPENSHELL_DB_URL", + cli: "--db-url", + }); + } + + Ok(file) +} + +/// Build the merged TOML table for `driver` by overlaying inheritable +/// `[openshell.gateway]` defaults onto `[openshell.drivers.]`. +/// +/// The returned [`toml::Value`] is a Table ready to feed into the driver's +/// `Deserialize` impl — keys present in `raw` win over the gateway defaults. +/// Keys outside [`inheritable_keys`] for this driver are never copied from +/// the gateway section, which keeps each driver's `deny_unknown_fields` +/// invariant intact. +pub fn driver_table( + driver: ComputeDriverKind, + gateway: &GatewayFileSection, + raw: Option<&toml::Value>, +) -> toml::Value { + let mut merged = match raw { + Some(toml::Value::Table(table)) => table.clone(), + _ => toml::Table::new(), + }; + + for key in inheritable_keys(driver) { + if merged.contains_key(*key) { + continue; + } + if let Some(value) = gateway_inherited_value(gateway, key) { + merged.insert((*key).to_string(), value); + } + } + + toml::Value::Table(merged) +} + +/// Inheritance allowlist (the Q4 "high-overlap set"). Each driver opts in +/// to a specific subset so a gateway-wide default does not accidentally land +/// in a driver table that does not understand the field. +fn inheritable_keys(driver: ComputeDriverKind) -> &'static [&'static str] { + match driver { + ComputeDriverKind::Kubernetes => &[ + "namespace", + "default_image", + "supervisor_image", + "client_tls_secret_name", + "host_gateway_ip", + "enable_user_namespaces", + ], + ComputeDriverKind::Docker => &[ + "sandbox_namespace", + "default_image", + "supervisor_image", + "host_gateway_ip", + "guest_tls_ca", + "guest_tls_cert", + "guest_tls_key", + ], + ComputeDriverKind::Podman => &[ + "default_image", + "supervisor_image", + "guest_tls_ca", + "guest_tls_cert", + "guest_tls_key", + ], + ComputeDriverKind::Vm => &[ + "default_image", + "guest_tls_ca", + "guest_tls_cert", + "guest_tls_key", + ], + } +} + +fn gateway_inherited_value(g: &GatewayFileSection, key: &str) -> Option { + match key { + "namespace" | "sandbox_namespace" => g.sandbox_namespace.as_deref().map(string_value), + "default_image" => g.default_image.as_deref().map(string_value), + "supervisor_image" => g.supervisor_image.as_deref().map(string_value), + "client_tls_secret_name" => g.client_tls_secret_name.as_deref().map(string_value), + "host_gateway_ip" => g.host_gateway_ip.as_deref().map(string_value), + "enable_user_namespaces" => g.enable_user_namespaces.map(toml::Value::Boolean), + "guest_tls_ca" => g.guest_tls_ca.as_deref().map(path_value), + "guest_tls_cert" => g.guest_tls_cert.as_deref().map(path_value), + "guest_tls_key" => g.guest_tls_key.as_deref().map(path_value), + _ => None, + } +} + +fn string_value(s: &str) -> toml::Value { + toml::Value::String(s.to_owned()) +} + +fn path_value(p: &Path) -> toml::Value { + toml::Value::String(p.display().to_string()) +} + +#[cfg(test)] +mod tests { + use super::*; + use std::io::Write; + + fn write_tmp(contents: &str) -> tempfile::NamedTempFile { + let mut tmp = tempfile::Builder::new() + .suffix(".toml") + .tempfile() + .expect("tempfile"); + tmp.write_all(contents.as_bytes()).expect("write"); + tmp + } + + #[test] + fn empty_file_yields_default_config() { + let tmp = write_tmp(""); + let file = load(tmp.path()).expect("empty file parses"); + assert!(file.openshell.version.is_none()); + assert!(file.openshell.gateway.bind_address.is_none()); + assert!(file.openshell.drivers.is_empty()); + } + + #[test] + fn parses_full_example() { + let toml = r#" +[openshell] +version = 1 + +[openshell.gateway] +bind_address = "0.0.0.0:8080" +health_bind_address = "0.0.0.0:8081" +log_level = "info" +compute_drivers = ["kubernetes"] +sandbox_namespace = "agents" +default_image = "ghcr.io/nvidia/openshell/sandbox:latest" +supervisor_image = "ghcr.io/nvidia/openshell/supervisor:latest" +client_tls_secret_name = "openshell-sandbox-tls" + +[openshell.gateway.tls] +cert_path = "/etc/openshell/certs/gateway.pem" +key_path = "/etc/openshell/certs/gateway-key.pem" +client_ca_path = "/etc/openshell/certs/client-ca.pem" + +[openshell.gateway.oidc] +issuer = "https://idp.example.com/realms/openshell" +audience = "openshell-cli" + +[openshell.drivers.kubernetes] +namespace = "agents" +grpc_endpoint = "https://openshell-gateway.agents.svc:8080" +"#; + let tmp = write_tmp(toml); + let file = load(tmp.path()).expect("valid file parses"); + let gw = &file.openshell.gateway; + assert_eq!(gw.log_level.as_deref(), Some("info")); + assert_eq!( + gw.default_image.as_deref(), + Some("ghcr.io/nvidia/openshell/sandbox:latest") + ); + assert!(gw.tls.is_some()); + assert!(gw.oidc.is_some()); + assert!(file.openshell.drivers.contains_key("kubernetes")); + } + + #[test] + fn rejects_database_url_in_file() { + let toml = r#" +[openshell.gateway] +database_url = "sqlite::memory:" +"#; + let tmp = write_tmp(toml); + let err = load(tmp.path()).expect_err("database_url must be rejected"); + assert!(matches!( + err, + ConfigFileError::SecretInFile { + field: "database_url", + .. + } + )); + } + + #[test] + fn rejects_unknown_gateway_field() { + let toml = r" +[openshell.gateway] +nonsense = true +"; + let tmp = write_tmp(toml); + let err = load(tmp.path()).expect_err("unknown field must be rejected"); + assert!(matches!(err, ConfigFileError::Parse { .. })); + } + + #[test] + fn rejects_removed_ssh_endpoint_fields() { + let toml = r" +[openshell.gateway] +ssh_gateway_port = 8080 +"; + let tmp = write_tmp(toml); + let err = load(tmp.path()).expect_err("removed SSH endpoint keys must be rejected"); + assert!(matches!(err, ConfigFileError::Parse { .. })); + } + + #[test] + fn rejects_unsupported_version() { + let toml = r" +[openshell] +version = 2 +"; + let tmp = write_tmp(toml); + let err = load(tmp.path()).expect_err("version > 1 must be rejected"); + assert!(matches!( + err, + ConfigFileError::UnsupportedVersion { version: 2 } + )); + } + + #[test] + fn driver_table_inherits_gateway_defaults() { + let gateway = GatewayFileSection { + default_image: Some("ghcr.io/nvidia/openshell/sandbox:0.9".to_string()), + supervisor_image: Some("ghcr.io/nvidia/openshell/supervisor:0.9".to_string()), + ..Default::default() + }; + let raw = toml::toml! { + namespace = "agents" + }; + let merged = driver_table( + ComputeDriverKind::Kubernetes, + &gateway, + Some(&toml::Value::Table(raw)), + ); + let table = merged.as_table().expect("table"); + assert_eq!( + table.get("namespace").and_then(|v| v.as_str()), + Some("agents") + ); + assert_eq!( + table.get("default_image").and_then(|v| v.as_str()), + Some("ghcr.io/nvidia/openshell/sandbox:0.9") + ); + assert_eq!( + table.get("supervisor_image").and_then(|v| v.as_str()), + Some("ghcr.io/nvidia/openshell/supervisor:0.9") + ); + } + + #[test] + fn docker_driver_table_inherits_gateway_defaults() { + let gateway = GatewayFileSection { + sandbox_namespace: Some("agents".to_string()), + default_image: Some("ghcr.io/nvidia/openshell/sandbox:0.9".to_string()), + host_gateway_ip: Some("10.0.0.1".to_string()), + ..Default::default() + }; + let merged = driver_table(ComputeDriverKind::Docker, &gateway, None); + let table = merged.as_table().expect("table"); + assert_eq!( + table.get("sandbox_namespace").and_then(|v| v.as_str()), + Some("agents") + ); + assert_eq!( + table.get("default_image").and_then(|v| v.as_str()), + Some("ghcr.io/nvidia/openshell/sandbox:0.9") + ); + assert_eq!( + table.get("host_gateway_ip").and_then(|v| v.as_str()), + Some("10.0.0.1") + ); + } + + #[test] + fn driver_table_specific_value_overrides_gateway_default() { + let gateway = GatewayFileSection { + default_image: Some("gateway-default".to_string()), + ..Default::default() + }; + let raw = toml::toml! { + default_image = "driver-specific" + }; + let merged = driver_table( + ComputeDriverKind::Podman, + &gateway, + Some(&toml::Value::Table(raw)), + ); + assert_eq!( + merged + .as_table() + .unwrap() + .get("default_image") + .and_then(|v| v.as_str()), + Some("driver-specific") + ); + } + + #[test] + fn driver_table_does_not_leak_keys_outside_allowlist() { + // `client_tls_secret_name` is K8s-only; Docker must not receive it + // even when set at gateway scope. + let gateway = GatewayFileSection { + client_tls_secret_name: Some("openshell-sandbox-tls".to_string()), + ..Default::default() + }; + let merged = driver_table(ComputeDriverKind::Docker, &gateway, None); + assert!( + !merged + .as_table() + .unwrap() + .contains_key("client_tls_secret_name") + ); + } + + #[test] + fn missing_path_is_io_error() { + let err = load(Path::new("/nonexistent/openshell-gateway.toml")) + .expect_err("missing file must be io error"); + assert!(matches!(err, ConfigFileError::Io { .. })); + } +} diff --git a/crates/openshell-server/src/grpc/sandbox.rs b/crates/openshell-server/src/grpc/sandbox.rs index 831d46938..108de9c0e 100644 --- a/crates/openshell-server/src/grpc/sandbox.rs +++ b/crates/openshell-server/src/grpc/sandbox.rs @@ -1263,17 +1263,10 @@ pub(super) async fn handle_revoke_ssh_session( // --------------------------------------------------------------------------- fn resolve_gateway(config: &openshell_core::Config) -> (String, u16) { - let host = if config.ssh_gateway_host.is_empty() { - config.bind_address.ip().to_string() - } else { - config.ssh_gateway_host.clone() - }; - let port = if config.ssh_gateway_port == 0 { - config.bind_address.port() - } else { - config.ssh_gateway_port - }; - (host, port) + ( + config.bind_address.ip().to_string(), + config.bind_address.port(), + ) } /// Shell-escape a value for embedding in a POSIX shell command. diff --git a/crates/openshell-server/src/lib.rs b/crates/openshell-server/src/lib.rs index 8a466a9e1..a6e337dec 100644 --- a/crates/openshell-server/src/lib.rs +++ b/crates/openshell-server/src/lib.rs @@ -23,6 +23,7 @@ mod auth; pub mod certgen; pub mod cli; mod compute; +pub mod config_file; mod grpc; mod http; mod inference; @@ -161,6 +162,7 @@ pub async fn run_server( config: Config, vm_config: VmComputeConfig, docker_config: DockerComputeConfig, + config_file: Option, tracing_log_bus: TracingLogBus, ) -> Result<()> { let database_url = config.database_url.trim(); @@ -194,6 +196,7 @@ pub async fn run_server( &config, &vm_config, &docker_config, + config_file.as_ref(), store.clone(), sandbox_index.clone(), sandbox_watch_bus.clone(), @@ -567,6 +570,7 @@ async fn build_compute_runtime( config: &Config, vm_config: &VmComputeConfig, docker_config: &DockerComputeConfig, + file: Option<&config_file::ConfigFile>, store: Arc, sandbox_index: SandboxIndex, sandbox_watch_bus: SandboxWatchBus, @@ -578,35 +582,9 @@ async fn build_compute_runtime( match driver { ComputeDriverKind::Kubernetes => { - let supervisor_image = std::env::var("OPENSHELL_SUPERVISOR_IMAGE") - .ok() - .filter(|s| !s.is_empty()) - .unwrap_or_else(|| openshell_core::config::DEFAULT_SUPERVISOR_IMAGE.to_string()); - let supervisor_image_pull_policy = - std::env::var("OPENSHELL_SUPERVISOR_IMAGE_PULL_POLICY") - .ok() - .filter(|s| !s.is_empty()) - .unwrap_or_default(); + let k8s = kubernetes_config_from_file(file)?; ComputeRuntime::new_kubernetes( - KubernetesComputeConfig { - namespace: config.sandbox_namespace.clone(), - default_image: config.sandbox_image.clone(), - image_pull_policy: config.sandbox_image_pull_policy.clone(), - supervisor_image, - supervisor_image_pull_policy, - supervisor_sideload_method: std::env::var( - "OPENSHELL_SUPERVISOR_SIDELOAD_METHOD", - ) - .ok() - .filter(|s| !s.is_empty()) - .and_then(|s| s.parse().ok()) - .unwrap_or_default(), - grpc_endpoint: config.grpc_endpoint.clone(), - ssh_socket_path: config.sandbox_ssh_socket_path.clone(), - client_tls_secret_name: config.client_tls_secret_name.clone(), - host_gateway_ip: config.host_gateway_ip.clone(), - enable_user_namespaces: config.enable_user_namespaces, - }, + k8s, store, sandbox_index, sandbox_watch_bus, @@ -642,61 +620,11 @@ async fn build_compute_runtime( .map_err(|e| Error::execution(format!("failed to create compute runtime: {e}"))) } ComputeDriverKind::Podman => { - let socket_path = std::env::var("OPENSHELL_PODMAN_SOCKET") - .ok() - .filter(|s| !s.is_empty()) - .map_or_else( - openshell_driver_podman::PodmanComputeConfig::default_socket_path, - std::path::PathBuf::from, - ); - - let network_name = std::env::var("OPENSHELL_NETWORK_NAME") - .ok() - .filter(|s| !s.is_empty()) - .unwrap_or_else(|| openshell_core::config::DEFAULT_NETWORK_NAME.to_string()); - - let stop_timeout_secs: u32 = std::env::var("OPENSHELL_STOP_TIMEOUT") - .ok() - .and_then(|s| s.parse().ok()) - .unwrap_or(openshell_core::config::DEFAULT_STOP_TIMEOUT_SECS); - - let supervisor_image = std::env::var("OPENSHELL_SUPERVISOR_IMAGE") - .ok() - .filter(|s| !s.is_empty()) - .unwrap_or_else(|| openshell_core::config::DEFAULT_SUPERVISOR_IMAGE.to_string()); - - // TLS client cert paths for sandbox mTLS. When all three are - // set, the Podman driver bind-mounts them into sandbox - // containers and switches the endpoint to https://. - let podman_tls_ca = std::env::var("OPENSHELL_PODMAN_TLS_CA") - .ok() - .filter(|s| !s.is_empty()) - .map(std::path::PathBuf::from); - let podman_tls_cert = std::env::var("OPENSHELL_PODMAN_TLS_CERT") - .ok() - .filter(|s| !s.is_empty()) - .map(std::path::PathBuf::from); - let podman_tls_key = std::env::var("OPENSHELL_PODMAN_TLS_KEY") - .ok() - .filter(|s| !s.is_empty()) - .map(std::path::PathBuf::from); + let mut podman = podman_config_from_file(file)?; + podman.gateway_port = config.bind_address.port(); ComputeRuntime::new_podman( - openshell_driver_podman::PodmanComputeConfig { - socket_path, - default_image: config.sandbox_image.clone(), - image_pull_policy: config.sandbox_image_pull_policy.parse().unwrap_or_default(), - grpc_endpoint: config.grpc_endpoint.clone(), - gateway_port: config.bind_address.port(), - sandbox_ssh_socket_path: config.sandbox_ssh_socket_path.clone(), - network_name, - ssh_port: config.sandbox_ssh_port, - stop_timeout_secs, - supervisor_image, - guest_tls_ca: podman_tls_ca, - guest_tls_cert: podman_tls_cert, - guest_tls_key: podman_tls_key, - }, + podman, store, sandbox_index, sandbox_watch_bus, @@ -709,6 +637,43 @@ async fn build_compute_runtime( } } +/// Build a [`KubernetesComputeConfig`] from the file's +/// `[openshell.drivers.kubernetes]` table merged with inheritable +/// `[openshell.gateway]` defaults. Falls back to the driver's `Default` +/// when no file is present. +fn kubernetes_config_from_file( + file: Option<&config_file::ConfigFile>, +) -> Result { + let Some(file) = file else { + return Ok(KubernetesComputeConfig::default()); + }; + let merged = config_file::driver_table( + ComputeDriverKind::Kubernetes, + &file.openshell.gateway, + file.openshell.drivers.get("kubernetes"), + ); + merged + .try_into() + .map_err(|e| Error::config(format!("invalid [openshell.drivers.kubernetes] table: {e}"))) +} + +/// Same pattern as [`kubernetes_config_from_file`] but for Podman. +fn podman_config_from_file( + file: Option<&config_file::ConfigFile>, +) -> Result { + let Some(file) = file else { + return Ok(openshell_driver_podman::PodmanComputeConfig::default()); + }; + let merged = config_file::driver_table( + ComputeDriverKind::Podman, + &file.openshell.gateway, + file.openshell.drivers.get("podman"), + ); + merged + .try_into() + .map_err(|e| Error::config(format!("invalid [openshell.drivers.podman] table: {e}"))) +} + fn configured_compute_driver(config: &Config) -> Result { match config.compute_drivers.as_slice() { [] => match openshell_core::config::detect_driver() { diff --git a/deploy/deb/init-gateway-config.sh b/deploy/deb/init-gateway-config.sh new file mode 100755 index 000000000..55b07f7e5 --- /dev/null +++ b/deploy/deb/init-gateway-config.sh @@ -0,0 +1,56 @@ +#!/bin/sh +# SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: Apache-2.0 + +set -eu + +CONFIG_FILE="${1:?Usage: init-gateway-config.sh }" +PKI_DIR="${2:?Usage: init-gateway-config.sh }" +DRIVER_DIR="${3:?Usage: init-gateway-config.sh }" +VM_STATE_DIR="${4:?Usage: init-gateway-config.sh }" + +if [ -f "$CONFIG_FILE" ]; then + exit 0 +fi + +mkdir -p "$(dirname "$CONFIG_FILE")" "$VM_STATE_DIR" + +port="${OPENSHELL_SERVER_PORT:-17670}" +scheme="https" +if [ "${OPENSHELL_DISABLE_TLS:-false}" = "true" ]; then + scheme="http" +fi + +tmp="${CONFIG_FILE}.tmp" +{ + cat < "$tmp" + +chmod 600 "$tmp" +mv "$tmp" "$CONFIG_FILE" diff --git a/deploy/deb/openshell-gateway.service b/deploy/deb/openshell-gateway.service index 9de94da22..1b57f3e48 100644 --- a/deploy/deb/openshell-gateway.service +++ b/deploy/deb/openshell-gateway.service @@ -13,21 +13,10 @@ Environment=OPENSHELL_TLS_CERT=%S/openshell/tls/server/tls.crt Environment=OPENSHELL_TLS_KEY=%S/openshell/tls/server/tls.key Environment=OPENSHELL_TLS_CLIENT_CA=%S/openshell/tls/ca.crt Environment=OPENSHELL_DB_URL=sqlite:%S/openshell/gateway/openshell.db -Environment=OPENSHELL_GRPC_ENDPOINT=https://127.0.0.1:17670 -Environment=OPENSHELL_SSH_GATEWAY_HOST=127.0.0.1 -Environment=OPENSHELL_SSH_GATEWAY_PORT=17670 -Environment=OPENSHELL_VM_DRIVER_STATE_DIR=%S/openshell/vm-driver -Environment=OPENSHELL_VM_TLS_CA=%S/openshell/tls/ca.crt -Environment=OPENSHELL_VM_TLS_CERT=%S/openshell/tls/client/tls.crt -Environment=OPENSHELL_VM_TLS_KEY=%S/openshell/tls/client/tls.key -Environment=OPENSHELL_DOCKER_TLS_CA=%S/openshell/tls/ca.crt -Environment=OPENSHELL_DOCKER_TLS_CERT=%S/openshell/tls/client/tls.crt -Environment=OPENSHELL_DOCKER_TLS_KEY=%S/openshell/tls/client/tls.key -Environment=OPENSHELL_PODMAN_TLS_CA=%S/openshell/tls/ca.crt -Environment=OPENSHELL_PODMAN_TLS_CERT=%S/openshell/tls/client/tls.crt -Environment=OPENSHELL_PODMAN_TLS_KEY=%S/openshell/tls/client/tls.key +Environment=OPENSHELL_GATEWAY_CONFIG=%S/openshell/gateway/config.toml EnvironmentFile=-%h/.config/openshell/gateway.env ExecStartPre=/usr/bin/openshell-gateway generate-certs --output-dir %S/openshell/tls --server-san host.openshell.internal +ExecStartPre=/usr/libexec/openshell/init-gateway-config.sh %S/openshell/gateway/config.toml %S/openshell/tls /usr/libexec/openshell %S/openshell/vm-driver ExecStart=/usr/bin/openshell-gateway Restart=on-failure RestartSec=5s diff --git a/deploy/helm/openshell/templates/gateway-config.yaml b/deploy/helm/openshell/templates/gateway-config.yaml new file mode 100644 index 000000000..9d95e45c1 --- /dev/null +++ b/deploy/helm/openshell/templates/gateway-config.yaml @@ -0,0 +1,95 @@ +{{/* +ConfigMap holding the gateway TOML config file (RFC 0003). + +The gateway reads `/etc/openshell/gateway.toml` (mounted from this ConfigMap) +at startup. CLI flags and OPENSHELL_* env vars on the StatefulSet container +still override anything in this file. + +One value is intentionally NOT rendered here: + - server.dbUrl → passed via --db-url in the StatefulSet args +*/}} +apiVersion: v1 +kind: ConfigMap +metadata: + name: {{ include "openshell.fullname" . }}-config + labels: + {{- include "openshell.labels" . | nindent 4 }} +data: + gateway.toml: | + [openshell] + version = 1 + + [openshell.gateway] + bind_address = "0.0.0.0:{{ .Values.service.port }}" + {{- if .Values.service.healthPort }} + health_bind_address = "0.0.0.0:{{ .Values.service.healthPort }}" + {{- end }} + {{- if .Values.service.metricsPort }} + metrics_bind_address = "0.0.0.0:{{ .Values.service.metricsPort }}" + {{- end }} + log_level = {{ .Values.server.logLevel | quote }} + sandbox_namespace = {{ include "openshell.sandboxNamespace" . | quote }} + default_image = {{ .Values.server.sandboxImage | quote }} + supervisor_image = {{ include "openshell.supervisorImage" . | quote }} + {{- if .Values.server.hostGatewayIP }} + host_gateway_ip = {{ .Values.server.hostGatewayIP | quote }} + {{- end }} + {{- if .Values.server.enableUserNamespaces }} + enable_user_namespaces = true + {{- end }} + {{- if .Values.server.disableTls }} + disable_tls = true + {{- else }} + client_tls_secret_name = {{ .Values.server.tls.clientTlsSecretName | quote }} + {{- end }} + enable_loopback_service_http = {{ .Values.server.enableLoopbackServiceHttp }} + {{- $sans := list -}} + {{- if and .Values.certManager.enabled .Values.certManager.serverDnsNames }} + {{- $sans = .Values.certManager.serverDnsNames }} + {{- else if and .Values.pkiInitJob.enabled .Values.pkiInitJob.serverDnsNames }} + {{- $sans = .Values.pkiInitJob.serverDnsNames }} + {{- end }} + {{- if $sans }} + server_sans = [{{- range $i, $san := $sans }}{{ if $i }}, {{ end }}{{ $san | quote }}{{- end }}] + {{- end }} + + {{- if not .Values.server.disableTls }} + + [openshell.gateway.tls] + cert_path = "/etc/openshell-tls/server/tls.crt" + key_path = "/etc/openshell-tls/server/tls.key" + client_ca_path = "/etc/openshell-tls/client-ca/ca.crt" + {{- if .Values.server.disableGatewayAuth }} + allow_unauthenticated = true + {{- end }} + {{- end }} + + {{- if .Values.server.oidc.issuer }} + + [openshell.gateway.oidc] + issuer = {{ .Values.server.oidc.issuer | quote }} + audience = {{ .Values.server.oidc.audience | quote }} + jwks_ttl_secs = {{ .Values.server.oidc.jwksTtl }} + {{- if .Values.server.oidc.rolesClaim }} + roles_claim = {{ .Values.server.oidc.rolesClaim | quote }} + {{- end }} + {{- if .Values.server.oidc.adminRole }} + admin_role = {{ .Values.server.oidc.adminRole | quote }} + {{- end }} + {{- if .Values.server.oidc.userRole }} + user_role = {{ .Values.server.oidc.userRole | quote }} + {{- end }} + {{- if .Values.server.oidc.scopesClaim }} + scopes_claim = {{ .Values.server.oidc.scopesClaim | quote }} + {{- end }} + {{- end }} + + [openshell.drivers.kubernetes] + grpc_endpoint = {{ include "openshell.grpcEndpoint" . | quote }} + supervisor_sideload_method = {{ include "openshell.supervisorSideloadMethod" . | quote }} + {{- if .Values.server.sandboxImagePullPolicy }} + image_pull_policy = {{ .Values.server.sandboxImagePullPolicy | quote }} + {{- end }} + {{- if .Values.supervisor.image.pullPolicy }} + supervisor_image_pull_policy = {{ .Values.supervisor.image.pullPolicy | quote }} + {{- end }} diff --git a/deploy/helm/openshell/templates/statefulset.yaml b/deploy/helm/openshell/templates/statefulset.yaml index 3c805f056..c6ff21491 100644 --- a/deploy/helm/openshell/templates/statefulset.yaml +++ b/deploy/helm/openshell/templates/statefulset.yaml @@ -15,10 +15,15 @@ spec: {{- include "openshell.selectorLabels" . | nindent 6 }} template: metadata: - {{- with .Values.podAnnotations }} annotations: + # Roll the StatefulSet when the rendered gateway TOML changes — the + # gateway only reads /etc/openshell/gateway.toml at startup, so + # without this annotation a `helm upgrade` that only mutates the + # ConfigMap would leave pods running with stale config. + checksum/gateway-config: {{ include (print $.Template.BasePath "/gateway-config.yaml") . | sha256sum }} + {{- with .Values.podAnnotations }} {{- toYaml . | nindent 8 }} - {{- end }} + {{- end }} labels: {{- include "openshell.labels" . | nindent 8 }} {{- with .Values.podLabels }} @@ -47,110 +52,29 @@ spec: image: {{ include "openshell.image" . | quote }} imagePullPolicy: {{ .Values.image.pullPolicy }} args: - - --bind-address - - "0.0.0.0" - - --port - - {{ .Values.service.port | quote }} - - --health-port - - {{ .Values.service.healthPort | quote }} - {{- if .Values.service.metricsPort }} - - --metrics-port - - {{ .Values.service.metricsPort | quote }} - {{- end }} - - --log-level - - {{ .Values.server.logLevel }} + - --config + - /etc/openshell/gateway.toml - --db-url - {{ .Values.server.dbUrl | quote }} env: - - name: OPENSHELL_SANDBOX_NAMESPACE - value: {{ include "openshell.sandboxNamespace" . | quote }} - - name: OPENSHELL_SANDBOX_IMAGE - value: {{ .Values.server.sandboxImage | quote }} - {{- if .Values.server.sandboxImagePullPolicy }} - - name: OPENSHELL_SANDBOX_IMAGE_PULL_POLICY - value: {{ .Values.server.sandboxImagePullPolicy | quote }} - {{- end }} - - name: OPENSHELL_SUPERVISOR_IMAGE - value: {{ include "openshell.supervisorImage" . | quote }} - {{- if .Values.supervisor.image.pullPolicy }} - - name: OPENSHELL_SUPERVISOR_IMAGE_PULL_POLICY - value: {{ .Values.supervisor.image.pullPolicy | quote }} - {{- end }} - - name: OPENSHELL_SUPERVISOR_SIDELOAD_METHOD - value: {{ include "openshell.supervisorSideloadMethod" . | quote }} - - name: OPENSHELL_GRPC_ENDPOINT - value: {{ include "openshell.grpcEndpoint" . | quote }} - {{- if .Values.server.sshGatewayHost }} - - name: OPENSHELL_SSH_GATEWAY_HOST - value: {{ .Values.server.sshGatewayHost | quote }} - {{- end }} - {{- if .Values.server.sshGatewayPort }} - - name: OPENSHELL_SSH_GATEWAY_PORT - value: {{ .Values.server.sshGatewayPort | quote }} - {{- end }} - {{- if .Values.server.hostGatewayIP }} - - name: OPENSHELL_HOST_GATEWAY_IP - value: {{ .Values.server.hostGatewayIP | quote }} - {{- end }} - {{- if .Values.server.enableUserNamespaces }} - - name: OPENSHELL_ENABLE_USER_NAMESPACES - value: "true" - {{- end }} - {{- if and .Values.certManager.enabled .Values.certManager.serverDnsNames }} - - name: OPENSHELL_SERVER_SAN - value: {{ join "," .Values.certManager.serverDnsNames | quote }} - {{- else if and .Values.pkiInitJob.enabled .Values.pkiInitJob.serverDnsNames }} - - name: OPENSHELL_SERVER_SAN - value: {{ join "," .Values.pkiInitJob.serverDnsNames | quote }} - {{- end }} - - name: OPENSHELL_ENABLE_LOOPBACK_SERVICE_HTTP - value: {{ .Values.server.enableLoopbackServiceHttp | quote }} - {{- if .Values.server.disableTls }} - - name: OPENSHELL_DISABLE_TLS - value: "true" - {{- else }} - - name: OPENSHELL_TLS_CERT - value: /etc/openshell-tls/server/tls.crt - - name: OPENSHELL_TLS_KEY - value: /etc/openshell-tls/server/tls.key - {{- if or .Values.server.tls.clientCaSecretName .Values.pkiInitJob.enabled (and .Values.certManager.enabled .Values.certManager.clientCaFromServerTlsSecret) }} - - name: OPENSHELL_TLS_CLIENT_CA - value: /etc/openshell-tls/client-ca/ca.crt - {{- end }} - - name: OPENSHELL_CLIENT_TLS_SECRET_NAME - value: {{ .Values.server.tls.clientTlsSecretName | quote }} - {{- end }} - {{- if .Values.server.oidc.issuer }} - {{- if .Values.server.oidc.caConfigMapName }} + # All gateway settings live in the ConfigMap-backed TOML file + # mounted at /etc/openshell/gateway.toml. The only env var below + # is a process-level setting consumed by libraries outside + # gateway code (currently just SSL_CERT_FILE for OIDC issuer TLS). + {{- if and .Values.server.oidc.issuer .Values.server.oidc.caConfigMapName }} + # OIDC issuer custom-CA: rustls/reqwest read SSL_CERT_FILE for + # outbound TLS verification. This is a process-level env var + # consumed by the TLS stack itself, not by gateway code, so it + # cannot be represented in the gateway TOML schema. - name: SSL_CERT_FILE value: /etc/openshell-tls/oidc-ca/ca.crt {{- end }} - - name: OPENSHELL_OIDC_ISSUER - value: {{ .Values.server.oidc.issuer | quote }} - - name: OPENSHELL_OIDC_AUDIENCE - value: {{ .Values.server.oidc.audience | quote }} - - name: OPENSHELL_OIDC_JWKS_TTL - value: {{ .Values.server.oidc.jwksTtl | quote }} - {{- if .Values.server.oidc.rolesClaim }} - - name: OPENSHELL_OIDC_ROLES_CLAIM - value: {{ .Values.server.oidc.rolesClaim | quote }} - {{- end }} - {{- if .Values.server.oidc.adminRole }} - - name: OPENSHELL_OIDC_ADMIN_ROLE - value: {{ .Values.server.oidc.adminRole | quote }} - {{- end }} - {{- if .Values.server.oidc.userRole }} - - name: OPENSHELL_OIDC_USER_ROLE - value: {{ .Values.server.oidc.userRole | quote }} - {{- end }} - {{- if .Values.server.oidc.scopesClaim }} - - name: OPENSHELL_OIDC_SCOPES_CLAIM - value: {{ .Values.server.oidc.scopesClaim | quote }} - {{- end }} - {{- end }} volumeMounts: - name: openshell-data mountPath: /var/openshell + - name: gateway-config + mountPath: /etc/openshell + readOnly: true {{- if not .Values.server.disableTls }} - name: tls-cert mountPath: /etc/openshell-tls/server @@ -204,6 +128,9 @@ spec: resources: {{- toYaml .Values.resources | nindent 12 }} volumes: + - name: gateway-config + configMap: + name: {{ include "openshell.fullname" . }}-config {{- if not .Values.server.disableTls }} - name: tls-cert secret: diff --git a/deploy/helm/openshell/tests/gateway_config_test.yaml b/deploy/helm/openshell/tests/gateway_config_test.yaml new file mode 100644 index 000000000..2d464b8e6 --- /dev/null +++ b/deploy/helm/openshell/tests/gateway_config_test.yaml @@ -0,0 +1,68 @@ +# SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: Apache-2.0 + +suite: gateway TOML config shape +templates: + - templates/gateway-config.yaml + - templates/statefulset.yaml +release: + name: openshell + namespace: my-namespace + +tests: + # Regression for Drew's P2: a ConfigMap-only mutation in `helm upgrade` + # must roll the StatefulSet, otherwise pods keep running with stale config. + - it: annotates the StatefulSet pod template with a ConfigMap checksum + template: templates/statefulset.yaml + asserts: + - exists: + path: spec.template.metadata.annotations["checksum/gateway-config"] + + + # Regression for the P1 bug Drew flagged: grpc_endpoint MUST live in the + # Kubernetes driver table, not in [openshell.gateway]. The gateway-side + # schema has `deny_unknown_fields` and no `grpc_endpoint` field, so writing + # it at gateway scope makes `config_file::load` reject the default install. + - it: renders grpc_endpoint under [openshell.drivers.kubernetes], not [openshell.gateway] + template: templates/gateway-config.yaml + asserts: + - matchRegex: + path: data["gateway.toml"] + pattern: '(?ms)\[openshell\.drivers\.kubernetes\].*?grpc_endpoint' + - notMatchRegex: + path: data["gateway.toml"] + pattern: '(?ms)\[openshell\.gateway\][^\[]*?grpc_endpoint' + + - it: omits server_sans when no DNS SANs are configured + template: templates/gateway-config.yaml + asserts: + - notMatchRegex: + path: data["gateway.toml"] + pattern: 'server_sans\s*=' + + - it: emits disable_tls=true and omits the [openshell.gateway.tls] section when disableTls is set + set: + server.disableTls: true + certManager.enabled: false + pkiInitJob.enabled: false + template: templates/gateway-config.yaml + asserts: + - matchRegex: + path: data["gateway.toml"] + pattern: 'disable_tls\s*=\s*true' + - notMatchRegex: + path: data["gateway.toml"] + pattern: '\[openshell\.gateway\.tls\]' + + - it: renders server_sans from certManager.serverDnsNames + set: + certManager.enabled: true + certManager.serverDnsNames: + - openshell + - "*.dev.openshell.localhost" + pkiInitJob.enabled: false + template: templates/gateway-config.yaml + asserts: + - matchRegex: + path: data["gateway.toml"] + pattern: 'server_sans\s*=\s*\["openshell", "\*\.dev\.openshell\.localhost"\]' diff --git a/deploy/helm/openshell/tests/sandbox_namespace_test.yaml b/deploy/helm/openshell/tests/sandbox_namespace_test.yaml index a128cd440..2d3461c6f 100644 --- a/deploy/helm/openshell/tests/sandbox_namespace_test.yaml +++ b/deploy/helm/openshell/tests/sandbox_namespace_test.yaml @@ -3,32 +3,28 @@ suite: sandboxNamespace defaulting templates: - - templates/statefulset.yaml + - templates/gateway-config.yaml - templates/networkpolicy.yaml release: name: openshell namespace: my-namespace tests: - - it: defaults OPENSHELL_SANDBOX_NAMESPACE to release namespace - template: templates/statefulset.yaml + - it: defaults sandbox_namespace to release namespace in the TOML config + template: templates/gateway-config.yaml asserts: - - contains: - path: spec.template.spec.containers[0].env - content: - name: OPENSHELL_SANDBOX_NAMESPACE - value: "my-namespace" + - matchRegex: + path: data["gateway.toml"] + pattern: 'sandbox_namespace\s*=\s*"my-namespace"' - it: uses explicit sandboxNamespace when set - template: templates/statefulset.yaml + template: templates/gateway-config.yaml set: server.sandboxNamespace: other-ns asserts: - - contains: - path: spec.template.spec.containers[0].env - content: - name: OPENSHELL_SANDBOX_NAMESPACE - value: "other-ns" + - matchRegex: + path: data["gateway.toml"] + pattern: 'sandbox_namespace\s*=\s*"other-ns"' - it: defaults NetworkPolicy namespace to release namespace template: templates/networkpolicy.yaml diff --git a/deploy/helm/openshell/values.yaml b/deploy/helm/openshell/values.yaml index b502fea9d..c7fa50296 100644 --- a/deploy/helm/openshell/values.yaml +++ b/deploy/helm/openshell/values.yaml @@ -101,11 +101,6 @@ server: # Override only when sandboxes must reach the gateway via a different # hostname (e.g. an external ingress or a host alias). grpcEndpoint: "" - # Public host/port returned to CLI clients for SSH proxy CONNECT requests. - # For local clusters the default 127.0.0.1:8080 is correct; for remote - # clusters these should be set to the externally reachable host and port. - sshGatewayHost: "" - sshGatewayPort: 0 # TLS configuration for the server. The server always terminates mTLS # directly and requires client certificates. # Host gateway IP for sandbox pod hostAliases. When set, sandbox pods get diff --git a/deploy/kube/manifests/openshell-helmchart.yaml b/deploy/kube/manifests/openshell-helmchart.yaml index 40170d289..3ca6e3b90 100644 --- a/deploy/kube/manifests/openshell-helmchart.yaml +++ b/deploy/kube/manifests/openshell-helmchart.yaml @@ -33,8 +33,6 @@ spec: sandboxImagePullPolicy: __SANDBOX_IMAGE_PULL_POLICY__ supervisorImage: ghcr.io/nvidia/openshell/supervisor:latest dbUrl: __DB_URL__ - sshGatewayHost: __SSH_GATEWAY_HOST__ - sshGatewayPort: __SSH_GATEWAY_PORT__ hostGatewayIP: __HOST_GATEWAY_IP__ disableTls: __DISABLE_TLS__ oidc: diff --git a/deploy/man/openshell-gateway.8.md b/deploy/man/openshell-gateway.8.md index 14ca6a1ea..ee2ad8ed2 100644 --- a/deploy/man/openshell-gateway.8.md +++ b/deploy/man/openshell-gateway.8.md @@ -94,26 +94,9 @@ gRPC and HTTP, secured by mutual TLS (mTLS) by default. service URLs under that domain. Environment: **OPENSHELL_SERVER_SAN**. -**--sandbox-image** *IMAGE* -: Default container image for sandboxes. - Environment: **OPENSHELL_SANDBOX_IMAGE**. - -**--sandbox-image-pull-policy** *POLICY* -: Image pull policy: Always, IfNotPresent, Never. - Environment: **OPENSHELL_SANDBOX_IMAGE_PULL_POLICY**. - -**--ssh-gateway-host** *HOST* -: Public host for the SSH gateway endpoint. Default: **127.0.0.1**. - Environment: **OPENSHELL_SSH_GATEWAY_HOST**. - -**--ssh-gateway-port** *PORT* -: Public port for the SSH gateway endpoint. Default: **8080**. - Environment: **OPENSHELL_SSH_GATEWAY_PORT**. - -**--grpc-endpoint** *URL* -: gRPC endpoint for sandbox callbacks. Should be reachable from - within sandbox containers. - Environment: **OPENSHELL_GRPC_ENDPOINT**. +Compute driver settings such as sandbox image, callback endpoint, image +pull policy, network name, VM state directory, and guest TLS material are +configured in the TOML file passed with **--config**. # SYSTEMD INTEGRATION diff --git a/deploy/man/openshell-gateway.env.5.md b/deploy/man/openshell-gateway.env.5.md index 19da4cb4f..ec3f466a1 100644 --- a/deploy/man/openshell-gateway.env.5.md +++ b/deploy/man/openshell-gateway.env.5.md @@ -93,41 +93,12 @@ exist (the unit has built-in defaults for all required settings). Wildcard DNS SANs also enable sandbox service URLs under that domain. -**OPENSHELL_PODMAN_TLS_CA** (default: auto-generated path) -: CA certificate bind-mounted into sandbox containers. +## Driver Configuration -**OPENSHELL_PODMAN_TLS_CERT** (default: auto-generated path) -: Client certificate bind-mounted into sandbox containers. - -**OPENSHELL_PODMAN_TLS_KEY** (default: auto-generated path) -: Client private key bind-mounted into sandbox containers. - -## Images - -**OPENSHELL_SUPERVISOR_IMAGE** (default: ghcr.io/nvidia/openshell/supervisor:latest) -: OCI image containing the supervisor binary, mounted read-only - into sandbox containers. - -**OPENSHELL_SANDBOX_IMAGE** (default: ghcr.io/nvidia/openshell-community/sandboxes/base:latest) -: Default OCI image for sandbox containers. - -**OPENSHELL_SANDBOX_IMAGE_PULL_POLICY** (default: missing) -: When to pull sandbox images: **always** (every sandbox creation), - **missing** (only if not cached locally), **never** (use cached - only), **newer** (pull if a newer version exists). - -## Podman Driver - -**OPENSHELL_PODMAN_SOCKET** (default: $XDG_RUNTIME_DIR/podman/podman.sock) -: Path to the Podman API Unix socket. - -**OPENSHELL_NETWORK_NAME** (default: openshell) -: Name of the Podman bridge network for sandbox containers. Created - automatically if it does not exist. - -**OPENSHELL_STOP_TIMEOUT** (default: 10) -: Seconds to wait after SIGTERM before sending SIGKILL when stopping - a sandbox container. +Compute driver settings are configured in the TOML file referenced by +**OPENSHELL_GATEWAY_CONFIG** or **--config**. This includes sandbox +images, image pull policy, callback endpoints, Podman socket path, +Docker network name, VM state directory, and guest TLS material. # EXAMPLES @@ -135,15 +106,6 @@ Change the API port to 9090: OPENSHELL_SERVER_PORT=9090 -Pin sandbox images to a specific version: - - OPENSHELL_SUPERVISOR_IMAGE=ghcr.io/nvidia/openshell/supervisor:v0.0.37 - OPENSHELL_SANDBOX_IMAGE=ghcr.io/nvidia/openshell-community/sandboxes/base:v0.0.37 - -Air-gapped deployment (pre-loaded images, no registry access): - - OPENSHELL_SANDBOX_IMAGE_PULL_POLICY=never - Enable debug logging: OPENSHELL_LOG_LEVEL=debug diff --git a/deploy/rpm/CONFIGURATION.md b/deploy/rpm/CONFIGURATION.md index 9e228600d..2bf23fd1b 100644 --- a/deploy/rpm/CONFIGURATION.md +++ b/deploy/rpm/CONFIGURATION.md @@ -109,8 +109,8 @@ To disable TLS (not recommended for production): OPENSHELL_DISABLE_TLS=true ``` -1. Comment out the `OPENSHELL_TLS_*` and `OPENSHELL_PODMAN_TLS_*` - variables if they are set. +1. Remove or comment out the `guest_tls_*` entries in + `~/.config/openshell/gateway.toml` if they are set. 1. Restart the gateway. @@ -120,14 +120,15 @@ When mTLS is enabled, the Podman driver bind-mounts the client certificates into each sandbox container so the supervisor process can establish an mTLS connection back to the gateway. -The following environment variables control the host-side paths of the -client certificates that are mounted into sandbox containers: +The following TOML fields control the host-side paths of the client +certificates that are mounted into sandbox containers: -| Variable | Description | -|----------|-------------| -| `OPENSHELL_PODMAN_TLS_CA` | CA certificate (host path) | -| `OPENSHELL_PODMAN_TLS_CERT` | Client certificate (host path) | -| `OPENSHELL_PODMAN_TLS_KEY` | Client private key (host path) | +```toml +[openshell.gateway] +guest_tls_ca = "/home/user/.local/state/openshell/tls/ca.crt" +guest_tls_cert = "/home/user/.local/state/openshell/tls/client/tls.crt" +guest_tls_key = "/home/user/.local/state/openshell/tls/client/tls.key" +``` Inside the container, the supervisor reads them from: @@ -141,13 +142,14 @@ configuration is required. ## Configuration reference -All settings are controlled via environment variables. The user unit -reads from `~/.config/openshell/gateway.env` (generated on first start) -and from `Environment=` directives in the systemd unit. +Gateway process settings are controlled via environment variables. Driver +implementation settings live in `~/.config/openshell/gateway.toml`, which is +generated on first start and selected through `OPENSHELL_GATEWAY_CONFIG`. Values in `gateway.env` override the unit defaults. Use `systemctl --user edit openshell-gateway` to add overrides that persist -across package upgrades. +across package upgrades. Gateway CLI/env values override the gateway section +of the TOML file, while driver tables are read from TOML. ### Gateway settings @@ -158,7 +160,7 @@ across package upgrades. | `OPENSHELL_HEALTH_PORT` | `0` (disabled) | Port for unauthenticated health endpoints (`/healthz`, `/readyz`). Set to a non-zero value to enable. | | `OPENSHELL_METRICS_PORT` | `0` (disabled) | Port for Prometheus metrics (`/metrics`). Set to a non-zero value to enable. | | `OPENSHELL_LOG_LEVEL` | `info` | Log level: `trace`, `debug`, `info`, `warn`, `error` | -| `OPENSHELL_DRIVERS` | `podman` | Compute driver (`podman`, `docker`, `kubernetes`) | +| `OPENSHELL_DRIVERS` | `podman` | Compute driver (`podman`, `docker`, `kubernetes`, `vm`) | | `OPENSHELL_DB_URL` | `sqlite://$XDG_STATE_HOME/openshell/gateway.db` | SQLite database URL for state persistence | ### TLS settings @@ -169,25 +171,26 @@ across package upgrades. | `OPENSHELL_TLS_KEY` | (auto-generated path) | Server TLS private key | | `OPENSHELL_TLS_CLIENT_CA` | (auto-generated path) | CA for client certificate verification; requires mTLS unless OIDC is also configured | | `OPENSHELL_DISABLE_TLS` | (unset) | Set to `true` to disable TLS | -| `OPENSHELL_PODMAN_TLS_CA` | (auto-generated path) | CA cert mounted into sandbox containers | -| `OPENSHELL_PODMAN_TLS_CERT` | (auto-generated path) | Client cert mounted into sandbox containers | -| `OPENSHELL_PODMAN_TLS_KEY` | (auto-generated path) | Client key mounted into sandbox containers | -### Sandbox settings +### Driver TOML settings -| Variable | Default | Description | -|----------|---------|-------------| -| `OPENSHELL_SUPERVISOR_IMAGE` | `ghcr.io/nvidia/openshell/supervisor:latest` | Supervisor binary OCI image | -| `OPENSHELL_SANDBOX_IMAGE` | `ghcr.io/nvidia/openshell-community/sandboxes/base:latest` | Default sandbox base image | -| `OPENSHELL_SANDBOX_IMAGE_PULL_POLICY` | `missing` | Image pull policy: `always`, `missing`, `never`, `newer` | +The generated `gateway.toml` contains the RPM's Podman defaults: -### Podman driver settings +```toml +[openshell.gateway] +compute_drivers = ["podman"] +default_image = "ghcr.io/nvidia/openshell-community/sandboxes/base:latest" +supervisor_image = "ghcr.io/nvidia/openshell/supervisor:latest" +guest_tls_ca = "/home/user/.local/state/openshell/tls/ca.crt" +guest_tls_cert = "/home/user/.local/state/openshell/tls/client/tls.crt" +guest_tls_key = "/home/user/.local/state/openshell/tls/client/tls.key" -| Variable | Default | Description | -|----------|---------|-------------| -| `OPENSHELL_PODMAN_SOCKET` | `$XDG_RUNTIME_DIR/podman/podman.sock` | Podman API Unix socket path | -| `OPENSHELL_NETWORK_NAME` | `openshell` | Podman bridge network name for sandbox containers | -| `OPENSHELL_STOP_TIMEOUT` | `10` | Container stop timeout in seconds (SIGTERM then SIGKILL) | +[openshell.drivers.podman] +socket_path = "/run/user/1000/podman/podman.sock" +image_pull_policy = "missing" +network_name = "openshell" +stop_timeout_secs = 10 +``` ### Image management @@ -202,14 +205,14 @@ podman pull ghcr.io/nvidia/openshell/supervisor:latest podman pull ghcr.io/nvidia/openshell-community/sandboxes/base:latest ``` -Or set `OPENSHELL_SANDBOX_IMAGE_PULL_POLICY=always` to pull on every -sandbox creation. +Or set `image_pull_policy = "always"` in +`[openshell.drivers.podman]` to pull on every sandbox creation. To pin specific image versions instead of `:latest`: ```shell -OPENSHELL_SUPERVISOR_IMAGE=ghcr.io/nvidia/openshell/supervisor:v0.0.37 -OPENSHELL_SANDBOX_IMAGE=ghcr.io/nvidia/openshell-community/sandboxes/base:v0.0.37 +supervisor_image = "ghcr.io/nvidia/openshell/supervisor:v0.0.37" +default_image = "ghcr.io/nvidia/openshell-community/sandboxes/base:v0.0.37" ``` For air-gapped environments: @@ -232,8 +235,9 @@ For air-gapped environments: 1. Set pull policy to `never`: - ```shell - OPENSHELL_SANDBOX_IMAGE_PULL_POLICY=never + ```toml + [openshell.drivers.podman] + image_pull_policy = "never" ``` ## File locations @@ -244,8 +248,9 @@ For air-gapped environments: | CLI binary | `/usr/bin/openshell` | | Systemd user unit | `/usr/lib/systemd/user/openshell-gateway.service` | | PKI bootstrap script | `/usr/libexec/openshell/init-pki.sh` | -| Env generator script | `/usr/libexec/openshell/init-gateway-env.sh` | +| Env/config generator script | `/usr/libexec/openshell/init-gateway-env.sh` | | TLS certificates | `~/.local/state/openshell/tls/` | | CLI client certs | `~/.config/openshell/gateways/openshell/mtls/` | | Gateway database | `~/.local/state/openshell/gateway.db` | -| Gateway configuration | `~/.config/openshell/gateway.env` | +| Gateway environment | `~/.config/openshell/gateway.env` | +| Gateway TOML configuration | `~/.config/openshell/gateway.toml` | diff --git a/deploy/rpm/QUICKSTART.md b/deploy/rpm/QUICKSTART.md index 4d6acb4a1..1f89bba00 100644 --- a/deploy/rpm/QUICKSTART.md +++ b/deploy/rpm/QUICKSTART.md @@ -51,8 +51,8 @@ The gateway pulls container images from ghcr.io on first sandbox creation. Ensure the host can reach ghcr.io over HTTPS (port 443). For air-gapped environments, pre-load images with `podman pull` and -set `OPENSHELL_SANDBOX_IMAGE_PULL_POLICY=never` in -`~/.config/openshell/gateway.env`. See CONFIGURATION.md for details. +set `image_pull_policy = "never"` in +`~/.config/openshell/gateway.toml`. See CONFIGURATION.md for details. ## Start the gateway @@ -64,6 +64,7 @@ On first start, the gateway automatically generates: - A self-signed PKI bundle (CA, server cert, client cert) for mTLS - A commented configuration file at `~/.config/openshell/gateway.env` +- A gateway TOML file at `~/.config/openshell/gateway.toml` > **Note:** The gateway binds to all interfaces (`0.0.0.0`) by default. > Mutual TLS (mTLS) is enabled automatically on first start, requiring a diff --git a/deploy/rpm/TROUBLESHOOTING.md b/deploy/rpm/TROUBLESHOOTING.md index 2c33e1a57..1cc39cd8d 100644 --- a/deploy/rpm/TROUBLESHOOTING.md +++ b/deploy/rpm/TROUBLESHOOTING.md @@ -186,8 +186,8 @@ podman pull ghcr.io/nvidia/openshell-community/sandboxes/base:latest podman pull ghcr.io/nvidia/openshell/supervisor:latest ``` -Or set `OPENSHELL_SANDBOX_IMAGE_PULL_POLICY=always` in -`~/.config/openshell/gateway.env` and restart the gateway. +Or set `image_pull_policy = "always"` in +`~/.config/openshell/gateway.toml` and restart the gateway. ### Gateway stops on logout @@ -216,11 +216,10 @@ systemctl --user restart openshell-gateway The SQLite database schema is auto-migrated on startup. Running sandboxes are stopped during the restart. -The `gateway.env` file is not overwritten during upgrades. The -`init-gateway-env.sh` script is idempotent and only generates the file -on first start. New configuration options from newer versions can be -added manually by referencing CONFIGURATION.md or running -`openshell-gateway --help`. +The `gateway.env` and `gateway.toml` files are not overwritten during +upgrades. The `init-gateway-env.sh` script is idempotent and only generates +missing files on first start. New gateway process options can be added +manually by referencing CONFIGURATION.md or running `openshell-gateway --help`. To pick up new container images after an upgrade: diff --git a/deploy/rpm/init-gateway-env.sh b/deploy/rpm/init-gateway-env.sh index f041fa941..baf2f5564 100644 --- a/deploy/rpm/init-gateway-env.sh +++ b/deploy/rpm/init-gateway-env.sh @@ -2,7 +2,7 @@ # SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. # SPDX-License-Identifier: Apache-2.0 -# Generate the gateway environment configuration file on first start. +# Generate the gateway environment and TOML configuration files on first start. # # Called from the systemd ExecStartPre directive to bootstrap the # gateway configuration. Idempotent: exits immediately if the file @@ -17,14 +17,65 @@ set -euo pipefail ENV_FILE="${1:?Usage: init-gateway-env.sh }" +CONFIG_DIR="$(dirname "${ENV_FILE}")" +CONFIG_FILE="${CONFIG_DIR}/gateway.toml" +STATE_HOME="${XDG_STATE_HOME:-${HOME}/.local/state}" +RUNTIME_HOME="${XDG_RUNTIME_DIR:-/run/user/$(id -u)}" + +write_gateway_config() { + if [ -f "${CONFIG_FILE}" ]; then + return + fi + + mkdir -p "${CONFIG_DIR}" "${STATE_HOME}/openshell/vm-driver" + cat > "${CONFIG_FILE}" << EOF +[openshell] +version = 1 + +[openshell.gateway] +compute_drivers = ["podman"] +default_image = "ghcr.io/nvidia/openshell-community/sandboxes/base:latest" +supervisor_image = "ghcr.io/nvidia/openshell/supervisor:latest" +guest_tls_ca = "${STATE_HOME}/openshell/tls/ca.crt" +guest_tls_cert = "${STATE_HOME}/openshell/tls/client/tls.crt" +guest_tls_key = "${STATE_HOME}/openshell/tls/client/tls.key" + +[openshell.drivers.podman] +socket_path = "${RUNTIME_HOME}/podman/podman.sock" +image_pull_policy = "missing" +network_name = "openshell" +stop_timeout_secs = 10 + +[openshell.drivers.vm] +state_dir = "${STATE_HOME}/openshell/vm-driver" +driver_dir = "/usr/libexec/openshell" +grpc_endpoint = "https://127.0.0.1:8080" +EOF + chmod 600 "${CONFIG_FILE}" +} + +ensure_env_points_at_config() { + if grep -q '^OPENSHELL_GATEWAY_CONFIG=' "${ENV_FILE}"; then + return + fi + + cat >> "${ENV_FILE}" << EOF + +# Gateway TOML configuration. Driver implementation settings live here. +OPENSHELL_GATEWAY_CONFIG=${CONFIG_FILE} +EOF +} # ── Idempotent: skip if env file already exists ───────────────────── if [ -f "${ENV_FILE}" ]; then + write_gateway_config + ensure_env_points_at_config exit 0 fi # ── Create parent directory ───────────────────────────────────────── -mkdir -p "$(dirname "${ENV_FILE}")" +mkdir -p "${CONFIG_DIR}" +write_gateway_config # ── Write environment file ────────────────────────────────────────── cat > "${ENV_FILE}" << EOF @@ -34,13 +85,15 @@ cat > "${ENV_FILE}" << EOF # Run 'openshell-gateway --help' for the full list of options. # See /usr/share/doc/openshell-gateway/ for guides. +OPENSHELL_GATEWAY_CONFIG=${CONFIG_FILE} + # ---- Optional (uncomment to override defaults) ---- # Database URL for gateway state persistence. # Default for the user unit: sqlite://\$XDG_STATE_HOME/openshell/gateway.db #OPENSHELL_DB_URL=sqlite:///path/to/gateway.db -# Compute driver: podman (default for RPM), docker, kubernetes. +# Compute driver: podman (default for RPM), docker, kubernetes, vm. #OPENSHELL_DRIVERS=podman # Bind address. 0.0.0.0 listens on all interfaces; mTLS prevents @@ -53,18 +106,9 @@ cat > "${ENV_FILE}" << EOF # Log level: trace, debug, info, warn, error. #OPENSHELL_LOG_LEVEL=info -# ---- Images ---- - -# Supervisor binary OCI image (mounted read-only into sandboxes). -#OPENSHELL_SUPERVISOR_IMAGE=ghcr.io/nvidia/openshell/supervisor:latest - -# Default sandbox base image. -#OPENSHELL_SANDBOX_IMAGE=ghcr.io/nvidia/openshell-community/sandboxes/base:latest - -# Image pull policy: always, missing (default), never, newer. -# Use 'always' to pick up new tags automatically. -# Use 'never' for air-gapped environments with pre-loaded images. -#OPENSHELL_SANDBOX_IMAGE_PULL_POLICY=missing +# Driver implementation settings, including images, pull policy, Podman +# socket, TLS mounts, and VM paths, live in: +# ${CONFIG_FILE} # ---- TLS (mTLS enabled by default) ---- # PKI is auto-generated by init-pki.sh on first start. Client certs are @@ -89,21 +133,6 @@ cat > "${ENV_FILE}" << EOF # Example: OPENSHELL_SERVER_SAN=*.apps.example.com #OPENSHELL_SERVER_SAN= -# Podman driver: client certs bind-mounted into sandbox containers. -#OPENSHELL_PODMAN_TLS_CA=\$XDG_STATE_HOME/openshell/tls/ca.crt -#OPENSHELL_PODMAN_TLS_CERT=\$XDG_STATE_HOME/openshell/tls/client/tls.crt -#OPENSHELL_PODMAN_TLS_KEY=\$XDG_STATE_HOME/openshell/tls/client/tls.key - -# ---- Podman driver ---- - -# Podman API Unix socket path. -#OPENSHELL_PODMAN_SOCKET=\$XDG_RUNTIME_DIR/podman/podman.sock - -# Podman bridge network name for sandbox containers. -#OPENSHELL_NETWORK_NAME=openshell - -# Container stop timeout in seconds (SIGTERM then SIGKILL). -#OPENSHELL_STOP_TIMEOUT=10 EOF chmod 600 "${ENV_FILE}" diff --git a/deploy/snap/README.md b/deploy/snap/README.md index 4950f4921..ece73f680 100644 --- a/deploy/snap/README.md +++ b/deploy/snap/README.md @@ -97,9 +97,10 @@ it while sandboxes are active. Restart the service manually when you are ready to move the gateway to the refreshed snap revision. `openshell-sandbox` is staged next to `openshell-gateway` as the Docker -supervisor binary. The gateway app passes it to the in-process Docker driver -through `OPENSHELL_DOCKER_SUPERVISOR_BIN=$SNAP/bin/openshell-sandbox`. The -service stores its gateway database under `$SNAP_COMMON`. +supervisor binary. The gateway app starts through a small wrapper that writes +`$SNAP_COMMON/gateway.toml` on first start and points the in-process Docker +driver at `$SNAP/bin/openshell-sandbox`. The service stores its gateway +database under `$SNAP_COMMON`. ## Interfaces @@ -147,16 +148,13 @@ openshell.gateway \ --disable-tls \ --port 17670 \ --db-url "sqlite:$SNAP_COMMON/gateway.db?mode=rwc" \ - --docker-supervisor-bin "$SNAP/bin/openshell-sandbox" \ - --docker-network-name openshell-snap \ - --sandbox-namespace docker-snap \ - --sandbox-image ghcr.io/nvidia/openshell-community/sandboxes/base:latest \ - --sandbox-image-pull-policy IfNotPresent \ - --grpc-endpoint http://host.openshell.internal:17670 + --config "$SNAP_COMMON/gateway.toml" ``` This stores the gateway SQLite database at -`/var/snap/openshell/common/gateway.db`. +`/var/snap/openshell/common/gateway.db`. The generated TOML stores Docker +driver settings such as the supervisor binary path, network name, sandbox +namespace, sandbox image, pull policy, and callback endpoint. ## Connect with the OpenShell CLI diff --git a/deploy/snap/bin/openshell-gateway-wrapper b/deploy/snap/bin/openshell-gateway-wrapper new file mode 100755 index 000000000..19e24b52b --- /dev/null +++ b/deploy/snap/bin/openshell-gateway-wrapper @@ -0,0 +1,27 @@ +#!/bin/sh +# SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: Apache-2.0 + +set -eu + +CONFIG_FILE="${OPENSHELL_GATEWAY_CONFIG:-${SNAP_COMMON}/gateway.toml}" + +if [ ! -f "$CONFIG_FILE" ]; then + mkdir -p "$(dirname "$CONFIG_FILE")" + cat > "$CONFIG_FILE" << EOF +[openshell] +version = 1 + +[openshell.drivers.docker] +default_image = "ghcr.io/nvidia/openshell-community/sandboxes/base:latest" +image_pull_policy = "IfNotPresent" +sandbox_namespace = "docker-snap" +grpc_endpoint = "http://host.openshell.internal:17670" +supervisor_bin = "${SNAP}/bin/openshell-sandbox" +network_name = "openshell-snap" +EOF + chmod 600 "$CONFIG_FILE" +fi + +export OPENSHELL_GATEWAY_CONFIG="$CONFIG_FILE" +exec "${SNAP}/bin/openshell-gateway" "$@" diff --git a/deploy/snap/meta/snap.yaml.in b/deploy/snap/meta/snap.yaml.in index 9444fbbf7..4175da0ac 100644 --- a/deploy/snap/meta/snap.yaml.in +++ b/deploy/snap/meta/snap.yaml.in @@ -31,23 +31,16 @@ apps: - ssh-keys - system-observe gateway: - command: bin/openshell-gateway + command: bin/openshell-gateway-wrapper daemon: simple refresh-mode: endure environment: OPENSHELL_BIND_ADDRESS: 127.0.0.1 OPENSHELL_SERVER_PORT: 17670 OPENSHELL_DB_URL: "sqlite:$SNAP_COMMON/gateway.db?mode=rwc" - OPENSHELL_GRPC_ENDPOINT: http://host.openshell.internal:17670 OPENSHELL_DISABLE_TLS: true OPENSHELL_DRIVERS: docker - OPENSHELL_DOCKER_SUPERVISOR_BIN: "$SNAP/bin/openshell-sandbox" - OPENSHELL_DOCKER_NETWORK_NAME: openshell-snap - OPENSHELL_SANDBOX_IMAGE: ghcr.io/nvidia/openshell-community/sandboxes/base:latest - OPENSHELL_SANDBOX_IMAGE_PULL_POLICY: IfNotPresent - OPENSHELL_SANDBOX_SSH_PORT: 2222 - OPENSHELL_SSH_GATEWAY_HOST: 127.0.0.1 - OPENSHELL_SSH_GATEWAY_PORT: 8080 + OPENSHELL_GATEWAY_CONFIG: "$SNAP_COMMON/gateway.toml" XDG_DATA_HOME: "$SNAP_COMMON" # Used for creating and locating certain sockets. XDG_RUNTIME_DIR: "$SNAP_COMMON" diff --git a/docs/kubernetes/ingress.mdx b/docs/kubernetes/ingress.mdx index e4b23101f..3ed9a4cd5 100644 --- a/docs/kubernetes/ingress.mdx +++ b/docs/kubernetes/ingress.mdx @@ -62,21 +62,9 @@ openshell gateway add http:// --name production openshell status ``` -## Configure SSH relay +## SSH Relay -For sandbox SSH connections to work through the external address, set `server.sshGatewayHost` and `server.sshGatewayPort` to the hostname and port that CLI clients can reach: - -```shell -helm upgrade openshell \ - oci://ghcr.io/nvidia/openshell/helm-chart \ - --version \ - --namespace openshell \ - --set grpcRoute.enabled=true \ - --set grpcRoute.gateway.create=true \ - --set grpcRoute.gateway.className=eg \ - --set server.sshGatewayHost= \ - --set server.sshGatewayPort= -``` +Sandbox SSH uses the gateway endpoint registered with the CLI. No separate Helm SSH host or port values are required. ## Next Steps diff --git a/docs/kubernetes/setup.mdx b/docs/kubernetes/setup.mdx index 5d9e2c089..bb9997305 100644 --- a/docs/kubernetes/setup.mdx +++ b/docs/kubernetes/setup.mdx @@ -134,7 +134,6 @@ The most commonly changed values are: | `server.sandboxNamespace` | Namespace where sandbox pods are created. Defaults to the Helm release namespace when left empty. | | `server.sandboxImage` | Default sandbox image used when a sandbox does not specify one. | | `server.grpcEndpoint` | Endpoint that sandbox supervisors use to call back to the gateway. Must be reachable from inside the cluster. | -| `server.sshGatewayHost` / `server.sshGatewayPort` | Public host and port returned to CLI clients for SSH proxy connections. Required when the gateway is exposed externally. | | `server.disableTls` | Run the gateway over plaintext HTTP. Use only behind a trusted transport. | | `server.enableLoopbackServiceHttp` | Enable local plaintext HTTP for loopback sandbox service URLs. Defaults to `true`. | | `pkiInitJob.serverDnsNames` / `certManager.serverDnsNames` | Additional gateway server DNS SANs. Wildcard SANs also enable sandbox service URLs under that domain. | diff --git a/docs/reference/gateway-config.mdx b/docs/reference/gateway-config.mdx new file mode 100644 index 000000000..ffb932f63 --- /dev/null +++ b/docs/reference/gateway-config.mdx @@ -0,0 +1,214 @@ +--- +# SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: Apache-2.0 +title: "Gateway Configuration File" +sidebar-title: "Gateway Config" +description: "Reference for the OpenShell gateway TOML configuration file (RFC 0003)." +keywords: "Generative AI, Cybersecurity, AI Agents, Sandboxing, Gateway, Configuration, TOML, Reference" +position: 5 +--- + +The OpenShell gateway reads its configuration from a TOML file when `--config` or `OPENSHELL_GATEWAY_CONFIG` is set. Gateway process flags and gateway `OPENSHELL_*` environment variables override the file. Compute driver settings live in the driver TOML tables. See [RFC 0003](https://github.com/NVIDIA/OpenShell/blob/main/rfc/0003-gateway-configuration/README.md) for the full schema. + +## Source Precedence + +```text +Gateway CLI flag > gateway OPENSHELL_* env var > TOML file > built-in default +``` + +`database_url` is env-only. The loader rejects it when it appears in the file. + +## Layout + +The file is rooted at `[openshell]`. Gateway-wide settings live under `[openshell.gateway]`. Each compute driver owns its own `[openshell.drivers.]` table. Shared keys set at gateway scope are inherited into driver tables when not overridden. + +```toml +[openshell] +version = 1 + +[openshell.gateway] +# ... gateway-wide settings ... + +[openshell.gateway.tls] +# ... gateway listener TLS ... + +[openshell.gateway.oidc] +# ... JWT bearer auth ... + +[openshell.drivers.kubernetes] +# ... driver-specific settings ... +``` + +## Full Example + +A complete gateway configuration covering every section. Trim to the fields you need. + +```toml +# SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: Apache-2.0 + +[openshell] +version = 1 + +[openshell.gateway] +bind_address = "0.0.0.0:8080" +health_bind_address = "0.0.0.0:8081" +metrics_bind_address = "0.0.0.0:9090" + +log_level = "info" + +# When empty the gateway auto-detects (Kubernetes -> Podman -> Docker). VM is +# never auto-detected and requires an explicit entry here. +compute_drivers = ["kubernetes"] + +sandbox_namespace = "openshell" + +# Subject Alternative Names baked into the gateway server certificate. +# Wildcard DNS SANs (e.g. "*.dev.openshell.localhost") also enable sandbox +# service URLs under that domain. +server_sans = ["openshell", "*.dev.openshell.localhost"] +# Allow plaintext HTTP routing for loopback sandbox service URLs. +enable_loopback_service_http = true + +# Shared driver defaults — inherited into [openshell.drivers.] tables +# when the driver-specific table does not override them. +default_image = "ghcr.io/nvidia/openshell/sandbox:latest" +supervisor_image = "ghcr.io/nvidia/openshell/supervisor:latest" +client_tls_secret_name = "openshell-client-tls" + +# Gateway listener TLS (distinct from the per-driver guest_tls_*). +[openshell.gateway.tls] +cert_path = "/etc/openshell/certs/gateway.pem" +key_path = "/etc/openshell/certs/gateway-key.pem" +client_ca_path = "/etc/openshell/certs/client-ca.pem" +allow_unauthenticated = false + +[openshell.gateway.oidc] +issuer = "https://idp.example.com/realms/openshell" +audience = "openshell-cli" +jwks_ttl_secs = 3600 +roles_claim = "realm_access.roles" +admin_role = "openshell-admin" +user_role = "openshell-user" +``` + +`image_pull_policy` is intentionally not a shared gateway key. Kubernetes uses `Always | IfNotPresent | Never` while Podman uses `always | missing | never | newer`. Set it inside the relevant driver table. + +## Per-Driver Examples + +### Kubernetes + +The gateway runs as a Pod and creates sandbox Pods in another namespace. mTLS material for sandboxes is delivered via a Kubernetes Secret rather than host-side file paths. + +```toml +[openshell] +version = 1 + +[openshell.gateway] +bind_address = "0.0.0.0:8080" +health_bind_address = "0.0.0.0:8081" +metrics_bind_address = "0.0.0.0:9090" +log_level = "info" +compute_drivers = ["kubernetes"] + +default_image = "ghcr.io/nvidia/openshell/sandbox:latest" +supervisor_image = "ghcr.io/nvidia/openshell/supervisor:latest" +client_tls_secret_name = "openshell-client-tls" + +[openshell.gateway.tls] +cert_path = "/etc/openshell-tls/server/tls.crt" +key_path = "/etc/openshell-tls/server/tls.key" +client_ca_path = "/etc/openshell-tls/client-ca/ca.crt" + +[openshell.drivers.kubernetes] +namespace = "agents" +grpc_endpoint = "https://openshell-gateway.agents.svc:8080" +image_pull_policy = "IfNotPresent" +# Use the image volume on K8s >= 1.35 (GA in 1.36); switch to "init-container" +# on older clusters or where the ImageVolume feature gate is off. +supervisor_sideload_method = "image-volume" +``` + +### Docker + +Sandboxes run as containers on a local bridge network. The supervisor binary is bind-mounted from the host (no in-cluster image pull required); guest mTLS material is supplied as host paths. + +```toml +[openshell] +version = 1 + +[openshell.gateway] +bind_address = "127.0.0.1:8080" +log_level = "info" +compute_drivers = ["docker"] + +supervisor_image = "ghcr.io/nvidia/openshell/supervisor:latest" +guest_tls_ca = "/etc/openshell/certs/ca.pem" +guest_tls_cert = "/etc/openshell/certs/client.pem" +guest_tls_key = "/etc/openshell/certs/client-key.pem" + +[openshell.drivers.docker] +default_image = "ghcr.io/nvidia/openshell/sandbox:latest" +image_pull_policy = "IfNotPresent" +sandbox_namespace = "docker-dev" +grpc_endpoint = "https://host.openshell.internal:8080" +network_name = "openshell-docker" +# Skip the image-pull-and-extract step by pointing at a locally built binary. +supervisor_bin = "/usr/local/libexec/openshell/openshell-sandbox" +``` + +### Podman + +Sandboxes run as Podman containers on a user-mode bridge network. The supervisor image is mounted read-only via Podman's `type=image` mount; guest mTLS material is supplied as host paths. + +```toml +[openshell] +version = 1 + +[openshell.gateway] +bind_address = "127.0.0.1:8080" +log_level = "info" +compute_drivers = ["podman"] + +default_image = "ghcr.io/nvidia/openshell/sandbox:latest" +supervisor_image = "ghcr.io/nvidia/openshell/supervisor:latest" +guest_tls_ca = "/etc/openshell/certs/ca.pem" +guest_tls_cert = "/etc/openshell/certs/client.pem" +guest_tls_key = "/etc/openshell/certs/client-key.pem" + +[openshell.drivers.podman] +# Rootless socket path. For root Podman use /run/podman/podman.sock. +socket_path = "/run/user/1000/podman/podman.sock" +network_name = "openshell" +stop_timeout_secs = 10 +image_pull_policy = "missing" # Podman vocabulary: always | missing | never | newer +``` + +### MicroVM + +Each sandbox runs inside its own libkrun microVM managed by the standalone `openshell-driver-vm` subprocess. Use this driver when you want stronger isolation than container namespaces alone. + +```toml +[openshell] +version = 1 + +[openshell.gateway] +bind_address = "127.0.0.1:8080" +log_level = "info" +# VM is never auto-detected; an explicit entry here is required. +compute_drivers = ["vm"] + +default_image = "ghcr.io/nvidia/openshell/sandbox:latest" +guest_tls_ca = "/var/lib/openshell/guest-tls/ca.pem" +guest_tls_cert = "/var/lib/openshell/guest-tls/client.pem" +guest_tls_key = "/var/lib/openshell/guest-tls/client-key.pem" + +[openshell.drivers.vm] +grpc_endpoint = "https://host.containers.internal:8080" +state_dir = "/var/lib/openshell/vm" +# Where the gateway looks for the openshell-driver-vm subprocess binary. +driver_dir = "/usr/local/libexec/openshell" +vcpus = 2 +mem_mib = 2048 +krun_log_level = 1 +``` diff --git a/docs/reference/sandbox-compute-drivers.mdx b/docs/reference/sandbox-compute-drivers.mdx index 88d2da3f8..9521353c0 100644 --- a/docs/reference/sandbox-compute-drivers.mdx +++ b/docs/reference/sandbox-compute-drivers.mdx @@ -29,8 +29,8 @@ Common gateway options: | Option | Environment variable | Description | |---|---|---| | `--drivers ` | `OPENSHELL_DRIVERS` | Select the compute driver. Supported values are `docker`, `podman`, `kubernetes`, and `vm`. | -| `--sandbox-image ` | `OPENSHELL_SANDBOX_IMAGE` | Set the default sandbox image used when a sandbox create request does not specify one. | -| `--grpc-endpoint ` | `OPENSHELL_GRPC_ENDPOINT` | Set the gateway callback endpoint that sandbox workloads use to connect back to OpenShell. | + +Set driver-specific values such as sandbox images, callback endpoints, network names, TLS material, and VM sizing in the gateway TOML file. See the [Gateway Configuration File](./gateway-config) reference for the full `[openshell.drivers.]` schema. Sandbox create supports `--cpu` and `--memory` for per-sandbox compute sizing. Docker and Podman apply them as runtime limits. Kubernetes applies them as both @@ -45,13 +45,7 @@ The gateway talks to the Docker daemon to create sandbox containers. Docker is a For maintainer-level implementation details, refer to the [Docker driver README](https://github.com/NVIDIA/OpenShell/blob/main/crates/openshell-driver-docker/README.md). -| Option | Environment variable | Description | -|---|---|---| -| `--drivers docker` | `OPENSHELL_DRIVERS=docker` | Select the Docker compute driver. | -| `--docker-network-name ` | `OPENSHELL_DOCKER_NETWORK_NAME` | Override the bridge network used by Docker sandbox containers. | -| `--docker-supervisor-bin ` | `OPENSHELL_DOCKER_SUPERVISOR_BIN` | Use a local Linux `openshell-sandbox` binary instead of resolving or extracting one automatically. | -| `--docker-supervisor-image ` | `OPENSHELL_DOCKER_SUPERVISOR_IMAGE` | Override the image used to extract the Linux `openshell-sandbox` binary. | -| `--docker-tls-ca`, `--docker-tls-cert`, `--docker-tls-key` | `OPENSHELL_DOCKER_TLS_CA`, `OPENSHELL_DOCKER_TLS_CERT`, `OPENSHELL_DOCKER_TLS_KEY` | Mount sandbox client TLS materials into Docker containers for mTLS callback to the gateway. | +Select Docker with `--drivers docker` or `OPENSHELL_DRIVERS=docker`. Configure Docker driver values such as `grpc_endpoint`, `network_name`, `supervisor_bin`, `supervisor_image`, `image_pull_policy`, and `guest_tls_*` in `[openshell.drivers.docker]`. For GPU-backed Docker sandboxes, configure Docker CDI before starting the gateway so OpenShell can detect the daemon capability. @@ -63,14 +57,7 @@ The gateway talks to the Podman API socket. The Podman driver requires Podman 5. For maintainer-level implementation details, refer to the [Podman driver README](https://github.com/NVIDIA/OpenShell/blob/main/crates/openshell-driver-podman/README.md) and [Podman networking notes](https://github.com/NVIDIA/OpenShell/blob/main/crates/openshell-driver-podman/NETWORKING.md). -| Option | Environment variable | Description | -|---|---|---| -| `--drivers podman` | `OPENSHELL_DRIVERS=podman` | Select the Podman compute driver. | -| None | `OPENSHELL_PODMAN_SOCKET` | Override the Podman API socket path. | -| None | `OPENSHELL_NETWORK_NAME` | Override the Podman bridge network. | -| None | `OPENSHELL_SUPERVISOR_IMAGE` | Override the image containing the `openshell-sandbox` supervisor binary. | -| None | `OPENSHELL_STOP_TIMEOUT` | Set the container stop timeout in seconds. | -| None | `OPENSHELL_PODMAN_TLS_CA`, `OPENSHELL_PODMAN_TLS_CERT`, `OPENSHELL_PODMAN_TLS_KEY` | Mount sandbox client TLS materials into Podman containers for mTLS callback to the gateway. | +Select Podman with `--drivers podman` or `OPENSHELL_DRIVERS=podman`. Configure Podman driver values such as `socket_path`, `network_name`, `supervisor_image`, `stop_timeout_secs`, `image_pull_policy`, `grpc_endpoint`, and `guest_tls_*` in `[openshell.drivers.podman]`. ## MicroVM Driver @@ -92,15 +79,7 @@ openshell-gateway --drivers vm For a service, set `OPENSHELL_DRIVERS=vm` in the service environment file and restart the service. Homebrew creates `$(brew --prefix)/var/openshell/gateway.env` with a commented `OPENSHELL_DRIVERS=vm` entry. Debian and RPM user services read `~/.config/openshell/gateway.env`. -| Option | Environment variable | Description | -|---|---|---| -| `--drivers vm` | `OPENSHELL_DRIVERS=vm` | Select the VM compute driver. VM is never auto-detected. | -| `--driver-dir ` | `OPENSHELL_DRIVER_DIR` | Search a custom directory for `openshell-driver-vm`. | -| `--vm-driver-state-dir ` | `OPENSHELL_VM_DRIVER_STATE_DIR` | Store VM rootfs, console logs, runtime state, image-rootfs cache, and the private `run/compute-driver.sock` socket under this directory. | -| `--vm-driver-vcpus ` | `OPENSHELL_VM_DRIVER_VCPUS` | Set the default vCPU count for VM sandboxes. | -| `--vm-driver-mem-mib ` | `OPENSHELL_VM_DRIVER_MEM_MIB` | Set the default memory allocation for VM sandboxes in MiB. | -| `--vm-krun-log-level ` | `OPENSHELL_VM_KRUN_LOG_LEVEL` | Set the libkrun log level for VM helper processes. | -| `--vm-tls-ca`, `--vm-tls-cert`, `--vm-tls-key` | `OPENSHELL_VM_TLS_CA`, `OPENSHELL_VM_TLS_CERT`, `OPENSHELL_VM_TLS_KEY` | Copy sandbox client TLS materials into VM guests for mTLS callback to the gateway. | +Select VM with `--drivers vm` or `OPENSHELL_DRIVERS=vm`. Configure VM driver values such as `grpc_endpoint`, `driver_dir`, `state_dir`, `vcpus`, `mem_mib`, `krun_log_level`, and `guest_tls_*` in `[openshell.drivers.vm]`. The gateway starts `openshell-driver-vm` over a private Unix socket and passes its process ID so the driver can reject unexpected local clients. The driver's standalone TCP listener is disabled unless `--allow-unauthenticated-tcp` is set for local development. @@ -120,14 +99,14 @@ Helm deployments set Kubernetes driver values through the chart. For maintainer-level implementation details, refer to the [Kubernetes driver README](https://github.com/NVIDIA/OpenShell/blob/main/crates/openshell-driver-kubernetes/README.md). -| Gateway option | Environment variable | Helm value | Description | -|---|---|---|---| -| `--drivers kubernetes` | `OPENSHELL_DRIVERS=kubernetes` | Not applicable | Select the Kubernetes compute driver. | -| `--sandbox-namespace ` | `OPENSHELL_SANDBOX_NAMESPACE` | `server.sandboxNamespace` | Set the namespace for sandbox resources. The Helm chart defaults to the release namespace when left empty. | -| `--sandbox-image ` | `OPENSHELL_SANDBOX_IMAGE` | `server.sandboxImage` | Set the default sandbox image. | -| `--sandbox-image-pull-policy ` | `OPENSHELL_SANDBOX_IMAGE_PULL_POLICY` | `server.sandboxImagePullPolicy` | Set the Kubernetes image pull policy for sandbox pods. | -| `--grpc-endpoint ` | `OPENSHELL_GRPC_ENDPOINT` | `server.grpcEndpoint` | Set the gateway callback endpoint reachable from sandbox pods. | -| `--client-tls-secret-name ` | `OPENSHELL_CLIENT_TLS_SECRET_NAME` | `server.tls.clientTlsSecretName` | Mount sandbox client TLS materials from a Kubernetes secret. | -| Not applicable | Not applicable | `supervisor.sideloadMethod` | How the supervisor binary is delivered into sandbox pods. Leave empty to auto-detect from cluster version. Set to `image-volume` to mount the supervisor OCI image directly as a volume (requires Kubernetes 1.33+ with the ImageVolume feature gate; GA in 1.36), or `init-container` to copy it through an init container on older clusters. | +| Gateway configuration | Helm value | Description | +|---|---|---| +| `compute_drivers = ["kubernetes"]` or `--drivers kubernetes` | Not applicable | Select the Kubernetes compute driver. | +| `[openshell.drivers.kubernetes].namespace` | `server.sandboxNamespace` | Set the namespace for sandbox resources. The Helm chart defaults to the release namespace when left empty. | +| `default_image` | `server.sandboxImage` | Set the default sandbox image. | +| `image_pull_policy` | `server.sandboxImagePullPolicy` | Set the Kubernetes image pull policy for sandbox pods. | +| `grpc_endpoint` | `server.grpcEndpoint` | Set the gateway callback endpoint reachable from sandbox pods. | +| `client_tls_secret_name` | `server.tls.clientTlsSecretName` | Mount sandbox client TLS materials from a Kubernetes secret. | +| `supervisor_sideload_method` | `supervisor.sideloadMethod` | How the supervisor binary is delivered into sandbox pods. Leave empty to auto-detect from cluster version. Set to `image-volume` to mount the supervisor OCI image directly as a volume (requires Kubernetes 1.33+ with the ImageVolume feature gate; GA in 1.36), or `init-container` to copy it through an init container on older clusters. | The Kubernetes driver creates namespaced `agents.x-k8s.io/v1alpha1` `Sandbox` resources from the Kubernetes SIG Apps [agent-sandbox](https://github.com/kubernetes-sigs/agent-sandbox) project. The Agent Sandbox controller turns those resources into sandbox pods and related storage. diff --git a/docs/security/best-practices.mdx b/docs/security/best-practices.mdx index 5123c8c12..0c86069e1 100644 --- a/docs/security/best-practices.mdx +++ b/docs/security/best-practices.mdx @@ -71,8 +71,8 @@ This provides defense-in-depth: even if a container escape vulnerability exists, | Aspect | Detail | |---|---| -| Default | Disabled. Set `server.enableUserNamespaces: true` in the Helm values or `OPENSHELL_ENABLE_USER_NAMESPACES=true` as an environment variable to enable cluster-wide. | -| What you can change | Enable cluster-wide through Helm or environment variable. Override per-sandbox through the `user_namespaces` field on `SandboxTemplate` in the API. | +| Default | Disabled. Set `server.enableUserNamespaces: true` in Helm values or `enable_user_namespaces = true` in the gateway config to enable cluster-wide. | +| What you can change | Enable cluster-wide through Helm or gateway config. Override per-sandbox through the `user_namespaces` field on `SandboxTemplate` in the API. | | Prerequisites | Kubernetes 1.33+ with user namespace support available (beta through 1.35, GA in 1.36+), a container runtime that supports user namespaces (containerd 2.0+, CRI-O 1.25+), and Linux 5.12+ for ID-mapped mounts. | | Risk if enabled with GPU | NVIDIA device plugin compatibility with user namespaces is unverified. OpenShell logs a warning when both GPU and user namespaces are active on the same sandbox. | | Recommendation | Enable on non-GPU clusters running Kubernetes with user namespace support available (1.33+ beta, 1.36+ GA) for stronger host isolation. Test GPU workloads separately before enabling on GPU clusters. | diff --git a/e2e/with-docker-gateway.sh b/e2e/with-docker-gateway.sh index f6f34fbb5..83e4185f2 100755 --- a/e2e/with-docker-gateway.sh +++ b/e2e/with-docker-gateway.sh @@ -448,27 +448,61 @@ else fi echo "Starting openshell-gateway on port ${HOST_PORT} (namespace: ${E2E_NAMESPACE})..." + +# Driver-specific options moved from CLI flags into a TOML config table +# (commit 560550d2). Synthesize a minimal config here and pass --config. +# Quote a value as a TOML basic string: wrap in double quotes and escape +# any embedded backslashes / double quotes. Adequate for paths, image +# refs, and namespace identifiers — none of which contain TOML special +# characters in practice. +toml_string() { + local value="$1" + value="${value//\\/\\\\}" + value="${value//\"/\\\"}" + printf '"%s"' "${value}" +} + +GATEWAY_CONFIG="${STATE_DIR}/gateway.toml" +{ + printf '[openshell]\nversion = 1\n\n' + printf '[openshell.gateway]\nlog_level = "info"\n\n' + printf '[openshell.drivers.docker]\n' + printf 'sandbox_namespace = %s\n' "$(toml_string "${E2E_NAMESPACE}")" + printf 'network_name = %s\n' "$(toml_string "${DOCKER_NETWORK_NAME}")" + printf 'grpc_endpoint = %s\n' "$(toml_string "${GATEWAY_ENDPOINT}")" + printf 'default_image = %s\n' "$(toml_string "${SANDBOX_IMAGE}")" + printf 'image_pull_policy = "IfNotPresent"\n' + printf 'guest_tls_ca = %s\n' "$(toml_string "${PKI_DIR}/ca.crt")" + printf 'guest_tls_cert = %s\n' "$(toml_string "${PKI_DIR}/client.crt")" + printf 'guest_tls_key = %s\n' "$(toml_string "${PKI_DIR}/client.key")" + # DOCKER_SUPERVISOR_ARGS holds either ("--docker-supervisor-bin" "") + # or ("--docker-supervisor-image" ""); both map to TOML keys on + # the docker driver config. + for ((i=0; i<${#DOCKER_SUPERVISOR_ARGS[@]}; i+=2)); do + case "${DOCKER_SUPERVISOR_ARGS[$i]}" in + --docker-supervisor-bin) + printf 'supervisor_bin = %s\n' "$(toml_string "${DOCKER_SUPERVISOR_ARGS[$((i+1))]}")" + ;; + --docker-supervisor-image) + printf 'supervisor_image = %s\n' "$(toml_string "${DOCKER_SUPERVISOR_ARGS[$((i+1))]}")" + ;; + esac + done + if [ -n "${GATEWAY_HOST_ALIAS_IP}" ]; then + printf 'host_gateway_ip = %s\n' "$(toml_string "${GATEWAY_HOST_ALIAS_IP}")" + fi +} > "${GATEWAY_CONFIG}" + GATEWAY_ARGS=( - --bind-address 0.0.0.0 \ - --port "${HOST_PORT}" \ - --drivers docker \ - --sandbox-namespace "${E2E_NAMESPACE}" \ - --docker-network-name "${DOCKER_NETWORK_NAME}" \ - --tls-cert "${PKI_DIR}/server.crt" \ - --tls-key "${PKI_DIR}/server.key" \ - --tls-client-ca "${PKI_DIR}/ca.crt" \ - --db-url "sqlite:${STATE_DIR}/gateway.db?mode=rwc" \ - --grpc-endpoint "${GATEWAY_ENDPOINT}" \ - "${DOCKER_SUPERVISOR_ARGS[@]}" \ - --docker-tls-ca "${PKI_DIR}/ca.crt" \ - --docker-tls-cert "${PKI_DIR}/client.crt" \ - --docker-tls-key "${PKI_DIR}/client.key" \ - --sandbox-image "${SANDBOX_IMAGE}" \ - --sandbox-image-pull-policy IfNotPresent + --config "${GATEWAY_CONFIG}" + --bind-address 0.0.0.0 + --port "${HOST_PORT}" + --drivers docker + --tls-cert "${PKI_DIR}/server.crt" + --tls-key "${PKI_DIR}/server.key" + --tls-client-ca "${PKI_DIR}/ca.crt" + --db-url "sqlite:${STATE_DIR}/gateway.db?mode=rwc" ) -if [ -n "${GATEWAY_HOST_ALIAS_IP}" ]; then - GATEWAY_ARGS+=(--host-gateway-ip "${GATEWAY_HOST_ALIAS_IP}") -fi e2e_write_gateway_args_file "${GATEWAY_ARGS_FILE}" "${GATEWAY_ARGS[@]}" e2e_export_gateway_restart_metadata \ diff --git a/e2e/with-podman-gateway.sh b/e2e/with-podman-gateway.sh index c7b5ceff7..727737d25 100755 --- a/e2e/with-podman-gateway.sh +++ b/e2e/with-podman-gateway.sh @@ -144,7 +144,12 @@ ensure_e2e_podman_network() { default_podman_socket_path() { case "$(uname -s)" in Darwin) - printf '%s\n' "${HOME}/.local/share/containers/podman/machine/podman.sock" + # On macOS the podman client talks to a VM; the API socket path is + # per-launch (under $TMPDIR) and reported by `podman machine inspect`. + # The legacy ~/.local/share/containers/podman/machine/podman.sock path + # is not created by podman >= 5.x with the applehv/libkrun providers. + podman_cmd machine inspect --format '{{.ConnectionInfo.PodmanSocket.Path}}' 2>/dev/null \ + | awk 'NF { print; exit }' ;; Linux) if [ -n "${XDG_RUNTIME_DIR:-}" ]; then @@ -165,13 +170,26 @@ ensure_podman_api_socket() { fi local default_socket - if default_socket="$(default_podman_socket_path)" \ + default_socket="$(default_podman_socket_path || true)" + if [ -n "${default_socket}" ] \ && [ -S "${default_socket}" ] \ && podman_cmd --url "unix://${default_socket}" info >/dev/null 2>&1; then export OPENSHELL_PODMAN_SOCKET="${default_socket}" return 0 fi + # `podman system service` is a Linux-only subcommand — the macOS client + # delegates the API service to the VM, so we can't spin one up locally. + # If we got here on Darwin, the user's `podman machine` is either not + # running or its socket isn't reachable; surface that directly. + if [ "$(uname -s)" = "Darwin" ]; then + echo "ERROR: could not reach the Podman API socket on macOS." >&2 + echo " Expected socket from 'podman machine inspect': ${default_socket:-}" >&2 + echo " Ensure 'podman machine start' has been run, or set" >&2 + echo " OPENSHELL_PODMAN_SOCKET to a reachable unix socket path." >&2 + exit 2 + fi + PODMAN_SOCKET="${WORKDIR}/podman/podman.sock" mkdir -p "$(dirname "${PODMAN_SOCKET}")" @@ -325,17 +343,47 @@ export OPENSHELL_E2E_NETWORK_NAME="${PODMAN_NETWORK_NAME}" export OPENSHELL_E2E_SANDBOX_NAMESPACE="${E2E_NAMESPACE}" echo "Starting openshell-gateway on port ${HOST_PORT} (namespace: ${E2E_NAMESPACE})..." + +# Driver-specific options moved from CLI flags into a TOML config table +# (commit 560550d2). Synthesize a minimal config here and pass --config. +# Quote a value as a TOML basic string: see with-docker-gateway.sh for +# the same helper (kept duplicated to avoid sourcing across e2e scripts). +toml_string() { + local value="$1" + value="${value//\\/\\\\}" + value="${value//\"/\\\"}" + printf '"%s"' "${value}" +} + +GATEWAY_CONFIG="${STATE_DIR}/gateway.toml" +{ + printf '[openshell]\nversion = 1\n\n' + printf '[openshell.gateway]\nlog_level = "info"\n\n' + printf '[openshell.drivers.podman]\n' + # The Podman driver scopes isolation by network rather than namespace. + printf 'network_name = %s\n' "$(toml_string "${PODMAN_NETWORK_NAME}")" + printf 'gateway_port = %s\n' "${HOST_PORT}" + printf 'default_image = %s\n' "$(toml_string "${SANDBOX_IMAGE}")" + printf 'image_pull_policy = "missing"\n' + printf 'supervisor_image = %s\n' "$(toml_string "${SUPERVISOR_IMAGE}")" + # The in-process Podman driver reads `socket_path` from TOML only — the + # OPENSHELL_PODMAN_SOCKET env var is honoured by the standalone driver + # binary, not the in-process driver used here. Pin the socket to the one + # the harness discovered (e.g. via `podman machine inspect` on macOS) so + # we don't fall back to the driver's stale macOS default. + if [ -n "${OPENSHELL_PODMAN_SOCKET:-}" ]; then + printf 'socket_path = %s\n' "$(toml_string "${OPENSHELL_PODMAN_SOCKET}")" + fi +} > "${GATEWAY_CONFIG}" + GATEWAY_ARGS=( + --config "${GATEWAY_CONFIG}" --bind-address 0.0.0.0 --port "${HOST_PORT}" --health-port "${HEALTH_PORT}" - --ssh-gateway-port "${HOST_PORT}" --drivers podman --disable-tls --db-url "sqlite:${STATE_DIR}/gateway.db?mode=rwc" - --sandbox-namespace "${E2E_NAMESPACE}" - --sandbox-image "${SANDBOX_IMAGE}" - --sandbox-image-pull-policy missing --log-level info ) diff --git a/python/openshell/release_formula_test.py b/python/openshell/release_formula_test.py index 1f005f97e..f2f7bf787 100644 --- a/python/openshell/release_formula_test.py +++ b/python/openshell/release_formula_test.py @@ -53,17 +53,18 @@ def test_generate_homebrew_formula_uses_tagged_macos_driver_asset_without_defaul assert 'sha256 "' + "b" * 64 + '"' in formula assert "OPENSHELL_DRIVERS:" not in formula assert "#OPENSHELL_DRIVERS=vm" in formula - assert 'OPENSHELL_DRIVER_DIR: "#{opt_libexec}"' in formula - assert ( - 'OPENSHELL_DOCKER_SUPERVISOR_IMAGE: "ghcr.io/nvidia/openshell/supervisor:0.0.10"' - ) in formula + assert 'OPENSHELL_GATEWAY_CONFIG: "#{var}/openshell/gateway.toml"' in formula + assert 'driver_dir = "#{opt_libexec}"' in formula + assert 'supervisor_image = "ghcr.io/nvidia/openshell/supervisor:0.0.10"' in formula assert 'run opt_libexec/"openshell-gateway-homebrew-service"' in formula assert ( 'docker_tls_dir="${OPENSHELL_DOCKER_TLS_DIR:-${HOME}/.local/state/openshell/homebrew/tls}"' ) in formula - assert 'export OPENSHELL_DOCKER_TLS_CA="${docker_tls_dir}/ca.crt"' in formula + assert 'guest_tls_ca = "${docker_tls_dir}/ca.crt"' in formula assert 'gateway_env="#{var}/openshell/gateway.env"' in formula assert '. "${gateway_env}"' in formula + assert "OPENSHELL_DRIVER_DIR:" not in formula + assert "OPENSHELL_DOCKER_SUPERVISOR_IMAGE:" not in formula assert 'OPENSHELL_DOCKER_TLS_CA: "#{var}/openshell/tls/ca.crt"' not in formula assert "entitlements.atomic_write" in formula assert "brew services restart openshell" in formula diff --git a/rfc/0003-gateway-configuration/README.md b/rfc/0003-gateway-configuration/README.md index dd831e228..028536e9e 100644 --- a/rfc/0003-gateway-configuration/README.md +++ b/rfc/0003-gateway-configuration/README.md @@ -37,7 +37,7 @@ Three sources are merged at startup, in descending priority: CLI flags > OPENSHELL_* environment variables > TOML config file > built-in defaults ``` -The TOML file is optional. If neither `--config` nor `OPENSHELL_CONFIG` is set, the gateway behaves exactly as before. Any field present in the file is overridden by a CLI flag or matching environment variable. +The TOML file is optional. If neither `--config` nor `OPENSHELL_GATEWAY_CONFIG` is set, the gateway behaves exactly as before. Any field present in the file is overridden by a CLI flag or matching environment variable. ### Loading the file @@ -78,18 +78,23 @@ log_level = "info" # (kubernetes → podman → docker). VM is never auto-detected. compute_drivers = ["kubernetes"] -# SSH proxy (gateway-side; driver-side equivalents live under each driver). # Note: database_url is a secret and must be supplied via OPENSHELL_DB_URL # (or --db-url) — it is NOT permitted in the file. ssh_session_ttl_secs = 86400 -ssh_gateway_host = "127.0.0.1" -ssh_gateway_port = 8080 -ssh_connect_path = "/connect/ssh" -sandbox_ssh_port = 2222 + +# Service routing — wildcard DNS SANs in `server_sans` also enable sandbox +# service URLs under that domain. `enable_loopback_service_http` toggles +# plaintext HTTP routing for loopback service URLs. +server_sans = ["openshell", "*.dev.openshell.localhost"] +enable_loopback_service_http = true # ────────────────────────────────────────────────────────────────────────────── # TLS / mTLS — when omitted, the gateway listens plaintext (sets --disable-tls) # ────────────────────────────────────────────────────────────────────────────── +# Mirrors --disable-tls / OPENSHELL_DISABLE_TLS. When true, the gateway +# ignores the [openshell.gateway.tls] table below. +disable_tls = false + [openshell.gateway.tls] cert_path = "/etc/openshell/certs/gateway.pem" key_path = "/etc/openshell/certs/gateway-key.pem" @@ -124,6 +129,10 @@ host_gateway_ip = "10.0.0.1" ssh_socket_path = "/run/openshell/ssh.sock" [openshell.drivers.docker] +default_image = "ghcr.io/nvidia/openshell/sandbox:latest" +image_pull_policy = "IfNotPresent" +sandbox_namespace = "docker-dev" +grpc_endpoint = "https://host.openshell.internal:8080" network_name = "openshell" supervisor_bin = "/usr/local/libexec/openshell/openshell-sandbox" # optional override supervisor_image = "ghcr.io/nvidia/openshell/supervisor:latest" # used to extract bin @@ -134,7 +143,7 @@ guest_tls_key = "/etc/openshell/certs/client-key.pem" [openshell.drivers.podman] socket_path = "/run/podman/podman.sock" default_image = "ghcr.io/nvidia/openshell/sandbox:latest" -image_pull_policy = "IfNotPresent" +image_pull_policy = "missing" # Podman vocabulary: always | missing | never | newer supervisor_image = "ghcr.io/nvidia/openshell/supervisor:latest" network_name = "openshell" stop_timeout_secs = 10 @@ -145,6 +154,7 @@ guest_tls_key = "/etc/openshell/certs/client-key.pem" [openshell.drivers.vm] state_dir = "/var/lib/openshell/vm" driver_dir = "/usr/local/libexec/openshell" +grpc_endpoint = "https://host.containers.internal:8080" vcpus = 2 mem_mib = 2048 krun_log_level = 1 @@ -256,11 +266,11 @@ The chart owners can migrate one section at a time: `OPENSHELL_*` env vars and t No part of this RFC has shipped yet. The work breaks down as: 1. **Add a config-file loader to `openshell-server`** — define a `GatewayConfigFile` struct that mirrors the schema above, parse it with `serde` + `toml`, and merge it into `openshell_core::Config` plus the per-driver structs in `compute/`. -2. **Wire the merge into `cli.rs`** — add `--config` / `OPENSHELL_CONFIG`, gate each existing flag's "apply from file" path on clap `ValueSource::DefaultValue`, and run cross-field validation after the merge. +2. **Wire the merge into `cli.rs`** — add `--config` / `OPENSHELL_GATEWAY_CONFIG`, gate each existing flag's "apply from file" path on clap `ValueSource::DefaultValue`, and run cross-field validation after the merge. 3. **Per-driver deserialization** — give each driver crate (`openshell-driver-{kubernetes,docker,podman,vm}`) a `from_toml` (or `serde::Deserialize`) entry point so the gateway can hand each driver its own table. 4. **Test coverage** — file parsing, env-overrides-file, CLI-overrides-env, partial TLS error, port-collision error, unknown-field rejection, missing driver table fallback. 5. **Helm chart migration** — add `gateway.config` value tree, render the `ConfigMap`, mount it, switch the gateway container to `--config`. Keep the `OPENSHELL_*` env names available as opt-in overrides for secrets. -6. **Example file** — ship `examples/gateway/gateway.example.toml` and link it from the docs reference. +6. **Example file** — ship the per-driver examples on the published docs reference at `docs/reference/gateway-config.mdx`. 7. **Architecture doc update** — reflect the new config sources and precedence in `architecture/gateway.md`. ## Risks diff --git a/snapcraft.yaml b/snapcraft.yaml index 9ac0a14c7..5f27ead1a 100644 --- a/snapcraft.yaml +++ b/snapcraft.yaml @@ -37,23 +37,16 @@ apps: - ssh-keys - system-observe gateway: - command: bin/openshell-gateway + command: bin/openshell-gateway-wrapper daemon: simple refresh-mode: endure environment: OPENSHELL_BIND_ADDRESS: 127.0.0.1 OPENSHELL_SERVER_PORT: 17670 OPENSHELL_DB_URL: "sqlite:$SNAP_COMMON/gateway.db?mode=rwc" - OPENSHELL_GRPC_ENDPOINT: http://host.openshell.internal:17670 OPENSHELL_DISABLE_TLS: "true" OPENSHELL_DRIVERS: docker - OPENSHELL_DOCKER_SUPERVISOR_BIN: "$SNAP/bin/openshell-sandbox" - OPENSHELL_DOCKER_NETWORK_NAME: openshell-snap - OPENSHELL_SANDBOX_IMAGE: ghcr.io/nvidia/openshell-community/sandboxes/base:latest - OPENSHELL_SANDBOX_IMAGE_PULL_POLICY: IfNotPresent - OPENSHELL_SANDBOX_SSH_PORT: 2222 - OPENSHELL_SSH_GATEWAY_HOST: 127.0.0.1 - OPENSHELL_SSH_GATEWAY_PORT: 8080 + OPENSHELL_GATEWAY_CONFIG: "$SNAP_COMMON/gateway.toml" XDG_DATA_HOME: "$SNAP_COMMON" XDG_RUNTIME_DIR: "$SNAP_COMMON" plugs: @@ -96,6 +89,8 @@ parts: "$CRAFT_PART_INSTALL/bin/openshell-gateway" install -D -m 0755 "$CRAFT_PROJECT_DIR/target/release/openshell-sandbox" \ "$CRAFT_PART_INSTALL/bin/openshell-sandbox" + install -D -m 0755 "$CRAFT_PROJECT_DIR/deploy/snap/bin/openshell-gateway-wrapper" \ + "$CRAFT_PART_INSTALL/bin/openshell-gateway-wrapper" install -D -m 0644 "$CRAFT_PROJECT_DIR/LICENSE" \ "$CRAFT_PART_INSTALL/usr/share/doc/openshell/LICENSE" install -D -m 0644 "$CRAFT_PROJECT_DIR/README.md" \ diff --git a/tasks/scripts/gateway-docker.sh b/tasks/scripts/gateway-docker.sh index c5b8d37dd..fd9ba0f6e 100644 --- a/tasks/scripts/gateway-docker.sh +++ b/tasks/scripts/gateway-docker.sh @@ -156,6 +156,22 @@ fi chmod +x "${SUPERVISOR_BIN}" mkdir -p "${STATE_DIR}" +CONFIG_PATH="${STATE_DIR}/gateway.toml" +cat >"${CONFIG_PATH}" <"${CONFIG_PATH}" <&2 + exit 2 + ;; + esac +} + explicit_driver="" while [[ "$#" -gt 0 ]]; do case "$1" in @@ -250,6 +263,7 @@ STATE_DIR="${OPENSHELL_GATEWAY_STATE_DIR:-${ROOT}/.cache/gateway-${DRIVER}}" SANDBOX_NAMESPACE="${OPENSHELL_SANDBOX_NAMESPACE:-${DRIVER}-dev}" SANDBOX_IMAGE="${OPENSHELL_SANDBOX_IMAGE:-ghcr.io/nvidia/openshell-community/sandboxes/base:latest}" SANDBOX_IMAGE_PULL_POLICY="${OPENSHELL_SANDBOX_IMAGE_PULL_POLICY:-IfNotPresent}" +GRPC_ENDPOINT="${OPENSHELL_GRPC_ENDPOINT:-}" LOG_LEVEL="${OPENSHELL_LOG_LEVEL:-info}" if [[ "${DRIVER}" == "podman" ]]; then @@ -278,6 +292,42 @@ if [[ ! -x "${GATEWAY_BIN}" ]]; then fi mkdir -p "${STATE_DIR}" +CONFIG_PATH="${STATE_DIR}/gateway.toml" +cat >"${CONFIG_PATH}" <>"${CONFIG_PATH}" <>"${CONFIG_PATH}" + fi + ;; + podman) + cat >>"${CONFIG_PATH}" <>"${CONFIG_PATH}" + fi + ;; +esac GATEWAY_ENDPOINT="http://127.0.0.1:${PORT}" register_gateway_metadata "${GATEWAY_NAME}" "${GATEWAY_ENDPOINT}" "${PORT}" @@ -295,11 +345,9 @@ echo "Active gateway set to '${GATEWAY_NAME}'. The CLI now targets this gateway echo exec "${GATEWAY_BIN}" \ + --config "${CONFIG_PATH}" \ --port "${PORT}" \ --log-level "${LOG_LEVEL}" \ --drivers "${DRIVER}" \ --disable-tls \ - --db-url "sqlite:${STATE_DIR}/gateway.db?mode=rwc" \ - --sandbox-namespace "${SANDBOX_NAMESPACE}" \ - --sandbox-image "${SANDBOX_IMAGE}" \ - --sandbox-image-pull-policy "${SANDBOX_IMAGE_PULL_POLICY}" + --db-url "sqlite:${STATE_DIR}/gateway.db?mode=rwc" diff --git a/tasks/scripts/package-deb.sh b/tasks/scripts/package-deb.sh index 9d7e3d328..5705e3385 100755 --- a/tasks/scripts/package-deb.sh +++ b/tasks/scripts/package-deb.sh @@ -115,6 +115,8 @@ stage_binary "$OPENSHELL_DRIVER_VM_BINARY" "$pkgroot/usr/libexec/openshell/opens # Per-user systemd unit. Each user enables it via `systemctl --user`. install -D -m 0644 "$src_dir/openshell-gateway.service" \ "$pkgroot/usr/lib/systemd/user/openshell-gateway.service" +install -D -m 0755 "$src_dir/init-gateway-config.sh" \ + "$pkgroot/usr/libexec/openshell/init-gateway-config.sh" # --------------------------------------------------------------------------- # DEBIAN/ control directory diff --git a/tasks/scripts/package-snap.sh b/tasks/scripts/package-snap.sh index 4aafa4ca4..8c299d352 100755 --- a/tasks/scripts/package-snap.sh +++ b/tasks/scripts/package-snap.sh @@ -182,6 +182,8 @@ fi stage_binary "$OPENSHELL_CLI_BINARY" "$snap_root/bin/openshell" stage_binary "$OPENSHELL_GATEWAY_BINARY" "$snap_root/bin/openshell-gateway" stage_binary "$OPENSHELL_DOCKER_SUPERVISOR_BINARY" "$snap_root/bin/openshell-sandbox" +install -D -m 0755 "${repo_root}/deploy/snap/bin/openshell-gateway-wrapper" \ + "$snap_root/bin/openshell-gateway-wrapper" install -D -m 0644 "${repo_root}/LICENSE" "$snap_root/usr/share/doc/openshell/LICENSE" install -D -m 0644 "${repo_root}/README.md" "$snap_root/usr/share/doc/openshell/README.md" diff --git a/tasks/scripts/release.py b/tasks/scripts/release.py index 7406e1adb..df61e0907 100644 --- a/tasks/scripts/release.py +++ b/tasks/scripts/release.py @@ -289,6 +289,13 @@ def install exit 1 fi + gateway_env="#{{var}}/openshell/gateway.env" + if [ -f "${{gateway_env}}" ]; then + set -a + . "${{gateway_env}}" + set +a + fi + docker_tls_dir="${{OPENSHELL_DOCKER_TLS_DIR:-${{HOME}}/.local/state/openshell/homebrew/tls}}" mkdir -p "${{docker_tls_dir}}/client" chmod 700 "${{docker_tls_dir}}" "${{docker_tls_dir}}/client" @@ -296,17 +303,36 @@ def install /usr/bin/install -m 0644 "#{{var}}/openshell/tls/client/tls.crt" "${{docker_tls_dir}}/client/tls.crt" /usr/bin/install -m 0600 "#{{var}}/openshell/tls/client/tls.key" "${{docker_tls_dir}}/client/tls.key" - export OPENSHELL_DOCKER_TLS_CA="${{docker_tls_dir}}/ca.crt" - export OPENSHELL_DOCKER_TLS_CERT="${{docker_tls_dir}}/client/tls.crt" - export OPENSHELL_DOCKER_TLS_KEY="${{docker_tls_dir}}/client/tls.key" - - gateway_env="#{{var}}/openshell/gateway.env" - if [ -f "${{gateway_env}}" ]; then - set -a - . "${{gateway_env}}" - set +a + gateway_config="${{OPENSHELL_GATEWAY_CONFIG:-#{{var}}/openshell/gateway.toml}}" + if [ ! -f "${{gateway_config}}" ]; then + mkdir -p "$(dirname "${{gateway_config}}")" "#{{var}}/openshell/vm-driver" + cat > "${{gateway_config}}" < Starting gateway on port $PORT (state=$STATE_DIR, health=$health_port)" mkdir -p "$STATE_DIR" + cat >"$config" < "$LOG" 2>&1 &