From 5eba482b01c5ca6701f0274f15e80d1805f3a131 Mon Sep 17 00:00:00 2001 From: Bill Minckler Date: Mon, 29 Jun 2026 17:34:38 +0000 Subject: [PATCH 1/3] fix: clarify missing-vendor exploration error and add BMC connection circuit breaker Two related reliability fixes for unreachable BMCs. site-explorer: when a BMC's Redfish ServiceRoot does not yield a recognized vendor, the recorded exploration error now reports the raw Vendor/Oem string we actually read (and where from), distinguishing a BMC that reported nothing (commonly transient while it is still initializing) from one that reported a vendor we don't support yet. The exploration metric label is split accordingly (vendor_not_reported vs vendor_unrecognized) so the two cases can be alerted on separately. health: add a per-endpoint connection circuit breaker on the shared BmcClient. Previously, when a BMC stopped answering at the network level, every collector sharing the endpoint kept firing requests that each blocked for a full TCP connect timeout -- hundreds per sensor sweep, each logging a warning. The breaker classifies connect/timeout failures (not 404s/auth/decode errors), opens after the first one, and fast-fails subsequent requests with exponential backoff and a single half-open probe to self-heal. The healthy path stays lock-free via an atomic hint. The sensor sweep skips its fan-out while the circuit is open and treats breaker fast-fails as skips rather than per-sensor failures, so a dead endpoint no longer produces a log flood. Co-Authored-By: Claude Opus 4.8 (1M context) --- crates/api-model/src/site_explorer/mod.rs | 15 +- crates/health/src/bmc.rs | 501 ++++++++++++++++++++-- crates/health/src/collectors/sensors.rs | 35 ++ crates/site-explorer/src/metrics.rs | 6 +- crates/site-explorer/src/redfish.rs | 23 +- 5 files changed, 543 insertions(+), 37 deletions(-) diff --git a/crates/api-model/src/site_explorer/mod.rs b/crates/api-model/src/site_explorer/mod.rs index 68fc57858f..0fe7b5567b 100644 --- a/crates/api-model/src/site_explorer/mod.rs +++ b/crates/api-model/src/site_explorer/mod.rs @@ -1196,8 +1196,19 @@ pub enum EndpointExplorationError { /// This field just exists here until site-explorer updates existing records #[error("Endpoint is not a BMC with Redfish support at the specified URI")] MissingRedfish { uri: Option }, - #[error("BMC vendor field is not populated. Unsupported BMC.")] - MissingVendor, + /// The BMC's Redfish ServiceRoot (`/redfish/v1`) did not yield a vendor we + /// recognize. `observed` is the raw vendor string we read from the root — + /// the `Vendor` field, falling back to the first `Oem` key. `None` means the + /// BMC reported neither, which is commonly transient while the BMC is still + /// initializing/syncing (exploration will retry). `Some(value)` means the BMC + /// reported a vendor we don't support yet — `value` is what it sent. + #[error( + "BMC ServiceRoot (/redfish/v1) did not report a recognized vendor (observed Vendor/Oem = {observed:?}); an empty value usually means the BMC is still initializing and exploration will retry" + )] + MissingVendor { + #[serde(default)] + observed: Option, + }, #[error( "Site explorer will not explore this endpoint to avoid lockout: it could not login previously" )] diff --git a/crates/health/src/bmc.rs b/crates/health/src/bmc.rs index 9cef572a76..44cd85b200 100644 --- a/crates/health/src/bmc.rs +++ b/crates/health/src/bmc.rs @@ -16,9 +16,9 @@ */ use std::pin::Pin; -use std::sync::Arc; -use std::sync::atomic::{AtomicU64, Ordering}; -use std::time::Duration; +use std::sync::atomic::{AtomicBool, AtomicU64, Ordering}; +use std::sync::{Arc, Mutex as StdMutex}; +use std::time::{Duration, Instant}; use futures::TryStreamExt; use http::HeaderMap; @@ -40,6 +40,40 @@ use crate::endpoint::{BmcAddr, BmcCredentials}; pub(crate) const CREDENTIAL_REFRESH_TIMEOUT: Duration = Duration::from_secs(30); +/// How long the per-endpoint circuit stays open after the first connect-level +/// failure. Subsequent failed probes double this up to [`CIRCUIT_MAX_BACKOFF`]. +const CIRCUIT_INITIAL_BACKOFF: Duration = Duration::from_secs(5); +/// Upper bound on the circuit backoff window. +const CIRCUIT_MAX_BACKOFF: Duration = Duration::from_secs(300); +/// How long a single half-open probe is allowed to run before the circuit lets +/// another caller probe. This only matters if a probe is lost (e.g. its future +/// is cancelled) — it stops the circuit from latching half-open forever. It is +/// deliberately longer than a BMC connect timeout. +const CIRCUIT_PROBE_TIMEOUT: Duration = Duration::from_secs(60); + +/// Per-endpoint connection circuit breaker state. +/// +/// When a BMC stops answering at the network level, every collector sharing the +/// endpoint's [`BmcClient`] would otherwise keep firing requests that each block +/// for a full TCP connect timeout — hundreds of them per sensor sweep — and log +/// a warning apiece. The breaker short-circuits those requests after the first +/// connect-level failure so a dead endpoint costs one failed probe per backoff +/// window instead of a flood. See NVBug 6036327. +#[derive(Debug)] +enum CircuitState { + /// Requests flow normally. + Closed, + /// Requests fast-fail until `until`; `backoff` is the window that was applied. + Open { until: Instant, backoff: Duration }, + /// A single probe has been let through and is in flight until `deadline`; + /// other callers fast-fail. `backoff` is the window to escalate from if the + /// probe fails. + Probing { + deadline: Instant, + backoff: Duration, + }, +} + pub type BoxFuture<'a, T> = Pin + Send + 'a>>; pub trait CredentialProvider: Send + Sync { @@ -77,6 +111,16 @@ pub struct BmcClient { credential_generation: AtomicU64, init: OnceCell<()>, refresh_lock: Mutex<()>, + circuit: StdMutex, + /// Lock-free fast-path hint mirroring `circuit`: `false` iff the circuit is + /// `Closed`. Lets the healthy request path (the overwhelmingly common case) + /// skip the mutex entirely. It is only ever set to `true` while holding the + /// `circuit` lock as the state leaves `Closed`, and cleared while holding it + /// as the state returns to `Closed`, so the invariant "circuit is blocking + /// ⟹ `circuit_tripped` is `true`" always holds. A stale `false` read just + /// means a request that was already racing an open-transition proceeds — + /// harmless, identical to a request already in flight when the circuit trips. + circuit_tripped: AtomicBool, } impl BmcClient { @@ -108,6 +152,8 @@ impl BmcClient { credential_generation: AtomicU64::new(0), init: OnceCell::new(), refresh_lock: Mutex::new(()), + circuit: StdMutex::new(CircuitState::Closed), + circuit_tripped: AtomicBool::new(false), }) } @@ -196,6 +242,160 @@ impl BmcClient { error } + + /// Run a BMC operation through the connection circuit breaker. + /// + /// Fast-fails (without touching the network) while the circuit is open, and + /// updates the breaker based on the outcome: a success closes the circuit, + /// while a connect-level failure trips it. Non-connection failures (auth, + /// 404s, decode errors, …) pass through untouched — they are not the BMC + /// being unreachable, so they must not open the circuit. + async fn guarded( + &self, + op: impl std::future::Future>, + ) -> Result { + self.check_circuit()?; + match op.await { + Ok(value) => { + self.note_reachable(); + Ok(value) + } + Err(error) => { + if is_connection_error(&error) { + self.trip_circuit(&error); + } + Err(error) + } + } + } + + /// Gate an attempt against the circuit. Returns the fast-fail error while the + /// circuit is open, and otherwise lets the caller proceed — promoting an + /// expired `Open` (or a lost `Probing`) circuit to a fresh half-open probe. + fn check_circuit(&self) -> Result<(), HealthError> { + // Fast path: a healthy (closed) circuit never touches the mutex. + if !self.circuit_tripped.load(Ordering::Acquire) { + return Ok(()); + } + let mut state = self.circuit.lock().expect("circuit mutex poisoned"); + match *state { + CircuitState::Closed => Ok(()), + CircuitState::Open { until, backoff } => { + if Instant::now() >= until { + *state = CircuitState::Probing { + deadline: Instant::now() + CIRCUIT_PROBE_TIMEOUT, + backoff, + }; + Ok(()) + } else { + Err(self.circuit_open_error()) + } + } + CircuitState::Probing { deadline, backoff } => { + // A probe is already in flight; everyone else waits. If the probe + // was lost (deadline passed without a result), let a new one run. + if Instant::now() >= deadline { + *state = CircuitState::Probing { + deadline: Instant::now() + CIRCUIT_PROBE_TIMEOUT, + backoff, + }; + Ok(()) + } else { + Err(self.circuit_open_error()) + } + } + } + } + + /// Record that the BMC answered, closing the circuit if it was open. + fn note_reachable(&self) { + // Fast path: already closed — nothing to do, no lock. + if !self.circuit_tripped.load(Ordering::Acquire) { + return; + } + let mut state = self.circuit.lock().expect("circuit mutex poisoned"); + if !matches!(*state, CircuitState::Closed) { + tracing::info!(endpoint = ?self.addr, "BMC is reachable again; closing connection circuit"); + *state = CircuitState::Closed; + } + // Clear the hint while still holding the lock so it can never lag the + // state into a `Closed`-but-`tripped` window that the fast path would + // wrongly treat as blocking. + self.circuit_tripped.store(false, Ordering::Release); + } + + /// Open (or escalate) the circuit after a connect-level failure. + fn trip_circuit(&self, error: &HealthError) { + let mut state = self.circuit.lock().expect("circuit mutex poisoned"); + match *state { + CircuitState::Closed => { + *state = CircuitState::Open { + until: Instant::now() + CIRCUIT_INITIAL_BACKOFF, + backoff: CIRCUIT_INITIAL_BACKOFF, + }; + tracing::warn!( + endpoint = ?self.addr, + backoff_secs = CIRCUIT_INITIAL_BACKOFF.as_secs(), + error = ?error, + "BMC connect failure; opening connection circuit to stop request flood" + ); + } + CircuitState::Probing { backoff, .. } => { + // The half-open probe failed: keep the circuit open and back off + // further before the next probe. + let next = (backoff * 2).min(CIRCUIT_MAX_BACKOFF); + *state = CircuitState::Open { + until: Instant::now() + next, + backoff: next, + }; + tracing::debug!( + endpoint = ?self.addr, + backoff_secs = next.as_secs(), + "BMC still unreachable; extending connection circuit backoff" + ); + } + // Already open: this is a request that was in flight before the + // circuit opened. Leave the existing window untouched. + CircuitState::Open { .. } => {} + } + // Every branch above leaves the circuit non-`Closed`; publish the hint + // while still holding the lock so the fast path observes it. + self.circuit_tripped.store(true, Ordering::Release); + } + + /// Whether the circuit is currently blocking requests. Lets callers (e.g. the + /// sensor sweep) skip a whole batch of work — and its log spam — instead of + /// fast-failing each request individually. Returns `false` when the open + /// window has elapsed so the next sweep can probe and self-heal. + pub fn circuit_blocks_requests(&self) -> bool { + // Fast path: a closed circuit blocks nothing. + if !self.circuit_tripped.load(Ordering::Acquire) { + return false; + } + let state = self.circuit.lock().expect("circuit mutex poisoned"); + match *state { + CircuitState::Closed => false, + CircuitState::Open { until, .. } => Instant::now() < until, + CircuitState::Probing { deadline, .. } => Instant::now() < deadline, + } + } + + fn circuit_open_error(&self) -> HealthError { + HealthError::GenericError(format!( + "BMC {} is unreachable; request skipped while the connection circuit breaker is open", + self.addr.ip + )) + } + + /// Seed the circuit state and its fast-path hint coherently. Tests use this + /// instead of writing the mutex directly so the `circuit_tripped` invariant + /// is never violated. + #[cfg(test)] + fn set_circuit_for_test(&self, state: CircuitState) { + let tripped = !matches!(state, CircuitState::Closed); + *self.circuit.lock().expect("circuit mutex poisoned") = state; + self.circuit_tripped.store(tripped, Ordering::Release); + } } fn bmc_url(addr: &BmcAddr, proxy_url: Option<&Url>) -> Result { @@ -231,10 +431,13 @@ impl Bmc for BmcClient { self.ensure_credentials().await?; let credential_generation = self.credential_generation.load(Ordering::Acquire); match self - .inner - .expand(id, query) + .guarded(async { + self.inner + .expand(id, query) + .await + .map_err(HealthError::from) + }) .await - .map_err(HealthError::from) { Ok(value) => Ok(value), Err(error) => Err(self @@ -249,7 +452,10 @@ impl Bmc for BmcClient { ) -> Result, Self::Error> { self.ensure_credentials().await?; let credential_generation = self.credential_generation.load(Ordering::Acquire); - match self.inner.get(id).await.map_err(HealthError::from) { + match self + .guarded(async { self.inner.get(id).await.map_err(HealthError::from) }) + .await + { Ok(value) => Ok(value), Err(error) => Err(self .refresh_auth_if_needed(error, credential_generation) @@ -265,10 +471,13 @@ impl Bmc for BmcClient { self.ensure_credentials().await?; let credential_generation = self.credential_generation.load(Ordering::Acquire); match self - .inner - .filter(id, query) + .guarded(async { + self.inner + .filter(id, query) + .await + .map_err(HealthError::from) + }) .await - .map_err(HealthError::from) { Ok(value) => Ok(value), Err(error) => Err(self @@ -283,10 +492,13 @@ impl Bmc for BmcClient { query: &V, ) -> Result, Self::Error> { self.ensure_credentials().await?; - self.inner - .create(id, query) - .await - .map_err(HealthError::from) + self.guarded(async { + self.inner + .create(id, query) + .await + .map_err(HealthError::from) + }) + .await } async fn update< @@ -299,10 +511,13 @@ impl Bmc for BmcClient { update: &V, ) -> Result, Self::Error> { self.ensure_credentials().await?; - self.inner - .update(id, etag, update) - .await - .map_err(HealthError::from) + self.guarded(async { + self.inner + .update(id, etag, update) + .await + .map_err(HealthError::from) + }) + .await } async fn multipart_update( @@ -316,10 +531,13 @@ impl Bmc for BmcClient { V: Send + Sync + Serialize, { self.ensure_credentials().await?; - self.inner - .multipart_update(uri, request) - .await - .map_err(HealthError::from) + self.guarded(async { + self.inner + .multipart_update(uri, request) + .await + .map_err(HealthError::from) + }) + .await } async fn delete Deserialize<'de>>( @@ -327,7 +545,8 @@ impl Bmc for BmcClient { id: &ODataId, ) -> Result, Self::Error> { self.ensure_credentials().await?; - self.inner.delete(id).await.map_err(HealthError::from) + self.guarded(async { self.inner.delete(id).await.map_err(HealthError::from) }) + .await } async fn action< @@ -339,10 +558,13 @@ impl Bmc for BmcClient { params: &T, ) -> Result, Self::Error> { self.ensure_credentials().await?; - self.inner - .action(action, params) - .await - .map_err(HealthError::from) + self.guarded(async { + self.inner + .action(action, params) + .await + .map_err(HealthError::from) + }) + .await } async fn stream Deserialize<'de> + Send + 'static>( @@ -351,7 +573,10 @@ impl Bmc for BmcClient { ) -> Result, Self::Error> { self.ensure_credentials().await?; let credential_generation = self.credential_generation.load(Ordering::Acquire); - match self.inner.stream(uri).await.map_err(HealthError::from) { + match self + .guarded(async { self.inner.stream(uri).await.map_err(HealthError::from) }) + .await + { Ok(stream) => Ok(Box::pin(stream.map_err(HealthError::from))), Err(error) => Err(self .refresh_auth_if_needed(error, credential_generation) @@ -368,10 +593,13 @@ impl Bmc for BmcClient { query: &V, ) -> Result, Self::Error> { self.ensure_credentials().await?; - self.inner - .create_session(id, query) - .await - .map_err(HealthError::from) + self.guarded(async { + self.inner + .create_session(id, query) + .await + .map_err(HealthError::from) + }) + .await } } @@ -402,6 +630,30 @@ fn is_auth_bmc_error(error: &BmcError) -> bool { ) } +/// Whether an error represents the BMC being unreachable at the transport layer +/// (TCP connect refused/timed out, or a request that timed out) — as opposed to +/// the BMC answering with an error. Only these trip the connection circuit +/// breaker; an HTTP 404 or a decode error means the BMC is alive and talking. +pub(crate) fn is_connection_error(error: &HealthError) -> bool { + match error { + HealthError::BmcError(inner) => is_connection_bmc_source_error(inner.as_ref()), + _ => false, + } +} + +fn is_connection_bmc_source_error(error: &(dyn std::error::Error + 'static)) -> bool { + error + .downcast_ref::() + .is_some_and(is_connection_bmc_error) + || error + .downcast_ref::() + .is_some_and(is_connection_error) +} + +fn is_connection_bmc_error(error: &BmcError) -> bool { + matches!(error, BmcError::ReqwestError(e) if e.is_connect() || e.is_timeout()) +} + #[cfg(test)] mod tests { use std::str::FromStr; @@ -474,6 +726,191 @@ mod tests { } } + fn test_client() -> BmcClient { + let (provider, _) = CountingProvider::new( + BmcCredentials::SessionToken { + token: "t".to_string(), + }, + None, + ); + BmcClient::new(reqwest(), test_addr(), provider, None, 10).expect("constructor ok") + } + + fn dummy_error() -> HealthError { + HealthError::GenericError("boom".to_string()) + } + + #[test] + fn non_connection_errors_do_not_trip_the_circuit() { + // 401/403, 404, and generic errors mean the BMC answered (or the failure + // is unrelated to reachability) — they must not open the breaker. + assert!(!is_connection_error(&HealthError::BmcError(Box::new( + bmc_status_error(http::StatusCode::UNAUTHORIZED) + )))); + assert!(!is_connection_error(&HealthError::BmcError(Box::new( + bmc_status_error(http::StatusCode::NOT_FOUND) + )))); + assert!(!is_connection_error(&HealthError::HttpError( + "HTTP 404".to_string() + ))); + assert!(!is_connection_error(&dummy_error())); + } + + #[tokio::test] + async fn real_connect_failure_is_classified_and_trips_the_circuit() { + // Reserve an ephemeral port, then release it, so connecting to it is + // refused — a real, deterministic transport-level failure with no + // fixed-port collision and no waiting on a timeout. This exercises the + // whole chain end to end (reqwest error -> BmcError -> HealthError -> + // is_connection_error -> trip_circuit), guarding the assumption that a + // genuine connect failure — the production flood was `Connect, TimedOut` + // — is actually classified as a connection error. See NVBug 6036327. + let listener = std::net::TcpListener::bind("127.0.0.1:0").expect("bind ephemeral port"); + let port = listener.local_addr().expect("local addr").port(); + drop(listener); + + let (provider, _) = CountingProvider::new( + BmcCredentials::SessionToken { + token: "t".to_string(), + }, + None, + ); + let addr = BmcAddr { + ip: "127.0.0.1".parse().expect("loopback ip"), + port: Some(port), + mac: MacAddress::from_str("00:11:22:33:44:55").expect("mac"), + }; + let client = + Arc::new(BmcClient::new(reqwest(), addr, provider, None, 10).expect("constructor ok")); + + assert!(!client.circuit_blocks_requests(), "circuit starts closed"); + + // Any real Redfish read against the closed port fails to connect. + let result = nv_redfish::ServiceRoot::new(client.clone()).await; + assert!(result.is_err(), "connecting to a closed port must fail"); + + assert!( + client.circuit_blocks_requests(), + "a genuine connect failure must be classified as a connection error and open the breaker" + ); + } + + #[test] + fn circuit_opens_on_failure_then_closes_on_success() { + let client = test_client(); + + // Starts closed: requests flow. + assert!(!client.circuit_blocks_requests()); + assert!(client.check_circuit().is_ok()); + + // A connect-level failure opens the circuit. + client.trip_circuit(&dummy_error()); + assert!(client.circuit_blocks_requests()); + assert!( + client.check_circuit().is_err(), + "open circuit must fast-fail" + ); + + // A success closes it again. + client.note_reachable(); + assert!(!client.circuit_blocks_requests()); + assert!(client.check_circuit().is_ok()); + } + + #[test] + fn fast_path_hint_tracks_circuit_state() { + let client = test_client(); + + // Closed: hint clear. + assert!(!client.circuit_tripped.load(Ordering::Acquire)); + + // Tripped: hint set so the lock-free fast path consults the lock. + client.trip_circuit(&dummy_error()); + assert!(client.circuit_tripped.load(Ordering::Acquire)); + + // Reachable again: hint cleared so the fast path stays lock-free. + client.note_reachable(); + assert!(!client.circuit_tripped.load(Ordering::Acquire)); + + // Promoting an expired Open to a half-open probe keeps the hint set + // (still non-closed). + client.set_circuit_for_test(CircuitState::Open { + until: Instant::now() - Duration::from_secs(1), + backoff: CIRCUIT_INITIAL_BACKOFF, + }); + assert!(client.check_circuit().is_ok()); + assert!(client.circuit_tripped.load(Ordering::Acquire)); + } + + #[test] + fn expired_open_circuit_admits_exactly_one_probe() { + let client = test_client(); + + // Simulate an open window that has already elapsed. + client.set_circuit_for_test(CircuitState::Open { + until: Instant::now() - Duration::from_secs(1), + backoff: CIRCUIT_INITIAL_BACKOFF, + }); + + // The first caller is let through as the probe... + assert!(client.check_circuit().is_ok(), "probe should be admitted"); + assert!( + matches!( + *client.circuit.lock().unwrap(), + CircuitState::Probing { .. } + ), + "circuit should be half-open after admitting a probe" + ); + // ...and everyone else keeps fast-failing while the probe is in flight. + assert!(client.check_circuit().is_err()); + assert!(client.circuit_blocks_requests()); + } + + #[test] + fn failed_probe_escalates_backoff() { + let client = test_client(); + + client.set_circuit_for_test(CircuitState::Probing { + deadline: Instant::now() + CIRCUIT_PROBE_TIMEOUT, + backoff: CIRCUIT_INITIAL_BACKOFF, + }); + + client.trip_circuit(&dummy_error()); + + match *client.circuit.lock().unwrap() { + CircuitState::Open { backoff, .. } => assert_eq!( + backoff, + (CIRCUIT_INITIAL_BACKOFF * 2).min(CIRCUIT_MAX_BACKOFF), + "a failed probe must double the backoff window" + ), + ref other => panic!("expected Open after failed probe, got {other:?}"), + } + } + + #[test] + fn stale_failure_while_open_does_not_extend_backoff() { + let client = test_client(); + + client.set_circuit_for_test(CircuitState::Open { + until: Instant::now() + CIRCUIT_INITIAL_BACKOFF, + backoff: CIRCUIT_INITIAL_BACKOFF, + }); + + // A request that was already in flight when the circuit opened fails. It + // must not push the backoff window out further. + client.trip_circuit(&dummy_error()); + + match *client.circuit.lock().unwrap() { + CircuitState::Open { backoff, .. } => { + assert_eq!( + backoff, CIRCUIT_INITIAL_BACKOFF, + "backoff must be unchanged" + ) + } + ref other => panic!("expected Open, got {other:?}"), + } + } + #[test] fn detects_auth_bmc_errors() { assert!(is_auth_bmc_error(&bmc_status_error( diff --git a/crates/health/src/collectors/sensors.rs b/crates/health/src/collectors/sensors.rs index 0b4926a5a6..38e33e5f31 100644 --- a/crates/health/src/collectors/sensors.rs +++ b/crates/health/src/collectors/sensors.rs @@ -102,6 +102,24 @@ impl PeriodicCollector for SensorCollector { }); }; + // If the endpoint's connection circuit breaker is open, the BMC is known + // to be unreachable. Skip the whole sensor fan-out rather than firing one + // request per sensor (each blocking for a connect timeout and logging a + // warning). When the backoff window elapses the breaker stops blocking, + // so the next sweep runs and one fetch probes for recovery. See NVBug + // 6036327. + if self.endpoint.bmc.circuit_blocks_requests() { + tracing::debug!( + bmc_addr = ?self.endpoint.addr, + "BMC connection circuit is open; skipping sensor iteration" + ); + return Ok(IterationResult { + refresh_triggered: false, + entity_count: None, + fetch_failures: 0, + }); + } + tracing::debug!( bmc_addr = ?self.endpoint.addr, generation = inventory.generation, @@ -199,6 +217,23 @@ impl SensorCollector { let sensor = match sensor_link.fetch().await { Ok(s) => s, Err(e) => { + // When the endpoint's connection circuit breaker is blocking, this + // failure is a symptom of the BMC being unreachable — either a + // breaker fast-fail or the single recovery probe timing out — not a + // problem with this sensor. Log it quietly and don't inflate the + // fetch-failure count; otherwise a dead BMC emits a warning per + // sensor every time the backoff window elapses and a sweep probes. + // The breaker logs one warning of its own when it opens. See NVBug + // 6036327. + if self.endpoint.bmc.circuit_blocks_requests() { + tracing::debug!( + sensor_id = %sensor_link.odata_id(), + entity_type = entity.entity_type(), + error = ?e, + "Skipping sensor fetch failure; BMC connection circuit is open" + ); + return 0; + } fetch_failures.fetch_add(1, Ordering::Relaxed); tracing::warn!( sensor_id = %sensor_link.odata_id(), diff --git a/crates/site-explorer/src/metrics.rs b/crates/site-explorer/src/metrics.rs index a3cbeeb214..faee96957f 100644 --- a/crates/site-explorer/src/metrics.rs +++ b/crates/site-explorer/src/metrics.rs @@ -922,7 +922,11 @@ pub fn exploration_error_to_metric_label(error: &EndpointExplorationError) -> St EndpointExplorationError::MissingCredentials { .. } => "missing_credentials", EndpointExplorationError::SetCredentials { .. } => "set_credentials", EndpointExplorationError::MissingRedfish { .. } => "missing_redfish", - EndpointExplorationError::MissingVendor => "missing_vendor", + // Distinguish a BMC that reported no vendor at all (commonly transient, + // still initializing) from one that reported a vendor we don't support, + // so the two failure modes can be alerted on separately. See NVBug 6036327. + EndpointExplorationError::MissingVendor { observed: None } => "vendor_not_reported", + EndpointExplorationError::MissingVendor { observed: Some(_) } => "vendor_unrecognized", EndpointExplorationError::AvoidLockout => "avoid_lockout", EndpointExplorationError::Other { .. } => "other", EndpointExplorationError::VikingFWInventoryForbiddenError { .. } => { diff --git a/crates/site-explorer/src/redfish.rs b/crates/site-explorer/src/redfish.rs index d29a33e56a..b16ad3b196 100644 --- a/crates/site-explorer/src/redfish.rs +++ b/crates/site-explorer/src/redfish.rs @@ -139,8 +139,19 @@ impl RedfishClient { match service_root.vendor() { Some(vendor) if vendor != RedfishVendor::Unknown => Ok(vendor), _ => { - tracing::info!("No recognized vendor for BMC at {bmc_ip_address}"); - Err(EndpointExplorationError::MissingVendor) + // Capture the raw vendor string the ServiceRoot actually reported + // (the `Vendor` field, falling back to the first `Oem` key) so the + // recorded exploration error says *what* we read and *where* from. + // `None` here means the BMC reported neither — usually transient + // while it is still initializing; `Some(_)` means a vendor we don't + // recognize yet. See NVBug 6036327. + let observed = service_root.vendor_string(); + tracing::info!( + %bmc_ip_address, + observed_vendor = ?observed, + "BMC ServiceRoot did not report a recognized vendor" + ); + Err(EndpointExplorationError::MissingVendor { observed }) } } } @@ -273,6 +284,14 @@ impl RedfishClient { .map_err(map_redfish_error)?; } RedfishVendor::Unknown => { + // Defensive guard: callers in the explorer resolve the vendor via + // `get_redfish_vendor`, which rejects `Unknown` (as `MissingVendor`) + // before we ever get here, so this arm is not reachable from the + // live exploration path. Note that an unrecognized *raw* vendor + // string is already collapsed to `Unknown` by libredfish, so the + // original name is unavailable at this point — the meaningful + // capture happens in `get_redfish_vendor`. If this ever fires it + // signals an internal logic error rather than a parsed vendor. return Err(EndpointExplorationError::UnsupportedVendor { vendor: vendor.to_string(), }); From 97f9c4d1fa0b647d635349ec51f2961e4b059edd Mon Sep 17 00:00:00 2001 From: Bill Minckler Date: Mon, 29 Jun 2026 19:04:14 +0000 Subject: [PATCH 2/3] =?UTF-8?q?fix:=20address=20review=20=E2=80=94=20close?= =?UTF-8?q?=20breaker=20on=20probe=20response,=20single-probe=20sensor=20s?= =?UTF-8?q?weep?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Follow-up to the connection circuit breaker, resolving review feedback: - guarded(): a non-connection error (404/auth/decode) now closes the circuit too, not just a success. The BMC responded, so it is reachable -- without this a half-open probe that got a real error would stay fast-failed until the probe deadline. - Sensor sweep: replace the post-call breaker-state suppression (fragile under concurrency -- a probe success could flip the state mid-sweep and re-surface the fast-fails) with an explicit three-way decision (Full/Probe/Skip). When the backoff window elapses the sweep sends a single probe instead of fanning out every sensor, so a dead BMC costs one request rather than a fast-fail burst, and per-sensor failures are logged/counted normally again. - Document that only stream *establishment* runs through the breaker; long-lived SSE item errors are handled by the streaming collector's own reconnect backoff. - Add tests: legacy MissingVendor unit-variant still deserializes (observed: None) and the new form round-trips; non-connection error during a probe closes the circuit; collector_sweep returns Probe once the window elapses. Co-Authored-By: Claude Opus 4.8 (1M context) --- crates/api-model/src/site_explorer/mod.rs | 30 +++++ crates/health/src/bmc.rs | 130 ++++++++++++++++++---- crates/health/src/collectors/sensors.rs | 70 ++++++------ 3 files changed, 171 insertions(+), 59 deletions(-) diff --git a/crates/api-model/src/site_explorer/mod.rs b/crates/api-model/src/site_explorer/mod.rs index 0fe7b5567b..0a0d07458e 100644 --- a/crates/api-model/src/site_explorer/mod.rs +++ b/crates/api-model/src/site_explorer/mod.rs @@ -2046,6 +2046,36 @@ mod explored_mlx_device_tests { } } + #[test] + fn missing_vendor_decodes_legacy_unit_variant() { + // Records written before `observed` was added are stored as the bare + // internally-tagged unit form. They must still deserialize, defaulting + // `observed` to None. + let legacy: EndpointExplorationError = + serde_json::from_str(r#"{"Type":"MissingVendor"}"#).expect("legacy form must decode"); + assert_eq!( + legacy, + EndpointExplorationError::MissingVendor { observed: None } + ); + } + + #[test] + fn missing_vendor_round_trips_with_observed() { + // New records carry the observed Vendor/Oem string and round-trip. + let with_observed = EndpointExplorationError::MissingVendor { + observed: Some("SomeNewVendor".to_string()), + }; + let json = serde_json::to_string(&with_observed).expect("serialize"); + let decoded: EndpointExplorationError = serde_json::from_str(&json).expect("deserialize"); + assert_eq!(decoded, with_observed); + + // And the absent case round-trips too. + let absent = EndpointExplorationError::MissingVendor { observed: None }; + let json = serde_json::to_string(&absent).expect("serialize"); + let decoded: EndpointExplorationError = serde_json::from_str(&json).expect("deserialize"); + assert_eq!(decoded, absent); + } + #[test] fn dpu_part_number_reads_card1_part_number() { assert_eq!( diff --git a/crates/health/src/bmc.rs b/crates/health/src/bmc.rs index 44cd85b200..ae907cd823 100644 --- a/crates/health/src/bmc.rs +++ b/crates/health/src/bmc.rs @@ -74,6 +74,19 @@ enum CircuitState { }, } +/// What a batch-oriented collector should do this iteration, derived from the +/// endpoint's circuit state via [`BmcClient::collector_sweep`]. +#[derive(Debug, Clone, Copy, PartialEq, Eq)] +pub enum CollectorSweep { + /// Circuit closed — run the full batch as normal. + Full, + /// Backoff window elapsed — send a single probe to test reachability instead + /// of the full fan-out, so a still-dead BMC costs one request, not hundreds. + Probe, + /// Circuit open within the backoff window — skip entirely. + Skip, +} + pub type BoxFuture<'a, T> = Pin + Send + 'a>>; pub trait CredentialProvider: Send + Sync { @@ -246,10 +259,12 @@ impl BmcClient { /// Run a BMC operation through the connection circuit breaker. /// /// Fast-fails (without touching the network) while the circuit is open, and - /// updates the breaker based on the outcome: a success closes the circuit, - /// while a connect-level failure trips it. Non-connection failures (auth, - /// 404s, decode errors, …) pass through untouched — they are not the BMC - /// being unreachable, so they must not open the circuit. + /// updates the breaker based on the outcome. A connect-level failure trips + /// it. Any other outcome — a success, or a non-connection error such as a + /// 404/auth/decode — means the BMC actually answered, so it closes the + /// circuit. Closing on a non-connection error matters for the half-open + /// probe: without it a reachable-but-erroring BMC would stay fast-failed + /// until the probe deadline. async fn guarded( &self, op: impl std::future::Future>, @@ -263,6 +278,9 @@ impl BmcClient { Err(error) => { if is_connection_error(&error) { self.trip_circuit(&error); + } else { + // The BMC responded (just not happily); it is reachable. + self.note_reachable(); } Err(error) } @@ -363,20 +381,34 @@ impl BmcClient { self.circuit_tripped.store(true, Ordering::Release); } - /// Whether the circuit is currently blocking requests. Lets callers (e.g. the - /// sensor sweep) skip a whole batch of work — and its log spam — instead of - /// fast-failing each request individually. Returns `false` when the open - /// window has elapsed so the next sweep can probe and self-heal. - pub fn circuit_blocks_requests(&self) -> bool { - // Fast path: a closed circuit blocks nothing. + /// What a batch-oriented caller (e.g. the sensor sweep) should do this + /// iteration, so it can avoid both the request flood and its log spam: + /// run the full batch, send a single probe, or skip entirely. Reading this + /// once up front — rather than letting each request fast-fail individually — + /// keeps a dead endpoint from re-emitting a per-request burst every time the + /// backoff window elapses. + pub fn collector_sweep(&self) -> CollectorSweep { + // Fast path: a closed circuit runs normally and never takes the lock. if !self.circuit_tripped.load(Ordering::Acquire) { - return false; + return CollectorSweep::Full; } let state = self.circuit.lock().expect("circuit mutex poisoned"); match *state { - CircuitState::Closed => false, - CircuitState::Open { until, .. } => Instant::now() < until, - CircuitState::Probing { deadline, .. } => Instant::now() < deadline, + CircuitState::Closed => CollectorSweep::Full, + CircuitState::Open { until, .. } => { + if Instant::now() < until { + CollectorSweep::Skip + } else { + CollectorSweep::Probe + } + } + CircuitState::Probing { deadline, .. } => { + if Instant::now() < deadline { + CollectorSweep::Skip + } else { + CollectorSweep::Probe + } + } } } @@ -577,6 +609,14 @@ impl Bmc for BmcClient { .guarded(async { self.inner.stream(uri).await.map_err(HealthError::from) }) .await { + // Only stream *establishment* runs through the breaker. Per-item + // errors on the returned long-lived stream (e.g. a mid-stream SSE + // disconnect) are intentionally not fed back into it: streaming + // collectors own a reconnect loop with their own exponential backoff, + // and the breaker is scoped to the periodic-collector request flood — + // many short requests against a dead endpoint — not a single + // long-lived connection. Routing item errors here would also couple + // log-stream health to sensor/discovery collection. Ok(stream) => Ok(Box::pin(stream.map_err(HealthError::from))), Err(error) => Err(self .refresh_auth_if_needed(error, credential_generation) @@ -783,14 +823,19 @@ mod tests { let client = Arc::new(BmcClient::new(reqwest(), addr, provider, None, 10).expect("constructor ok")); - assert!(!client.circuit_blocks_requests(), "circuit starts closed"); + assert_eq!( + client.collector_sweep(), + CollectorSweep::Full, + "circuit starts closed" + ); // Any real Redfish read against the closed port fails to connect. let result = nv_redfish::ServiceRoot::new(client.clone()).await; assert!(result.is_err(), "connecting to a closed port must fail"); - assert!( - client.circuit_blocks_requests(), + assert_eq!( + client.collector_sweep(), + CollectorSweep::Skip, "a genuine connect failure must be classified as a connection error and open the breaker" ); } @@ -800,12 +845,12 @@ mod tests { let client = test_client(); // Starts closed: requests flow. - assert!(!client.circuit_blocks_requests()); + assert_eq!(client.collector_sweep(), CollectorSweep::Full); assert!(client.check_circuit().is_ok()); // A connect-level failure opens the circuit. client.trip_circuit(&dummy_error()); - assert!(client.circuit_blocks_requests()); + assert_eq!(client.collector_sweep(), CollectorSweep::Skip); assert!( client.check_circuit().is_err(), "open circuit must fast-fail" @@ -813,10 +858,53 @@ mod tests { // A success closes it again. client.note_reachable(); - assert!(!client.circuit_blocks_requests()); + assert_eq!(client.collector_sweep(), CollectorSweep::Full); assert!(client.check_circuit().is_ok()); } + #[tokio::test] + async fn non_connection_error_during_probe_closes_circuit() { + let client = test_client(); + + // An open window that has elapsed: the next caller through `guarded` + // becomes the half-open probe. + client.set_circuit_for_test(CircuitState::Open { + until: Instant::now() - Duration::from_secs(1), + backoff: CIRCUIT_INITIAL_BACKOFF, + }); + + // The probe reaches the BMC and gets a real (non-connection) error. + let result: Result<(), HealthError> = client + .guarded(async { + Err(HealthError::BmcError(Box::new(bmc_status_error( + http::StatusCode::NOT_FOUND, + )))) + }) + .await; + assert!(result.is_err()); + + assert_eq!( + client.collector_sweep(), + CollectorSweep::Full, + "a non-connection response proves reachability and must close the circuit, \ + not leave it half-open until the probe deadline" + ); + } + + #[test] + fn collector_sweep_probes_once_window_elapses() { + let client = test_client(); + client.set_circuit_for_test(CircuitState::Open { + until: Instant::now() - Duration::from_secs(1), + backoff: CIRCUIT_INITIAL_BACKOFF, + }); + assert_eq!( + client.collector_sweep(), + CollectorSweep::Probe, + "an elapsed backoff window should admit a single probe, not a full sweep" + ); + } + #[test] fn fast_path_hint_tracks_circuit_state() { let client = test_client(); @@ -863,7 +951,7 @@ mod tests { ); // ...and everyone else keeps fast-failing while the probe is in flight. assert!(client.check_circuit().is_err()); - assert!(client.circuit_blocks_requests()); + assert_eq!(client.collector_sweep(), CollectorSweep::Skip); } #[test] diff --git a/crates/health/src/collectors/sensors.rs b/crates/health/src/collectors/sensors.rs index 38e33e5f31..f3b5b3a5ee 100644 --- a/crates/health/src/collectors/sensors.rs +++ b/crates/health/src/collectors/sensors.rs @@ -24,6 +24,7 @@ use nv_redfish::core::{Bmc, EntityTypeRef, ToSnakeCase}; use nv_redfish::sensor::SensorLink; use crate::HealthError; +use crate::bmc::CollectorSweep; use crate::collectors::inventory::{DiscoveredEntity, SharedInventory}; use crate::collectors::runtime::{IterationResult, PeriodicCollector}; use crate::endpoint::BmcEndpoint; @@ -102,13 +103,15 @@ impl PeriodicCollector for SensorCollector { }); }; - // If the endpoint's connection circuit breaker is open, the BMC is known - // to be unreachable. Skip the whole sensor fan-out rather than firing one - // request per sensor (each blocking for a connect timeout and logging a - // warning). When the backoff window elapses the breaker stops blocking, - // so the next sweep runs and one fetch probes for recovery. See NVBug - // 6036327. - if self.endpoint.bmc.circuit_blocks_requests() { + // Consult the endpoint's connection circuit breaker. When the BMC is + // unreachable, firing one request per sensor would block on a connect + // timeout apiece and log a warning apiece. So: skip entirely while the + // backoff window is open, and once it elapses send a *single* probe + // instead of the full fan-out — a still-dead BMC then costs one request, + // not hundreds, and one fetch is enough to let the breaker self-heal. + // See NVBug 6036327. + let sweep = self.endpoint.bmc.collector_sweep(); + if sweep == CollectorSweep::Skip { tracing::debug!( bmc_addr = ?self.endpoint.addr, "BMC connection circuit is open; skipping sensor iteration" @@ -119,12 +122,14 @@ impl PeriodicCollector for SensorCollector { fetch_failures: 0, }); } + let probe_only = sweep == CollectorSweep::Probe; tracing::debug!( bmc_addr = ?self.endpoint.addr, generation = inventory.generation, inventory_age_secs = inventory.discovered_at.elapsed().as_secs(), entity_count = inventory.entities.len(), + probe_only, "Reading entity inventory snapshot for sensor iteration" ); @@ -132,26 +137,32 @@ impl PeriodicCollector for SensorCollector { self.emit_event(CollectorEvent::MetricCollectionStart); // Entity-level derived metrics (drive media life, PSU capacity), once - // per entity. - for entity in &inventory.entities { - self.emit_derived_metrics(entity); + // per entity. Skipped while probing — they would emit metrics from stale + // inventory for a BMC we already believe is down. + if !probe_only { + for entity in &inventory.entities { + self.emit_derived_metrics(entity); + } } // Build the fetch futures borrowing from the shared snapshot, then // drive them concurrently. Each future borrows `&self`, the entity, and - // its sensor (all alive for as long as `inventory` is held here). + // its sensor (all alive for as long as `inventory` is held here). When + // probing, take just the first sensor: one fetch is enough to test + // reachability and re-arm or clear the breaker. let this = &*self; let failures = &fetch_failures; - let futures: Vec<_> = inventory - .entities - .iter() - .flat_map(|entity| { - entity - .sensors() - .iter() - .map(move |sensor| this.update_sensor(entity, sensor, failures)) - }) - .collect(); + let fetches = inventory.entities.iter().flat_map(|entity| { + entity + .sensors() + .iter() + .map(move |sensor| this.update_sensor(entity, sensor, failures)) + }); + let futures: Vec<_> = if probe_only { + fetches.take(1).collect() + } else { + fetches.collect() + }; let processed: usize = stream::iter(futures) .buffer_unordered(self.sensor_fetch_concurrency) @@ -217,23 +228,6 @@ impl SensorCollector { let sensor = match sensor_link.fetch().await { Ok(s) => s, Err(e) => { - // When the endpoint's connection circuit breaker is blocking, this - // failure is a symptom of the BMC being unreachable — either a - // breaker fast-fail or the single recovery probe timing out — not a - // problem with this sensor. Log it quietly and don't inflate the - // fetch-failure count; otherwise a dead BMC emits a warning per - // sensor every time the backoff window elapses and a sweep probes. - // The breaker logs one warning of its own when it opens. See NVBug - // 6036327. - if self.endpoint.bmc.circuit_blocks_requests() { - tracing::debug!( - sensor_id = %sensor_link.odata_id(), - entity_type = entity.entity_type(), - error = ?e, - "Skipping sensor fetch failure; BMC connection circuit is open" - ); - return 0; - } fetch_failures.fetch_add(1, Ordering::Relaxed); tracing::warn!( sensor_id = %sensor_link.odata_id(), From 01eec589f1a07da05e73249ec25d166d01fdda94 Mon Sep 17 00:00:00 2001 From: Bill Minckler Date: Tue, 30 Jun 2026 19:11:30 +0000 Subject: [PATCH 3/3] fix: update MissingVendor usages for struct-variant after merge Latest main added new `MissingVendor` references (the error_code and operator_mitigation match arms, plus rpc/api-model tests) that still treated it as a unit variant. Update them for the `MissingVendor { observed }` struct variant introduced earlier on this branch. Co-Authored-By: Claude Opus 4.8 (1M context) --- crates/api-model/src/site_explorer/mod.rs | 6 +++--- crates/rpc/src/model/site_explorer.rs | 2 +- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/crates/api-model/src/site_explorer/mod.rs b/crates/api-model/src/site_explorer/mod.rs index 0a0d07458e..1b125e5a6d 100644 --- a/crates/api-model/src/site_explorer/mod.rs +++ b/crates/api-model/src/site_explorer/mod.rs @@ -1317,7 +1317,7 @@ impl OperatorError for EndpointExplorationError { ErrorCode::nico(SiteExplorer, 120) } EndpointExplorationError::MissingRedfish { .. } => ErrorCode::nico(SiteExplorer, 121), - EndpointExplorationError::MissingVendor => ErrorCode::nico(SiteExplorer, 122), + EndpointExplorationError::MissingVendor { .. } => ErrorCode::nico(SiteExplorer, 122), EndpointExplorationError::RedfishError { .. } => ErrorCode::nico(SiteExplorer, 130), EndpointExplorationError::VikingFWInventoryForbiddenError { .. } => { ErrorCode::nico(SiteExplorer, 131) @@ -1349,7 +1349,7 @@ impl OperatorError for EndpointExplorationError { "Verify endpoint network reachability and that the BMC Redfish service is listening.", ), EndpointExplorationError::UnsupportedVendor { .. } - | EndpointExplorationError::MissingVendor => Some( + | EndpointExplorationError::MissingVendor { .. } => Some( "Confirm the endpoint's BMC vendor and model are listed in the NICo Hardware \ Compatibility List \ (https://docs.nvidia.com/infra-controller/documentation/reference/hardware-compatibility-list); \ @@ -2468,7 +2468,7 @@ mod tests { EndpointExplorationError::UnsupportedVendor { vendor: "unknown".to_string(), } => true, - EndpointExplorationError::MissingVendor => true, + EndpointExplorationError::MissingVendor { observed: None } => true, } ); } diff --git a/crates/rpc/src/model/site_explorer.rs b/crates/rpc/src/model/site_explorer.rs index 5b24b0fe4c..47e7343308 100644 --- a/crates/rpc/src/model/site_explorer.rs +++ b/crates/rpc/src/model/site_explorer.rs @@ -426,7 +426,7 @@ mod tests { #[test] fn endpoint_report_propagates_operator_error_schema_to_rpc() { - let error = EndpointExplorationError::MissingVendor; + let error = EndpointExplorationError::MissingVendor { observed: None }; let expected_schema = error.operator_error_schema(); let report =